In [1]:
import os
os.environ['JAVA_HOME'] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ['SPARK_HOME'] = "/home/bdai/spark_work/spark-3.2.4-bin-hadoop3.2"

os.environ['PYSPARK_SUBMIT_ARGS'] = "--jars /home/bdai/spark_work/rapids-4-spark_2.12-23.06.0.jar,/home/bdai/spark_work/cudf-23.06.0-cuda12.jar --master local[*] pyspark-shell"

In [2]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.sql.types import *
from pyspark.sql.functions import col
from pyspark.ml.feature import StringIndexer
import shutil
import warnings
import time
import torch

warnings.filterwarnings('ignore')

In [3]:
def timing(start):
    print(f'Elapsed time: {time.time() - start:.2f} s')
# start = time.time()

# Start Session

In [4]:
start = time.time()

spark = SparkSession.builder.appName('SparkTrain').config('spark.plugins','com.nvidia.spark.SQLPlugin').config("spark.driver.memory", "15g").getOrCreate()
spark.sparkContext.addPyFile('/home/bdai/spark_work/rapids-4-spark_2.12-23.06.0.jar')
spark.sparkContext.addPyFile('/home/bdai/spark_work/cudf-23.06.0-cuda12.jar')
spark.conf.set('spark.rapids.sql.enabled','true')
spark.conf.set('spark.rapids.sql.incompatibleOps.enabled', 'true')
spark.conf.set('spark.rapids.sql.format.csv.read.enabled', 'true')
spark.conf.set('spark.rapids.sql.format.csv.enabled', 'true')

timing(start)

23/08/01 09:32:48 WARN Utils: Your hostname, bdai-desktop resolves to a loopback address: 127.0.1.1; using 165.132.118.198 instead (on interface enp0s31f6)
23/08/01 09:32:48 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
23/08/01 09:32:48 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/08/01 09:32:49 WARN RapidsPluginUtils: RAPIDS Accelerator 23.06.0 using cudf 23.06.0.
23/08/01 09:32:49 WARN RapidsPluginUtils: RAPIDS Accelerator is enabled, to disable GPU support set `spark.rapids.sql.enabled` to false.
23/08/01 09:32:49 WARN RapidsPluginUtils: spark.rapids.sql.explain is set to `NOT_ON_GPU`. Set it to 'NONE' to suppress the diagnostics logging about the query placement on the 

Elapsed time: 27.56 s


# 1. Load Dataset

In [5]:
train_image_path = "/home/bdai/spark_work/spark-warehouse/covid_train_binary"
test_image_path = "/home/bdai/spark_work/spark-warehouse/covid_test_binary"
cache_path = "file:///home/bdai/spark_work/petastorm"

In [6]:
start = time.time()

train_df = spark.read.parquet(train_image_path)
df_test = spark.read.parquet(test_image_path)

df_train, df_val = train_df.randomSplit([0.8, 0.2], seed=12345)

timing(start)

                                                                                

Elapsed time: 2.70 s


# 2. Image preprocessing

In [8]:
import torchvision.transforms as transforms
import io
import numpy as np
import pandas as pd
from PIL import Image

from petastorm.spark import SparkDatasetConverter, make_spark_converter
from petastorm import TransformSpec

image_shape = (3, 224, 224)

## 1) Cache the Spark DataFrame using Petastorm Spark converter

In [9]:
# Set a cache directory on DBFS FUSE for intermediate data.
start = time.time()

spark.conf.set(SparkDatasetConverter.PARENT_CACHE_DIR_URL_CONF, cache_path)

converter_train = make_spark_converter(df_train)
converter_val = make_spark_converter(df_val)
converter_test = make_spark_converter(df_test)

print(f"train: {len(converter_train)}, val: {len(converter_val)}, test : {len(converter_test)}")

timing(start)

Converting floating-point columns to float32
23/08/01 09:33:57 WARN GpuOverrides: 
    !Exec <SampleExec> cannot run on GPU because unsupported data types in output: BinaryType [content#3]; unsupported data types in input: BinaryType [content#3]
      !Exec <SortExec> cannot run on GPU because not all expressions can be replaced
        @Expression <SortOrder> path#0 ASC NULLS FIRST could run on GPU
          @Expression <AttributeReference> path#0 could run on GPU
        @Expression <SortOrder> file_name#1 ASC NULLS FIRST could run on GPU
          @Expression <AttributeReference> file_name#1 could run on GPU
        @Expression <SortOrder> size#2 ASC NULLS FIRST could run on GPU
          @Expression <AttributeReference> size#2 could run on GPU
        !Expression <SortOrder> content#3 ASC NULLS FIRST cannot run on GPU because input expression AttributeReference content#3 (BinaryType is not supported); expression SortOrder content#3 ASC NULLS FIRST produces an unsupported type Binar

train: 23912, val: 6074, test : 400
Elapsed time: 291.73 s


## 2) Preprocess images
Before feeding the dataset into the model, we need to decode the raw image bytes and apply standard ImageNet transforms. We recommend not doing this transformation on the Spark DataFrame since that will substantially increase the size of the intermediate files and might harm the performance. Instead, we recommend doing this transformation in a TransformSpec function in petastorm.

In [10]:
def preprocess(content):
    image = Image.open(io.BytesIO(content)).resize([image_shape[1],image_shape[2]])
    transformers = [transforms.Lambda(lambda image: image.convert('RGB'))]
    transformers.extend([transforms.ToTensor()])
    trans = transforms.Compose(transformers)
    image_arr = trans(image)
    return image_arr.numpy()
    

def transform_row(pd_batch):
  """
  The input and output of this function must be pandas dataframes.
  """
  pd_batch['features'] = pd_batch['content'].map(lambda x: preprocess(x))
  pd_batch['label'] = pd_batch['label'].map(lambda x: int(x))
  pd_batch = pd_batch.drop(labels=['content'], axis=1)
  return pd_batch[['features', 'label']]

def get_transform_spec():
  # Note that the output shape of the `TransformSpec` is not automatically known by petastorm, 
  # so we need to specify the shape for new columns in `edit_fields` and specify the order of 
  # the output columns in `selected_fields`.
  return TransformSpec(transform_row, 
                       edit_fields=[('features', np.float32, image_shape, False)], 
                       selected_fields=['features', 'label'])

## 3) Examining execution time for dataloading and transorming a batch    

In [11]:
start = time.time()

with converter_train.make_torch_dataloader(transform_spec=get_transform_spec(), batch_size=16) as train_dataloader:
    train_dataloader_iter = iter(train_dataloader)
    for idx, batch in enumerate(train_dataloader_iter):
        if idx == 1: break

timing(start)

Elapsed time: 5.36 s


# 3. Train Model

In [10]:
import numpy as np
import torch
import torchvision

## 1) Get the model ResNet from torchvision

In [12]:
def get_model(lr=0.001):
  # Load a MobileNetV2 model from torchvision
  model = torchvision.models.resnet50(pretrained=True)
  # Freeze parameters in the feature extraction layers
  for param in model.parameters():
    param.requires_grad = False
    
  # Add a new classifier layer for transfer learning
  num_ftrs = model.fc.in_features
  # Parameters of newly constructed modules have requires_grad=True by default
  model.fc = torch.nn.Linear(num_ftrs, 2)
  
  return model

## 2) Define the train and evaluate function for the model

In [13]:
def train_one_epoch(model, criterion, optimizer, scheduler, 
                    train_dataloader_iter, steps_per_epoch, epoch, 
                    device):
  model.train()  # Set model to training mode

  # statistics
  running_loss = 0.0
  running_corrects = 0

  # Iterate over the data for one epoch.
  for step in range(steps_per_epoch):
    pd_batch = next(train_dataloader_iter)
    inputs, labels = pd_batch['features'].to(device), pd_batch['label'].to(device)
    
    # Track history in training
    with torch.set_grad_enabled(True):
      # zero the parameter gradients
      optimizer.zero_grad()

      # forward
      outputs = model(inputs)
      _, preds = torch.max(outputs, 1)
      loss = criterion(outputs, labels)

      # backward + optimize
      loss.backward()
      optimizer.step()

    # statistics
    running_loss += loss.item() * inputs.size(0)
    running_corrects += torch.sum(preds == labels.data)
  
  scheduler.step()

  epoch_loss = running_loss / (steps_per_epoch * BATCH_SIZE)
  epoch_acc = running_corrects.double() / (steps_per_epoch * BATCH_SIZE)

  print('Train Loss: {:.4f} Acc: {:.4f}'.format(epoch_loss, epoch_acc))
  return epoch_loss, epoch_acc

def evaluate(model, criterion, val_dataloader_iter, validation_steps, device, 
             metric_agg_fn=None):
  model.eval()  # Set model to evaluate mode

  # statistics
  running_loss = 0.0
  running_corrects = 0

  # Iterate over all the validation data.
  for step in range(validation_steps):
    pd_batch = next(val_dataloader_iter)
    inputs, labels = pd_batch['features'].to(device), pd_batch['label'].to(device)

    # Do not track history in evaluation to save memory
    with torch.set_grad_enabled(False):
      # forward
      outputs = model(inputs)
      _, preds = torch.max(outputs, 1)
      loss = criterion(outputs, labels)

    # statistics
    running_loss += loss.item()
    running_corrects += torch.sum(preds == labels.data)
  
  # The losses are averaged across observations for each minibatch.
  epoch_loss = running_loss / validation_steps
  epoch_acc = running_corrects.double() / (validation_steps * BATCH_SIZE)
  
  # metric_agg_fn is used in the distributed training to aggregate the metrics on all workers
  if metric_agg_fn is not None:
    epoch_loss = metric_agg_fn(epoch_loss, 'avg_loss')
    epoch_acc = metric_agg_fn(epoch_acc, 'avg_acc')

  print('Validation Loss: {:.4f} Acc: {:.4f}'.format(epoch_loss, epoch_acc))
  return epoch_loss, epoch_acc

## 3) Train and evaluate the model on the local machine
Use converter.make_torch_dataloader(...) to create the dataloader.

In [16]:
# hyperparameters
NUM_EPOCHS = 2
BATCH_SIZE = 16

In [24]:
def train_and_evaluate(lr=0.001):
    device = torch.device("cuda")
    
    model = get_model(lr=lr)
    model = model.to(device)
    
    criterion = torch.nn.CrossEntropyLoss()
    
    # Only parameters of final layer are being optimized.
    optimizer = torch.optim.SGD(model.fc.parameters(), lr=lr, momentum=0.9)
    
    # Decay LR by a factor of 0.1 every 7 epochs
    exp_lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)
    
    with converter_test.make_torch_dataloader(transform_spec=get_transform_spec(), batch_size=BATCH_SIZE) as train_dataloader:
    
        train_dataloader_iter = iter(train_dataloader)
        steps_per_epoch = len(converter_test) // BATCH_SIZE
        
        
        for epoch in range(NUM_EPOCHS):
            print('Epoch {}/{}'.format(epoch + 1, NUM_EPOCHS))
            print('-' * 10)
            
            train_loss, train_acc = train_one_epoch(model, criterion, optimizer, exp_lr_scheduler, 
                                                  train_dataloader_iter, steps_per_epoch, epoch, 
                                                  device)
    return train_loss

loss = train_and_evaluate()


RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 3.80 GiB total capacity; 68.83 MiB already allocated; 38.69 MiB free; 78.00 MiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
lr=0.01
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = get_model(lr=lr)
model = model.to(device)

criterion = torch.nn.CrossEntropyLoss()

# Only parameters of final layer are being optimized.
optimizer = torch.optim.SGD(model.classifier[1].parameters(), lr=lr, momentum=0.9)

# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

with converter_train.make_torch_dataloader(transform_spec=get_transform_spec(), 
                                         batch_size=BATCH_SIZE) as train_dataloader, \
   converter_val.make_torch_dataloader(transform_spec=get_transform_spec(), 
                                       batch_size=BATCH_SIZE) as val_dataloader:

    train_dataloader_iter = iter(train_dataloader)
    steps_per_epoch = len(converter_train) // BATCH_SIZE
    
    val_dataloader_iter = iter(val_dataloader)
    validation_steps = max(1, len(converter_val) // BATCH_SIZE)
    
    for epoch in range(NUM_EPOCHS):
      print('Epoch {}/{}'.format(epoch + 1, NUM_EPOCHS))
      print('-' * 10)
    
      train_loss, train_acc = train_one_epoch(model, criterion, optimizer, exp_lr_scheduler, 
                                              train_dataloader_iter, steps_per_epoch, epoch, 
                                              device)
      val_loss, val_acc = evaluate(model, criterion, val_dataloader_iter, validation_steps, device)

loss = train_and_evaluate()


In [55]:
next(train_dataloader_iter)

StopIteration: 

  self._filesystem = pyarrow.localfs
  dataset = pq.ParquetDataset(path_or_paths, filesystem=fs, validate_schema=False, metadata_nthreads=10)
  dataset = pq.ParquetDataset(path_or_paths, filesystem=fs, validate_schema=False, metadata_nthreads=10)
  if not dataset.common_metadata:
  self.dataset = pq.ParquetDataset(dataset_path, filesystem=pyarrow_filesystem,
  self.dataset = pq.ParquetDataset(dataset_path, filesystem=pyarrow_filesystem,
  meta = parquet_dataset.pieces[0].get_metadata()
  for partition in (parquet_dataset.partitions or []):
  metadata = dataset.metadata
  common_metadata = dataset.common_metadata
  futures_list = [thread_pool.submit(_split_piece, piece, dataset.fs.open) for piece in dataset.pieces]
  futures_list = [thread_pool.submit(_split_piece, piece, dataset.fs.open) for piece in dataset.pieces]
  return [pq.ParquetDatasetPiece(piece.path, open_file_func=fs_open,
  self._dataset = pq.ParquetDataset(
  parquet_file = ParquetFile(self._dataset.fs.open(piece.path))
  

TypeError: 'TorchDatasetContextManager' object is not iterable

In [91]:
train_iter

NameError: name 'train_iter' is not defined