In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.sql.types import *
from pyspark.sql.functions import col
from pyspark.ml.feature import StringIndexer
import shutil
import warnings
import time
import os
import torch

warnings.filterwarnings('ignore')

In [2]:
def timing(start):
    print(f'Elapsed time: {time.time() - start:.2f} s')
# start = time.time()

# Start Session

In [3]:
start = time.time()

spark = SparkSession.builder.appName('SparkCPU').config("spark.driver.memory", "15g").getOrCreate()

timing(start)

23/08/01 11:14:48 WARN Utils: Your hostname, bdai-desktop resolves to a loopback address: 127.0.1.1; using 165.132.118.198 instead (on interface enp0s31f6)
23/08/01 11:14:48 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/08/01 11:14:48 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Elapsed time: 2.56 s


# 1. Load Dataset

In [4]:
train_image_path = "/home/bdai/spark_work/spark-warehouse/covid_train_binary"
test_image_path = "/home/bdai/spark_work/spark-warehouse/covid_test_binary"
cache_path = "file:///home/bdai/spark_work/petastorm"

In [5]:
start = time.time()

train_df = spark.read.parquet(train_image_path)
df_test = spark.read.parquet(test_image_path)

df_train, df_val = train_df.randomSplit([0.8, 0.2], seed=12345)

timing(start)

Elapsed time: 2.61 s


# 2. Image preprocessing

In [6]:
import torchvision.transforms as transforms
import io
import numpy as np
import pandas as pd
from PIL import Image

from petastorm.spark import SparkDatasetConverter, make_spark_converter
from petastorm import TransformSpec

image_shape = (3, 224, 224)

## 1) Cache the Spark DataFrame using Petastorm Spark converter

In [7]:
# Set a cache directory on DBFS FUSE for intermediate data.
start = time.time()

spark.conf.set(SparkDatasetConverter.PARENT_CACHE_DIR_URL_CONF, cache_path)

converter_train = make_spark_converter(df_train)
converter_val = make_spark_converter(df_val)
converter_test = make_spark_converter(df_test)

print(f"train: {len(converter_train)}, val: {len(converter_val)}, test : {len(converter_test)}")

timing(start)

Converting floating-point columns to float32
23/08/01 11:15:33 WARN InternalParquetRecordWriter: Too much memory used: Store {
 [class] optional binary class (STRING) {
  r:0 bytes
  d:0 bytes
   data: FallbackValuesWriter{
   data: initial: DictionaryValuesWriter{
   data: initial: dict:24
   data: initial: values:120
   data: initial:}

   data: fallback: PLAIN CapacityByteArrayOutputStream 0 slabs, 0 bytes
   data:}

   pages: ColumnChunkPageWriter ConcatenatingByteArrayCollector 0 slabs, 0 bytes
   total: 360/144
 }
 [content] optional binary content {
  r:0 bytes
  d:0 bytes
   data: FallbackValuesWriter{
   data: initial: DictionaryValuesWriter{
   data: initial: dict:0
   data: initial: values:0
   data: initial:}

   data: fallback: PLAIN CapacityByteArrayOutputStream 60 slabs, 114,587,997 bytes
   data:}

   pages: ColumnChunkPageWriter ConcatenatingByteArrayCollector 0 slabs, 0 bytes
   total: 114,587,997/114,587,997
 }
 [file_name] optional binary file_name (STRING) {
  r:0 

train: 23931, val: 6055, test : 400
Elapsed time: 230.71 s


## 2) Preprocess images
Before feeding the dataset into the model, we need to decode the raw image bytes and apply standard ImageNet transforms. We recommend not doing this transformation on the Spark DataFrame since that will substantially increase the size of the intermediate files and might harm the performance. Instead, we recommend doing this transformation in a TransformSpec function in petastorm.

In [10]:
def preprocess(content):
    image = Image.open(io.BytesIO(content)).resize([image_shape[1],image_shape[2]])
    transformers = [transforms.Lambda(lambda image: image.convert('RGB'))]
    transformers.extend([transforms.ToTensor()])
    trans = transforms.Compose(transformers)
    image_arr = trans(image)
    return image_arr.numpy()
    

def transform_row(pd_batch):
  """
  The input and output of this function must be pandas dataframes.
  """
  pd_batch['features'] = pd_batch['content'].map(lambda x: preprocess(x))
  pd_batch['label'] = pd_batch['label'].map(lambda x: int(x))
  pd_batch = pd_batch.drop(labels=['content'], axis=1)
  return pd_batch[['features', 'label']]

def get_transform_spec():
  # Note that the output shape of the `TransformSpec` is not automatically known by petastorm, 
  # so we need to specify the shape for new columns in `edit_fields` and specify the order of 
  # the output columns in `selected_fields`.
  return TransformSpec(transform_row, 
                       edit_fields=[('features', np.float32, image_shape, False)], 
                       selected_fields=['features', 'label'])

## 3) Examining execution time for dataloading and transorming a batch    

In [11]:
start = time.time()

with converter_train.make_torch_dataloader(transform_spec=get_transform_spec(), batch_size=16) as train_dataloader:
    train_dataloader_iter = iter(train_dataloader)
    for idx, batch in enumerate(train_dataloader_iter):
        if idx == 1: break

timing(start)

Elapsed time: 2.26 s


# 3. Train Model

In [13]:
import numpy as np
import torch
import torchvision

## 1) Get the model ResNet from torchvision

In [14]:
def get_model(lr=0.001):
  # Load a ResNet50 model from torchvision
  model = torchvision.models.resnet50(pretrained=True)
  # Freeze parameters in the feature extraction layers
  for param in model.parameters():
    param.requires_grad = False
    
  # Add a new classifier layer for transfer learning
  num_ftrs = model.fc.in_features
  # Parameters of newly constructed modules have requires_grad=True by default
  model.fc = torch.nn.Linear(num_ftrs, 2)
  
  return model

## 2) Define the train and evaluate function for the model

In [15]:
def train_one_epoch(model, criterion, optimizer, scheduler, 
                    train_dataloader_iter, steps_per_epoch, epoch, 
                    device):
  model.train()  # Set model to training mode

  # statistics
  running_loss = 0.0
  running_corrects = 0

  # Iterate over the data for one epoch.
  for step in range(steps_per_epoch):
    pd_batch = next(train_dataloader_iter)
    inputs, labels = pd_batch['features'].to(device), pd_batch['label'].to(device)
    
    # Track history in training
    with torch.set_grad_enabled(True):
      # zero the parameter gradients
      optimizer.zero_grad()

      # forward
      outputs = model(inputs)
      _, preds = torch.max(outputs, 1)
      loss = criterion(outputs, labels)

      # backward + optimize
      loss.backward()
      optimizer.step()

    # statistics
    running_loss += loss.item() * inputs.size(0)
    running_corrects += torch.sum(preds == labels.data)
  
  scheduler.step()

  epoch_loss = running_loss / (steps_per_epoch * BATCH_SIZE)
  epoch_acc = running_corrects.double() / (steps_per_epoch * BATCH_SIZE)

  print('Train Loss: {:.4f} Acc: {:.4f}'.format(epoch_loss, epoch_acc))
  return epoch_loss, epoch_acc

def evaluate(model, criterion, val_dataloader_iter, validation_steps, device, 
             metric_agg_fn=None):
  model.eval()  # Set model to evaluate mode

  # statistics
  running_loss = 0.0
  running_corrects = 0

  # Iterate over all the validation data.
  for step in range(validation_steps):
    pd_batch = next(val_dataloader_iter)
    inputs, labels = pd_batch['features'].to(device), pd_batch['label'].to(device)

    # Do not track history in evaluation to save memory
    with torch.set_grad_enabled(False):
      # forward
      outputs = model(inputs)
      _, preds = torch.max(outputs, 1)
      loss = criterion(outputs, labels)

    # statistics
    running_loss += loss.item()
    running_corrects += torch.sum(preds == labels.data)
  
  # The losses are averaged across observations for each minibatch.
  epoch_loss = running_loss / validation_steps
  epoch_acc = running_corrects.double() / (validation_steps * BATCH_SIZE)
  
  # metric_agg_fn is used in the distributed training to aggregate the metrics on all workers
  if metric_agg_fn is not None:
    epoch_loss = metric_agg_fn(epoch_loss, 'avg_loss')
    epoch_acc = metric_agg_fn(epoch_acc, 'avg_acc')

  print('Validation Loss: {:.4f} Acc: {:.4f}'.format(epoch_loss, epoch_acc))
  return epoch_loss, epoch_acc

## 3) Train and evaluate the model on the local machine
Use converter.make_torch_dataloader(...) to create the dataloader.

In [18]:
# hyperparameters
NUM_EPOCHS = 2
BATCH_SIZE = 64

In [19]:
def train_and_evaluate(lr=0.001):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    model = get_model(lr=lr)
    model = model.to(device)
    
    criterion = torch.nn.CrossEntropyLoss()
    
    # Only parameters of final layer are being optimized.
    optimizer = torch.optim.SGD(model.fc.parameters(), lr=lr, momentum=0.9)
    
    # Decay LR by a factor of 0.1 every 7 epochs
    exp_lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)
    
    with converter_train.make_torch_dataloader(transform_spec=get_transform_spec(), 
                                             batch_size=BATCH_SIZE) as train_dataloader, \
       converter_val.make_torch_dataloader(transform_spec=get_transform_spec(), 
                                           batch_size=BATCH_SIZE) as val_dataloader:
    
        train_dataloader_iter = iter(train_dataloader)
        steps_per_epoch = len(converter_train) // BATCH_SIZE
        
        val_dataloader_iter = iter(val_dataloader)
        validation_steps = max(1, len(converter_val) // BATCH_SIZE)
        
        for epoch in range(NUM_EPOCHS):
          print('Epoch {}/{}'.format(epoch + 1, NUM_EPOCHS))
          print('-' * 10)
        
          train_loss, train_acc = train_one_epoch(model, criterion, optimizer, exp_lr_scheduler, 
                                                  train_dataloader_iter, steps_per_epoch, epoch, 
                                                  device)
          val_loss, val_acc = evaluate(model, criterion, val_dataloader_iter, validation_steps, device)
    return val_loss

loss = train_and_evaluate()


Epoch 1/2
----------
Train Loss: 2.7889 Acc: 0.5425
Validation Loss: 4.8211 Acc: 0.5354
Epoch 2/2
----------
Train Loss: 2.8595 Acc: 0.5657
Validation Loss: 5.4731 Acc: 0.5424


In [12]:
spark.stop()