# Data Preprocessing

In [1]:
import pandas as pd
import numpy as np
from google.colab import drive
import keras
from keras import layers
import tensorflow as tf
from sklearn.utils import resample

## Data Reader

In [None]:
class DataReader:

  def __init__(self):
    pass

  '''Capture of csv datasets as pandas dataframes and inclusion of image path for each record'''
  def capture_df(self, df_type):
    base_path = '/content/drive/MyDrive/Colab Notebooks/skin-cancer-project/'

    if df_type=="train":
      csv_path = 'datasets/train/'
      csv_name = 'ISIC2018_Task3_Training_GroundTruth.csv'
    elif df_type == "validate":
      csv_path = 'datasets/validate/'
      csv_name = 'ISIC2018_Task3_Validation_GroundTruth.csv'
    else:
      csv_path = 'datasets/test/'
      csv_name = 'ISIC2018_Task3_Test_GroundTruth.csv'

    df = pd.read_csv(base_path + csv_path + csv_name)
    df['img_path'] = base_path + 'images/' + df['image']+'.jpg'

    return df

## Data Balancing

In [2]:
class DataBalancer:
  RANDOM_STATE = 42

  def __init__(self, balance_target, classes):
    self.BALANCE_COUNT_PER_CLASS = balance_target
    self.CLASSES = classes

  '''Balance the received dataset, upsampling and undersampling as required to achieve target count of records per class'''
  def balance_data(self, df):

    df['label'] = df[self.CLASSES].idxmax(axis=1)
    df_balanced = pd.DataFrame(columns=df.columns)

    for label in self.CLASSES:
        df_class = df[df['label'] == label]
        current_count = len(df_class)

        if current_count < self.BALANCE_COUNT_PER_CLASS:
            # Upsample
            df_class_balanced = resample(df_class, replace=True, n_samples=self.BALANCE_COUNT_PER_CLASS, random_state=self.RANDOM_STATE)
        else:
            # Downsample
            df_class_balanced = resample(df_class, replace=False, n_samples=self.BALANCE_COUNT_PER_CLASS, random_state=self.RANDOM_STATE)

        df_balanced = pd.concat([df_balanced, df_class_balanced])

    df = df_balanced.sample(frac=1, random_state=self.RANDOM_STATE).reset_index(drop=True)

    return df

## Data Preparation Flow

In [None]:
class DataPreparer:

  def __init__(self, aug_rotation, batch_size, channels, image_resize, classes):
    self.AUGMENTATION_ROTATION = aug_rotation
    self.BATCH_SIZE = batch_size
    self.CHANNELS = channels
    self.IMAGE_RESIZE = image_resize
    self.CLASSES = classes
    self.augmenter = keras.Sequential([keras.layers.RandomRotation(factor=self.AUGMENTATION_ROTATION),])

  '''Implementation of preprocessing techniques to include data augmentation, image resizing and image pixel value normalisation'''
  def load_and_preprocess_image(self, path, label, training=False):
      image = tf.io.read_file(path)
      image = tf.image.decode_jpeg(image, channels=self.CHANNELS)

      if training:
          image = self.augmenter(image)

      image = tf.image.resize(image, self.IMAGE_RESIZE)
      image = image / 255.0
      return image, label

  '''Creation of tensor dataset, mapping labels and images, and setting batches'''
  def create_dataset(self, df, training=False):
    image_path = df['img_path'].values
    labels = df[self.CLASSES].values
    dataset = tf.data.Dataset.from_tensor_slices((image_path, labels))

    dataset = dataset.map(
        lambda x, y: self.load_and_preprocess_image(x, y, training),
        num_parallel_calls=tf.data.AUTOTUNE
      )

    if training:
        dataset = dataset.shuffle(512)

    dataset = dataset.batch(self.BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
    return dataset