# Step 1: Importing Essential Libraries

In [1]:
import pandas as pd
import numpy as np
import warnings 
warnings.filterwarnings('ignore')



# Step 2: Loading data and Making labels

In [2]:
#Dataset used: https://www.kaggle.com/kmader/skin-cancer-mnist-ham10000
path='archive/hmnist_28_28_RGB.csv'
meta_path = "archive/HAM10000_metadata.csv"

In [3]:
df = pd.read_csv(path)
metadata = pd.read_csv(meta_path)

In [4]:
#drop NA values
meta_set = metadata[["age", "sex", "localization"]]
df = pd.concat([meta_set, df], axis=1)
df = df.dropna()
df = df.iloc[:,3:]
df.tail()

Unnamed: 0,pixel0000,pixel0001,pixel0002,pixel0003,pixel0004,pixel0005,pixel0006,pixel0007,pixel0008,pixel0009,...,pixel2343,pixel2344,pixel2345,pixel2346,pixel2347,pixel2348,pixel2349,pixel2350,pixel2351,label
10010,183,165,181,182,165,180,184,166,182,188,...,208,185,187,208,186,186,206,187,189,0
10011,2,3,1,38,33,32,121,104,103,132,...,96,79,76,24,23,21,3,4,1,0
10012,132,118,118,167,149,149,175,156,160,184,...,204,181,178,181,159,153,172,151,145,0
10013,160,124,146,164,131,152,167,127,146,169,...,185,162,167,184,157,166,185,162,172,0
10014,175,142,121,181,150,134,181,150,133,178,...,159,79,82,174,137,125,175,139,126,6


# Step 3: Train Test Split

In [5]:
# Split into train, validation, and test.
np.random.seed(2070404)

# Shuffle all records.
df_shuffle = df.sample(frac = 1)

# Create split counts.
splits = np.multiply(len(df_shuffle), (0.6,0.2,0.2)).astype(int)
print(f"Split counts (train/ validation/ test): {splits}")

# Create split data sets.
train_set, valid_set, test_set = np.split(df_shuffle, [splits[0], splits[0] + splits[1]])

# Reset split set indicies.
train_set.reset_index(drop = True, inplace = True)
valid_set.reset_index(drop = True, inplace = True)
test_set.reset_index(drop = True, inplace = True)

Split counts (train/ validation/ test): [5974 1991 1991]


In [6]:
print(len(train_set))

5974


In [7]:
print(len(test_set))

1993


In [8]:
df.label.unique()

array([2, 4, 3, 6, 5, 1, 0])

In [10]:
#separate features and labels

y_train_clean = train_set['label']
x_train_clean = train_set.drop(columns=['label'])
y_valid = valid_set['label']
x_valid = valid_set.drop(columns=['label'])
y_test = test_set['label']
x_test = test_set.drop(columns=['label'])

columns = list(x_train_clean)

# Step 4: Preprocessing

In [11]:
#create a new train set that over-sample the minority class(es) by picking samples at random with replacement
from imblearn.over_sampling import RandomOverSampler

oversample = RandomOverSampler()
x_train_oversample, y_train_oversample  = oversample.fit_resample(x_train_clean, y_train_clean)

In [12]:
import matplotlib.pyplot as plt
import random

x_train_clean = np.array(x_train_clean, dtype=np.uint8).reshape(-1,28,28,3)
x_train_oversample = np.array(x_train_oversample, dtype=np.uint8).reshape(-1,28,28,3)
x_valid = np.array(x_valid, dtype=np.uint8).reshape(-1,28,28,3)

In [13]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, Flatten, Dense, MaxPool2D, Dropout, BatchNormalization
from tensorflow.keras.layers import GlobalAveragePooling2D
from tensorflow import keras
import tensorflow as tf

2023-12-05 23:46:20.261116: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [14]:
#create new train set that contains augmented images

#create augmented validation set with greyscale
x_valid_augm = x_valid / 255.0

#create augmented test set with greyscale
x_test_augm = x_test / 255.0

def aug_image(x_train_set, y_train_set, contrast_factor = 2, delta = 0.1, flip_flag = True):
    """ 
    apply transformaions and augmentations
    
    Args:
    x_train_set: dataset to be augmented and transformed
    y_train_set: labels of the dataset
    contrast_factor: contrast factor to be used
    delta: delta value to be used
    flip_flag: whether to apply flipping transformation to the dataset
    
    
    Returns:
    x_train_set_augm: transformed and augmented dataset
    y_train_set_augm: labels of the transformed and augmented dataset
    
    """
    
    tf.random.set_seed(1234)

    #transform image to greyscale
    x_train_set_augm = x_train_set / 255.0

    #change delta
    x_train_set_augm = tf.image.adjust_brightness(x_train_set_augm, delta = delta)

    #change contrast
    x_train_set_augm = tf.image.adjust_contrast(x_train_set_augm, contrast_factor = contrast_factor)

    #flip images
    if flip_flag:
        x_train_set_augm = tf.image.random_flip_left_right(x_train_set_augm)

    #concatenate original dataset and augmented dataset
    x_train_set_augm = tf.concat([x_train_set, x_train_set_augm],axis = 0)

    #concatenate original dataset labels and augmented dataset labels
    y_train_set_augm = y_train_set
    y_train_set_augm = tf.concat([y_train_set, y_train_set_augm],axis = 0)


    # shuffle dataset
    shuffle = tf.random.shuffle(tf.range(tf.shape(x_train_set_augm)[0], dtype = tf.int32))
    x_train_set_augm = tf.gather(x_train_set_augm, shuffle)
    y_train_set_augm = tf.gather(y_train_set_augm, shuffle).numpy() #also transforms y_train to numpy array
    
    return x_train_set_augm, y_train_set_augm

# Step 5: Model Building (CNN)

In [15]:
def build_model(kernel_size = 2, pool_size = 2, learning_rate = 0.001, optimizer_name = "Adam", additional_dense = True):
    """
    Build a CNN model using Keras.

    Args:
    kernel_size: convolution layer kernel size
    pool_size: pooling layer pool size
    learning_rate: optimizer learning rate
    optimizer_name: optimizer used to compile the model
    additional_dense: whether to add an additional dense layer

    Returns:
    model: A tf.keras model
    """
    
    #clear session
    tf.keras.backend.clear_session()
    model = tf.keras.Sequential()

    #add input layer
    model.add(keras.layers.Input(shape=[28, 28, 3]))

    #add convolution layer 1
    model.add(tf.keras.layers.Conv2D(
        filters = 32, kernel_size = (kernel_size, kernel_size),
        strides=(1,1), padding='same',
        data_format = 'channels_last',
        input_shape = (28, 28, 3),  # Updated input shape for RGB
        name='conv_1', activation='relu'))

    #add pooling layer 1
    model.add(tf.keras.layers.MaxPool2D(
        pool_size= (pool_size, pool_size), name = 'pool_1'))

    #add convolution layer 2
    model.add(tf.keras.layers.Conv2D(
        filters = 64, kernel_size = (kernel_size, kernel_size),
        strides = (1,1), padding = 'same',
        name = 'conv_2', activation = 'relu'))

    #add pooling layer 2
    model.add(tf.keras.layers.MaxPool2D(
        pool_size = (pool_size, pool_size), name = 'pool_2'))

    #add flattening layer
    model.add(tf.keras.layers.Flatten())


    #add dense layer 1
    model.add(tf.keras.layers.Dense(
        units = 1024, name = 'fc_1', 
        activation = 'relu'))

    #dropout
    model.add(tf.keras.layers.Dropout(rate = 0.5))
    
    if additional_dense:
        #add dense layer 2
        model.add(tf.keras.layers.Dense(
            units = 1024, name = 'fc_2', 
            activation = 'relu'))

    #add output layer
    model.add(tf.keras.layers.Dense(
        units = 7, name = 'fc_3',
        activation = 'softmax'))

    if optimizer_name == "Adam":
        optimizer = tf.keras.optimizers.Adam(learning_rate = learning_rate)
    elif optimizer_name == "SGD":
        optimizer = tf.keras.optimizers.SGD(learning_rate = learning_rate)
    elif optimizer_name == "Adagrad":
        optimizer = tf.keras.optimizers.Adagrad(learning_rate = learning_rate)

    #compile model
    model.compile(loss = 'sparse_categorical_crossentropy',
                  optimizer = optimizer,
                  metrics = ['accuracy'])
                
    return model


# Step 6: Fitting the Model Using Different Datasets

### 6.1: fit models using oversampling dataset

In [16]:
from datetime import datetime

#test 4 hyperparameter combinations based on results from testing with clean (original) dataset
kernel_list = [2, 3]
pool_size = [2, 3]

over_dict = {}

for i in kernel_list:
    for j in pool_size:
        
        start_time = datetime.now()
        print("-----kernel size: " + str(i) + ", pool size: " + str(j) + "-----")
        model = build_model(kernel_size = i, pool_size = j, learning_rate = 0.001, optimizer_name = "Adam", additional_dense = True)
        history_true = model.fit(x_train_oversample,
                            y_train_oversample,
                            validation_data=(x_valid, y_valid),
                            batch_size = 128,
                            epochs = 10)
        end_time = datetime.now()
        print('Duration: {}'.format(end_time - start_time))
        
        start_time = datetime.now()
        print("-----kernel size: " + str(i) + ", pool size: " + str(j) + "-----")
        model = build_model(kernel_size = i, pool_size = j, learning_rate = 0.001, optimizer_name = "Adam", additional_dense = False)
        history_false = model.fit(x_train_oversample,
                            y_train_oversample,
                            validation_data=(x_valid, y_valid),
                            batch_size = 128,
                            epochs = 10)
        end_time = datetime.now()
        print('Duration: {}'.format(end_time - start_time))
        
        over_dict[(i, j)] = [history_true.history['accuracy'][-1], history_true.history['val_accuracy'][-1], history_false.history['accuracy'][-1], history_false.history['val_accuracy'][-1]]
        

-----kernel size: 2, pool size: 2-----
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Duration: 0:02:20.255588
-----kernel size: 2, pool size: 2-----
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Duration: 0:01:58.491016
-----kernel size: 2, pool size: 3-----
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Duration: 0:01:04.174316
-----kernel size: 2, pool size: 3-----
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Duration: 0:00:40.458274
-----kernel size: 3, pool size: 2-----
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Duration: 0:02:48.961299
-----kernel size: 3, pool size: 2-----
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/1

### 6.2: fit models using image augmentaion dataset

In [17]:
#test out 18 image augmentaion parameter-combinations
contrast_list = [1, 2, 3]
delta_list = [0.1, 0.2, 0.3]
flip_list = [True, False]

image_dict = {}

for cl in contrast_list:
    for dl in delta_list:
        for fl in flip_list:
            
            print("-----contrast: " + str(cl) + ", delta: " + str(dl) + ", flip: " + str(fl) + "-----")
            x_train_augm, y_train_augm = aug_image(x_train_clean, y_train_clean, contrast_factor = cl, delta = dl, flip_flag = fl)

            for i in kernel_list:
                for j in pool_size:

                    start_time = datetime.now()
                    print("-----kernel size: " + str(i) + ", pool size: " + str(j) + "-----")
                    model = build_model(kernel_size = i, pool_size = j, learning_rate = 0.001, optimizer_name = "Adam", additional_dense = True)
                    history_true = model.fit(x_train_augm,
                                        y_train_augm,
                                        validation_data=(x_valid_augm, y_valid),
                                        batch_size = 128,
                                        epochs = 10)
                    
                    end_time = datetime.now()
                    print('Duration: {}'.format(end_time - start_time))
                    
                    start_time = datetime.now()
                    print("-----kernel size: " + str(i) + ", pool size: " + str(j) + "-----")
                    model = build_model(kernel_size = i, pool_size = j, learning_rate = 0.001, optimizer_name = "Adam", additional_dense = False)
                    history_false = model.fit(x_train_augm,
                                        y_train_augm,
                                        validation_data=(x_valid_augm, y_valid),
                                        batch_size = 128,
                                        epochs = 10)
                    
                    end_time = datetime.now()
                    print('Duration: {}'.format(end_time - start_time))
                    
                    image_dict[(i, j, cl, dl, fl)] = [history_true.history['accuracy'][-1], history_true.history['val_accuracy'][-1], history_false.history['accuracy'][-1], history_false.history['val_accuracy'][-1]]
    

-----contrast: 1, delta: 0.1, flip: True-----
-----kernel size: 2, pool size: 2-----
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Duration: 0:01:01.541430
-----kernel size: 2, pool size: 2-----
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Duration: 0:00:50.124066
-----kernel size: 2, pool size: 3-----
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Duration: 0:00:28.268085
-----kernel size: 2, pool size: 3-----
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Duration: 0:00:18.092206
-----kernel size: 3, pool size: 2-----
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Duration: 0:01:14.241162
-----kernel size: 3, pool size: 2-----
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4

### 6.3: fit models using oversample and image augmentaion dataset

In [18]:

image_over_dict = {}

for cl in contrast_list:
    for dl in delta_list:
        for fl in flip_list:
            
            print("-----contrast: " + str(cl) + ", delta: " + str(dl) + ", flip: " + str(fl) + "-----")
            x_train_oversample_augm, y_train_oversample_augm = aug_image(x_train_oversample, y_train_oversample, contrast_factor = cl, delta = dl, flip_flag = fl)

            for i in kernel_list:
                for j in pool_size:

                    start_time = datetime.now()
                    print("-----kernel size: " + str(i) + ", pool size: " + str(j) + "-----")
                    model = build_model(kernel_size = i, pool_size = j, learning_rate = 0.001, optimizer_name = "Adam", additional_dense = True)
                    history_true = model.fit(x_train_oversample_augm,
                                        y_train_oversample_augm,
                                        validation_data=(x_valid_augm, y_valid),
                                        batch_size = 128,
                                        epochs = 10)
                    
                    end_time = datetime.now()
                    print('Duration: {}'.format(end_time - start_time))
                    
                    start_time = datetime.now()
                    print("-----kernel size: " + str(i) + ", pool size: " + str(j) + "-----")
                    model = build_model(kernel_size = i, pool_size = j, learning_rate = 0.001, optimizer_name = "Adam", additional_dense = False)
                    history_false = model.fit(x_train_oversample_augm,
                                        y_train_oversample_augm,
                                        validation_data=(x_valid_augm, y_valid),
                                        batch_size = 128,
                                        epochs = 10)
                    
                    end_time = datetime.now()
                    print('Duration: {}'.format(end_time - start_time))
                    
                    image_over_dict[(i, j, cl, dl, fl)] = [history_true.history['accuracy'][-1], history_true.history['val_accuracy'][-1], history_false.history['accuracy'][-1], history_false.history['val_accuracy'][-1]]
        

-----contrast: 1, delta: 0.1, flip: True-----
-----kernel size: 2, pool size: 2-----
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Duration: 0:04:44.821101
-----kernel size: 2, pool size: 2-----
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Duration: 0:04:03.441279
-----kernel size: 2, pool size: 3-----
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Duration: 0:02:11.876615
-----kernel size: 2, pool size: 3-----
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Duration: 0:01:22.727227
-----kernel size: 3, pool size: 2-----
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Duration: 0:05:48.358821
-----kernel size: 3, pool size: 2-----
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4

# Step 7: Output Results

In [20]:
import csv

over_table = []

for key in over_dict:
    over_table.append([key[0], key[1], over_dict[key][0], over_dict[key][1], over_dict[key][2], over_dict[key][3]])

with open('oversampler.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerows(over_table)
    

In [21]:

image_table = []

for key in image_dict:
    image_table.append([key[0], key[1], key[2], key[3], key[4], image_dict[key][0], image_dict[key][1], image_dict[key][2], image_dict[key][3]])

with open('image_augm.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerows(image_table)
    

In [22]:

image_over_table = []

for key in image_over_dict:
    image_over_table.append([key[0], key[1], key[2], key[3], key[4], image_over_dict[key][0], image_over_dict[key][1], image_over_dict[key][2], image_over_dict[key][3]])

with open('image_augm_over.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerows(image_over_table)
    