This notebook is meant to serve as a summary notebook for the performance of the models trained in `model_training/model_training.ipynb` for ease of readability and comparison.

Notes:
We implemented data augmentation techniques like SMOTE, SVMSMOTE, and ADASYNC but performance was not improved. We also tried dimensionality reduction with PCA but performance worsened. We excluded these techniques from the final models. This indicates that the models may benefit from additional feature engineering, hyperparameter tuning, or use of other model architectures. However, due to computational and time constraints, we did not explore these options further but will provide detail in the final report.

In [1]:
# Environment Setup
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, classification_report,
                             confusion_matrix, f1_score)
from sklearn.pipeline import Pipeline
import joblib
import os
import xgboost as xgb
import keras
import tensorflow as tf
import keras_tuner as kt
import itertools
from sklearn.ensemble import VotingClassifier
import itertools
import math
import gzip
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier

In [2]:
# Set random seed for reproducibility
SEED = 42
np.random.seed(SEED)

In [3]:
# Define paths
base_path = "dataset/data_exploration/"
metadata_path = "dataset/dataverse_files/"

# Load feature files
color_var = pd.read_csv(os.path.join(base_path, "color_variance_features.csv"))
color_hist = pd.read_csv(os.path.join(base_path, "combined_color_histogram_features.csv"))
lbp = pd.read_csv(os.path.join(base_path, "combined_lbp_features.csv"))
glcm = pd.read_csv(os.path.join(base_path, "glcm_features.csv"))
metadata = pd.read_csv(os.path.join(metadata_path, "HAM10000_metadata"))

# Function to extract image_id from file_name
def extract_image_id(file_name):
    # Extract the base name (e.g., 'ISIC_0024306.jpg')
    base_name = file_name.split('\\')[-1]
    # Remove the file extension (e.g., '.jpg')
    image_id = os.path.splitext(base_name)[0]
    return image_id

# Apply the function to extract image_id
color_var['image_id'] = color_var['file_name'].apply(extract_image_id)
color_hist['image_id'] = color_hist['file_name'].apply(extract_image_id)
lbp['image_id'] = lbp['file_name'].apply(extract_image_id)
glcm['image_id'] = glcm['file_name'].apply(extract_image_id)

# Sort feature DataFrames by image_id
color_var_sorted = color_var.sort_values(by='image_id').reset_index(drop=True)
color_hist_sorted = color_hist.sort_values(by='image_id').reset_index(drop=True)
lbp_sorted = lbp.sort_values(by='image_id').reset_index(drop=True)
glcm_sorted = glcm.sort_values(by='image_id').reset_index(drop=True)

# Sort metadata by image_id
metadata_sorted = metadata.sort_values(by='image_id').reset_index(drop=True)

In [4]:
# Merge all features
def merge_features(df_list):
    # Start with the first DataFrame
    merged = df_list[0]
    # Merge the rest
    for df in df_list[1:]:
        merged = pd.merge(merged, df, on='image_id', how='inner', suffixes=('', '_dup'))
        # Drop duplicate columns
        merged = merged.loc[:, ~merged.columns.str.endswith('_dup')]
    return merged

# List of sorted feature DataFrames
feature_dfs = [color_var_sorted, color_hist_sorted, lbp_sorted, glcm_sorted]
merged_features = merge_features(feature_dfs)

# Add metadata (dx column)
full_data = merged_features.copy()
full_data['dx'] = metadata_sorted['dx']

In [5]:
# Data Preprocessing
# Encode labels
le = LabelEncoder()
full_data['label'] = le.fit_transform(full_data['dx'])

# Define X and y (features and target)
X = full_data.drop(columns=['label'])
y = full_data['label']

# Drop non-feature columns
X_numeric = X.select_dtypes(include=['number'])

# Output removed columns
removed_cols = list(set(X.columns) - set(X_numeric.columns))
print("removed_cols:", removed_cols)

# Update X
X = X_numeric

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=SEED
)

removed_cols: ['dx', 'folder', 'image_id', 'file_name']


In [6]:
# Model Evaluation
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted', zero_division=0):.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=le.classes_, zero_division=0))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

In [7]:
# Load models
models = {
    'SVM': joblib.load('model_training/best_svm_model.pkl'),
    'RandomForest': joblib.load('model_training/best_rf_model.pkl'),
    'XGBoost': joblib.load('model_training/xgboost_model.pkl'),
    'Bagging': joblib.load('model_training/bagging_dtc_model.pkl'),
}

# Decompress Ensemble Model
ensemble_model_path = 'model_training/best_ensemble_model.pkl.gz'
with gzip.open(ensemble_model_path, 'rb') as f:
    ensemble_model = joblib.load(f)
models['Ensemble'] = ensemble_model # Add to models dictionary

In [8]:
# Evaluate models
for model_name, model in models.items():
    print(f"Model: {model_name}")
    evaluate_model(model, X_test, y_test)
    print("="*50)

Model: SVM
Accuracy: 0.7559
F1 Score: 0.7441

Classification Report:
              precision    recall  f1-score   support

       akiec       0.54      0.49      0.52        65
         bcc       0.47      0.47      0.47       103
         bkl       0.55      0.48      0.51       220
          df       0.75      0.13      0.22        23
         mel       0.51      0.40      0.45       223
          nv       0.84      0.91      0.88      1341
        vasc       0.68      0.46      0.55        28

    accuracy                           0.76      2003
   macro avg       0.62      0.48      0.51      2003
weighted avg       0.74      0.76      0.74      2003


Confusion Matrix:
[[  32   14    4    0    3   11    1]
 [  11   48    6    0    4   33    1]
 [   4   17  106    1   25   66    1]
 [   4    3    3    3    0   10    0]
 [   2    0   39    0   89   93    0]
 [   6   19   35    0   55 1223    3]
 [   0    2    1    0    0   12   13]]
Model: RandomForest
Accuracy: 0.7264
F1 Score: 0

### Neural Networks

We also trained neural networks with transfer learning utilizing VGG16 and MobileNetV2 architectures. VGG16 is a deep convolutional neural network architecture that was trained on the ImageNet dataset and was chosen for its complexity. MobileNetV2 is a lightweight model designed for mobile and edge devices, chosen for its efficiency.

We trained these models on image data with some data augmentation and tuning, but accuracy was a bit low. The models would likely benefit from additional hyperparameter tuning, more epochs, and more combinations of data augmentation techniques. However, due to computational and time constraints, we did not explore these options further but will provide detail in the final report.

In [9]:
from datasets import load_dataset

# load dataset with images from huggingface
train_dataset = load_dataset("marmal88/skin_cancer", split="train")
valid_dataset = load_dataset("marmal88/skin_cancer", split="validation")
test_dataset = load_dataset("marmal88/skin_cancer", split="test")

In [10]:
# save lengths of datasets for later use
train_length = len(train_dataset) - 1 # Remove the first row
valid_length = len(valid_dataset) - 1
test_length = len(test_dataset) - 1

In [11]:
# Function to convert dataset to TensorFlow dataset
def convert_to_tf_dataset(dataset):
    def generator():
        for data in dataset:
            yield {'image': data['image'], 'dx': data['dx']}

    return tf.data.Dataset.from_generator(
        generator,
        output_signature={
            'image': tf.TensorSpec(shape=(None, None, 3), dtype=tf.uint8),
            'dx': tf.TensorSpec(shape=(), dtype=tf.string)
        }
    )

# Data Augmentation for Images
data_augmentation = keras.Sequential([
    keras.layers.RandomFlip('horizontal_and_vertical'),
    keras.layers.RandomRotation(0.1),
    keras.layers.RandomZoom(0.1),
    keras.layers.RandomContrast(0.1),
])

# Define normalization layer
normalization_layer = keras.layers.Rescaling(1./255)

# Extract unique labels from the dataset
unique_labels = train_dataset.unique('dx')

# Create a lookup table mapping each label to a unique integer
label_to_index = {label: index for index, label in enumerate(unique_labels)}

# Retrieve number of classes
num_classes = len(train_dataset.unique('dx'))

# Function to preprocess and augment the dataset
def preprocess_data(dataset, augment=False):
    def preprocess_image(image, label):
        image = tf.image.resize(image, (128, 128))
        image = normalization_layer(image)
        if augment:
            image = data_augmentation(image)
        label = label_to_index[label.numpy().decode('utf-8')]  # Convert string labels to integers
        label = tf.one_hot(label, num_classes)  # One-hot encode the label
        return image, label

    def map_fn(data):
        image = data['image']
        label = data['dx']
        image, label = tf.py_function(preprocess_image, [image, label], [tf.float32, tf.float32])
        image.set_shape((128, 128, 3))
        label.set_shape((num_classes,))
        return image, label

    dataset = dataset.map(map_fn)
    dataset = dataset.batch(32)
    return dataset

In [12]:
# Convert datasets to TensorFlow datasets
train_dataset = convert_to_tf_dataset(train_dataset)
valid_dataset = convert_to_tf_dataset(valid_dataset)
test_dataset = convert_to_tf_dataset(test_dataset)

In [13]:
# Preprocess the datasets
train_dataset = preprocess_data(train_dataset, augment=True) # Repeat the dataset 50 times to match epochs for training
valid_dataset = preprocess_data(valid_dataset)
test_dataset = preprocess_data(test_dataset)

# Remove the first row from each dataset
train_dataset = train_dataset.skip(1)
valid_dataset = valid_dataset.skip(1)
test_dataset = test_dataset.skip(1)

AUTOTUNE = tf.data.AUTOTUNE

# Cache and prefetch the datasets to improve performance
train_dataset = train_dataset.take(train_length).cache().repeat().prefetch(buffer_size=tf.data.AUTOTUNE)
valid_dataset = valid_dataset.cache().take(valid_length).repeat().prefetch(buffer_size=tf.data.AUTOTUNE)
test_dataset = test_dataset.cache().take(test_length).repeat().prefetch(buffer_size=tf.data.AUTOTUNE)

# Calculate steps per epoch
steps_per_epoch = math.ceil(train_length / 32)
validation_steps = math.ceil(valid_length / 32)

In [15]:
# Load the models
vgg16_model = keras.models.load_model('model_training/best_vgg16_model.keras')
mobilenet_model = keras.models.load_model('model_training/best_mobilenetv2_model.keras')

# Evaluate the models
print('VGG16 Model Evaluation:')
vgg16_model.evaluate(test_dataset, steps=test_length)

print('\nMobileNetV2 Model Evaluation:')
mobilenet_model.evaluate(test_dataset, steps=test_length)

  saveable.load_own_variables(weights_store.get(inner_path))


VGG16 Model Evaluation:
[1m1284/1284[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2551s[0m 2s/step - accuracy: 0.6776 - loss: 1.6830

MobileNetV2 Model Evaluation:
[1m1284/1284[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m105s[0m 81ms/step - accuracy: 0.6789 - loss: 1.4435


[1.43679678440094, 0.6849641799926758]