In [1]:
# Environment Setup
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, classification_report, 
                             confusion_matrix, f1_score)
from sklearn.pipeline import Pipeline
import joblib
import os


In [2]:
# Set random seed for reproducibility
SEED = 42
np.random.seed(SEED)

In [3]:
# Define paths
base_path = "../dataset/data_exploration/"
metadata_path = "../dataset/dataverse_files/"

# Load feature files
color_var = pd.read_csv(os.path.join(base_path, "color_variance_features.csv"))
color_hist = pd.read_csv(os.path.join(base_path, "combined_color_histogram_features.csv"))
lbp = pd.read_csv(os.path.join(base_path, "combined_lbp_features.csv"))
glcm = pd.read_csv(os.path.join(base_path, "glcm_features.csv"))
metadata = pd.read_csv(os.path.join(metadata_path, "HAM10000_metadata"))

# Function to extract image_id from file_name
def extract_image_id(file_name):
    # Extract the base name (e.g., 'ISIC_0024306.jpg')
    base_name = file_name.split('\\')[-1]
    # Remove the file extension (e.g., '.jpg')
    image_id = os.path.splitext(base_name)[0]
    return image_id

# Apply the function to extract image_id
color_var['image_id'] = color_var['file_name'].apply(extract_image_id)
color_hist['image_id'] = color_hist['file_name'].apply(extract_image_id)
lbp['image_id'] = lbp['file_name'].apply(extract_image_id)
glcm['image_id'] = glcm['file_name'].apply(extract_image_id)

# Sort feature DataFrames by image_id
color_var_sorted = color_var.sort_values(by='image_id').reset_index(drop=True)
color_hist_sorted = color_hist.sort_values(by='image_id').reset_index(drop=True)
lbp_sorted = lbp.sort_values(by='image_id').reset_index(drop=True)
glcm_sorted = glcm.sort_values(by='image_id').reset_index(drop=True)

# Sort metadata by image_id
metadata_sorted = metadata.sort_values(by='image_id').reset_index(drop=True)


In [4]:
# Check for consistency
print("Feature image_id after sorting:", color_var_sorted['image_id'].head())
print("Metadata image_id after sorting:", metadata_sorted['image_id'].head())

Feature image_id after sorting: 0    ISIC_0024306
1    ISIC_0024307
2    ISIC_0024308
3    ISIC_0024309
4    ISIC_0024310
Name: image_id, dtype: object
Metadata image_id after sorting: 0    ISIC_0024306
1    ISIC_0024307
2    ISIC_0024308
3    ISIC_0024309
4    ISIC_0024310
Name: image_id, dtype: object


In [5]:
# Merge all features
def merge_features(df_list):
    # Start with the first DataFrame
    merged = df_list[0]
    # Merge the rest
    for df in df_list[1:]:
        merged = pd.merge(merged, df, on='image_id', how='inner', suffixes=('', '_dup'))
        # Drop duplicate columns
        merged = merged.loc[:, ~merged.columns.str.endswith('_dup')]
    return merged

# List of sorted feature DataFrames
feature_dfs = [color_var_sorted, color_hist_sorted, lbp_sorted, glcm_sorted]
merged_features = merge_features(feature_dfs)

# Add metadata (dx column)
full_data = merged_features.copy()
full_data['dx'] = metadata_sorted['dx']

In [6]:
# Check the full data
print("Full Data Shape:", full_data.shape)
print("Full Data Columns:", full_data.columns)
print("Class Distribution:\n", full_data['dx'].value_counts())

Full Data Shape: (10015, 122)
Full Data Columns: Index(['file_name', 'folder', 'mean_r', 'mean_g', 'mean_b', 'var_r', 'var_g',
       'var_b', 'overall_var', 'image_id',
       ...
       'lbp_6', 'lbp_7', 'lbp_8', 'lbp_9', 'contrast', 'dissimilarity',
       'homogeneity', 'energy', 'correlation', 'dx'],
      dtype='object', length=122)
Class Distribution:
 dx
nv       6705
mel      1113
bkl      1099
bcc       514
akiec     327
vasc      142
df        115
Name: count, dtype: int64


In [7]:
# Data Preprocessing
# Encode labels
le = LabelEncoder()
full_data['label'] = le.fit_transform(full_data['dx'])

# Define X and y (features and target)
X = full_data.drop(columns=['label'])
y = full_data['label']

# Drop non-feature columns
X_numeric = X.select_dtypes(include=['number'])

# Output removed columns
removed_cols = list(set(X.columns) - set(X_numeric.columns))
print("removed_cols:", removed_cols)

# Update X
X = X_numeric

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    stratify=y,
    random_state=SEED
)

print("X Shape:", X.shape)
print("y Shape:", y.shape)
print("X Columns:", X.columns)
print("Class Distribution:\n", y.value_counts())

removed_cols: ['dx', 'image_id', 'file_name', 'folder']
X Shape: (10015, 118)
y Shape: (10015,)
X Columns: Index(['mean_r', 'mean_g', 'mean_b', 'var_r', 'var_g', 'var_b', 'overall_var',
       'hist_r_0', 'hist_r_1', 'hist_r_2',
       ...
       'lbp_5', 'lbp_6', 'lbp_7', 'lbp_8', 'lbp_9', 'contrast',
       'dissimilarity', 'homogeneity', 'energy', 'correlation'],
      dtype='object', length=118)
Class Distribution:
 label
5    6705
4    1113
2    1099
1     514
0     327
6     142
3     115
Name: count, dtype: int64


In [8]:
# Model Pipeline Setup
scaler = StandardScaler()

# SVM Pipeline
svm_pipe = Pipeline([
    ('scaler', scaler),
    ('svm', SVC(probability=True, random_state=SEED))
])

# Random Forest Pipeline
rf_pipe = Pipeline([
    ('scaler', scaler),
    ('rf', RandomForestClassifier(random_state=SEED))
])


In [9]:
# Hyperparameter Tuning

# # SVM Hyperparameters
# svm_params = {
#     'svm__C': [0.01],
#     'svm__kernel': ['linear'],
#     'svm__gamma': ['scale'],
#     'svm__class_weight': [None, 'balanced']
# }

svm_params = {
    'svm__C': [0.001, .001, .01, 0.1],
    'svm__kernel': ['linear', 'rbf'],
    'svm__gamma': ['scale', 'auto'],
    'svm__class_weight': [None, 'balanced']
}

# # Random Forest Hyperparameters
# rf_params = {
#     'rf__n_estimators': [100],
#     'rf__max_depth': [None, 10],
#     'rf__min_samples_split': [2],
#     'rf__min_samples_leaf': [2],
#     'rf__class_weight': [None, 'balanced']
# }

rf_params = {
    'rf__n_estimators': [100, 200, 500],
    'rf__max_depth': [None, 10, 20, 30],
    'rf__min_samples_split': [2, 5, 10],
    'rf__min_samples_leaf': [1, 2, 4],
    'rf__class_weight': [None, 'balanced', 'balanced_subsample']
}

In [10]:
# Tuning Strategy
def tune_model(pipe, params, X, y):
    search = RandomizedSearchCV(
        pipe,
        params,
        n_iter=50,
        cv=5,
        scoring='f1_weighted',
        n_jobs=-1,
        verbose=1,
        random_state=SEED
    )
    search.fit(X, y)
    return search.best_estimator_, search.best_params_

In [11]:
# SVM Tuning
print("Tuning SVM...")
best_svm, svm_best_params = tune_model(svm_pipe, svm_params, X_train, y_train)

Tuning SVM...
Fitting 5 folds for each of 32 candidates, totalling 160 fits




In [12]:
# Random Forest Tuning
print("\nTuning Random Forest...")
best_rf, rf_best_params = tune_model(rf_pipe, rf_params, X_train, y_train)


Tuning Random Forest...
Fitting 5 folds for each of 50 candidates, totalling 250 fits


In [13]:
# Model Evaluation
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=le.classes_))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

# Evaluate SVM
print("SVM Performance:")
evaluate_model(best_svm, X_test, y_test)

# Evaluate Random Forest
print("\nRandom Forest Performance:")
evaluate_model(best_rf, X_test, y_test)

SVM Performance:
Accuracy: 0.7209
F1 Score: 0.6767

Classification Report:
              precision    recall  f1-score   support

       akiec       0.49      0.42      0.45        65
         bcc       0.40      0.26      0.32       103
         bkl       0.50      0.35      0.41       220
          df       0.00      0.00      0.00        23
         mel       0.46      0.13      0.21       223
          nv       0.77      0.95      0.85      1341
        vasc       0.71      0.36      0.48        28

    accuracy                           0.72      2003
   macro avg       0.48      0.35      0.39      2003
weighted avg       0.67      0.72      0.68      2003


Confusion Matrix:
[[  27   10    2    0    2   24    0]
 [  10   27    5    0    1   60    0]
 [   3   13   78    0   13  112    1]
 [   4    2    3    0    0   14    0]
 [   2    1   39    0   30  151    0]
 [   9   12   26    0   19 1272    3]
 [   0    3    4    0    0   11   10]]

Random Forest Performance:
Accuracy: 0.74

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [14]:
# Feature Importance Analysis (RF Specific)

# Get feature importances
importances = best_rf.named_steps['rf'].feature_importances_
feature_names = X.columns
feat_imp = pd.DataFrame({'feature': feature_names, 'importance': importances})
feat_imp = feat_imp.sort_values(by='importance', ascending=False)

print("\nTop 10 Important Features:")
print(feat_imp.head(10))


Top 10 Important Features:
      feature  importance
78   hist_b_7    0.021240
45   hist_g_6    0.019861
103     lbp_0    0.019531
46   hist_g_7    0.018812
44   hist_g_5    0.018754
79   hist_b_8    0.018018
77   hist_b_6    0.017859
108     lbp_5    0.017817
106     lbp_3    0.017438
112     lbp_9    0.017391


In [15]:
# Save models
joblib.dump(best_svm, 'best_svm_model.pkl')
joblib.dump(best_rf, 'best_rf_model.pkl')
joblib.dump(le, 'label_encoder.pkl')

# Save feature names
with open('feature_names.txt', 'w') as f:
    f.write('\n'.join(feature_names))

In [35]:
# # Key Parameter Adjustment Strategies
# """
# For SVM:
# 1. Regularization (C):
#    - Start with log scale values (0.1, 1, 10, 100)
#    - Higher C = less regularization, might overfit
#
# 2. Kernel Selection:
#    - Try linear first for baseline
#    - RBF for non-linear relationships
#    - Poly for complex patterns (but needs more data)
#
# 3. Gamma:
#    - Controls decision boundary curvature
#    - Lower values = larger influence radius
#    - Use 'scale' (1/(n_features * X.var())) as baseline
#
# For Random Forest:
# 1. n_estimators:
#    - Start with 100-500 trees
#    - More trees = better performance but longer training"
#
# 2. max_depth:
#    - Control tree complexity
#    - None for full expansion (watch for overfitting)
#
# 3. class_weight:
#    - Crucial for imbalanced datasets
#    - 'balanced' adjusts weights inversely proportional to class frequencies
#
# 4. min_samples_split:
#    - Higher values prevent overfitting
#    - Start with 2 (default), try 5-10 for regularization
# """


In [11]:
# See how the svm model and rf models can potentially be improved with data augmentation

# Data Augmentation
def augment_data(data, num_augmentations=1, noise_level=0.01):
    augmented_data = data.copy()
    for _ in range(num_augmentations):
        noise = np.random.normal(0, noise_level, data.shape)
        augmented_sample = data + noise
        augmented_data = pd.concat([augmented_data, augmented_sample], ignore_index=True)
    return augmented_data

# Augment the training data
X_train_augmented = augment_data(X_train, num_augmentations=2, noise_level=0.01)

# Repeat the target labels to match the augmented data
y_train_augmented = pd.concat([y_train] * (2 + 1), ignore_index=True)  # Original + 2 augmentations

print("Augmented X_train shape:", X_train_augmented.shape)
print("Augmented y_train shape:", y_train_augmented.shape)

Augmented X_train shape: (24036, 119)
Augmented y_train shape: (24036,)


In [12]:
# Convert to numpy arrays
X_train_augmented = np.array(X_train_augmented)
y_train_augmented = np.array(y_train_augmented)

# Standardize the data
X_train_augmented = scaler.fit_transform(X_train_augmented)
X_test = scaler.transform(X_test)



In [13]:
# SVM Model
svm_model_augmented = SVC(probability=True, random_state=SEED)
svm_model_augmented.fit(X_train_augmented, y_train_augmented)

# Random Forest Model
rf_model_augmented = RandomForestClassifier(random_state=SEED)
rf_model_augmented.fit(X_train_augmented, y_train_augmented)

In [18]:
# Tune SVM
print("Tuning SVM...")
best_svm_aug, svm_best_params_aug = tune_model(svm_pipe, svm_params, X_train_augmented, y_train_augmented)

Tuning SVM...
Fitting 5 folds for each of 24 candidates, totalling 120 fits



KeyboardInterrupt



In [None]:
# Tune Random Forest
print("\nTuning Random Forest...")
best_rf_aug, rf_best_params_aug = tune_model(rf_pipe, rf_params, X_train_augmented, y_train_augmented)

In [None]:
# Evaluate models
print("SVM with Augmented Data Performance:")
evaluate_model(best_svm_aug, X_test, y_test)

print("\nRandom Forest with Augmented Data Performance:")
evaluate_model(best_rf_aug, X_test, y_test)

In [None]:
# Save models
joblib.dump(svm_model_augmented, 'svm_model_augmented.pkl')
joblib.dump(rf_model_augmented, 'rf_model_augmented.pkl')

We will try to use a CNN model to classify the skin cancer images. We will use the VGG16 architecture with transfer learning.

In [36]:
from datasets import load_dataset

# load dataset from huggingface
train_dataset = load_dataset("marmal88/skin_cancer", split="train")
valid_dataset = load_dataset("marmal88/skin_cancer", split="validation")
test_dataset = load_dataset("marmal88/skin_cancer", split="test")

In [37]:
# save lengths of datasets for later use
train_length = len(train_dataset) - 1 # Remove the first row
valid_length = len(valid_dataset) - 1
test_length = len(test_dataset) - 1

# check the dataset
print(train_dataset.shape)
print(train_dataset.features)

(9577, 8)
{'image': Image(mode=None, decode=True, id=None), 'image_id': Value(dtype='string', id=None), 'lesion_id': Value(dtype='string', id=None), 'dx': Value(dtype='string', id=None), 'dx_type': Value(dtype='string', id=None), 'age': Value(dtype='float64', id=None), 'sex': Value(dtype='string', id=None), 'localization': Value(dtype='string', id=None)}


In [3]:
# apply CNNs and transfer learning
import keras
import tensorflow as tf

# Retrieve number of classes
num_classes = len(train_dataset.unique('dx'))

# Define a learning rate scheduler
def scheduler(epoch, lr):
    if epoch < 10:
        return float(lr)
    else:
        return float(lr * tf.math.exp(-0.1))

lr_scheduler = keras.callbacks.LearningRateScheduler(scheduler)

def build_model(hp):
    base_model = keras.applications.VGG16(weights='imagenet', include_top=False, input_shape=(256, 128, 3))
    for layer in base_model.layers:
        layer.trainable = False

    x = base_model.output
    x = keras.layers.Flatten()(x)
    x = keras.layers.Dense(hp.Int('units', min_value=128, max_value=512, step=64), activation='relu')(x)
    x = keras.layers.Dropout(hp.Float('dropout', min_value=0.2, max_value=0.5, step=0.1))(x)
    predictions = keras.layers.Dense(num_classes, activation='softmax')(x)

    model = keras.Model(inputs=base_model.input, outputs=predictions)
    model.compile(
        optimizer=keras.optimizers.Adam(hp.Float('learning_rate', min_value=1e-4, max_value=1e-2, sampling='LOG')),
        loss=keras.losses.CategoricalCrossentropy(from_logits=False),
        metrics=['accuracy']
    )
    return model

In [5]:
# Function to convert dataset to TensorFlow dataset
def convert_to_tf_dataset(dataset):
    def generator():
        for data in dataset:
            yield {'image': data['image'], 'dx': data['dx']}

    return tf.data.Dataset.from_generator(
        generator,
        output_signature={
            'image': tf.TensorSpec(shape=(None, None, 3), dtype=tf.uint8),
            'dx': tf.TensorSpec(shape=(), dtype=tf.string)
        }
    )

# Data Augmentation for Images
data_augmentation = keras.Sequential([
    keras.layers.RandomFlip('horizontal_and_vertical'),
    keras.layers.RandomRotation(0.2),
    keras.layers.RandomZoom(0.2),
    keras.layers.RandomContrast(0.2),
])

# Define normalization layer
normalization_layer = keras.layers.Rescaling(1./255)

# Extract unique labels from the dataset
unique_labels = train_dataset.unique('dx')

# Create a lookup table mapping each label to a unique integer
label_to_index = {label: index for index, label in enumerate(unique_labels)}

# Function to preprocess and augment the dataset
def preprocess_data(dataset, augment=False):
    def preprocess_image(image, label):
        image = tf.image.resize(image, (128, 128))
        image = normalization_layer(image)
        if augment:
            image = data_augmentation(image)
        label = label_to_index[label.numpy().decode('utf-8')]  # Convert string labels to integers
        label = tf.one_hot(label, num_classes)  # One-hot encode the label
        return image, label

    def map_fn(data):
        image = data['image']
        label = data['dx']
        image, label = tf.py_function(preprocess_image, [image, label], [tf.float32, tf.float32])
        image.set_shape((128, 128, 3))
        label.set_shape((num_classes,))
        return image, label

    dataset = dataset.map(map_fn)
    dataset = dataset.batch(32)
    return dataset

In [6]:
# Convert datasets to TensorFlow datasets
train_dataset = convert_to_tf_dataset(train_dataset)
valid_dataset = convert_to_tf_dataset(valid_dataset)
test_dataset = convert_to_tf_dataset(test_dataset)

In [7]:
import math

# Preprocess the datasets
train_dataset = preprocess_data(train_dataset, augment=True) # Repeat the dataset 50 times to match epochs for training
valid_dataset = preprocess_data(valid_dataset)
test_dataset = preprocess_data(test_dataset)

# Remove the first row from each dataset
train_dataset = train_dataset.skip(1)
valid_dataset = valid_dataset.skip(1)
test_dataset = test_dataset.skip(1)

AUTOTUNE = tf.data.AUTOTUNE

# Cache and prefetch the datasets to improve performance
train_dataset = train_dataset.take(train_length).cache().repeat().prefetch(buffer_size=tf.data.AUTOTUNE)
valid_dataset = valid_dataset.cache().take(valid_length).repeat().prefetch(buffer_size=tf.data.AUTOTUNE)
test_dataset = test_dataset.cache().take(test_length).repeat().prefetch(buffer_size=tf.data.AUTOTUNE)

# Calculate steps per epoch
steps_per_epoch = math.ceil(train_length / 32)
validation_steps = math.ceil(valid_length / 32)

# Debugging Statement: Print dataset shapes and steps
print(f"Train dataset length: {train_length}, Steps per epoch: {steps_per_epoch}")
print(f"Validation dataset length: {valid_length}, Validation steps: {validation_steps}")

Train dataset length: 9576, Steps per epoch: 300
Validation dataset length: 2491, Validation steps: 78


In [8]:
# Check the normalization
normalized_ds = train_dataset.take(1).cache().map(lambda x, y: (normalization_layer(x), y))
image_batch, labels_batch = next(iter(normalized_ds))
first_image = image_batch[0]
# Notice the pixel values are now in [0,1].
print(np.min(first_image), np.max(first_image))

0.0008656473 0.0033541827


2025-03-28 13:44:03.934699: W tensorflow/core/kernels/data/cache_dataset_ops.cc:916] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [16]:
import keras_tuner as kt

tuner = kt.Hyperband(
    build_model,
    objective='val_accuracy',
    max_epochs=10,
    factor=4,
    directory='my_dir',
    project_name='cnn_tuning'
)

tuner.search(train_dataset, validation_data=valid_dataset, epochs=50, steps_per_epoch=steps_per_epoch, validation_steps=validation_steps)

Trial 11 Complete [01h 10m 28s]
val_accuracy: 0.6685393452644348

Best val_accuracy So Far: 0.6689406037330627
Total elapsed time: 08h 03m 07s


In [17]:
best_model = tuner.get_best_models(num_models=1)[0]
best_hyperparameters = tuner.get_best_hyperparameters(num_trials=1)[0]

print("Best Hyperparameters:")
print(best_hyperparameters.values)

Best Hyperparameters:
{'units': 192, 'dropout': 0.2, 'learning_rate': 0.0010158927858354443, 'tuner/epochs': 3, 'tuner/initial_epoch': 0, 'tuner/bracket': 1, 'tuner/round': 0}


In [18]:
# save best model
best_model.save('best_cnn_model.keras')