In [1]:
import itertools

# Environment Setup
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, classification_report, 
                             confusion_matrix, f1_score)
from sklearn.pipeline import Pipeline
import joblib
import os

from sklearn.tree import DecisionTreeClassifier


In [2]:
# Set random seed for reproducibility
SEED = 42
np.random.seed(SEED)

In [3]:
# Define paths
base_path = "../dataset/data_exploration/"
metadata_path = "../dataset/dataverse_files/"

# Load feature files
color_var = pd.read_csv(os.path.join(base_path, "color_variance_features.csv"))
color_hist = pd.read_csv(os.path.join(base_path, "combined_color_histogram_features.csv"))
lbp = pd.read_csv(os.path.join(base_path, "combined_lbp_features.csv"))
glcm = pd.read_csv(os.path.join(base_path, "glcm_features.csv"))
metadata = pd.read_csv(os.path.join(metadata_path, "HAM10000_metadata"))

# Function to extract image_id from file_name
def extract_image_id(file_name):
    # Extract the base name (e.g., 'ISIC_0024306.jpg')
    base_name = file_name.split('\\')[-1]
    # Remove the file extension (e.g., '.jpg')
    image_id = os.path.splitext(base_name)[0]
    return image_id

# Apply the function to extract image_id
color_var['image_id'] = color_var['file_name'].apply(extract_image_id)
color_hist['image_id'] = color_hist['file_name'].apply(extract_image_id)
lbp['image_id'] = lbp['file_name'].apply(extract_image_id)
glcm['image_id'] = glcm['file_name'].apply(extract_image_id)

# Sort feature DataFrames by image_id
color_var_sorted = color_var.sort_values(by='image_id').reset_index(drop=True)
color_hist_sorted = color_hist.sort_values(by='image_id').reset_index(drop=True)
lbp_sorted = lbp.sort_values(by='image_id').reset_index(drop=True)
glcm_sorted = glcm.sort_values(by='image_id').reset_index(drop=True)

# Sort metadata by image_id
metadata_sorted = metadata.sort_values(by='image_id').reset_index(drop=True)


In [4]:
# Check for consistency
print("Feature image_id after sorting:", color_var_sorted['image_id'].head())
print("Metadata image_id after sorting:", metadata_sorted['image_id'].head())

Feature image_id after sorting: 0    ISIC_0024306
1    ISIC_0024307
2    ISIC_0024308
3    ISIC_0024309
4    ISIC_0024310
Name: image_id, dtype: object
Metadata image_id after sorting: 0    ISIC_0024306
1    ISIC_0024307
2    ISIC_0024308
3    ISIC_0024309
4    ISIC_0024310
Name: image_id, dtype: object


In [5]:
# Merge all features
def merge_features(df_list):
    # Start with the first DataFrame
    merged = df_list[0]
    # Merge the rest
    for df in df_list[1:]:
        merged = pd.merge(merged, df, on='image_id', how='inner', suffixes=('', '_dup'))
        # Drop duplicate columns
        merged = merged.loc[:, ~merged.columns.str.endswith('_dup')]
    return merged

# List of sorted feature DataFrames
feature_dfs = [color_var_sorted, color_hist_sorted, lbp_sorted, glcm_sorted]
merged_features = merge_features(feature_dfs)

# Add metadata (dx column)
full_data = merged_features.copy()
full_data['dx'] = metadata_sorted['dx']

In [6]:
# Check the full data
print("Full Data Shape:", full_data.shape)
print("Full Data Columns:", full_data.columns)
print("Class Distribution:\n", full_data['dx'].value_counts())

Full Data Shape: (10015, 122)
Full Data Columns: Index(['file_name', 'folder', 'mean_r', 'mean_g', 'mean_b', 'var_r', 'var_g',
       'var_b', 'overall_var', 'image_id',
       ...
       'lbp_6', 'lbp_7', 'lbp_8', 'lbp_9', 'contrast', 'dissimilarity',
       'homogeneity', 'energy', 'correlation', 'dx'],
      dtype='object', length=122)
Class Distribution:
 dx
nv       6705
mel      1113
bkl      1099
bcc       514
akiec     327
vasc      142
df        115
Name: count, dtype: int64


In [7]:
# Data Preprocessing
# Encode labels
le = LabelEncoder()
full_data['label'] = le.fit_transform(full_data['dx'])

# Define X and y (features and target)
X = full_data.drop(columns=['label'])
y = full_data['label']

# Drop non-feature columns
X_numeric = X.select_dtypes(include=['number'])

# Output removed columns
removed_cols = list(set(X.columns) - set(X_numeric.columns))
print("removed_cols:", removed_cols)

# Update X
X = X_numeric

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    stratify=y,
    random_state=SEED
)

print("X Shape:", X.shape)
print("y Shape:", y.shape)
print("X Columns:", X.columns)
print("Class Distribution:\n", y.value_counts())

removed_cols: ['file_name', 'folder', 'image_id', 'dx']
X Shape: (10015, 118)
y Shape: (10015,)
X Columns: Index(['mean_r', 'mean_g', 'mean_b', 'var_r', 'var_g', 'var_b', 'overall_var',
       'hist_r_0', 'hist_r_1', 'hist_r_2',
       ...
       'lbp_5', 'lbp_6', 'lbp_7', 'lbp_8', 'lbp_9', 'contrast',
       'dissimilarity', 'homogeneity', 'energy', 'correlation'],
      dtype='object', length=118)
Class Distribution:
 label
5    6705
4    1113
2    1099
1     514
0     327
6     142
3     115
Name: count, dtype: int64


In [8]:
# Model Pipeline Setup
scaler = StandardScaler()

# SVM Pipeline
svm_pipe = Pipeline([
    ('scaler', scaler),
    ('svm', SVC(probability=True, random_state=SEED))
])

# Random Forest Pipeline
rf_pipe = Pipeline([
    ('scaler', scaler),
    ('rf', RandomForestClassifier(random_state=SEED))
])


In [9]:
# Hyperparameter Tuning

# # SVM Hyperparameters
# svm_params = {
#     'svm__C': [0.01],
#     'svm__kernel': ['linear'],
#     'svm__gamma': ['scale'],
#     'svm__class_weight': [None, 'balanced']
# }

svm_params = {
    'svm__C': [5, 10, 15],
    'svm__kernel': ['poly', 'rbf'],
    'svm__gamma': ['scale', 'auto'],
    'svm__class_weight': [None, 'balanced']
}

# # Random Forest Hyperparameters
# rf_params = {
#     'rf__n_estimators': [100],
#     'rf__max_depth': [None, 10],
#     'rf__min_samples_split': [2],
#     'rf__min_samples_leaf': [2],
#     'rf__class_weight': [None, 'balanced']
# }

rf_params = {
    'rf__n_estimators': [100, 200, 300, 400, 500],
    'rf__max_depth': [None, 10, 20, 30, 40, 50],
    'rf__min_samples_split': [2, 5, 10, 15, 20],
    'rf__min_samples_leaf': [1, 2, 4, 6, 8],
    'rf__class_weight': [None, 'balanced', 'balanced_subsample']
}

In [10]:
# Callback function to save the progress
def save_progress(search, filename='search_progress.pkl'):
    joblib.dump(search, filename)

# Define the tuning function with progress saving and loading
def tune_model(pipe, params, X, y, n_iter=50, save_every=10, filename='search_progress.pkl'):
    if os.path.exists(filename):
        search = joblib.load(filename)
        completed_iter = search.n_iter
    else:
        search = RandomizedSearchCV(
            pipe,
            params,
            n_iter=n_iter,
            cv=5,
            scoring='f1_weighted',
            n_jobs=-1,
            verbose=4,
            random_state=SEED
        )
        completed_iter = 0

    remaining_iter = n_iter - completed_iter
    if remaining_iter > 0:
        for i in range(0, remaining_iter, save_every):
            iter_chunk = min(save_every, remaining_iter - i)
            search.n_iter = iter_chunk
            search.fit(X, y)
            save_progress(search, filename)

    return search.best_estimator_, search.best_params_

In [11]:
# Model Evaluation
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted', zero_division=0):.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=le.classes_, zero_division=0))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

In [16]:
# SVM Tuning
print("Tuning SVM...")
best_svm, svm_best_params = tune_model(svm_pipe, svm_params, X_train, y_train, n_iter=100, save_every=10, filename='svm_search_progress.pkl')

Tuning SVM...
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [40]:
# Random Forest Tuning
print("\nTuning Random Forest...")
best_rf, rf_best_params = tune_model(rf_pipe, rf_params, X_train, y_train, n_iter=100, save_every=10, filename='rf_search_progress.pkl')


Tuning Random Forest...
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [17]:
# Evaluate SVM
print("SVM Performance:")
evaluate_model(best_svm, X_test, y_test)

# Evaluate Random Forest
print("\nRandom Forest Performance:")
evaluate_model(best_rf, X_test, y_test)

SVM Performance:
Accuracy: 0.7559
F1 Score: 0.7441

Classification Report:
              precision    recall  f1-score   support

       akiec       0.54      0.49      0.52        65
         bcc       0.47      0.47      0.47       103
         bkl       0.55      0.48      0.51       220
          df       0.75      0.13      0.22        23
         mel       0.51      0.40      0.45       223
          nv       0.84      0.91      0.88      1341
        vasc       0.68      0.46      0.55        28

    accuracy                           0.76      2003
   macro avg       0.62      0.48      0.51      2003
weighted avg       0.74      0.76      0.74      2003


Confusion Matrix:
[[  32   14    4    0    3   11    1]
 [  11   48    6    0    4   33    1]
 [   4   17  106    1   25   66    1]
 [   4    3    3    3    0   10    0]
 [   2    0   39    0   89   93    0]
 [   6   19   35    0   55 1223    3]
 [   0    2    1    0    0   12   13]]


In [42]:
# Feature Importance Analysis (RF Specific)

# Get feature importances
importances = best_rf.named_steps['rf'].feature_importances_
feature_names = X.columns
feat_imp = pd.DataFrame({'feature': feature_names, 'importance': importances})
feat_imp = feat_imp.sort_values(by='importance', ascending=False)

print("\nTop 10 Important Features:")
print(feat_imp.head(10))


Top 10 Important Features:
      feature  importance
45   hist_g_6    0.022971
78   hist_b_7    0.021679
77   hist_b_6    0.020622
44   hist_g_5    0.020284
103     lbp_0    0.019746
104     lbp_1    0.019382
46   hist_g_7    0.019250
108     lbp_5    0.019242
79   hist_b_8    0.018713
76   hist_b_5    0.017528


In [43]:
# Save models
joblib.dump(best_svm, 'best_svm_model.pkl')
joblib.dump(best_rf, 'best_rf_model.pkl')
joblib.dump(le, 'label_encoder.pkl')

# Save feature names
with open('feature_names.txt', 'w') as f:
    f.write('\n'.join(feature_names))

In [44]:
# best params
print("Best SVM Parameters:")
print(svm_best_params)
print("\nBest Random Forest Parameters:")
print(rf_best_params)

Best SVM Parameters:
{'svm__kernel': 'rbf', 'svm__gamma': 'auto', 'svm__class_weight': None, 'svm__C': 10}

Best Random Forest Parameters:
{'rf__n_estimators': 500, 'rf__min_samples_split': 15, 'rf__min_samples_leaf': 2, 'rf__max_depth': 40, 'rf__class_weight': 'balanced'}


In [35]:
# # Key Parameter Adjustment Strategies
# """
# For SVM:
# 1. Regularization (C):
#    - Start with log scale values (0.1, 1, 10, 100)
#    - Higher C = less regularization, might overfit
#
# 2. Kernel Selection:
#    - Try linear first for baseline
#    - RBF for non-linear relationships
#    - Poly for complex patterns (but needs more data)
#
# 3. Gamma:
#    - Controls decision boundary curvature
#    - Lower values = larger influence radius
#    - Use 'scale' (1/(n_features * X.var())) as baseline
#
# For Random Forest:
# 1. n_estimators:
#    - Start with 100-500 trees
#    - More trees = better performance but longer training"
#
# 2. max_depth:
#    - Control tree complexity
#    - None for full expansion (watch for overfitting)
#
# 3. class_weight:
#    - Crucial for imbalanced datasets
#    - 'balanced' adjusts weights inversely proportional to class frequencies
#
# 4. min_samples_split:
#    - Higher values prevent overfitting
#    - Start with 2 (default), try 5-10 for regularization
# """


In [29]:
import xgboost as xgb

xgb_model = xgb.XGBClassifier(random_state=SEED)

# Train the XGBoost model
xgb_model.fit(X_train, y_train)

# Evaluate the XGBoost model
evaluate_model(xgb_model, X_test, y_test)

Accuracy: 0.7708
F1 Score: 0.7536

Classification Report:
              precision    recall  f1-score   support

       akiec       0.52      0.40      0.45        65
         bcc       0.43      0.34      0.38       103
         bkl       0.58      0.48      0.53       220
          df       0.67      0.09      0.15        23
         mel       0.58      0.44      0.50       223
          nv       0.84      0.94      0.89      1341
        vasc       0.94      0.57      0.71        28

    accuracy                           0.77      2003
   macro avg       0.65      0.47      0.52      2003
weighted avg       0.75      0.77      0.75      2003


Confusion Matrix:
[[  26   17    3    0    2   17    0]
 [  14   35    9    0    1   44    0]
 [   4   12  106    1   29   68    0]
 [   3    1    4    2    1   12    0]
 [   0    2   30    0   98   93    0]
 [   3   12   28    0   36 1261    1]
 [   0    2    2    0    1    7   16]]


In [30]:
# Save the XGBoost model
joblib.dump(xgb_model, 'xgboost_model.pkl')

['xgboost_model.pkl']

In [24]:
# Bagging
from sklearn.ensemble import BaggingClassifier

bagging_model_RF = BaggingClassifier(
    estimator=rf_pipe,
    n_estimators=50,
    random_state=SEED
)

bagging_model_RF.fit(X_train, y_train)

In [25]:
from sklearn.tree import DecisionTreeClassifier

DTC = DecisionTreeClassifier(random_state=SEED)

bagging_model_DT = BaggingClassifier(
    estimator = DTC,
    n_estimators=50,
    random_state=SEED
)

bagging_model_DT.fit(X_train, y_train)

In [26]:
# Evaluate the Bagging models
print("Bagging with Random Forest Performance:")
evaluate_model(bagging_model_RF, X_test, y_test)

print("\nBagging with Decision Tree Performance:")
evaluate_model(bagging_model_DT, X_test, y_test)

Bagging with Random Forest Performance:
Accuracy: 0.7379
F1 Score: 0.6965

Classification Report:
              precision    recall  f1-score   support

       akiec       0.53      0.31      0.39        65
         bcc       0.48      0.30      0.37       103
         bkl       0.58      0.33      0.42       220
          df       1.00      0.04      0.08        23
         mel       0.55      0.24      0.34       223
          nv       0.77      0.96      0.86      1341
        vasc       1.00      0.21      0.35        28

    accuracy                           0.74      2003
   macro avg       0.70      0.34      0.40      2003
weighted avg       0.71      0.74      0.70      2003


Confusion Matrix:
[[  20   10    6    0    2   27    0]
 [   7   31    5    0    1   59    0]
 [   4   10   73    0   21  112    0]
 [   3    3    2    1    1   13    0]
 [   1    0   21    0   54  147    0]
 [   3    8   18    0   19 1293    0]
 [   0    3    1    0    0   18    6]]

Bagging with Decis

In [28]:
# Save Bagging DTC model
joblib.dump(bagging_model_DT, 'bagging_dtc_model.pkl')

['bagging_dtc_model.pkl']

In [9]:
from sklearn.ensemble import VotingClassifier
import itertools

# Load all of the models
best_svm = joblib.load('best_svm_model.pkl')
best_rf_model = joblib.load('best_rf_model.pkl')
xgb_model = joblib.load('xgboost_model.pkl')
bagging_model_DT = joblib.load('bagging_dtc_model.pkl')

models = [
    ('svm', best_svm),
    ('rf', best_rf_model),
    ('xgb', xgb_model),
    ('bagging', bagging_model_DT)
]

# A function to try all combinations of models with soft and hard voting
def ensemble_combinations(models, X_train, y_train, X_test, y_test):
    voting_types = ['soft', 'hard']
    results = []

    for voting in voting_types:
        for i in range(1, len(models) + 1):
            for combination in itertools.combinations(models, i):
                estimators = [(name, model) for name, model in combination]
                ensemble_model = VotingClassifier(estimators=estimators, voting=voting)
                ensemble_model.fit(X_train, y_train)
                estimator_names = [name for name, _ in estimators]
                print(f'Ensemble Model with {estimator_names} with voting {voting}')
                evaluation = evaluate_model(ensemble_model, X_test, y_test)
                results.append({
                    'estimators': estimator_names,
                    'voting': voting,
                    'model': ensemble_model,
                    'evaluation': evaluation
                })

    return results

In [43]:
ensemble_models = ensemble_combinations(models, X_train, y_train, X_test, y_test)

Ensemble Model with ['svm'] with voting soft
Accuracy: 0.7539
F1 Score: 0.7346

Classification Report:
              precision    recall  f1-score   support

       akiec       0.57      0.37      0.45        65
         bcc       0.42      0.36      0.39       103
         bkl       0.57      0.47      0.51       220
          df       0.75      0.13      0.22        23
         mel       0.52      0.35      0.42       223
          nv       0.83      0.93      0.88      1341
        vasc       0.67      0.50      0.57        28

    accuracy                           0.75      2003
   macro avg       0.62      0.44      0.49      2003
weighted avg       0.73      0.75      0.73      2003


Confusion Matrix:
[[  24   17    5    0    3   15    1]
 [   8   37    6    0    4   47    1]
 [   2   15  103    1   26   72    1]
 [   3    3    3    3    0   11    0]
 [   1    1   34    0   79  107    1]
 [   4   14   29    0   41 1250    3]
 [   0    2    1    0    0   11   14]]
Ensemble Model

The ensemble model with 'svm', 'rf', and 'xgb' with hard voting achieved the best performance.

In [15]:
# Generate best ensemble model
best_ensemble_model = VotingClassifier(
    estimators=[('svm', best_svm), ('rf', best_rf_model), ('xgb', xgb_model)],
    voting='hard'
)

# Train the best ensemble model
best_ensemble_model.fit(X_train, y_train)

# Save the best ensemble model
joblib.dump(best_ensemble_model, 'best_ensemble_model.pkl')

['best_ensemble_model.pkl']

We will try to use a CNN model to classify the skin cancer images. We will use the VGG16 architecture as well as the MobileNetV2 architecture. We will use the transfer learning method to train the model.

In [36]:
from datasets import load_dataset

# load dataset with images from huggingface
train_dataset = load_dataset("marmal88/skin_cancer", split="train")
valid_dataset = load_dataset("marmal88/skin_cancer", split="validation")
test_dataset = load_dataset("marmal88/skin_cancer", split="test")

In [37]:
# save lengths of datasets for later use
train_length = len(train_dataset) - 1 # Remove the first row
valid_length = len(valid_dataset) - 1
test_length = len(test_dataset) - 1

# check the dataset
print(train_dataset.shape)
print(train_dataset.features)

(9577, 8)
{'image': Image(mode=None, decode=True, id=None), 'image_id': Value(dtype='string', id=None), 'lesion_id': Value(dtype='string', id=None), 'dx': Value(dtype='string', id=None), 'dx_type': Value(dtype='string', id=None), 'age': Value(dtype='float64', id=None), 'sex': Value(dtype='string', id=None), 'localization': Value(dtype='string', id=None)}


In [38]:
# apply CNNs and transfer learning
import keras
import tensorflow as tf

# Retrieve number of classes
num_classes = len(train_dataset.unique('dx'))

# Define a learning rate scheduler
def scheduler(epoch, lr):
    if epoch < 10:
        return float(lr)
    else:
        return float(lr * tf.math.exp(-0.1))

lr_scheduler = keras.callbacks.LearningRateScheduler(scheduler)

# Build a VGG16 model
def build_model(hp):
    base_model = keras.applications.VGG16(weights='imagenet', include_top=False, input_shape=(256, 128, 3))
    for layer in base_model.layers:
        layer.trainable = False

    x = base_model.output
    x = keras.layers.Flatten()(x)
    x = keras.layers.Dense(hp.Int('units', min_value=128, max_value=512, step=64), activation='relu')(x)
    x = keras.layers.Dropout(hp.Float('dropout', min_value=0.2, max_value=0.5, step=0.1))(x)
    predictions = keras.layers.Dense(num_classes, activation='softmax')(x)

    model = keras.Model(inputs=base_model.input, outputs=predictions)
    model.compile(optimizer=keras.optimizers.Adam(1e-3), loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [39]:
# Function to convert dataset to TensorFlow dataset
def convert_to_tf_dataset(dataset):
    def generator():
        for data in dataset:
            yield {'image': data['image'], 'dx': data['dx']}

    return tf.data.Dataset.from_generator(
        generator,
        output_signature={
            'image': tf.TensorSpec(shape=(None, None, 3), dtype=tf.uint8),
            'dx': tf.TensorSpec(shape=(), dtype=tf.string)
        }
    )

# Data Augmentation for Images
data_augmentation = keras.Sequential([
    keras.layers.RandomFlip('horizontal_and_vertical'),
    keras.layers.RandomRotation(0.1),
    keras.layers.RandomZoom(0.1),
    keras.layers.RandomContrast(0.1),
])

# Define normalization layer
normalization_layer = keras.layers.Rescaling(1./255)

# Extract unique labels from the dataset
unique_labels = train_dataset.unique('dx')

# Create a lookup table mapping each label to a unique integer
label_to_index = {label: index for index, label in enumerate(unique_labels)}

# Function to preprocess and augment the dataset
def preprocess_data(dataset, augment=False):
    def preprocess_image(image, label):
        image = tf.image.resize(image, (128, 128))
        image = normalization_layer(image)
        if augment:
            image = data_augmentation(image)
        label = label_to_index[label.numpy().decode('utf-8')]  # Convert string labels to integers
        label = tf.one_hot(label, num_classes)  # One-hot encode the label
        return image, label

    def map_fn(data):
        image = data['image']
        label = data['dx']
        image, label = tf.py_function(preprocess_image, [image, label], [tf.float32, tf.float32])
        image.set_shape((128, 128, 3))
        label.set_shape((num_classes,))
        return image, label

    dataset = dataset.map(map_fn)
    dataset = dataset.batch(32)
    return dataset

In [40]:
# Convert datasets to TensorFlow datasets
train_dataset = convert_to_tf_dataset(train_dataset)
valid_dataset = convert_to_tf_dataset(valid_dataset)
test_dataset = convert_to_tf_dataset(test_dataset)

In [41]:
import math

# Preprocess the datasets
train_dataset = preprocess_data(train_dataset, augment=True) # Repeat the dataset 50 times to match epochs for training
valid_dataset = preprocess_data(valid_dataset)
test_dataset = preprocess_data(test_dataset)

# Remove the first row from each dataset
train_dataset = train_dataset.skip(1)
valid_dataset = valid_dataset.skip(1)
test_dataset = test_dataset.skip(1)

AUTOTUNE = tf.data.AUTOTUNE

# Cache and prefetch the datasets to improve performance
train_dataset = train_dataset.take(train_length).cache().repeat().prefetch(buffer_size=tf.data.AUTOTUNE)
valid_dataset = valid_dataset.cache().take(valid_length).repeat().prefetch(buffer_size=tf.data.AUTOTUNE)
test_dataset = test_dataset.cache().take(test_length).repeat().prefetch(buffer_size=tf.data.AUTOTUNE)

# Calculate steps per epoch
steps_per_epoch = math.ceil(train_length / 32)
validation_steps = math.ceil(valid_length / 32)

# Debugging Statement: Print dataset shapes and steps
print(f"Train dataset length: {train_length}, Steps per epoch: {steps_per_epoch}")
print(f"Validation dataset length: {valid_length}, Validation steps: {validation_steps}")

Train dataset length: 9576, Steps per epoch: 300
Validation dataset length: 2491, Validation steps: 78


In [42]:
# Check the normalization
normalized_ds = train_dataset.take(1).cache().map(lambda x, y: (normalization_layer(x), y))
image_batch, labels_batch = next(iter(normalized_ds))
first_image = image_batch[0]
# Notice the pixel values are now in [0,1].
print(np.min(first_image), np.max(first_image))

0.0009795828 0.003293625


2025-04-04 20:17:30.065392: W tensorflow/core/kernels/data/cache_dataset_ops.cc:916] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [43]:
import keras_tuner as kt

tuner = kt.Hyperband(
    build_model,
    objective='val_accuracy',
    max_epochs=10,
    factor=4,
    directory='my_dir',
    project_name='cnn_tuning'
)

tuner.search(train_dataset, validation_data=valid_dataset, epochs=50, steps_per_epoch=steps_per_epoch, validation_steps=validation_steps)

Reloading Tuner from my_dir/cnn_tuning/tuner0.json


In [None]:
best_model = tuner.get_best_models(num_models=1)[0]
best_hyperparameters = tuner.get_best_hyperparameters(num_trials=1)[0]

print("Best Hyperparameters:")
print(best_hyperparameters.values)

In [18]:
# save best model
best_model.save('best_vgg16_model.keras')

In [30]:
# Evaluate CNN model
vgg16_model = keras.models.load_model('best_vgg16_model.keras')
vgg16_model.evaluate(test_dataset, steps=test_length)

  saveable.load_own_variables(weights_store.get(inner_path))


[1m1284/1284[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2019s[0m 2s/step - accuracy: 0.6776 - loss: 1.6830


[1.6803847551345825, 0.6833730936050415]

In [22]:
import keras
import tensorflow as tf

def build_mobilenetv2_model(hp):
    base_model = keras.applications.MobileNetV2(weights='imagenet', include_top=False, input_shape=(128, 128, 3))
    for layer in base_model.layers:
        layer.trainable = False

    x = base_model.output
    x = keras.layers.GlobalAveragePooling2D()(x)
    x = keras.layers.Dense(hp.Int('units', min_value=128, max_value=512, step=64), activation='relu')(x)
    x = keras.layers.Dropout(hp.Float('dropout', min_value=0.2, max_value=0.5, step=0.1))(x)
    predictions = keras.layers.Dense(num_classes, activation='softmax')(x)

    model = keras.Model(inputs=base_model.input, outputs=predictions)
    model.compile(
        optimizer=keras.optimizers.Adam(hp.Float('learning_rate', min_value=1e-5, max_value=1e-2, sampling='LOG')),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

In [23]:
# build and train the model
mobilenetv2_model = build_mobilenetv2_model

In [24]:
import keras_tuner as kt

tuner = kt.Hyperband(
    build_mobilenetv2_model,
    objective='val_accuracy',
    max_epochs=10,
    factor=3,
    directory='my_dir',
    project_name='mobilenetv2_tuning'
)

tuner.search(train_dataset, validation_data=valid_dataset, epochs=50, steps_per_epoch=steps_per_epoch, validation_steps=validation_steps)

Reloading Tuner from my_dir/mobilenetv2_tuning/tuner0.json


In [29]:
best_mobilenetv2_model = tuner.get_best_models(num_models=1)[0]
best_hyperparameters = tuner.get_best_hyperparameters(num_trials=1)[0]

# evaluate
best_mobilenetv2_model.evaluate(test_dataset, steps=test_length)

print("Best Hyperparameters:")
print(best_hyperparameters.values)

  saveable.load_own_variables(weights_store.get(inner_path))


[1m1284/1284[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m102s[0m 79ms/step - accuracy: 0.6789 - loss: 1.4435
Best Hyperparameters:
{'units': 384, 'dropout': 0.2, 'learning_rate': 0.0009789457230138046, 'tuner/epochs': 10, 'tuner/initial_epoch': 0, 'tuner/bracket': 0, 'tuner/round': 0}


In [31]:
# Save the best model
best_mobilenetv2_model.save('best_mobilenetv2_model.keras')