In [1]:
# Environment Setup
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, classification_report, 
                             confusion_matrix, f1_score)
from sklearn.pipeline import Pipeline
import joblib
import os


In [2]:
# Set random seed for reproducibility
SEED = 42
np.random.seed(SEED)

In [3]:
# Define paths
base_path = "../dataset/data_exploration/"
metadata_path = "../dataset/dataverse_files/"

# Load feature files
color_var = pd.read_csv(os.path.join(base_path, "color_variance_features.csv"))
color_hist = pd.read_csv(os.path.join(base_path, "combined_color_histogram_features.csv"))
lbp = pd.read_csv(os.path.join(base_path, "combined_lbp_features.csv"))
glcm = pd.read_csv(os.path.join(base_path, "glcm_features.csv"))
metadata = pd.read_csv(os.path.join(metadata_path, "HAM10000_metadata"))

# Function to extract image_id from file_name
def extract_image_id(file_name):
    # Extract the base name (e.g., 'ISIC_0024306.jpg')
    base_name = file_name.split('\\')[-1]
    # Remove the file extension (e.g., '.jpg')
    image_id = os.path.splitext(base_name)[0]
    return image_id

# Apply the function to extract image_id
color_var['image_id'] = color_var['file_name'].apply(extract_image_id)
color_hist['image_id'] = color_hist['file_name'].apply(extract_image_id)
lbp['image_id'] = lbp['file_name'].apply(extract_image_id)
glcm['image_id'] = glcm['file_name'].apply(extract_image_id)

# Sort feature DataFrames by image_id
color_var_sorted = color_var.sort_values(by='image_id').reset_index(drop=True)
color_hist_sorted = color_hist.sort_values(by='image_id').reset_index(drop=True)
lbp_sorted = lbp.sort_values(by='image_id').reset_index(drop=True)
glcm_sorted = glcm.sort_values(by='image_id').reset_index(drop=True)

# Sort metadata by image_id
metadata_sorted = metadata.sort_values(by='image_id').reset_index(drop=True)


In [4]:
# Check for consistency
print("Feature image_id after sorting:", color_var_sorted['image_id'].head())
print("Metadata image_id after sorting:", metadata_sorted['image_id'].head())

Feature image_id after sorting: 0    ISIC_0024306
1    ISIC_0024307
2    ISIC_0024308
3    ISIC_0024309
4    ISIC_0024310
Name: image_id, dtype: object
Metadata image_id after sorting: 0    ISIC_0024306
1    ISIC_0024307
2    ISIC_0024308
3    ISIC_0024309
4    ISIC_0024310
Name: image_id, dtype: object


In [5]:
# Merge all features
def merge_features(df_list):
    # Start with the first DataFrame
    merged = df_list[0]
    # Merge the rest
    for df in df_list[1:]:
        merged = pd.merge(merged, df, on='image_id', how='inner', suffixes=('', '_dup'))
        # Drop duplicate columns
        merged = merged.loc[:, ~merged.columns.str.endswith('_dup')]
    return merged

# List of sorted feature DataFrames
feature_dfs = [color_var_sorted, color_hist_sorted, lbp_sorted, glcm_sorted]
merged_features = merge_features(feature_dfs)

# Add metadata (dx column)
full_data = merged_features.copy()
full_data['dx'] = metadata_sorted['dx']

In [6]:
# Check the full data
print("Full Data Shape:", full_data.shape)
print("Full Data Columns:", full_data.columns)
print("Class Distribution:\n", full_data['dx'].value_counts())

Full Data Shape: (10015, 122)
Full Data Columns: Index(['file_name', 'folder', 'mean_r', 'mean_g', 'mean_b', 'var_r', 'var_g',
       'var_b', 'overall_var', 'image_id',
       ...
       'lbp_6', 'lbp_7', 'lbp_8', 'lbp_9', 'contrast', 'dissimilarity',
       'homogeneity', 'energy', 'correlation', 'dx'],
      dtype='object', length=122)
Class Distribution:
 dx
nv       6705
mel      1113
bkl      1099
bcc       514
akiec     327
vasc      142
df        115
Name: count, dtype: int64


In [7]:
# Data Preprocessing
# Encode labels
le = LabelEncoder()
full_data['label'] = le.fit_transform(full_data['dx'])

# Define X and y (features and target)
X = full_data
y = full_data['label']

# Drop non-feature columns
X_numeric = X.select_dtypes(include=['number'])

# Output removed columns
removed_cols = list(set(X.columns) - set(X_numeric.columns))
print("removed_cols:", removed_cols)

# Update X
X = X_numeric

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    stratify=y,
    random_state=SEED
)

print("X Shape:", X.shape)
print("y Shape:", y.shape)
print("X Columns:", X.columns)
print("Class Distribution:\n", y.value_counts())

removed_cols: ['folder', 'image_id', 'file_name', 'dx']
X Shape: (10015, 119)
y Shape: (10015,)
X Columns: Index(['mean_r', 'mean_g', 'mean_b', 'var_r', 'var_g', 'var_b', 'overall_var',
       'hist_r_0', 'hist_r_1', 'hist_r_2',
       ...
       'lbp_6', 'lbp_7', 'lbp_8', 'lbp_9', 'contrast', 'dissimilarity',
       'homogeneity', 'energy', 'correlation', 'label'],
      dtype='object', length=119)
Class Distribution:
 label
5    6705
4    1113
2    1099
1     514
0     327
6     142
3     115
Name: count, dtype: int64


In [8]:
# Model Pipeline Setup
scaler = StandardScaler()

# SVM Pipeline
svm_pipe = Pipeline([
    ('scaler', scaler),
    ('svm', SVC(probability=True, random_state=SEED))
])

# Random Forest Pipeline
rf_pipe = Pipeline([
    ('scaler', scaler),
    ('rf', RandomForestClassifier(random_state=SEED))
])


In [9]:
# Hyperparameter Tuning

# SVM Hyperparameters
svm_params = {
    'svm__C': [0.01],
    'svm__kernel': ['linear'],
    'svm__gamma': ['scale'],
    'svm__class_weight': [None, 'balanced']
}

# svm_params = {
#     'svm__C': [0.0001, .001, .01],
#     'svm__kernel': ['linear', 'rbf'],
#     'svm__gamma': ['scale', 'auto', 0.1],
#     'svm__class_weight': [None, 'balanced']
# }

# Random Forest Hyperparameters
rf_params = {
    'rf__n_estimators': [100],
    'rf__max_depth': [None, 10],
    'rf__min_samples_split': [2],
    'rf__min_samples_leaf': [2],
    'rf__class_weight': [None, 'balanced']
}

# rf_params = {
#     'rf__n_estimators': [100, 200, 500],
#     'rf__max_depth': [None, 10, 20, 30],
#     'rf__min_samples_split': [2, 5, 10],
#     'rf__min_samples_leaf': [1, 2, 4],
#     'rf__class_weight': [None, 'balanced', 'balanced_subsample']
# }

In [10]:
# Tuning Strategy
def tune_model(pipe, params, X, y):
    search = RandomizedSearchCV(
        pipe,
        params,
        n_iter=50,
        cv=5,
        scoring='f1_weighted',
        n_jobs=-1,
        verbose=1,
        random_state=SEED
    )
    search.fit(X, y)
    return search.best_estimator_, search.best_params_

In [11]:
# SVM Tuning
print("Tuning SVM...")
best_svm, svm_best_params = tune_model(svm_pipe, svm_params, X_train, y_train)

Tuning SVM...
Fitting 5 folds for each of 2 candidates, totalling 10 fits




In [12]:
# Random Forest Tuning
print("\nTuning Random Forest...")
best_rf, rf_best_params = tune_model(rf_pipe, rf_params, X_train, y_train)


Tuning Random Forest...
Fitting 5 folds for each of 4 candidates, totalling 20 fits




In [13]:
# Model Evaluation
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=le.classes_))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

# Evaluate SVM
print("SVM Performance:")
evaluate_model(best_svm, X_test, y_test)

# Evaluate Random Forest
print("\nRandom Forest Performance:")
evaluate_model(best_rf, X_test, y_test)

SVM Performance:
Accuracy: 0.9960
F1 Score: 0.9961

Classification Report:
              precision    recall  f1-score   support

       akiec       1.00      1.00      1.00        65
         bcc       1.00      1.00      1.00       103
         bkl       1.00      1.00      1.00       220
          df       0.96      0.96      0.96        23
         mel       1.00      1.00      1.00       223
          nv       1.00      1.00      1.00      1341
        vasc       0.87      0.93      0.90        28

    accuracy                           1.00      2003
   macro avg       0.97      0.98      0.98      2003
weighted avg       1.00      1.00      1.00      2003


Confusion Matrix:
[[  65    0    0    0    0    0    0]
 [   0  103    0    0    0    0    0]
 [   0    0  220    0    0    0    0]
 [   0    0    0   22    1    0    0]
 [   0    0    0    1  222    0    0]
 [   0    0    0    0    0 1337    4]
 [   0    0    0    0    0    2   26]]

Random Forest Performance:
Accuracy: 0.92

In [14]:
# Feature Importance Analysis (RF Specific)

# Get feature importances
importances = best_rf.named_steps['rf'].feature_importances_
feature_names = X.columns
feat_imp = pd.DataFrame({'feature': feature_names, 'importance': importances})
feat_imp = feat_imp.sort_values(by='importance', ascending=False)

print("\nTop 10 Important Features:")
print(feat_imp.head(10))


Top 10 Important Features:
      feature  importance
118     label    0.224848
45   hist_g_6    0.018343
78   hist_b_7    0.015904
77   hist_b_6    0.015608
79   hist_b_8    0.015291
46   hist_g_7    0.014663
103     lbp_0    0.014481
106     lbp_3    0.013772
108     lbp_5    0.013731
112     lbp_9    0.013513


In [15]:
# Save models
joblib.dump(best_svm, 'best_svm_model.pkl')
joblib.dump(best_rf, 'best_rf_model.pkl')
joblib.dump(le, 'label_encoder.pkl')

# Save feature names
with open('feature_names.txt', 'w') as f:
    f.write('\n'.join(feature_names))

In [16]:
# Key Parameter Adjustment Strategies
"""
For SVM:
1. Regularization (C): 
   - Start with log scale values (0.1, 1, 10, 100)
   - Higher C = less regularization, might overfit
   
2. Kernel Selection:
   - Try linear first for baseline
   - RBF for non-linear relationships
   - Poly for complex patterns (but needs more data)
   
3. Gamma:
   - Controls decision boundary curvature
   - Lower values = larger influence radius
   - Use 'scale' (1/(n_features * X.var())) as baseline

For Random Forest:
1. n_estimators:
   - Start with 100-500 trees
   - More trees = better performance but longer training"

2. max_depth:
   - Control tree complexity
   - None for full expansion (watch for overfitting)
   
3. class_weight:
   - Crucial for imbalanced datasets
   - 'balanced' adjusts weights inversely proportional to class frequencies
   
4. min_samples_split:
   - Higher values prevent overfitting
   - Start with 2 (default), try 5-10 for regularization
"""


'\nFor SVM:\n1. Regularization (C): \n   - Start with log scale values (0.1, 1, 10, 100)\n   - Higher C = less regularization, might overfit\n   \n2. Kernel Selection:\n   - Try linear first for baseline\n   - RBF for non-linear relationships\n   - Poly for complex patterns (but needs more data)\n   \n3. Gamma:\n   - Controls decision boundary curvature\n   - Lower values = larger influence radius\n   - Use \'scale\' (1/(n_features * X.var())) as baseline\n\nFor Random Forest:\n1. n_estimators:\n   - Start with 100-500 trees\n   - More trees = better performance but longer training"\n\n2. max_depth:\n   - Control tree complexity\n   - None for full expansion (watch for overfitting)\n   \n3. class_weight:\n   - Crucial for imbalanced datasets\n   - \'balanced\' adjusts weights inversely proportional to class frequencies\n   \n4. min_samples_split:\n   - Higher values prevent overfitting\n   - Start with 2 (default), try 5-10 for regularization\n'