In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import cv2
import json
import time
import joblib
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from imblearn.over_sampling import RandomOverSampler

import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model, model_from_json
from tensorflow.keras.layers import Conv2D, Flatten, Dense, MaxPool2D, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, TensorBoard
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import ImageDataGenerator


In [3]:
# Load and inspect data
data = pd.read_csv('input/hmnist_28_28_RGB.csv')
print("Dataset shape:", data.shape)
data.head()

Dataset shape: (10015, 2353)


Unnamed: 0,pixel0000,pixel0001,pixel0002,pixel0003,pixel0004,pixel0005,pixel0006,pixel0007,pixel0008,pixel0009,...,pixel2343,pixel2344,pixel2345,pixel2346,pixel2347,pixel2348,pixel2349,pixel2350,pixel2351,label
0,192,153,193,195,155,192,197,154,185,202,...,173,124,138,183,147,166,185,154,177,2
1,25,14,30,68,48,75,123,93,126,158,...,60,39,55,25,14,28,25,14,27,2
2,192,138,153,200,145,163,201,142,160,206,...,167,129,143,159,124,142,136,104,117,2
3,38,19,30,95,59,72,143,103,119,171,...,44,26,36,25,12,17,25,12,15,2
4,158,113,139,194,144,174,215,162,191,225,...,209,166,185,172,135,149,109,78,92,2


In [4]:
# Extract features and labels
y = data['label']
X = data.drop(columns=['label'])
print("Features shape:", X.shape)


Features shape: (10015, 2352)


In [5]:
# Load metadata
metadata = pd.read_csv('input/HAM10000_metadata.csv')
metadata.head()



Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear


In [6]:
# Define class dictionary
classes = {
    0: ('akiec', 'Actinic keratoses and intraepithelial carcinomae'),  
    1: ('bcc', 'Basal cell carcinoma'), 
    2: ('bkl', 'Benign keratosis-like lesions'), 
    3: ('df', 'Dermatofibroma'),
    4: ('nv', 'Melanocytic nevi'),  
    5: ('vasc', 'Pyogenic granulomas and hemorrhage'), 
    6: ('mel', 'Melanoma')
}

In [7]:
# Visualize data distribution
plt.figure(figsize=(12, 6))
sns.countplot(x='dx', data=metadata)
plt.xlabel('Disease', size=16)
plt.ylabel('Frequency', size=16)
plt.title('Frequency Distribution of Classes', size=18)
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('disease_distribution.png')
plt.close()

In [8]:
# Visualize gender distribution
plt.figure(figsize=(8, 8))
plt.pie(metadata['sex'].value_counts(), 
        labels=metadata['sex'].value_counts().index, 
        autopct="%.1f%%")
plt.title('Gender of Patient', size=18)
plt.savefig('gender_distribution.png')
plt.close()

In [9]:
# Visualize age distribution
plt.figure(figsize=(10, 6))
sns.histplot(metadata['age'], bins=20)
plt.title('Age Distribution of Patients', size=18)
plt.xlabel('Age', size=14)
plt.ylabel('Count', size=14)
plt.savefig('age_distribution.png')
plt.close()

In [10]:
# Balance dataset using oversampling
print("Class distribution before oversampling:", y.value_counts())
oversample = RandomOverSampler(random_state=42)
X_resampled, y_resampled = oversample.fit_resample(X, y)
print("Class distribution after oversampling:", pd.Series(y_resampled).value_counts())

Class distribution before oversampling: label
4    6705
6    1113
2    1099
1     514
0     327
5     142
3     115
Name: count, dtype: int64
Class distribution after oversampling: label
2    6705
4    6705
3    6705
6    6705
5    6705
1    6705
0    6705
Name: count, dtype: int64


In [12]:
# Reshape data for CNN input (28×28×3)
X_resampled = np.array(X_resampled).reshape(-1, 28, 28, 3)
print('Reshaped data shape:', X_resampled.shape)

Reshaped data shape: (46935, 28, 28, 3)


In [13]:
# Visualize some samples
plt.figure(figsize=(12, 8))
for i in range(10):
    plt.subplot(2, 5, i+1)
    plt.imshow(X_resampled[i])
    plt.title(f"Class: {y_resampled[i]}")
    plt.axis('off')
plt.tight_layout()
plt.savefig('sample_images.png')
plt.close()

In [14]:
# Normalize pixel values
X_normalized = (X_resampled - np.mean(X_resampled)) / np.std(X_resampled)


In [15]:
# Split data into train and test sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(
    X_normalized, y_resampled, 
    test_size=0.2,
    random_state=42,
    stratify=y_resampled  # Ensures balanced classes in both train and test sets
)

In [16]:
print(f"Training set: {X_train.shape}, {y_train.shape}")
print(f"Testing set: {X_test.shape}, {y_test.shape}")


Training set: (37548, 28, 28, 3), (37548,)
Testing set: (9387, 28, 28, 3), (9387,)


In [17]:
# Create data augmentation for training set to improve generalization
datagen = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.1,
    height_shift_range=0.1,
    shear_range=0.1,
    zoom_range=0.1,
    horizontal_flip=True,
    fill_mode='nearest'
)

datagen.fit(X_train)

In [18]:
# Build improved CNN model
def create_model():
    model = Sequential()
    
    # First convolution block
    model.add(Conv2D(32, kernel_size=(3, 3), activation='relu', padding='same', input_shape=(28, 28, 3)))
    model.add(BatchNormalization())
    model.add(Conv2D(32, kernel_size=(3, 3), activation='relu'))
    model.add(BatchNormalization())
    model.add(MaxPool2D(pool_size=(2, 2)))
    model.add(Dropout(0.25))
    
    # Second convolution block
    model.add(Conv2D(64, kernel_size=(3, 3), activation='relu', padding='same'))
    model.add(BatchNormalization())
    model.add(Conv2D(64, kernel_size=(3, 3), activation='relu'))
    model.add(BatchNormalization())
    model.add(MaxPool2D(pool_size=(2, 2)))
    model.add(Dropout(0.25))
    
    # Third convolution block
    model.add(Conv2D(128, kernel_size=(3, 3), activation='relu', padding='same'))
    model.add(BatchNormalization())
    model.add(Conv2D(128, kernel_size=(3, 3), activation='relu'))
    model.add(BatchNormalization())
    model.add(MaxPool2D(pool_size=(2, 2)))
    model.add(Dropout(0.25))
    
    # Fully connected layers
    model.add(Flatten())
    model.add(Dense(256, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(Dense(128, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(Dense(7, activation='softmax'))
    
    return model

In [19]:
# Create model
model = create_model()
model.summary()

# Callbacks for training
checkpoint = ModelCheckpoint(
    'best_model.keras',
    monitor='val_accuracy',
    mode='max',
    save_best_only=True,
    verbose=1
)

early_stopping = EarlyStopping(
    monitor='val_accuracy',
    mode='max',
    patience=10,
    verbose=1,
    restore_best_weights=True
)

reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.2,
    patience=3,
    min_lr=1e-6,
    verbose=1
)

tensorboard = TensorBoard(
    log_dir=f"logs/skin_cancer_{time.strftime('%Y%m%d_%H%M%S')}"
)

callbacks = [checkpoint, early_stopping, reduce_lr, tensorboard]

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 28, 28, 32)        896       
                                                                 
 batch_normalization (BatchN  (None, 28, 28, 32)       128       
 ormalization)                                                   
                                                                 
 conv2d_1 (Conv2D)           (None, 26, 26, 32)        9248      
                                                                 
 batch_normalization_1 (Batc  (None, 26, 26, 32)       128       
 hNormalization)                                                 
                                                                 
 max_pooling2d (MaxPooling2D  (None, 13, 13, 32)       0         
 )                                                               
                                                        

In [20]:
# Compile model with optimized hyperparameters
model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

In [21]:
# Train the model with data augmentation
batch_size = 64
epochs = 50

start_time = time.time()
history = model.fit(
    datagen.flow(X_train, y_train, batch_size=batch_size),
    steps_per_epoch=len(X_train) // batch_size,
    epochs=epochs,
    validation_data=(X_test, y_test),
    callbacks=callbacks,
    verbose=1
)
training_time = time.time() - start_time
print(f"Training completed in {training_time:.2f} seconds")

# Load the best model for evaluation
model = load_model('best_model.keras')


Epoch 1/50
Epoch 1: val_accuracy improved from -inf to 0.59007, saving model to best_model.keras
Epoch 2/50
Epoch 2: val_accuracy improved from 0.59007 to 0.70619, saving model to best_model.keras
Epoch 3/50
Epoch 3: val_accuracy did not improve from 0.70619
Epoch 4/50
Epoch 4: val_accuracy improved from 0.70619 to 0.73442, saving model to best_model.keras
Epoch 5/50
Epoch 5: val_accuracy improved from 0.73442 to 0.75711, saving model to best_model.keras
Epoch 6/50
Epoch 6: val_accuracy improved from 0.75711 to 0.77810, saving model to best_model.keras
Epoch 7/50
Epoch 7: val_accuracy improved from 0.77810 to 0.81016, saving model to best_model.keras
Epoch 8/50
Epoch 8: val_accuracy did not improve from 0.81016
Epoch 9/50
Epoch 9: val_accuracy improved from 0.81016 to 0.83445, saving model to best_model.keras
Epoch 10/50
Epoch 10: val_accuracy did not improve from 0.83445
Epoch 11/50
Epoch 11: val_accuracy improved from 0.83445 to 0.84628, saving model to best_model.keras
Epoch 12/50
E

In [22]:
# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=1)
print(f"Test accuracy: {test_accuracy:.4f}")
print(f"Test loss: {test_loss:.4f}")

Test accuracy: 0.9518
Test loss: 0.1296


In [23]:
# Make predictions
y_pred_prob = model.predict(X_test)
y_pred = np.argmax(y_pred_prob, axis=1)



In [24]:
# Generate classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=[classes[i][0] for i in range(7)]))



Classification Report:
              precision    recall  f1-score   support

       akiec       0.97      1.00      0.99      1341
         bcc       0.98      1.00      0.99      1341
         bkl       0.91      0.95      0.93      1341
          df       1.00      1.00      1.00      1341
          nv       0.98      0.74      0.84      1341
        vasc       1.00      1.00      1.00      1341
         mel       0.84      0.97      0.90      1341

    accuracy                           0.95      9387
   macro avg       0.96      0.95      0.95      9387
weighted avg       0.96      0.95      0.95      9387



In [25]:
plt.figure(figsize=(10, 8))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(
    cm, 
    annot=True, 
    fmt="d", 
    cmap="Blues", 
    xticklabels=[classes[i][0] for i in range(7)],
    yticklabels=[classes[i][0] for i in range(7)]
)
plt.xlabel("Predicted Label", size=14)
plt.ylabel("True Label", size=14)
plt.title("Confusion Matrix", size=16)
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('confusion_matrix.png')
plt.close()

In [26]:
# Plot training history
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy', size=14)
plt.xlabel('Epoch', size=12)
plt.ylabel('Accuracy', size=12)
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss', size=14)
plt.xlabel('Epoch', size=12)
plt.ylabel('Loss', size=12)
plt.legend()

plt.tight_layout()
plt.savefig('training_history.png')
plt.close()

In [27]:
# Export the model for deployment

# 1. Create export directory
export_dir = "exported_model"
if not os.path.exists(export_dir):
    os.makedirs(export_dir)

In [28]:
# 2. Save the model architecture as JSON
model_json = model.to_json()
with open(os.path.join(export_dir, "model.json"), "w") as json_file:
    json_file.write(model_json)
print("Model architecture saved as model.json")

# 3. Save the model weights
model.save_weights(os.path.join(export_dir, "weights.h5"))
print("Model weights saved as weights.h5")

# 4. Save class labels
label_json = {}
for class_id, (class_code, class_name) in classes.items():
    label_json[str(class_id)] = {
        "code": class_code,
        "name": class_name
    }

with open(os.path.join(export_dir, "labels.json"), "w") as label_file:
    json.dump(label_json, label_file, indent=4)
print("Class labels saved as labels.json")

# 5. Save the full model in Keras format
model.save(os.path.join(export_dir, "full_model.h5"))
print("Full model saved as full_model.h5")

# 6. Function to load the exported model
def load_exported_model(export_dir="exported_model"):
    # Load model architecture from JSON
    with open(os.path.join(export_dir, "model.json"), "r") as json_file:
        loaded_model_json = json_file.read()
    
    loaded_model = model_from_json(loaded_model_json)
    
    # Load weights
    loaded_model.load_weights(os.path.join(export_dir, "weights.h5"))
    
    # Compile the model
    loaded_model.compile(
        optimizer=Adam(learning_rate=0.001),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    
    print("Model loaded successfully from exported files!")
    return loaded_model

Model architecture saved as model.json
Model weights saved as weights.h5
Class labels saved as labels.json
Full model saved as full_model.h5


In [29]:
# 7. Convert model to TFLite format
def convert_to_tflite(export_dir="exported_model"):
    # Load the model
    model = load_model(os.path.join(export_dir, "full_model.h5"))
    
    # Convert to TFLite format
    converter = tf.lite.TFLiteConverter.from_keras_model(model)
    
    # Enable optimizations
    converter.optimizations = [tf.lite.Optimize.DEFAULT]
    
    # Convert the model
    tflite_model = converter.convert()
    
    # Save the TFLite model
    with open(os.path.join(export_dir, "model.tflite"), "wb") as f:
        f.write(tflite_model)
    
    print("Model successfully converted to TFLite format")
    return os.path.join(export_dir, "model.tflite")

In [30]:
# Convert to TFLite
tflite_model_path = convert_to_tflite(export_dir)

# Function to make predictions on new images
def predict_image(image_path, model_path="exported_model/full_model.h5"):
    # Load the model
    model = load_model(model_path)
    
    # Load and preprocess the image
    img = Image.open(image_path)
    img = img.resize((28, 28))
    img_array = np.array(img)
    
    # Normalize the image
    img_normalized = (img_array - np.mean(img_array)) / np.std(img_array)
    
    # Reshape for prediction
    img_reshaped = img_normalized.reshape(1, 28, 28, 3)
    
    # Make prediction
    predictions = model.predict(img_reshaped)
    predicted_class = np.argmax(predictions)
    
    # Get class name and code
    class_code, class_name = classes[predicted_class]
    
    # Calculate confidence
    confidence = np.max(predictions) * 100
    
    print(f"Predicted class: {predicted_class} - {class_code} ({class_name})")
    print(f"Confidence: {confidence:.2f}%")
    
    # Display the image with prediction
    plt.figure(figsize=(6, 6))
    plt.imshow(img)
    plt.title(f"Prediction: {class_code}\n{class_name}\nConfidence: {confidence:.2f}%")
    plt.axis('off')
    plt.tight_layout()
    plt.show()
    
    return predicted_class, class_code, class_name, confidence



INFO:tensorflow:Assets written to: C:\Users\gokul\AppData\Local\Temp\tmp923dzflp\assets


INFO:tensorflow:Assets written to: C:\Users\gokul\AppData\Local\Temp\tmp923dzflp\assets


Model successfully converted to TFLite format
