In [None]:
# Skin Cancer Detection - Version 5: Training (84% Validation Accuracy)

import os
import math
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.applications.resnet50 import preprocess_input as resnet_preprocess
from tensorflow.keras.utils import to_categorical
from google.colab import drive
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.layers import Input, Dense, GlobalAveragePooling2D, Dropout, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.applications import ResNet50, DenseNet121
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from sklearn.metrics import confusion_matrix, classification_report

# 1. Setup: Mount Drive, Copy Data to Local Storage & Verify Paths
drive.mount('/content/drive')
drive_base_path = "/content/drive/MyDrive/SkinCancerDetection"
local_base_path = "/content/SkinCancerDetection"

if not os.path.exists(local_base_path):
    print("Copying dataset from Google Drive to local storage...")
    os.system(f'cp -r "{drive_base_path}" "{local_base_path}"')
else:
    print("Dataset already exists in local storage.")

base_path = local_base_path
print(f"Using base path: {base_path}")

train_meta_file = os.path.join(base_path, "train_metadata.csv")
val_meta_file = os.path.join(base_path, "val_metadata.csv")
test_meta_file = os.path.join(base_path, "test_metadata.csv")

for file in [train_meta_file, val_meta_file, test_meta_file]:
    if not os.path.exists(file):
        raise FileNotFoundError(f"Metadata file not found: {file}")
    else:
        print(f"Found metadata file: {file}")

train_folder = os.path.join(base_path, "train")
val_folder = os.path.join(base_path, "val")
test_folder = os.path.join(base_path, "test")

for folder in [train_folder, val_folder, test_folder]:
    if not os.path.exists(folder):
        raise FileNotFoundError(f"Image folder not found: {folder}")
    else:
        print(f"Found image folder: {folder}")

# 2. Load & Preprocess Metadata
def load_metadata(csv_path):
    df = pd.read_csv(csv_path)
    required_columns = ['image_id', 'dx']
    missing = [col for col in required_columns if col not in df.columns]
    if missing:
        raise ValueError(f"Missing columns in {csv_path}: {missing}")
    return df

train_df = load_metadata(train_meta_file)
val_df = load_metadata(val_meta_file)
test_df = load_metadata(test_meta_file)

TEST_MODE = False
if TEST_MODE:
    train_df = train_df.sample(100, random_state=42)
    val_df = val_df.sample(50, random_state=42)
    batch_size = 8
    epochs = 2
else:
    batch_size = 32
    epochs = 20

label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_df['dx'])
val_labels = label_encoder.transform(val_df['dx'])
num_classes = len(label_encoder.classes_)
print("Number of classes:", num_classes)

# 3. Image Loading & Preprocessing
def load_and_preprocess_image(image_path, augment=False):
    img = tf.io.read_file(image_path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, (224, 224))
    if augment:
        img = tf.image.random_flip_left_right(img)
        img = tf.image.random_brightness(img, max_delta=0.1)
        img = tf.image.random_contrast(img, lower=0.9, upper=1.1)
    img = resnet_preprocess(img)
    return img

# 4. Data Generator
def data_generator(df, image_folder, labels, augment=False):
    num_samples = len(df)
    indices = np.arange(num_samples)
    while True:
        np.random.shuffle(indices)
        for start in range(0, num_samples, batch_size):
            batch_indices = indices[start:start + batch_size]
            batch_images = []
            batch_labels = []
            for idx in batch_indices:
                try:
                    raw_filename = str(df.iloc[idx]['image_id'])
                    filename = os.path.splitext(raw_filename)[0] + ".png"
                    image_path = os.path.join(image_folder, filename)
                    if not os.path.exists(image_path):
                        print(f"Warning: {image_path} not found. Skipping.")
                        continue
                    img = load_and_preprocess_image(image_path, augment=augment)
                    batch_images.append(img)
                    batch_labels.append(labels[idx])
                except Exception as e:
                    print(f"Error processing index {idx}: {e}")
                    continue
            if not batch_images:
                continue
            images_tensor = tf.stack(batch_images)
            labels_tensor = tf.convert_to_tensor(to_categorical(batch_labels, num_classes=num_classes), dtype=tf.float32)
            yield images_tensor, labels_tensor

output_signature = (
    tf.TensorSpec(shape=(None, 224, 224, 3), dtype=tf.float32),
    tf.TensorSpec(shape=(None, num_classes), dtype=tf.float32)
)

train_dataset = tf.data.Dataset.from_generator(
    lambda: data_generator(train_df, train_folder, train_labels, augment=True),
    output_types=(tf.float32, tf.float32),
    output_shapes=([None, 224, 224, 3], [None, num_classes])
).prefetch(tf.data.AUTOTUNE)

val_dataset = tf.data.Dataset.from_generator(
    lambda: data_generator(val_df, val_folder, val_labels, augment=False),
    output_types=(tf.float32, tf.float32),
    output_shapes=([None, 224, 224, 3], [None, num_classes])
).prefetch(tf.data.AUTOTUNE)

steps_per_epoch = math.ceil(len(train_df) / batch_size)
validation_steps = math.ceil(len(val_df) / batch_size)
print(f"Steps per epoch (train): {steps_per_epoch}")
print(f"Steps per epoch (val): {validation_steps}")

# 5. Build the ResNet50 + DenseNet121 Hybrid Model
local_checkpoint_path = "model_checkpoint.keras"
checkpoint_drive_path = "/content/drive/MyDrive/model_checkpoint.keras"

def build_model(num_classes):
    image_input = Input(shape=(224, 224, 3), name="image_input")
    resnet_model = ResNet50(include_top=False, weights="imagenet", name="resnet50")
    densenet_model = DenseNet121(include_top=False, weights="imagenet", name="densenet121")
    for layer in resnet_model.layers:
        layer.trainable = False
    for layer in densenet_model.layers:
        layer.trainable = False
    resnet_features = GlobalAveragePooling2D()(resnet_model(image_input))
    densenet_features = GlobalAveragePooling2D()(densenet_model(image_input))
    combined_features = Concatenate()([resnet_features, densenet_features])
    x = Dropout(0.5)(combined_features)
    x = Dense(256, activation="relu")(combined_features)
    x = Dropout(0.5)(x)
    output = Dense(num_classes, activation="softmax", name="output")(x)
    return Model(inputs=image_input, outputs=output)

previous_best_accuracy = 0.83
if os.path.exists(checkpoint_drive_path):
    print("Found checkpoint on Google Drive. Copying to local storage...")
    os.system(f'cp "{checkpoint_drive_path}" "{local_checkpoint_path}"')
    try:
        model_hybrid = tf.keras.models.load_model(local_checkpoint_path)
        if len(model_hybrid.inputs) != 1 or model_hybrid.output_shape[-1] != num_classes:
            print("Warning: Loaded model mismatch. Building new model...")
            model_hybrid = build_model(num_classes)
        else:
            print("Model loaded from checkpoint.")
            initial_val_loss, initial_val_accuracy = model_hybrid.evaluate(val_dataset, steps=validation_steps, verbose=2)
            print(f"Loaded model initial validation accuracy: {initial_val_accuracy:.4f}")
            if initial_val_accuracy < 0.80:
                print("Warning: Loaded accuracy is too low. Building new model...")
                model_hybrid = build_model(num_classes)
            else:
                previous_best_accuracy = initial_val_accuracy
    except Exception as e:
        print(f"Error loading checkpoint: {e}. Building new model...")
        model_hybrid = build_model(num_classes)
else:
    print("No checkpoint found. Building a new model...")
    model_hybrid = build_model(num_classes)

optimizer = tf.keras.optimizers.Adam(learning_rate=5e-6)
model_hybrid.compile(optimizer=optimizer, loss="categorical_crossentropy", metrics=["accuracy"])
model_hybrid.summary()

# 6. Evaluate Initial Model
if not os.path.exists(checkpoint_drive_path):
    initial_val_loss, initial_val_accuracy = model_hybrid.evaluate(val_dataset, steps=validation_steps, verbose=2)
    print("Initial validation accuracy: {:.4f}".format(initial_val_accuracy))
else:
    initial_val_accuracy = previous_best_accuracy

# 7. Train the Model
checkpoint_callback = ModelCheckpoint(
    filepath=local_checkpoint_path,
    monitor='val_accuracy',
    save_best_only=True,
    save_weights_only=False,
    mode='max',
    verbose=1
)
checkpoint_callback.best = previous_best_accuracy

early_stopping = EarlyStopping(
    monitor='val_accuracy',
    patience=10,
    verbose=1,
    restore_best_weights=True
)

print(f"Training with previous best accuracy set to: {previous_best_accuracy:.4f}")
if TEST_MODE:
    print("Running in TEST MODE...")
    history = model_hybrid.fit(
        train_dataset,
        steps_per_epoch=steps_per_epoch,
        validation_data=val_dataset,
        validation_steps=validation_steps,
        epochs=epochs,
        verbose=1,
        callbacks=[checkpoint_callback, early_stopping]
    )
else:
    print("Running in FULL TRAINING MODE...")
    history = model_hybrid.fit(
        train_dataset,
        steps_per_epoch=steps_per_epoch,
        validation_data=val_dataset,
        validation_steps=validation_steps,
        epochs=epochs,
        verbose=1,
        callbacks=[checkpoint_callback, early_stopping]
    )

new_best_accuracy = checkpoint_callback.best
print(f"New best validation accuracy: {new_best_accuracy:.4f}")
if new_best_accuracy > previous_best_accuracy:
    print("New accuracy beats previous best. Copying checkpoint to Google Drive...")
    os.system(f'cp "{local_checkpoint_path}" "{checkpoint_drive_path}"')
    print("Checkpoint saved to Google Drive.")
else:
    print("New accuracy does not exceed previous best. Checkpoint not saved to Google Drive.")

# 8. Evaluate Model Performance
def get_predictions_and_labels(model, dataset, steps):
    preds = []
    true_labels = []
    for (images, labels) in dataset.take(steps):
        batch_preds = model.predict(images)
        preds.extend(np.argmax(batch_preds, axis=1))
        true_labels.extend(np.argmax(labels, axis=1))
    return np.array(preds), np.array(true_labels)

preds, true_labels = get_predictions_and_labels(model_hybrid, val_dataset, validation_steps)
cm = confusion_matrix(true_labels, preds)
print("Confusion Matrix:")
print(cm)
report = classification_report(true_labels, preds, target_names=label_encoder.classes_)
print("Classification Report:")
print(report)

In [None]:
# Skin Cancer Detection - Version 5: Test Dataset Evaluation

import os
import math
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.metrics import confusion_matrix, classification_report
from google.colab import drive
from sklearn.preprocessing import LabelEncoder

# Mount Google Drive
if not os.path.exists('/content/drive'):
    drive.mount('/content/drive')

# Load the pre-trained model
checkpoint_path = '/content/drive/MyDrive/model_checkpoint.keras'
if os.path.exists(checkpoint_path):
    model_hybrid = tf.keras.models.load_model(checkpoint_path)
    print('Model loaded successfully from checkpoint')
else:
    raise FileNotFoundError(f'Model checkpoint not found at {checkpoint_path}')

# Load and prepare test data
test_meta_file = '/content/SkinCancerDetection/test_metadata.csv'
test_folder = '/content/SkinCancerDetection/test'
test_df = pd.read_csv(test_meta_file)

# Initialize and fit LabelEncoder
label_encoder = LabelEncoder()
test_labels = label_encoder.fit_transform(test_df['dx'])
num_classes = len(label_encoder.classes_)

# Define data preprocessing and generator
def load_and_preprocess_image(image_path, augment=False):
    img = tf.io.read_file(image_path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, (224, 224))
    img = tf.keras.applications.resnet50.preprocess_input(img)
    return img

def data_generator(df, image_folder, labels, augment=False):
    num_samples = len(df)
    indices = np.arange(num_samples)
    while True:
        np.random.shuffle(indices)
        for start in range(0, num_samples, batch_size):
            batch_indices = indices[start:start + batch_size]
            batch_images = []
            batch_labels = []
            for idx in batch_indices:
                raw_filename = str(df.iloc[idx]['image_id'])
                filename = os.path.splitext(raw_filename)[0] + '.png'
                image_path = os.path.join(image_folder, filename)
                if not os.path.exists(image_path):
                    print(f'Warning: {image_path} not found. Skipping.')
                    continue
                img = load_and_preprocess_image(image_path, augment=augment)
                batch_images.append(img)
                batch_labels.append(labels[idx])
            if not batch_images:
                continue
            images_tensor = tf.stack(batch_images)
            labels_tensor = tf.convert_to_tensor(tf.keras.utils.to_categorical(batch_labels, num_classes=num_classes), dtype=tf.float32)
            yield images_tensor, labels_tensor

# Define output signature
output_signature = (
    tf.TensorSpec(shape=(None, 224, 224, 3), dtype=tf.float32),
    tf.TensorSpec(shape=(None, num_classes), dtype=tf.float32)
)

test_dataset = tf.data.Dataset.from_generator(
    lambda: data_generator(test_df, test_folder, test_labels, augment=False),
    output_signature=output_signature
).prefetch(tf.data.AUTOTUNE)

batch_size = 16
test_steps = math.ceil(len(test_df) / batch_size)
print(f'Test steps: {test_steps}')

# Evaluate on test dataset
test_loss, test_accuracy = model_hybrid.evaluate(test_dataset, steps=test_steps, verbose=1)
print(f'Test Loss: {test_loss:.4f}')
print(f'Test Accuracy: {test_accuracy:.4f}')

# Get predictions and labels
def get_predictions_and_labels(model, dataset, steps):
    preds = []
    true_labels = []
    for (images, labels) in dataset.take(steps):
        batch_preds = model.predict(images)
        preds.extend(np.argmax(batch_preds, axis=1))
        true_labels.extend(np.argmax(labels, axis=1))
    return np.array(preds), np.array(true_labels)

test_preds, test_true_labels = get_predictions_and_labels(model_hybrid, test_dataset, test_steps)
manual_accuracy = np.mean(test_preds == test_true_labels)
print(f'Manually Calculated Test Accuracy: {manual_accuracy:.4f}')

# Compute and print confusion matrix and classification report
test_cm = confusion_matrix(test_true_labels, test_preds)
print('\nConfusion Matrix (Test):')
print(test_cm)

test_report = classification_report(test_true_labels, test_preds, target_names=label_encoder.classes_)
print('\nClassification Report (Test):')
print(test_report)

# Save results to file
output_file = '/content/drive/MyDrive/test_evaluation_detailed2.txt'
with open(output_file, 'w') as f:
    f.write(f'Test Loss: {test_loss:.4f}\n')
    f.write(f'Test Accuracy: {test_accuracy:.4f}\n')
    f.write(f'Manually Calculated Test Accuracy: {manual_accuracy:.4f}\n')
    f.write('\nConfusion Matrix (Test):\n')
    f.write(np.array2string(test_cm))
    f.write('\n\nClassification Report (Test):\n')
    f.write(test_report)
print(f'Test evaluation results saved to {output_file}')