In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm

import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Input, Dropout, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.preprocessing.image import ImageDataGenerator

import kagglehub
from kagglehub import KaggleDatasetAdapter


import random
os.environ['PYTHONHASHSEED'] = '0'
random.seed(0)
np.random.seed(0)
tf.random.set_seed(0)

print("GPU Available: ", tf.config.list_physical_devices('GPU'))


In [None]:
dataset_path = kagglehub.dataset_download('ambarish/breakhis')
print(f"Dataset downloaded to: {dataset_path}")

for root, dirs, files in os.walk(dataset_path):
    level = root.replace(dataset_path, '').count(os.sep)
    indent = ' ' * 4 * level
    print(f"{indent}{os.path.basename(root)}/")
    if level < 3:  
        for f in files[:5]:  
            print(f"{indent}    {f}")
        if len(files) > 5:
            print(f"{indent}    ... ({len(files) - 5} more files)")


In [None]:
benign_path = os.path.join(dataset_path, 'BreaKHis_v1', 'BreaKHis_v1', 'histology_slides', 'breast', 'benign', 'SOB')
malignant_path = os.path.join(dataset_path, 'BreaKHis_v1', 'BreaKHis_v1', 'histology_slides', 'breast', 'malignant', 'SOB')

def collect_image_paths(base_path, label):
    image_paths = []
    labels = []
    
    for subtype in os.listdir(base_path):
        subtype_path = os.path.join(base_path, subtype)
        if not os.path.isdir(subtype_path):
            continue
            
        for patient in os.listdir(subtype_path):
            patient_path = os.path.join(subtype_path, patient)
            if not os.path.isdir(patient_path):
                continue
                
            for magnification in ['40X', '100X', '200X', '400X']:
                mag_path = os.path.join(patient_path, magnification)
                if not os.path.exists(mag_path):
                    continue
                    
                for img_file in os.listdir(mag_path):
                    if img_file.endswith('.png'):
                        image_paths.append(os.path.join(mag_path, img_file))
                        labels.append(label)
    
    return image_paths, labels


benign_paths, benign_labels = collect_image_paths(benign_path, 0)
malignant_paths, malignant_labels = collect_image_paths(malignant_path, 1)

all_image_paths = benign_paths + malignant_paths
all_labels = benign_labels + malignant_labels

print(f"Total images: {len(all_image_paths)}")
print(f"Benign images: {len(benign_paths)}")
print(f"Malignant images: {len(malignant_paths)}")

df = pd.DataFrame({
    'image_path': all_image_paths,
    'label': all_labels
})

plt.figure(figsize=(8, 5))
sns.countplot(x='label', data=df)
plt.title('Class Distribution')
plt.xlabel('Class (0: Benign, 1: Malignant)')
plt.ylabel('Count')
plt.show()


In [None]:
from sklearn.model_selection import train_test_split

train_val_df, test_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)

train_df, val_df = train_test_split(train_val_df, test_size=0.2, stratify=train_val_df['label'], random_state=42)

print(f"Training set: {len(train_df)} images")
print(f"Validation set: {len(val_df)} images")
print(f"Testing set: {len(test_df)} images")


In [None]:
IMG_SIZE = 224

train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    vertical_flip=True,
    fill_mode='nearest'
)

val_test_datagen = ImageDataGenerator(rescale=1./255)

BATCH_SIZE = 32

train_df['label'] = train_df['label'].astype(str)
val_df['label'] = val_df['label'].astype(str)
test_df['label'] = test_df['label'].astype(str)

def create_generator(dataframe, generator, batch_size=BATCH_SIZE):
    return generator.flow_from_dataframe(
        dataframe=dataframe,
        x_col='image_path',
        y_col='label',
        target_size=(IMG_SIZE, IMG_SIZE),
        batch_size=batch_size,
        class_mode='binary',
        shuffle=True
    )

train_generator = create_generator(train_df, train_datagen)
val_generator = create_generator(val_df, val_test_datagen)
test_generator = create_generator(test_df, val_test_datagen, batch_size=1)



In [None]:
strategy = tf.distribute.MirroredStrategy()
print(f"Number of devices: {strategy.num_replicas_in_sync}")

with strategy.scope():
    # Create the base model from pre-trained ResNet50
    base_model = ResNet50(
        weights='imagenet',
        include_top=False,
        input_shape=(IMG_SIZE, IMG_SIZE, 3)
    )

    # Freeze the base model layers
    for layer in base_model.layers:
        layer.trainable = False

    # Add custom layers on top
    x = base_model.output
    x = GlobalAveragePooling2D()(x)
    x = Dense(1024, activation='relu')(x)
    x = Dropout(0.5)(x)
    x = Dense(512, activation='relu')(x)
    x = Dropout(0.3)(x)
    predictions = Dense(1, activation='sigmoid')(x)

    # Create the final model
    model = Model(inputs=base_model.input, outputs=predictions)

    # Compile the model
    model.compile(
        optimizer=Adam(learning_rate=0.0001),
        loss='binary_crossentropy',
        metrics=['accuracy', tf.keras.metrics.AUC(), tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]
    )

model.summary()


In [None]:
# Define callbacks
checkpoint = ModelCheckpoint(
    'resnet50_breakhis_best.keras',
    monitor='val_accuracy',
    save_best_only=True,
    mode='max',
    verbose=1
)

early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True,
    verbose=1
)

reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.2,
    patience=5,
    min_lr=1e-6,
    verbose=1
)

callbacks = [checkpoint, early_stopping, reduce_lr]


In [None]:
# Train the model (Head only)
history = model.fit(
    train_generator,
    steps_per_epoch=len(train_generator),
    epochs=20,
    validation_data=val_generator,
    validation_steps=len(val_generator),
    callbacks=callbacks,
    verbose=1
)

model.save('resnet50_breakhis_phase1.keras')


In [None]:
print("Unfreezing layers for fine-tuning...")
for layer in base_model.layers[-30:]:  # Unfreeze the last 30 layers
    layer.trainable = True
    
trainable_count = sum(1 for layer in model.layers if layer.trainable)
non_trainable_count = sum(1 for layer in model.layers if not layer.trainable)
print(f"Trainable layers: {trainable_count}")
print(f"Non-trainable layers: {non_trainable_count}")

# Recompile the model
with strategy.scope():
    model.compile(
        optimizer=Adam(learning_rate=1e-5), 
        loss='binary_crossentropy',
        metrics=['accuracy', tf.keras.metrics.AUC(), tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]
    )


In [None]:
# Define callbacks for the second phase
checkpoint_ft = ModelCheckpoint(
    'resnet50_breakhis_finetuned.keras',
    monitor='val_accuracy',
    save_best_only=True,
    mode='max',
    verbose=1
)

early_stopping_ft = EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True,
    verbose=1
)

reduce_lr_ft = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.2,
    patience=5,
    min_lr=1e-7,
    verbose=1
)

callbacks_ft = [checkpoint_ft, early_stopping_ft, reduce_lr_ft]

# Continue training with unfrozen layers
print("Fine-tuning the model with unfrozen layers...")
history_fine_tune = model.fit(
    train_generator,
    steps_per_epoch=len(train_generator),
    epochs=30,
    validation_data=val_generator,
    validation_steps=len(val_generator),
    callbacks=callbacks_ft,
    initial_epoch=history.epoch[-1] + 1  # Continue from where we left off
)

# Save the fine-tuned model
model.save('resnet50_breakhis_final.keras')


In [None]:
# Plot the training history
plt.figure(figsize=(15, 5))

# Plot accuracy
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Phase 1 Train')
plt.plot(history.history['val_accuracy'], label='Phase 1 Validation')
plt.plot(history_fine_tune.history['accuracy'], label='Phase 2 Train')
plt.plot(history_fine_tune.history['val_accuracy'], label='Phase 2 Validation')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

# Plot loss
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Phase 1 Train')
plt.plot(history.history['val_loss'], label='Phase 1 Validation')
plt.plot(history_fine_tune.history['loss'], label='Phase 2 Train')
plt.plot(history_fine_tune.history['val_loss'], label='Phase 2 Validation')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.savefig('training_history.png')
plt.show()


In [None]:
model_json = model.to_json()
with open("resnet50_breakhis_model.json", "w") as json_file:
    json_file.write(model_json)
print("Model architecture saved to disk")

model.save_weights("resnet50_breakhis_weights.weights.h5")
print("Model weights saved to disk")

print("Training and fine-tuning complete. Check the 'Data' tab to download your models.")
