<font color='red'>**Introduction:**</font>

Histopathologic cancer detection involves identifying metastatic cancer in pathology scans. The dataset used in this project consists of labeled images, where detections are labeled 1 and absense 0. 

# <font color='red'>1. EDA
  </font>

## <font color='green'>1.1 Image Visualization</font>

In [None]:
# Required Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2
import os

# Load train labels
train_labels = pd.read_csv('/kaggle/input/histopathologic-cancer-detection/train_labels.csv')

In [None]:
from tensorflow.keras.preprocessing.image import load_img, img_to_array

# Define dimensions to resize image
IMG_WIDTH, IMG_HEIGHT = 96, 96

def resize_image(img_path, target_width=IMG_WIDTH, target_height=IMG_HEIGHT):
    img = load_img(img_path, target_size=(target_width, target_height))
    return img_to_array(img)

# Example usage
sample_path = train_labels['id'].iloc[0]
resized_img = resize_image(f'../input/histopathologic-cancer-detection/train/{sample_path}.tif')
print(f"Resized Image Shape: {resized_img.shape}")

In [None]:
def normalize_image(img):
    return img / 255.0

In [None]:
from sklearn.model_selection import train_test_split
train_data, temp_data, train_labels, temp_labels = train_test_split(train_labels['id'].values, train_labels['label'].values, test_size=0.2, random_state=42)
val_data, test_data, val_labels, test_labels = train_test_split(temp_data, temp_labels, test_size=0.5, random_state=42)

In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator, array_to_img, load_img, img_to_array
import matplotlib.pyplot as plt

# Data augmentation configuration
datagen = ImageDataGenerator(
    rotation_range=40,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

In [8]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout

model = Sequential()

# Convolutional layers
model.add(Conv2D(32, (3, 3), activation='relu', input_shape=(IMG_WIDTH, IMG_HEIGHT, 3)))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(128, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

# Fully connected layers
model.add(Flatten())
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))  # Binary classification

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 94, 94, 32)        896       
                                                                 
 max_pooling2d (MaxPooling2D  (None, 47, 47, 32)       0         
 )                                                               
                                                                 
 conv2d_1 (Conv2D)           (None, 45, 45, 64)        18496     
                                                                 
 max_pooling2d_1 (MaxPooling  (None, 22, 22, 64)       0         
 2D)                                                             
                                                                 
 conv2d_2 (Conv2D)           (None, 20, 20, 128)       73856     
                                                                 
 max_pooling2d_2 (MaxPooling  (None, 10, 10, 128)      0

In [9]:
from tensorflow.keras.optimizers import Adam

optimizer = Adam(learning_rate=0.0001)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

In [10]:
train_labels_df = pd.read_csv('../input/histopathologic-cancer-detection/train_labels.csv')
train_labels_df['label'] = train_labels_df['label'].astype(str)  # Convert label column to string

In [11]:

train_labels_df['id'] = train_labels_df['id'].apply(lambda x: f"{x}.tif")


train_datagen = ImageDataGenerator(rescale=1./255, validation_split=0.2)  # Normalize images

batch_size = 32
train_steps = 8000 // batch_size  # 8000 images for training
val_steps = 2000 // batch_size    # 2000 images for validation

train_gen = train_datagen.flow_from_dataframe(
    dataframe=train_labels_df.head(10000),
    directory='../input/histopathologic-cancer-detection/train/',
    x_col='id',
    y_col='label',
    target_size=(IMG_WIDTH, IMG_HEIGHT),
    class_mode='binary',
    batch_size=batch_size,
    subset='training'
)

val_gen = train_datagen.flow_from_dataframe(
    dataframe=train_labels_df.head(10000),
    directory='../input/histopathologic-cancer-detection/train/',
    x_col='id',
    y_col='label',
    target_size=(IMG_WIDTH, IMG_HEIGHT),
    class_mode='binary',
    batch_size=batch_size,
    subset='validation'
)

history = model.fit(
    train_gen,
    steps_per_epoch=train_steps,
    validation_data=val_gen,
    validation_steps=val_steps,
    epochs=10
)

Found 8000 validated image filenames belonging to 2 classes.
Found 2000 validated image filenames belonging to 2 classes.
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [12]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve

# Adjust the val_steps
val_steps = np.ceil(len(val_gen.classes) / batch_size)

# Predict classes
val_predictions = model.predict(val_gen, steps=val_steps)
val_pred_classes = (val_predictions > 0.5).astype(int).flatten()

# True labels
true_labels = val_gen.classes

# Ensure the lengths match
val_pred_classes = val_pred_classes[:len(true_labels)]

# Calculate metrics
accuracy = accuracy_score(true_labels, val_pred_classes)
precision = precision_score(true_labels, val_pred_classes)
recall = recall_score(true_labels, val_pred_classes)
f1 = f1_score(true_labels, val_pred_classes)
roc_auc = roc_auc_score(true_labels, val_predictions)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")dd
print(f"Recall: {recall:.4f}")dd
print(f"F1 Score: {f1:.4f}")
print(f"ROC-AUC: {roc_auc:.4f}")

Accuracy: 0.5335
Precision: 0.4102
Recall: 0.3932
F1 Score: 0.4015
ROC-AUC: 0.5140


In [None]:
!pip install keras-tuner

from kerastuner import RandomSearch
from kerastuner.engine.hyperparameters import HyperParameters

In [None]:
def build_model(hp):
    model = Sequential()
    
    # Convolutional layers
    model.add(Conv2D(hp.Int('input_units', min_value=32, max_value=64, step=32), (3, 3), activation='relu', input_shape=(IMG_WIDTH, IMG_HEIGHT, 3)))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    
    for i in range(hp.Int('n_layers', 1, 3)):  # adding between 1 and 3 convolutional layers
        model.add(Conv2D(hp.Int(f'conv_{i}_units', min_value=32, max_value=64, step=32), (3, 3), activation='relu'))
        model.add(MaxPooling2D(pool_size=(2, 2)))
    
    # Fully connected layers
    model.add(Flatten())
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))  # Binary classification
    
    optimizer = Adam(learning_rate=hp.Choice('learning_rate', [1e-3, 1e-4]))
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

tuner = RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=3,  # reduced number of model configurations to test
    executions_per_trial=1,
    directory='output',
    project_name='HistoPathologicCancerDetection'
)

# Train for fewer epochs during hyperparameter tuning
tuner.search(train_gen, epochs=5, validation_data=val_gen)

In [None]:
from tensorflow.keras.applications import VGG16

# Load the VGG16 model with weights pre-trained on ImageNet
base_model = VGG16(weights='imagenet', include_top=False, input_shape=(IMG_WIDTH, IMG_HEIGHT, 3))

# Freeze the layers of the base model
for layer in base_model.layers:
    layer.trainable = False

# Create a custom model on top
model = Sequential()
model.add(base_model)
model.add(Flatten())
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
from tensorflow.keras.applications import VGG16
from tensorflow.keras.regularizers import l2

# Load the VGG16 model with weights pre-trained on ImageNet
base_model = VGG16(weights='imagenet', include_top=False, input_shape=(IMG_WIDTH, IMG_HEIGHT, 3))

# Freeze the layers of the base model
for layer in base_model.layers:
    layer.trainable = False

model = Sequential()
model.add(base_model)

# Convolutional layers
# Removed pooling layers and adjusted convolutional layers
model.add(Conv2D(32, (3, 3), activation='relu', kernel_regularizer=l2(0.01), padding='same'))
model.add(Conv2D(64, (3, 3), activation='relu', kernel_regularizer=l2(0.01), padding='same'))
model.add(Conv2D(128, (3, 3), activation='relu', kernel_regularizer=l2(0.01), padding='same'))

# Fully connected layers
model.add(Flatten())
model.add(Dense(512, activation='relu', kernel_regularizer=l2(0.01)))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))  # Binary classification

optimizer = Adam(learning_rate=0.0001)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])