Copied from Basel Anaya - Cesar Pereiro Garcia

Pre-requisites and Dependencies 

In [None]:
! pip install kaggle 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
! mkdir ~/.kaggle

In [None]:
! cp /content/drive/MyDrive/Stat_Docs/kaggle.json ~/.kaggle/kaggle.json

In [None]:
# Change permission
! chmod 600 ~/.kaggle/kaggle.json

## Dataset Download 

Link to dataset -- 

In [None]:
! kaggle datasets download paultimothymooney/breast-histopathology-images

In [None]:

# unzip content
! unzip /content/breast-histopathology-images.zip -d /content/breast-histopathology-images    

In [None]:
# check disk usage
! df -h

In [None]:
! pip install plotly 
! pip install seaborn 

In [None]:
import os
from os import listdir
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import matplotlib.patches as patches
import plotly.express as px
import seaborn as sns

import cv2
from matplotlib.image import imread
import tensorflow.keras as keras
import tensorflow as tf

from keras.preprocessing import image
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix


import glob
import PIL
import random

random.seed(100)

In [None]:
path = '' #rename path as seen in colab 
breast_imgs=[]


In [None]:
for root, dirs, files in os.walk(path):
    for file in files:
        if file.endswith('.png'):
            breast_imgs.append(os.path.join(root, file))



In [None]:
# empty lists for information
patient_numbers = []
cancer_status = []
x_coords = []
y_coords = []
file_names = []
file_paths = []

In [None]:
for img in breast_imgs:
    parts = img.split('/') # /content/breast-histopathology-images/10264/1/10264_idx5_x1601_y1451_class1.png becomes: parts = ['content', 'breast-histopathology-images', '10264', '1', '10264_idx5_x1601_y1451_class1.png']
    filename = parts[-1] #last component of the split path, which is the filename
    patient_number = parts[-3] #hird-to-last component of the path, which corresponds to the patient number.
    info = filename.rstrip('.png').split('_') #Removes the .png extension from the filename using .rstrip('.png').Splits the remaining part of the filename into components using '_' as the delimiter.Example: ['10264', 'idx5', 'x1601', 'y1451', 'class1'].
    x_coord = int(info[2][1:]) #Extracts the third element ('x1601'), removes the leading 'x' using slicing ([1:]), and converts it to an integer.
    y_coord = int(info[3][1:]) #Extracts the fourth element ('y1451'), removes the leading 'y' using slicing ([1:]), and converts it to an integer.
    status = int(info[4][-1]) #Extracts the last element ('class1'), gets the last character ([-1]), and converts it to an integer.
    patient_numbers.append(patient_number) 
    cancer_status.append(status)
    x_coords.append(x_coord)
    y_coords.append(y_coord)
    file_names.append(filename)
    file_paths.append(img)


In [None]:
df = pd.DataFrame({
    'Patient_Number': patient_numbers,
    'Cancer_Status': cancer_status,
    'X_Coord': x_coords,
    'Y_Coord': y_coords,
    'File_Name': file_names,
    'File_Path': file_paths
})

df.sort_values(by=['Patient_Number', 'X_Coord', 'Y_Coord'], inplace=True)

print(df.head())


In [None]:
patient_ids = df['Patient Number'].unique()
fig, axs = plt.subplots(5,3, figsize = 20,27)

for i in range(5):
    for j in range(3):
        if 3 * i + j < len(patient_ids):
            patient_id = patient_ids[3 * i + j]
            patient_df = df[df["Patient_Number"] == patient_id]
            axs[i,j].scatter(patient_df[patient_df['Cancer_Status'] == 0]['X_Coord'], patient_df[patient_df['Cancer_Status'] == 0]['Y_Coord'], c = 'blue', label='No Cancer', s = 20)
            axs[i,j].scatter(patient_df[patient_df['Cancer_Status'] == 1]['X_Coord'], patient_df[patient_df['Cancer_Status'] == 1]['Y_Coord'], c = 'blue', label='No Cancer', s = 20)
            axs[i,j].set_title('Patient' + str(patient_id))
            axs[i,j].set_xlabel('X Coord')
            axs[i,j].set_ylabel("Y Coord")
            axs[i,j].legend()

plt.tight_layout()
plt.show()

In [None]:
# check image dataframe 
for imgname in breast_imgs[:7]:
    print(imgname)

In [None]:
df['Full_Path'] = df['File_path']

# load images and coordincates 
def load_image_and_coords_from_path(file_path, label, x_coord, y_coord):
    image = tf.io.read_file(file_path)
    image = tf.image.decode_png(image, channels = 3) #decode to an RGB image
    image = tf.image.resize(image, [50,50])
    return (image, tf.cast(label, tf.float32), tf.cast(x_coord, tf.float32),tf.cast(y_coord, tf.float32)) #returns a tuple - processed image, label to 32-bit floating point 


In [None]:
def create_dataset(df):
    path_ds = tf.Dataset.from_tensor_slices((
        df['Full_Path'].values,
        df['Cancer_Status'].values,
        df['X_Coord'].values,
        df['Y_Coord'].values
    ))
    dataset = path_ds.map(load_image_and_coords_from_path)
    return dataset.batch(512)


In [None]:
# unpack features and labels
def unpack_features_labels(image, label, x_coord, y_coord):
    return (image, tf.stack([x_coord, y_coord], axis=1)), label

In [None]:
# split dataframe into train....
train_df, temp_df = train_test_split(df, test_size=0.15, random_state=42, stratify=df['Patient_Number'])
valid_df, test_df = train_test_split(temp_df, test_size=0.15, random_state=42, stratify=df['Patient_Number'])

train_dataset = create_dataset(train_df)
train_dataset = train_dataset.map(unpack_features_labels)

valid_dataset = create_dataset(valid_df)
val_dataset = valid_dataset.map(unpack_features_labels)

test_dataset = create_dataset(test_df)
test_dataset = test_dataset.map(unpack_features_labels)

In [None]:
# verify order from first batch 
for (images, coords), labels in train_dataset.take(1):
    x_coords, y_coords = tf.unstack(coords, axis = 1)
    for i in range(tf.shape(labels)[0]):
        print(f'Image {i}: Label" {labels[i].numpy()}, X_Coord: {x_coords[i].numpy()}, Y_Coord: {y_coords[i].numpy()}')

In [None]:
# load image function
def load_image(file_path):
    image = tf.io.read_file(file_path)
    image = tf.image.decode_png(image, channels =3)
    image - tf.image.resize(image, [50, 50])
    return image.numpy()

In [None]:
# get images with coords -- DataFrame and a center index as parameters. This function will get surrounding images around a central point.
def get_surrounding_images_with_coords(df, center_idx):
    center_x = df.iloc[center_idx]['X_Coord']
    center_y = df.iloc[center_idx]['Y_Coord']
    # Creates a list of coordinate pairs for a 4x4 grid centered around the central point. Each grid cell is 50x50 pixels
    patch_coords = [(x,y) for y in range(center_y - 1 * 50, center_y + 3 * 50,  50) for x in range(center_x - 1 * 50, center_x + 3 * 50, 50)]
    # an empty 200x200 (4x50 by 4x50) RGB image array to store the final composite image.
    image_patch = np.zeros((4 * 50, 4 * 50, 3), dtype=np.uint8)
    # empty list to store coordinates, labels, and whether images are original or mirrored.
    used_coords_labels = []
    # Iterates through the coordinate pairs, calculating the row and column position in the 4x4 grid
    for i, (x, y) in enumerate(patch_coords):
        row = i // 4
        col = i % 4
        image_df = df[(df['X_Coord'] == x) & (df['Y_Coord'] == y)]
        # If no image exists at these coordinates:Finds the nearest image using Manhattan distanceLoads that image
        if image_df.empty:
            nearest_idx = ((df['X_Coord'] - x).abs() + (df['Y_Coord'] - y).abs()).argmin()
            nearest_image_df = df.iloc[nearest_idx]
            image = load_image(nearest_image_df['Full_Path'])
            # Flips the image horizontally or vertically based on its position relative to the center.
            if nearest_image_df['X_Coord'] < center_x:
                image = np.fliplr(image)
            elif nearest_image_df['X_Coord'] > center_x:
                image = np.flipud(image)
            # Records that this position used a mirrored image.
            used_coords_labels.append((nearest_image_df['X_Coord'], nearest_image_df['Y_Coord'], nearest_image_df['Cancer_Status'], 'Espejo'))
        else:
            image = load_image(image_df.iloc[0]['Full_Path'])
            used_coords_labels.append((x, y, image_df.iloc[0]['Cancer_Status'], 'Original'))
        
        # If an image exists at these coordinates, loads it and marks it as original.
        image_patch[row * 50:(row + 1) * 50, col * 50:(col + 1) * 50, :] = image
    
    # Places the loaded image in the correct position in the composite image.
    green_mask = np.full((50, 50, 3), [0, 255, 0], dtype=np.uint8)
    image_patch[100:150, 100:150, :] = np.clip(image_patch[100:150, 100:150, :] + green_mask * 0.2, 0, 255)

    # Adds a semi-transparent green overlay to the center image in the grid.
    return image_patch, used_coords_labels

# Returns the composite image and the list of coordinates/labels used.
# Calls the function with index 55 as the center image
center_image_idx = 55
patch_image, patch_coords_labels = get_surrounding_images_with_coords(df, center_image_idx)

# Prints the coordinates and labels of all images used, then displays the composite image.
print("Coordinates and labels of the images in the patch:")
for coord_label in patch_coords_labels:
    print(coord_label)


plt.imshow(patch_image)
plt.axis('off')
plt.show()

In [None]:
# CPU and GPU set-up --  colab ()()
USO_TPU = bool(1)
USO_GPU = bool(0)



if USO_TPU:
    
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver() 
    
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
    

if USO_GPU: 
    strategy = tf.distribute.MirroredStrategy()
    print('Number of devices: {}'.format(strategy.num_replicas_in_sync))

In [None]:
# model build up
import keras
from tensorflow.keras.layers import Input, LSTM, Dense, Flatten, Dropout, Concatenate, BatchNormalization, Add, Resizing, Cropping2D, RandomRotation, RandomBrightness, RandomFlip
from tensorflow.keras.models import Model
from tensorflow.keras.applications import ConvNeXtTiny, NASNetMobile, VGG16, EfficientNetV2M, EfficientNetB3
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.applications.efficientnet_v2 import preprocess_input

from tensorflow.keras.layers import Input, Dense, Flatten, BatchNormalization, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.applications import VGG16
from tensorflow.keras.applications.efficientnet import preprocess_input
from tensorflow.keras.layers import Input, RandomFlip, RandomRotation, RandomBrightness, GaussianNoise
from tensorflow.keras.applications import EfficientNetB3
from tensorflow.keras.applications.efficientnet import preprocess_input


In [None]:
# uncomment code if strategy doesn't work
with strategy.scope():
    # input for images 50x50 pixels and 3 channels - RGB
    image_input = Input(shape=(50, 50, 3), name='image_input')
    # input layer for coordinates 
    coords_input = Input(shape=(2,), name='coords_input')
    # data augmentation (add image brightness, flip add random noise)
    x = RandomBrightness(0.2)(image_input)
    x = RandomFlip()(x)
    x = GaussianNoise(0.2)(x)
    # preprocess specific to EfficientNetV2M
    processed = preprocess_input(x)
    # load base EfficientNetB3 model
    base_model = EfficientNetB3(include_top = False, weights = 'imagenet', input_tensor=processed)
    # freeze layers of the base model and set as trainable
    for layer in base_model.layers:
        layer.trainable = True
    # flatten output of base model
    flattened_base_model = Flatten()(base_model.output)
    # dense layers 
    dense1 = Dense(128, activation='relu')(flattened_base_model)
    batch_norm1 = BatchNormalization()(dense1)
    dropout1 = Dropout(0.1)(batch_norm1)  

    dense2 = Dense(64, activation='relu')(dropout1)
    batch_norm2 = BatchNormalization()(dense2)
    dropout2 = Dropout(0.2)(batch_norm2)  

    dense3 = Dense(32, activation='relu')(dropout2)
    batch_norm3 = BatchNormalization()(dense3)

    # output layer
    output = Dense(1, activation='sigmoid')(batch_norm3)

    model = Model(input=[image_input, coords_input], outputs=output)

    model.compile(optimizer = Adam(learning_rate=0.001), loss = 'binary_crossentropy', metrics = ['accuracy'])





In [None]:
model.summary()

In [None]:
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.callbacks import LearningRateScheduler


from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, LearningRateScheduler

In [None]:
# early stopping 
early_stopping = EarlyStopping(monitor = 'val_loss', patience=10, restore_best_weights=True)

In [None]:
# reduce learning rate
plateau = ReduceLROnPlateau(monitor='val_loss', factor = 0.1, patience=5)

In [None]:
# custom learning rate scheduler
from tensorflow.keras.callbacks import Callback

class LRScheduler(Callback):
    def __init__(self, schedule):
        super(LRScheduler, self).__init__()
        self.schedule = schedule

    def on_epoch_begin(self, epoch, logs=None):
        if not hasattr(self.model.optimizer, "lr"):
            raise ValueError("Optimizer must hae a "lr" attribute.")
        # get current learning rate from model
        lr = float(tf.keras.backend.get_value(self.model.optimizer.learning_rate))
        # call schedule function to get scheduled learning rate 
        scheduled_lr = self.schedule(epoch, lr)
        # set value back to optimizer befor epoch starts 
        tf.keras.backend.set_value(self.model.optimizer.lr, scheduled_lr)
        print(f"Epoch {epoch+1}: Learning rate is {scheduled_lr}")
    
    # custom scheduler function
    def lr_scheduler(epoch, lr):
        if epoch < 20:
            return lr
        else:
            return lr * tf.math.exp(-0.1)

    # instantiate LRScheduler with function
    lr_scheduler_callback = LRScheduler(lr_scheduler)

    # define class weights if classes are imbalances
    class_weights - {0: 1.0, 1: 5.0} 

In [None]:
history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=100,  # cheza hapa 
    verbose=1,
    class_weight=class_weights,
    callbacks=[early_stopping, plateau, lr_scheduler_callback])

In [None]:
# plot loss
loss = history.history['loss']
val = history.history['val_loss']

plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.plot(loss, label='Training Loss')
plt.plot(val, label='Validation Loss')
plt.title('Training Loss Curves')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()


plt.show()