# Imports

In [1]:
import os
from PIL import Image
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split

# Data Loading

In [2]:
def display_image(image_path):
    img = Image.open(image_path)
    img.show()

yes_path = './data/yes'
no_path = "./data/no"

In [3]:
def create_dataframe(yes_path, no_path):
    # Initialize an empty list to store the data
    data = []

    # Process 'yes' images
    for image in os.listdir(yes_path):
        data.append({
            "image_path": os.path.join(yes_path, image),
            "tumor_present": "yes"
        })

    # Process 'no' images
    for image in os.listdir(no_path):
        data.append({
            "image_path": os.path.join(no_path, image),
            "tumor_present": "no"
        })

    # Create a DataFrame
    df = pd.DataFrame(data)

    return df

## Train, Test, Validation Splits

In [4]:
def create_train_test_val_splits(df, test_size=0.2, val_size=0.1):

    # First, split into training + validation and test sets
    train_val_df, test_df = train_test_split(df, test_size=test_size, random_state=42)

    # Adjust val_size to compensate for the initial split
    adjusted_val_size = val_size / (1 - test_size)

    # Then, split the training + validation set into training and validation sets
    train_df, val_df = train_test_split(train_val_df, test_size=adjusted_val_size, random_state=42)

    return train_df, val_df, test_df

In [7]:
total_data = create_dataframe(yes_path, no_path)
total_data = total_data.sample(frac=1).reset_index(drop=True)

In [9]:
# Shuffle and split the data
train_data, val_data, test_data = create_train_test_val_splits(total_data)

# Display the sizes of each set
print(f"Training Set: {len(train_data)}")
print(f"Validation Set: {len(val_data)}")
print(f"Test Set: {len(test_data)}")

print(train_data.head())

Training Set: 2100
Validation Set: 300
Test Set: 600
               image_path tumor_present
921   ./data/no/no761.jpg            no
339   ./data/yes/y312.jpg           yes
1984  ./data/no/no373.jpg            no
2439  ./data/no/no555.jpg            no
259   ./data/yes/y118.jpg           yes


using this to guide creation of the CNN: https://www.analyticsvidhya.com/blog/2021/01/image-classification-using-convolutional-neural-networks-a-step-by-step-guide/

# Make images the same size and normalize

In [None]:
from PIL import Image, ImageOps
import numpy as np

# Global counter for images that are excluded
excluded_images_count = 0
all_images = 0

def preprocess_image(image_path, target_size):
    global excluded_images_count
    global all_images
    all_images += 1

    try:
        img = Image.open(image_path)
        img = ImageOps.fit(img, target_size, Image.Resampling.LANCZOS)  # Resizing with LANCZOS
        img = np.array(img)
        if img.shape != (target_size[0], target_size[1], 3):  # Check if image has three color channels
            excluded_images_count += 1
            return None
        img = img / 255.0  # Normalize pixel values
        return img
    except Exception as e:
        print(f"Error processing image {image_path}: {e}")
        excluded_images_count += 1
        return None

def encode_label(label):
    return 1 if label == 'yes' else 0

def prepare_dataset(df, target_size=(224, 224)):
    images = df['image_path'].apply(lambda x: preprocess_image(x, target_size)).dropna()
    labels = df['tumor_present'][images.index].apply(encode_label)  # Align labels with images
    return np.array(images.tolist()), np.array(labels)

In [None]:
#Function to visualize some sample images
import matplotlib.pyplot as plt

def show_sample_images(data, preprocess_func, num_images=5):
    plt.figure(figsize=(10, 10))
    for i in range(num_images):
        ax = plt.subplot(1, num_images, i + 1)
        img = preprocess_func(data.iloc[i]['image_path'], (224, 224))
        plt.imshow(img)
        plt.title(data.iloc[i]['tumor_present'])
        plt.axis("off")

# Prepare Data and Visualize some samples

In [None]:
X_train, y_train = prepare_dataset(train_data)
X_val, y_val = prepare_dataset(val_data)
X_test, y_test = prepare_dataset(test_data)

show_sample_images(train_data, preprocess_image)


print(f"Total images excluded: {excluded_images_count}")
print(f"all images : {all_images}")

# Custom Model

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Conv2D(32, (3, 3), padding='same', activation=tf.nn.relu, input_shape=(224, 224, 3)),  # Adjusted input shape
    tf.keras.layers.MaxPooling2D((2, 2), strides=2),
    tf.keras.layers.Conv2D(32, (3, 3), padding='same', activation=tf.nn.relu),
    tf.keras.layers.MaxPooling2D((2, 2), strides=2),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128, activation=tf.nn.relu),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Model Compilation
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Model Training using Validation Data
history = model.fit(X_train, y_train, batch_size=16, epochs=5, verbose=1, validation_data=(X_val, y_val))

plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.legend()
plt.title('Accuracy over Epochs')

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.legend()
plt.title('Loss over Epochs')

# Visualize Model

In [None]:
conv_layers = [layer for layer in model.layers if isinstance(layer, tf.keras.layers.Conv2D)]

for layer in conv_layers:
    filters, biases = layer.get_weights()
    f_min, f_max = filters.min(), filters.max()
    filters = (filters - f_min) / (f_max - f_min)  # normalize filter values
    # visualize first few filters
    n_filters = 6
    ix = 1
    plt.figure(figsize=(8, 8))
    for i in range(n_filters):
        f = filters[:, :, :, i]
        for j in range(3):  # there are 3 channels
            ax = plt.subplot(n_filters, 3, ix)
            ax.set_xticks([])
            ax.set_yticks([])
            plt.imshow(f[:, :, j], cmap='gray')
            ix += 1
    plt.show()


In [None]:
#Model Interpretability with Grad-CAM (uses weights of the trained model to genreate heatmaps)

import matplotlib.cm as cm
from tf_keras_vis.gradcam import Gradcam
from tf_keras_vis.utils import normalize

def make_gradcam_heatmap(img_array, model, last_conv_layer_name, pred_index=None):
    gradcam = Gradcam(model, model_modifier=None, clone=False)
    cam = gradcam(
        lambda output: output[pred_index] if pred_index else output,
        img_array,
        penultimate_layer=-1,
    )
    heatmap = normalize(cam)
    return heatmap

# Show original image and heatmap image for 5 images
image_paths = [
    './data/yes/y1471.jpg',
    './data/yes/y1434.jpg',
    './data/yes/y624.jpg',
    './data/yes/y392.jpg',
    './data/no/no1094.jpg'
]

import matplotlib.pyplot as plt

def show_images_with_heatmaps(image_paths, model, preprocess_func, last_conv_layer_name):
    plt.figure(figsize=(10, 10))

    for i, image_path in enumerate(image_paths):
        # Original image
        original_img = preprocess_func(image_path, (224, 224))
        ax = plt.subplot(5, 2, 2 * i + 1)
        plt.imshow(original_img)
        plt.title(f"Original - {image_path.split('/')[-1]}")
        plt.axis('off')

        img_array = np.array([original_img])
        heatmap = make_gradcam_heatmap(img_array, model, last_conv_layer_name)
        ax = plt.subplot(5, 2, 2 * i + 2)
        plt.imshow(original_img)
        plt.imshow(heatmap.squeeze(), cmap='jet', alpha=0.5)  
        plt.title("Heatmap")
        plt.axis('off')

    plt.tight_layout()
    plt.show()


show_images_with_heatmaps(image_paths, model, preprocess_image, 3)


In [None]:
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")