In [None]:
import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  # Restrict TensorFlow to only allocate 4GB of memory on the first GPU
  try:
    tf.config.experimental.set_virtual_device_configuration(
        gpus[0],
        [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=4096)])
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Virtual devices must be set before GPUs have been initialized
    print(e)

TensorFlow version: 2.10.0
GPUs available: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


0

In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from pathlib import Path
from sklearn.model_selection import train_test_split


from sklearn.metrics import confusion_matrix, classification_report

# Directories to Images

In [3]:
from PIL import Image
print("Pillow imported successfully!")

Pillow imported successfully!


In [4]:
positive_dir = Path('Dataset/Positive')
negative_dir = Path('Dataset/Negative')

In [5]:
print(f"Positive directory exists: {positive_dir.exists()}")
print(f"Is it a directory: {positive_dir.is_dir()}")

print(f"Negative directory exists: {negative_dir.exists()}")
print(f"Is it a directory: {negative_dir.is_dir()}")

Positive directory exists: True
Is it a directory: True
Negative directory exists: True
Is it a directory: True


# Creating DataFrames

In [6]:
list(positive_dir.glob(r'*.jpg'))

[WindowsPath('Dataset/Positive/00001.jpg'),
 WindowsPath('Dataset/Positive/00002.jpg'),
 WindowsPath('Dataset/Positive/00003.jpg'),
 WindowsPath('Dataset/Positive/00004.jpg'),
 WindowsPath('Dataset/Positive/00005.jpg'),
 WindowsPath('Dataset/Positive/00006.jpg'),
 WindowsPath('Dataset/Positive/00007.jpg'),
 WindowsPath('Dataset/Positive/00008.jpg'),
 WindowsPath('Dataset/Positive/00009.jpg'),
 WindowsPath('Dataset/Positive/00010.jpg'),
 WindowsPath('Dataset/Positive/00011.jpg'),
 WindowsPath('Dataset/Positive/00012.jpg'),
 WindowsPath('Dataset/Positive/00013.jpg'),
 WindowsPath('Dataset/Positive/00014.jpg'),
 WindowsPath('Dataset/Positive/00015.jpg'),
 WindowsPath('Dataset/Positive/00016.jpg'),
 WindowsPath('Dataset/Positive/00017.jpg'),
 WindowsPath('Dataset/Positive/00018.jpg'),
 WindowsPath('Dataset/Positive/00019.jpg'),
 WindowsPath('Dataset/Positive/00020.jpg'),
 WindowsPath('Dataset/Positive/00021.jpg'),
 WindowsPath('Dataset/Positive/00022.jpg'),
 WindowsPath('Dataset/Positive/0

# Dataframe Function

In [7]:
def generate_df(image_dir, label):
    filepaths = pd.Series(list(image_dir.glob(r'*.jpg')), name = 'Filepath').astype(str)
    labels = pd.Series(label, name = 'label', index = filepaths.index)

    #Combining filepaths and labels dataframes
    df = pd.concat([filepaths, labels], axis = 1)
    return df

In [8]:
positive_df = generate_df(positive_dir, label = 'POSITIVE')
negative_df =  generate_df(negative_dir, label = 'NEGATIVE')

#Create a dataframe that is a combination of both positive and negative dataframes
all_df = pd.concat([positive_df, negative_df], axis = 0).sample(frac = 1.0, random_state = 1).reset_index(drop = True)
all_df

Unnamed: 0,Filepath,label
0,Dataset\Positive\03842.jpg,POSITIVE
1,Dataset\Positive\12899_1.jpg,POSITIVE
2,Dataset\Positive\15033_1.jpg,POSITIVE
3,Dataset\Negative\16782.jpg,NEGATIVE
4,Dataset\Positive\09202.jpg,POSITIVE
...,...,...
39995,Dataset\Positive\07814.jpg,POSITIVE
39996,Dataset\Negative\12512.jpg,NEGATIVE
39997,Dataset\Positive\05193.jpg,POSITIVE
39998,Dataset\Positive\12173_1.jpg,POSITIVE


# Splitting data into Training and Test sets

In [9]:
train_df, test_df = train_test_split(
    all_df,
    train_size = 0.7,
    shuffle = True,
    random_state = 1
)

# Loading Image Data

In [10]:

label_map = {'POSITIVE': 1, 'NEGATIVE': 0} # Adjust if your labels are different

if not train_df.empty:
    # Create a true validation set from the original train_df
    # Original train_df (70% of all_df) is split 80% for new_train_df, 20% for val_df_from_train
    train_df_for_tf_dataset, val_df_for_tf_dataset = train_test_split(
        train_df,
        train_size = 0.8,
        shuffle = True,
        random_state = 42
    )

    train_df_for_tf_dataset = train_df_for_tf_dataset.copy()
    val_df_for_tf_dataset = val_df_for_tf_dataset.copy()
    test_df_for_tf_dataset = test_df.copy() # Assuming test_df also needs mapping

    train_df_for_tf_dataset['label_numeric'] = train_df_for_tf_dataset['label'].map(label_map)
    val_df_for_tf_dataset['label_numeric'] = val_df_for_tf_dataset['label'].map(label_map)
    if not test_df_for_tf_dataset.empty:
        test_df_for_tf_dataset['label_numeric'] = test_df_for_tf_dataset['label'].map(label_map)

    print(f"Shape of new training df for tf.data: {train_df_for_tf_dataset.shape}")
    print(f"Shape of new validation df for tf.data: {val_df_for_tf_dataset.shape}")

else:
    print("Original train_df is empty. Training and validation datasets will be empty.")
    # Create empty DataFrames with the necessary column to avoid errors later if you proceed
    train_df_for_tf_dataset = pd.DataFrame(columns=['Filepath', 'label_numeric'])
    val_df_for_tf_dataset = pd.DataFrame(columns=['Filepath', 'label_numeric'])
    # test_df might also be empty if all_df was empty
    if test_df.empty:
        test_df_for_tf_dataset = pd.DataFrame(columns=['Filepath', 'label_numeric'])
    else:
        test_df_for_tf_dataset = test_df.copy()
        test_df_for_tf_dataset['label_numeric'] = test_df_for_tf_dataset['label'].map(label_map)



IMG_HEIGHT = 120
IMG_WIDTH = 120
BATCH_SIZE = 4
CHANNELS = 3

def load_and_preprocess_image(filepath, label):
    # Read the image file
    img = tf.io.read_file(filepath)
    # Decode the image. decode_image tries to infer format.
    # For more control, use tf.image.decode_jpeg or tf.image.decode_png
    img = tf.image.decode_image(img, channels=CHANNELS, expand_animations=False)
    # Resize to the target dimensions
    img = tf.image.resize(img, [IMG_HEIGHT, IMG_WIDTH])
    # Ensure the shape is set if decode_image doesn't set it fully
    img.set_shape([IMG_HEIGHT, IMG_WIDTH, CHANNELS])
    # Rescale pixel values from [0, 255] to [0, 1]
    img = img / 255.0
    return img, label


AUTOTUNE = tf.data.AUTOTUNE # For parallel processing optimization

# Training Dataset
if not train_df_for_tf_dataset.empty:
    train_dataset = tf.data.Dataset.from_tensor_slices(
        (train_df_for_tf_dataset['Filepath'].values, train_df_for_tf_dataset['label_numeric'].astype('float32').values)
    )
    train_dataset = train_dataset.shuffle(buffer_size=len(train_df_for_tf_dataset), seed=1, reshuffle_each_iteration=True)
    train_dataset = train_dataset.map(load_and_preprocess_image, num_parallel_calls=AUTOTUNE)
    train_dataset = train_dataset.batch(BATCH_SIZE)
    train_dataset = train_dataset.cache()  # Cache data after first epoch (if it fits in RAM or for faster disk reads)
    train_dataset = train_dataset.prefetch(buffer_size=AUTOTUNE) # Crucial for performance
    print(f"Training dataset created successfully. Element spec: {train_dataset.element_spec}")
else:
    # Create an empty dataset with the correct spec if source df is empty
    train_dataset = tf.data.Dataset.from_tensor_slices(
        (tf.constant([], dtype=tf.string), tf.constant([], dtype=tf.float32))
    ).map(load_and_preprocess_image, num_parallel_calls=AUTOTUNE).batch(BATCH_SIZE).prefetch(buffer_size=AUTOTUNE)
    print("Empty training dataset created (source DataFrame was empty).")


# Validation Dataset
if not val_df_for_tf_dataset.empty:
    val_dataset = tf.data.Dataset.from_tensor_slices(
        (val_df_for_tf_dataset['Filepath'].values, val_df_for_tf_dataset['label_numeric'].astype('float32').values)
    )
    val_dataset = val_dataset.map(load_and_preprocess_image, num_parallel_calls=AUTOTUNE)
    val_dataset = val_dataset.batch(BATCH_SIZE)
    val_dataset = val_dataset.cache() # Cache validation data as well
    val_dataset = val_dataset.prefetch(buffer_size=AUTOTUNE)
    print(f"Validation dataset created successfully. Element spec: {val_dataset.element_spec}")
else:
    val_dataset = tf.data.Dataset.from_tensor_slices(
        (tf.constant([], dtype=tf.string), tf.constant([], dtype=tf.float32))
    ).map(load_and_preprocess_image, num_parallel_calls=AUTOTUNE).batch(BATCH_SIZE).prefetch(buffer_size=AUTOTUNE)
    print("Empty validation dataset created (source DataFrame was empty).")


# Test Dataset
if not test_df_for_tf_dataset.empty and 'label_numeric' in test_df_for_tf_dataset.columns:
    test_dataset = tf.data.Dataset.from_tensor_slices(
        (test_df_for_tf_dataset['Filepath'].values, test_df_for_tf_dataset['label_numeric'].astype('float32').values)
    )
    test_dataset = test_dataset.map(load_and_preprocess_image, num_parallel_calls=AUTOTUNE)
    test_dataset = test_dataset.batch(BATCH_SIZE)
    test_dataset = test_dataset.cache()
    test_dataset = test_dataset.prefetch(buffer_size=AUTOTUNE)
    print(f"Test dataset created successfully. Element spec: {test_dataset.element_spec}")
else:
    test_dataset = tf.data.Dataset.from_tensor_slices(
        (tf.constant([], dtype=tf.string), tf.constant([], dtype=tf.float32))
    ).map(load_and_preprocess_image, num_parallel_calls=AUTOTUNE).batch(BATCH_SIZE).prefetch(buffer_size=AUTOTUNE)
    print("Empty test dataset created (source DataFrame was empty or 'label_numeric' missing).")

print("--- tf.data.Dataset Pipelines Prepared ---")


Shape of new training df for tf.data: (22400, 3)
Shape of new validation df for tf.data: (5600, 3)
Training dataset created successfully. Element spec: (TensorSpec(shape=(None, 120, 120, 3), dtype=tf.float32, name=None), TensorSpec(shape=(None,), dtype=tf.float32, name=None))
Validation dataset created successfully. Element spec: (TensorSpec(shape=(None, 120, 120, 3), dtype=tf.float32, name=None), TensorSpec(shape=(None,), dtype=tf.float32, name=None))
Test dataset created successfully. Element spec: (TensorSpec(shape=(None, 120, 120, 3), dtype=tf.float32, name=None), TensorSpec(shape=(None,), dtype=tf.float32, name=None))
--- tf.data.Dataset Pipelines Prepared ---


# Defining and Training the Convolutional Neural Network

In [None]:
#Defining Input Layer
inputs = tf.keras.Input(shape = (120, 120, 3))  #Defines the shape of the input samples and the number of colour channels

x = tf.keras.layers.Conv2D(filters = 16, kernel_size = (3, 3), activation = 'relu')(inputs)
x = tf.keras.layers.MaxPool2D(pool_size = (2, 2))(x)

x = tf.keras.layers.Conv2D(filters = 32, kernel_size = (3, 3), activation = 'relu')(x)
x = tf.keras.layers.MaxPool2D(pool_size = (2, 2))(x)

x = tf.keras.layers.GlobalAveragePooling2D()(x)

#Defining Output Layer
outputs = tf.keras.layers.Dense(1, activation = 'sigmoid')(x)

model = tf.keras.Model(inputs = inputs, outputs = outputs)

model.compile(
    optimizer = 'adam',
    loss = 'binary_crossentropy',
    metrics = ['accuracy']
)

print(model.summary())

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 120, 120, 3)]     0         
                                                                 
 conv2d (Conv2D)             (None, 118, 118, 16)      448       
                                                                 
 max_pooling2d (MaxPooling2D  (None, 59, 59, 16)       0         
 )                                                               
                                                                 
 conv2d_1 (Conv2D)           (None, 57, 57, 32)        4640      
                                                                 
 max_pooling2d_1 (MaxPooling  (None, 28, 28, 32)       0         
 2D)                                                             
                                                                 
 global_average_pooling2d (G  (None, 32)               0     

: 

In [None]:
history = model.fit(
    train_dataset,
    validation_data = val_dataset,
    epochs = 100,
    callbacks = [            #Functions to run at the start or end of each epoch
        tf.keras.callbacks.EarlyStopping(
            monitor = 'val_loss',
            patience = 10,       #If val_loss doesn't increase for 3 epochs, training will be halted
            restore_best_weights = True      #Sets the model's weights to that of the epoch that had the lowest loss
        )
    ]
)

Epoch 1/100

In [None]:
fig = px.line(
    history.history,
    y = ['loss', 'val_loss'],
    labels = {'index' : "Epoch", 'value' : "Loss"},
    title = "Training and Validation Loss over Time"
)

fig.show()

In [None]:

def evaluate_model(model, test_data_input): # Renamed parameter to avoid confusion

    # --- 1. Evaluate Model to get Loss and Accuracy ---
    print("Evaluating model...")
    results = model.evaluate(test_data_input) # Use the input parameter
    loss = results[0]
    accuracy = results[1]

    print(f'Test Loss: {loss:.5f}')
    print(f'Test Accuracy: {accuracy * 100:.2f}%')

    # --- 2. Get Model Predictions ---
    print("Generating predictions...")
    raw_predictions = model.predict(test_data_input) # Use the input parameter
    y_pred = np.squeeze((raw_predictions >= 0.5).astype(int))
    print(f"Generated {len(y_pred)} predictions.")

    # --- 3. Get True Labels ---
    true_labels = []
    print("Extracting true labels from test_data_input...")
    # Iterate through the test_data_input to get all true labels.
    for images_batch, labels_batch in test_data_input: # Use the input parameter
        true_labels.extend(labels_batch.numpy())
    true_labels = np.array(true_labels)
    print(f"Extracted {len(true_labels)} true labels.")

    # Ensure lengths match (can sometimes be an issue with the last batch if not handled perfectly)
    if len(true_labels) != len(y_pred):
        print(f"Warning: Mismatch in number of true labels ({len(true_labels)}) and predictions ({len(y_pred)}).")
        # Adjusting the longer array to the shorter length for metrics calculation
        min_len = min(len(true_labels), len(y_pred))
        true_labels = true_labels[:min_len]
        y_pred = y_pred[:min_len]
        print(f"Adjusted lengths to {min_len} for metrics calculation.")


    # --- 4. Calculate and Display Confusion Matrix ---
    print("Calculating confusion matrix...")
    cm = confusion_matrix(true_labels, y_pred) # Use the extracted 'true_labels'

    # --- 5. Calculate and Display Classification Report ---
    print("Calculating classification report...")
    target_names = ['Negative', 'Positive'] # Ensure these match your label encoding
    clr = classification_report(true_labels, y_pred, target_names=target_names) # Use 'true_labels'

    # --- 6. Plot Confusion Matrix ---
    plt.figure(figsize=(6, 6))
    sns.heatmap(cm, annot=True, fmt='g', vmin=0, cmap='Blues', cbar=False)
    plt.xticks(ticks=np.arange(len(target_names)) + 0.5, labels=target_names)
    plt.yticks(ticks=np.arange(len(target_names)) + 0.5, labels=target_names)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')
    plt.show()

    # --- 7. Print Classification Report ---
    print('Classification Report:\n----------------------------------------------------------------')
    print(clr)

In [None]:
evaluate_model(model, test_dataset)