# Imports

In [1]:
import os
from PIL import Image
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split

# Data Loading

In [2]:
def display_image(image_path):
    img = Image.open(image_path)
    img.show()

yes_path = './data/yes'
no_path = "./data/no"

In [3]:
def create_dataframe(yes_path, no_path):
    # Initialize an empty list to store the data
    data = []

    # Process 'yes' images
    for image in os.listdir(yes_path):
        data.append({
            "image_path": os.path.join(yes_path, image),
            "tumor_present": "yes"
        })

    # Process 'no' images
    for image in os.listdir(no_path):
        data.append({
            "image_path": os.path.join(no_path, image),
            "tumor_present": "no"
        })

    # Create a DataFrame
    df = pd.DataFrame(data)

    return df

## Train, Test, Validation Splits

In [4]:
def create_train_test_val_splits(df, test_size=0.2, val_size=0.1):

    # First, split into training + validation and test sets
    train_val_df, test_df = train_test_split(df, test_size=test_size, random_state=42)

    # Adjust val_size to compensate for the initial split
    adjusted_val_size = val_size / (1 - test_size)

    # Then, split the training + validation set into training and validation sets
    train_df, val_df = train_test_split(train_val_df, test_size=adjusted_val_size, random_state=42)

    return train_df, val_df, test_df

In [7]:
total_data = create_dataframe(yes_path, no_path)
total_data = total_data.sample(frac=1).reset_index(drop=True)

In [9]:
# Shuffle and split the data
train_data, val_data, test_data = create_train_test_val_splits(total_data)

# Display the sizes of each set
print(f"Training Set: {len(train_data)}")
print(f"Validation Set: {len(val_data)}")
print(f"Test Set: {len(test_data)}")

print(train_data.head())

Training Set: 2100
Validation Set: 300
Test Set: 600
               image_path tumor_present
921   ./data/no/no761.jpg            no
339   ./data/yes/y312.jpg           yes
1984  ./data/no/no373.jpg            no
2439  ./data/no/no555.jpg            no
259   ./data/yes/y118.jpg           yes


using this to guide creation of the CNN: https://www.analyticsvidhya.com/blog/2021/01/image-classification-using-convolutional-neural-networks-a-step-by-step-guide/

# Make images the same size and normalize

In [None]:
from PIL import Image, ImageOps
import numpy as np

# Global counter for images that are excluded
excluded_images_count = 0
all_images = 0

def preprocess_image(image_path, target_size):
    global excluded_images_count
    global all_images
    all_images += 1

    try:
        img = Image.open(image_path)
        img = ImageOps.fit(img, target_size, Image.Resampling.LANCZOS)  # Resizing with LANCZOS
        img = np.array(img)
        if img.shape != (target_size[0], target_size[1], 3):  # Check if image has three color channels
            excluded_images_count += 1
            return None
        img = img / 255.0  # Normalize pixel values
        return img
    except Exception as e:
        print(f"Error processing image {image_path}: {e}")
        excluded_images_count += 1
        return None

def encode_label(label):
    return 1 if label == 'yes' else 0

def prepare_dataset(df, target_size=(224, 224)):
    images = df['image_path'].apply(lambda x: preprocess_image(x, target_size)).dropna()
    labels = df['tumor_present'][images.index].apply(encode_label)  # Align labels with images
    return np.array(images.tolist()), np.array(labels)