# DSC 180B CNN Notebook (Kaggle Dataset)

### Importing Needed Libraries

In [1]:
!pip uninstall opencv-python-headless -y 
!pip uninstall opencv-python -y
!pip uninstall opencv-contrib-python -y
!pip install opencv-python-headless
!pip install kaggle

[0mCollecting opencv-python-headless
  Using cached opencv-python-headless-4.11.0.86.tar.gz (95.2 MB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: opencv-python-headless
  Building wheel for opencv-python-headless (pyproject.toml) ... [?25l\^C
[?25canceled
[31mERROR: Operation cancelled by user[0m[31m
[0mCollecting kaggle
  Obtaining dependency information for kaggle from https://files.pythonhosted.org/packages/83/a9/3208f2007dd57d47329f766b3813bb61975170a7ed52627a338566a6c490/kaggle-1.7.4-py3-none-any.whl.metadata
  Using cached kaggle-1.7.4-py3-none-any.whl.metadata (17 kB)
Using cached kaggle-1.7.4-py3-none-any.whl (173 kB)
Installing collected packages: kaggle
Successfully installed kaggle-1.7.4


In [3]:
import json
import pandas as pd
import matplotlib.pylab as plt
import numpy as np
import torch
import cv2
import tensorflow as tf
from sklearn.model_selection import train_test_split
import os
from pathlib import Path
import zipfile
from torchvision import models
import torch.nn as nn
from tqdm import tqdm
import torchvision.transforms as transforms
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications.resnet50 import preprocess_input
from PIL import Image

In [None]:
# Paths for storing images
local_raw_download_path = '../../data/raw/'
local_processed_download_path = '../../data/processed/'
os.makedirs(os.path.join(local_raw_download_path, 'train'), exist_ok=True)
os.makedirs(os.path.join(local_raw_download_path, 'val'), exist_ok=True)
os.makedirs(os.path.join(local_raw_download_path, 'test'), exist_ok=True)

os.makedirs(local_processed_download_path, exist_ok=True)
os.makedirs(os.path.join(local_processed_download_path, 'train'), exist_ok=True)
os.makedirs(os.path.join(local_processed_download_path, 'val'), exist_ok=True)
os.makedirs(os.path.join(local_processed_download_path, 'test'), exist_ok=True)

In [5]:
def setup_kaggle(username: str, key: str):
    """
    Configures Kaggle API access and downloads the Chest X-ray Pneumonia dataset.
    :param username: Kaggle username
    :param key: Kaggle API key
    """
    # Define the kaggle credentials path
    kaggle_dir = Path.home() / ".kaggle"  # Use the user's home directory
    kaggle_dir.mkdir(parents=True, exist_ok=True)
    kaggle_json_path = kaggle_dir / "kaggle.json"
    
    # Write Kaggle credentials
    kaggle_credentials = {"username": username, "key": key}
    with open(kaggle_json_path, "w") as f:
        json.dump(kaggle_credentials, f)
    
    # Set permissions for security
    os.chmod(kaggle_json_path, 0o600)
    
    # Ensure kaggle CLI is in the path
    os.environ["KAGGLE_CONFIG_DIR"] = str(kaggle_dir)
        
    # Download the dataset
    dataset_path = Path("chest_xray_pneumonia")
    dataset_path.mkdir(exist_ok=True)

    os.system(f"kaggle datasets download -d paultimothymooney/chest-xray-pneumonia -p {dataset_path} --unzip")
    
    print("Dataset downloaded successfully!")
    
    # Define the unpack path
    unpack_path = Path("../../data/raw")
    unpack_path.mkdir(parents=True, exist_ok=True)
    
    # Unpack the zipped dataset into the specified path
    zip_file = dataset_path / "chest-xray-pneumonia.zip"
    
    if zip_file.exists():
        with zipfile.ZipFile(zip_file, 'r') as zip_ref:
            zip_ref.extractall(unpack_path)
        print(f"Dataset unpacked successfully to {unpack_path}")
        
        # Delete the zip file after unpacking
        zip_file.unlink()
        print("Zip file deleted after unpacking.")
    else:
        print("Zipped dataset not found!")

# Get user input for Kaggle credentials
username = input("Enter your Kaggle Username: ")
key = input("Enter your generated Kaggle API key: ")

setup_kaggle(username, key)

Protocol message DatasetInfo has no "info" field.
Dataset downloaded successfully!
Zipped dataset not found!


In [None]:
def preprocess_img(img, img_size=256, crop_size=224, is_train=True):
    # Resize and crop
    img = tf.image.resize(img, (img_size, img_size))
    img = tf.image.central_crop(img, crop_size / img_size)

    if is_train:
        # Random horizontal flip
        img = tf.image.random_flip_left_right(img)
    
        # Random brightness and contrast
        img = tf.image.random_brightness(img, max_delta=0.25)
        img = tf.image.random_contrast(img, lower=0.75, upper=1.25)
    
        # Random affine transformation (rotation & translation)
        def random_affine(img):
            transform = transforms.RandomAffine(
                degrees=15, translate=(0.1, 0.1), scale=(0.9, 1.1)
            )
            img = img.numpy().astype(np.uint8)  # Ensure the image is in uint8 format
            img_pil = Image.fromarray(img)
            img_pil = transform(img_pil)  # Apply affine transform
            img = np.array(img_pil).astype(np.float32)  # Convert back to float32
            return img
        
        img = random_affine(img)
        
    img = preprocess_input(img)
    
    return img

In [None]:
train_df = {'filename': [], 'label': []}
val_df = {'filename': [], 'label': []}
test_df = {'filename': [], 'label': []}

def extract_label_from_filename(filename):
    """Extracts label from filename based on known keywords."""
    if "bacteria" in filename:
        return "bacteria"
    elif "virus" in filename:
        return "virus"
    else:
        return "normal"

for folder_prefix, dataset in zip(["train/", "val/", "test/"], [train_df, val_df, test_df]):
    for file in os.listdir(os.path.join("../../data/raw/", folder_prefix)):
        filename = file.split("/")[-1]
        label = extract_label_from_filename(filename)
        dataset['filename'].append(filename)
        dataset['label'].append(label)

        # Define local paths for raw and processed data
        processed_local_folder = os.path.join("../../data/processed", folder_prefix)

        # Create the directories if they don't exist
        os.makedirs(processed_local_folder, exist_ok=True)

        # Load and process the image
        img = tf.io.read_file(file)
        
        img = tf.image.decode_jpeg(img, channels=3)
        if img is None:
            print(f"Failed to load image: {file}")
            continue

        is_train = (folder_prefix == "train/")
            
        img = preprocess_img(img, 256, 224, is_train)

        img = img.numpy() if isinstance(img, tf.Tensor) else img

        # Save processed image
        processed_image_path = os.path.join(processed_local_folder, filename)
        cv2.imwrite(processed_image_path, img)
                    
print("Completed fetching all objects.")
print(f"Train: {len(train_df['filename'])} images")
print(f"Validation: {len(val_df['filename'])} images")
print(f"Test: {len(test_df['filename'])} images")

In [None]:
train_datagen = ImageDataGenerator()
val_datagen = ImageDataGenerator()
test_datagen = ImageDataGenerator()

train_df = pd.DataFrame(train_df)
val_df = pd.DataFrame(val_df)
test_df = pd.DataFrame(test_df)

train_df.head()

In [None]:
# Converting to binary labels
train_df['binary_label'] = train_df['label'].apply(lambda x: '0' if x == 'normal' else '1')
val_df['binary_label'] = val_df['label'].apply(lambda x: '0' if x == 'normal' else '1')
test_df['binary_label'] = test_df['label'].apply(lambda x: '0' if x == 'normal' else '1')

In [None]:
# Create data generators
train_generator = train_datagen.flow_from_dataframe(
    train_df, directory='../../data/processed/train', x_col='filename', y_col='binary_label', target_size=(224, 224), batch_size=32, class_mode='binary', validate_filenames=False)

val_generator = val_datagen.flow_from_dataframe(
    val_df, directory='../../data/processed/val', x_col='filename', y_col='binary_label', target_size=(224, 224), batch_size=32, class_mode='binary', validate_filenames=False)

test_generator = test_datagen.flow_from_dataframe(
    test_df, directory='../../data/processed/test', x_col='filename', y_col='binary_label', target_size=(224, 224), batch_size=32, class_mode='binary', validate_filenames=False)

### Healthy vs Unhealthy Lung Comparison

In [None]:
import matplotlib.image as mpimg

healthy_x_rays = train_df[train_df['binary_label'] == '0'][:10]
abnormal_x_rays = train_df[train_df['binary_label'] == '1'][:10]

raw_paths = "../../data/raw/train/"

for i in range(10):
    fig, ax = plt.subplots(1, 2, figsize=(12, 6))

    # Load the healthy X-ray image
    healthy_image_path = raw_paths + healthy_x_rays.iloc[i]['filename']
    healthy_img = mpimg.imread(healthy_image_path)
    ax[0].imshow(healthy_img)
    ax[0].set_title('Healthy X-Ray')

    # Load the abnormal X-ray image
    abnormal_image_path = raw_paths + abnormal_x_rays.iloc[i]['filename']
    abnormal_img = mpimg.imread(abnormal_image_path)
    ax[1].imshow(abnormal_img)
    ax[1].set_title('Abnormal X-Ray')

    plt.show()

In [None]:
from tensorflow.keras.layers import Dense, BatchNormalization, Dropout, Activation
from tensorflow.keras.regularizers import l2
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.applications import ResNet50V2
from tensorflow.keras.optimizers.schedules import CosineDecayRestarts

# Load ResNet50V2
resnet50 = ResNet50V2(include_top=False, weights='imagenet', pooling='max', input_shape=(224, 224, 3))

for layer in resnet50.layers:
    layer.trainable = True
    
# Define the model
model = Sequential([
    resnet50,
    Dense(512, kernel_regularizer=l2(5e-4)),
    BatchNormalization(),
    Activation('relu'),
    Dropout(0.6),  
    Dense(256, kernel_regularizer=l2(5e-4)),
    BatchNormalization(),
    Activation('relu'),
    Dropout(0.5),
    Dense(128, kernel_regularizer=l2(5e-4)),
    Activation('relu'),
    Dense(1, activation="sigmoid")
])

# Implement Cosine Annealing
initial_lr = 0.0001  # Start higher to encourage exploration
first_decay_steps = 3 * 163

cosine_decay_restarts = CosineDecayRestarts(
    initial_learning_rate=initial_lr,
    first_decay_steps=first_decay_steps,
    t_mul=2.0,  # Double cycle length after each restart
    m_mul=0.8,  # Reduce LR after each restart
    alpha=1e-6  # Small floor LR to prevent zero updates
)

optimizer = Adam(learning_rate=cosine_decay_restarts)

model.compile(
    optimizer=optimizer,
    loss='binary_crossentropy',
    metrics=['accuracy', 'AUC']
)

checkpoint_filepath = 'final_model_1.keras'

# Callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True, verbose=1)
model_checkpoint = ModelCheckpoint(checkpoint_filepath, 
                                   monitor='val_loss', 
                                   save_best_only=True, 
                                   verbose=1, mode='min')

# Train the model
history = model.fit(
        train_generator,
        validation_data=test_generator,
        epochs=100,
        callbacks=[early_stopping, model_checkpoint],
        verbose=1
)