In [1]:
import pandas as pd
import numpy as np
import os

# Define a function to load and combine the abnormality description files
def load_abnormal_data(base_path, mass_train, mass_test, calc_train, calc_test):
    """
    Loads all four abnormality description CSVs and concatenates them.
    """
    try:
        # Load all CSVs
        df_mass_train = pd.read_csv(os.path.join(base_path, mass_train))
        df_mass_train['split'] = 'Training'

        df_mass_test = pd.read_csv(os.path.join(base_path, mass_test))
        df_mass_test['split'] = 'Test'

        df_calc_train = pd.read_csv(os.path.join(base_path, calc_train))
        df_calc_train['split'] = 'Training'

        df_calc_test = pd.read_csv(os.path.join(base_path, calc_test))
        df_calc_test['split'] = 'Test'

        # Add 'abnormality_type' before concatenating
        df_mass_train['abnormality_type'] = 'Mass'
        df_mass_test['abnormality_type'] = 'Mass'
        df_calc_train['abnormality_type'] = 'Calc'
        df_calc_test['abnormality_type'] = 'Calc'

        # Concatenate all into a single DataFrame
        df_abnormal = pd.concat([df_mass_train, df_mass_test, df_calc_train, df_calc_test], ignore_index=True)

        # Create the linking key for cropped images. Example: Mass-Training_P_00001_LEFT_CC_1
        df_abnormal['PatientID_key_cropped'] = df_abnormal['abnormality_type'] + '-' + \
                                                df_abnormal['split'] + '_' + \
                                                df_abnormal['patient_id'] + '_' + \
                                                df_abnormal['left or right breast'] + '_' + \
                                                df_abnormal['image view'] + '_' + \
                                                df_abnormal['abnormality id'].astype(str)

        # Create the linking key for full mammograms. Example: Mass-Training_P_00001_LEFT_CC
        df_abnormal['PatientID_key_full'] = df_abnormal['abnormality_type'] + '-' + \
                                            df_abnormal['split'] + '_' + \
                                            df_abnormal['patient_id'] + '_' + \
                                            df_abnormal['left or right breast'] + '_' + \
                                            df_abnormal['image view']

        return df_abnormal

    except FileNotFoundError as e:
        print(f"Error loading file: {e}. Make sure all description CSVs are present.")
        return None
    except Exception as e:
        print(f"An error occurred while loading abnormal data: {e}")
        return None

# --- Main execution ---
try:
    # Define the base path to your data in Google Drive
    # This assumes the CSVs are in the '4600 Data' folder
    # If they are in a subfolder, change this path.
    gdrive_base_path = "/Users/joshuarauf/Library/CloudStorage/GoogleDrive-jrauf7@gmail.com/My Drive/4600 Data"

    # 1. Load and process the abnormality data
    df_abnormal = load_abnormal_data(
        gdrive_base_path,
        "mass_case_description_train_set.csv",
        "mass_case_description_test_set.csv",
        "calc_case_description_train_set.csv",
        "calc_case_description_test_set.csv"
    )

    if df_abnormal is not None:
        # 2. Load the main dicom_info CSV
        dicom_info_path = os.path.join(gdrive_base_path, "dicom_info.csv")
        df_dicom_info = pd.read_csv(dicom_info_path)

        # 3. Create label DataFrames for merging

        # Labels for cropped images
        df_cropped_labels = df_abnormal[['PatientID_key_cropped', 'pathology']].copy()
        df_cropped_labels.rename(columns={'PatientID_key_cropped': 'PatientID'}, inplace=True)
        df_cropped_labels = df_cropped_labels.drop_duplicates(subset=['PatientID'])

        # Labels for full images
        df_full_labels = df_abnormal[['PatientID_key_full', 'pathology']].copy()
        df_full_labels.rename(columns={'PatientID_key_full': 'PatientID'}, inplace=True)
        df_full_labels = df_full_labels.drop_duplicates(subset=['PatientID'])

        # 4. Merge labels onto dicom_info
        df_merged = pd.merge(df_dicom_info, df_cropped_labels, on='PatientID', how='left')
        df_merged = pd.merge(df_merged, df_full_labels, on='PatientID', how='left', suffixes=('_cropped', '_full'))

        # 5. Consolidate pathology columns
        df_merged['pathology'] = df_merged['pathology_cropped'].fillna(df_merged['pathology_full'])
        df_merged.drop(columns=['pathology_cropped', 'pathology_full'], inplace=True)

        # 6. Fill 'NORMAL' cases
        # (This identifies images that are not masks and have no abnormality label)
        image_filter = df_merged['SeriesDescription'].isin(['full mammogram images', 'cropped images'])
        df_merged.loc[image_filter & df_merged['pathology'].isna(), 'pathology'] = 'NORMAL'

        # 7. Create the final DataFrame
        final_df = df_merged.loc[
            image_filter & df_merged['pathology'].notna(),
            ['image_path', 'pathology', 'SeriesDescription']
        ].copy()

        # 8. Clean the image_path to create the full, usable path
        # 'CBIS-DDSM/jpeg/...' -> '/content/drive/MyDrive/4600 Data/jpeg/...'
        final_df['full_image_path'] = gdrive_base_path + final_df['image_path'].str.replace('CBIS-DDSM/', '', regex=False)

        # Select final columns
        final_data_manifest = final_df[['full_image_path', 'pathology', 'SeriesDescription']]

        # 9. Report and Save
        print("--- Final Data Manifest Info ---")
        print(final_data_manifest.info())

        print("\n--- Value Counts for Pathology ---")
        print(final_data_manifest['pathology'].value_counts())

        print("\n--- Value Counts for Image Type ---")
        print(final_data_manifest['SeriesDescription'].value_counts())

        # Save the manifest to your Colab session's local storage
        output_filename = "image_pathology_manifest.csv"
        final_data_manifest.to_csv(output_filename, index=False)
        print(f"\nSuccessfully created '{output_filename}' in your Colab environment.")
        print("You can now load this file in your next step.")

except FileNotFoundError as e:
    print(f"Error loading file: {e}. Please ensure 'dicom_info.csv' and the other CSVs are in '{gdrive_base_path}'")
except Exception as e:
    print(f"An error occurred during the main execution: {e}")

--- Final Data Manifest Info ---
<class 'pandas.core.frame.DataFrame'>
Index: 6424 entries, 0 to 10236
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   full_image_path    6424 non-null   object
 1   pathology          6424 non-null   object
 2   SeriesDescription  6424 non-null   object
dtypes: object(3)
memory usage: 200.8+ KB
None

--- Value Counts for Pathology ---
pathology
MALIGNANT                  2725
BENIGN                     2660
BENIGN_WITHOUT_CALLBACK    1039
Name: count, dtype: int64

--- Value Counts for Image Type ---
SeriesDescription
cropped images           3567
full mammogram images    2857
Name: count, dtype: int64

Successfully created 'image_pathology_manifest.csv' in your Colab environment.
You can now load this file in your next step.


In [None]:
import pandas as pd
import cv2
import numpy as np
import os
from tqdm.notebook import tqdm

# ==========================================
# USER CONFIGURATION
# ==========================================
BASE_IMAGE_DIR = "/Users/joshuarauf/Library/CloudStorage/GoogleDrive-jrauf7@gmail.com/My Drive/4600 Data/jpeg" 
MANIFEST_PATH = "image_pathology_manifest.csv" 
# ==========================================

# --- 1. Load Manifest ---
if not os.path.exists(MANIFEST_PATH):
    print(f"Error: Manifest file not found at {MANIFEST_PATH}")
else:
    df = pd.read_csv(MANIFEST_PATH)
    print(f"Loaded manifest with {len(df)} entries.")

# --- 2. Smart File Indexing ---
print("Indexing local files... this may take a moment...")

file_map = {} 
for root, dirs, files in os.walk(BASE_IMAGE_DIR):
    for file in files:
        if file.lower().endswith(('.jpg', '.jpeg', '.png')):
            file_map[file] = os.path.join(root, file)

print(f"Found {len(file_map)} images in your local folder.")

# --- 3. Process Data ---
if 'SeriesDescription' in df.columns:
    df_cropped = df[df['SeriesDescription'] == 'cropped images'].copy()
else:
    df_cropped = df.copy()

if 'pathology' in df_cropped.columns:
    df_cropped['label'] = df_cropped['pathology'].replace({'BENIGN_WITHOUT_CALLBACK': 'BENIGN'})

images = []
labels = []
IMG_SIZE = 224

# Counters
success_count = 0
error_count = 0
total_manifest_entries = 6424 

print(f"Starting image load for {len(df_cropped)} manifest entries...")

# Create a tqdm progress bar
pbar = tqdm(df_cropped.iterrows(), total=len(df_cropped))

for i, (_, row) in enumerate(pbar):
    csv_path = row['full_image_path']
    filename = os.path.basename(csv_path)
    
    if filename in file_map:
        local_path = file_map[filename]
        
        try:
            img = cv2.imread(local_path, cv2.IMREAD_GRAYSCALE)
            
            if img is None:
                error_count += 1
            else:
                img = cv2.resize(img, (IMG_SIZE, IMG_SIZE))
                img = img / 255.0
                
                images.append(img)
                labels.append(row['label'])
                success_count += 1
            
        except Exception as e:
            error_count += 1
    else:
        # File not found in map
        error_count += 1

    # Update the progress bar description
    pbar.set_description(f"Success: {success_count} | Errors: {error_count} | Processed: {i+1}/{total_manifest_entries}")

# --- 4. Finalize ---
X = np.array(images)
y = np.array(labels)

if X.size > 0:
    X = X.reshape(-1, IMG_SIZE, IMG_SIZE, 1)
    print(f"\nSuccess!")
    print(f"Images Loaded: {X.shape}")
    print(f"Labels Loaded: {y.shape}")
    print(f"Total Successful: {success_count}")
    print(f"Total Errors/Missing: {error_count}")
else:
    print("No images found. Please check the BASE_IMAGE_DIR path.")

Loaded manifest with 6424 entries.
Indexing local files... this may take a moment...


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf

# --- Prerequisite: Encode Labels ---
# 'y' is currently an array of strings like ['BENIGN', 'MALIGNANT', ...]
# We need to convert them to numbers
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# y_encoded is now an array of [0, 1, 1, 0, ...]
# You can see the class mapping:
# print(le.classes_)  # Will show ['BENIGN' 'MALIGNANT']

# --- Step 1: Create the main Training (80%) and Test (20%) split ---
# We stratify by y_encoded to ensure both splits have a similar
# percentage of benign and malignant cases.
TEST_SPLIT_SIZE = 0.20

X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y_encoded,
    test_size=TEST_SPLIT_SIZE,
    random_state=42,  # for reproducibility
    stratify=y_encoded # This is the key part for stratification
)

# --- Step 2: Create the Training and Validation split from X_train_full ---
# We split the 80% training data into a new training set and a validation set.
# e.g., 0.25 * 80% = 20% validation, 75% * 80% = 60% training
VALIDATION_SPLIT_SIZE = 0.25 # (20% of original data)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full,
    test_size=VALIDATION_SPLIT_SIZE,
    random_state=42, # for reproducibility
    stratify=y_train_full # Stratify this split as well
)

# --- Check your results ---
print(f"Original X shape: {X.shape}")
print(f"Original y shape: {y_encoded.shape}\n")

print(f"Training X shape:   {X_train.shape}")
print(f"Training y shape:   {y_train.shape}\n")

print(f"Validation X shape: {X_val.shape}")
print(f"Validation y shape: {y_val.shape}\n")

print(f"Test X shape:       {X_test.shape}")
print(f"Test y shape:       {y_test.shape}")

In [None]:
# 1. Define the augmentation layers
# These will run on the GPU, making them very fast.
data_augmentation = tf.keras.Sequential([
    # Add 'input_shape' only if this is the first layer of your model
    # tf.keras.layers.InputLayer(input_shape=(IMG_SIZE, IMG_SIZE, 1)),

    # --- random rotations ---
    tf.keras.layers.RandomRotation(0.1),  # rotate by +/- 10%

    # --- horizontal flips ---
    tf.keras.layers.RandomFlip("horizontal"),

    # --- scaling (zoom) ---
    tf.keras.layers.RandomZoom(0.1), # zoom in/out by +/- 10%

    # --- slight adjustments in brightness and contrast ---
    tf.keras.layers.RandomBrightness(0.1), # adjust brightness by +/- 10%
    tf.keras.layers.RandomContrast(0.1)  # adjust contrast by +/- 10%
], name="data_augmentation")


# 2. Build your tf.data pipelines
# This is the modern, efficient way to feed data to a model
BATCH_SIZE = 32

# Create datasets from your NumPy arrays
train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train))
val_ds = tf.data.Dataset.from_tensor_slices((X_val, y_val))
test_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test))

# 3. Apply augmentation ONLY to the training set
# We also batch and prefetch for performance
train_ds = train_ds.shuffle(buffer_size=len(X_train)) \
                   .batch(BATCH_SIZE) \
                   .map(lambda x, y: (data_augmentation(x, training=True), y),
                        num_parallel_calls=tf.data.AUTOTUNE) \
                   .prefetch(buffer_size=tf.data.AUTOTUNE)

# Validation and Test sets: DO NOT augment. Just batch and prefetch.
val_ds = val_ds.batch(BATCH_SIZE) \
               .prefetch(buffer_size=tf.data.AUTOTUNE)

test_ds = test_ds.batch(BATCH_SIZE) \
                 .prefetch(buffer_size=tf.data.AUTOTUNE)

print("\nSuccessfully created tf.data pipelines.")
print(f"Training dataset:   {train_ds}")
print(f"Validation dataset: {val_ds}")
print(f"Test dataset:       {test_ds}")

In [None]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, accuracy_score

# --- 1. Flatten the Data ---
# Our sklearn models need 2D data (samples, features), not 4D (samples, h, w, c)
# We flatten (224, 224, 1) into a single vector of 50,176 features

print("Flattening data for sklearn models...")
# Get the number of samples
n_samples_train = X_train.shape[0]
n_samples_val = X_val.shape[0]

# Reshape
X_train_flat = X_train.reshape(n_samples_train, -1)
X_val_flat = X_val.reshape(n_samples_val, -1)

print(f"Original X_train shape: {X_train.shape}")
print(f"New X_train_flat shape: {X_train_flat.shape}")
print(f"Original X_val shape: {X_val.shape}")
print(f"New X_val_flat shape: {X_val_flat.shape}\n")

# --- 2. Model 1: Logistic Regression ---
# We use a Pipeline to chain scaling and the model.
# Logistic Regression benefits from scaling and needs more iterations
# to converge on this many features.
print("--- Training: Logistic Regression ---")
# We use StandardScaler for better performance
lr_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LogisticRegression(max_iter=1000, random_state=42))
])

lr_pipeline.fit(X_train_flat, y_train)

# Evaluate on the validation set
y_pred_lr = lr_pipeline.predict(X_val_flat)
print("Logistic Regression - Validation Results:")
print(f"Accuracy: {accuracy_score(y_val, y_pred_lr):.4f}")
print(classification_report(y_val, y_pred_lr, target_names=le.classes_))


# --- 3. Model 2: k-Nearest Neighbors (kNN) ---
# kNN is very sensitive to scaling and the 'curse of dimensionality'
# (which we have with 50k+ features!), so it may be slow and/or inaccurate.
print("--- Training: k-Nearest Neighbors (kNN) ---")
knn_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', KNeighborsClassifier(n_neighbors=5)) # Using k=5 as a default
])

knn_pipeline.fit(X_train_flat, y_train)

# Evaluate on the validation set
y_pred_knn = knn_pipeline.predict(X_val_flat)
print("kNN - Validation Results:")
print(f"Accuracy: {accuracy_score(y_val, y_pred_knn):.4f}")
print(classification_report(y_val, y_pred_knn, target_names=le.classes_))


# --- 4. Model 3: Gaussian Naive Bayes ---
# This model makes strong independence assumptions about features,
# which is not true for pixels, but it's a very fast baseline.
# It doesn't require scaling.
print("--- Training: Gaussian Naive Bayes ---")
gnb = GaussianNB()
gnb.fit(X_train_flat, y_train)

# Evaluate on the validation set
y_pred_gnb = gnb.predict(X_val_flat)
print("Gaussian Naive Bayes - Validation Results:")
print(f"Accuracy: {accuracy_score(y_val, y_pred_gnb):.4f}")
print(classification_report(y_val, y_pred_gnb, target_names=le.classes_))

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.metrics import classification_report, accuracy_score

# --- 1. Create a Custom PyTorch Dataset ---
# This class will handle our numpy arrays and convert them to
# the (N, C, H, W) format PyTorch expects.

class MammoDataset(Dataset):
    def __init__(self, images, labels):
        # images are (N, H, W, C) - we need (N, C, H, W)
        # We permute the dimensions here: (0, 3, 1, 2)
        # (N, H, W, C) -> (N, C, H, W)
        self.X = torch.tensor(images, dtype=torch.float32).permute(0, 3, 1, 2)
        # Labels for CrossEntropyLoss need to be LongTensors
        self.y = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# --- 2. Define the Baseline CNN Architecture ---

class BaseCNN(nn.Module):
    def __init__(self):
        super(BaseCNN, self).__init__()
        # Input shape: (Batch, 1, 224, 224)
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=3, stride=1, padding=1)
        # Shape: (Batch, 16, 224, 224)
        self.relu1 = nn.ReLU()
        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
        # Shape: (Batch, 16, 112, 112)

        self.conv2 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=1)
        # Shape: (Batch, 32, 112, 112)
        self.relu2 = nn.ReLU()
        self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
        # Shape: (Batch, 32, 56, 56)

        self.conv3 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=1)
        # Shape: (Batch, 64, 56, 56)
        self.relu3 = nn.ReLU()
        self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2)
        # Shape: (Batch, 64, 28, 28)

        # Flatten the output for the fully-connected layers
        # 64 channels * 28 * 28 = 50,176
        self.flatten = nn.Flatten()

        self.fc1 = nn.Linear(64 * 28 * 28, 512)
        self.relu4 = nn.ReLU()

        # Output layer: 2 classes (BENIGN, MALIGNANT)
        self.fc2 = nn.Linear(512, 2)

    def forward(self, x):
        x = self.pool1(self.relu1(self.conv1(x)))
        x = self.pool2(self.relu2(self.conv2(x)))
        x = self.pool3(self.relu3(self.conv3(x)))
        x = self.flatten(x)
        x = self.relu4(self.fc1(x))
        x = self.fc2(x) # No softmax here, CrossEntropyLoss handles it
        return x

# --- 3. Set up Data, Model, Loss, and Optimizer ---

# Set device (use GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}\n")

# Create Datasets
# X_train, y_train, X_val, y_val are from your sklearn train_test_split
train_dataset = MammoDataset(X_train, y_train)
val_dataset = MammoDataset(X_val, y_val)

# Create DataLoaders
BATCH_SIZE = 32
train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(dataset=val_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Initialize model, loss, and optimizer
model = BaseCNN().to(device)
criterion = nn.CrossEntropyLoss() # Handles softmax internally
optimizer = optim.Adam(model.parameters(), lr=0.001)

print("Model architecture:")
print(model)
print("\nStarting training...")

# --- 4. Training and Validation Loop ---
NUM_EPOCHS = 10 # You can increase this

for epoch in range(NUM_EPOCHS):
    # --- Training Phase ---
    model.train() # Set model to training mode
    train_loss = 0.0

    for i, (images, labels) in enumerate(train_loader):
        # Move data to the device
        images = images.to(device)
        labels = labels.to(device)

        # 1. Zero the gradients
        optimizer.zero_grad()

        # 2. Forward pass
        outputs = model(images)

        # 3. Calculate loss
        loss = criterion(outputs, labels)

        # 4. Backward pass
        loss.backward()

        # 5. Update weights
        optimizer.step()

        train_loss += loss.item() * images.size(0)

    train_loss = train_loss / len(train_loader.dataset)

    # --- Validation Phase ---
    model.eval() # Set model to evaluation mode
    val_loss = 0.0
    val_corrects = 0
    all_preds = []
    all_labels = []

    with torch.no_grad(): # No gradients needed for validation
        for images, labels in val_loader:
            images = images.to(device)
            labels = labels.to(device)

            # Forward pass
            outputs = model(images)
            loss = criterion(outputs, labels)

            # Get predictions (class with the highest score)
            _, preds = torch.max(outputs, 1)

            val_loss += loss.item() * images.size(0)
            val_corrects += torch.sum(preds == labels.data)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    val_loss = val_loss / len(val_loader.dataset)
    val_acc = val_corrects.double() / len(val_loader.dataset)

    print(f"Epoch {epoch+1}/{NUM_EPOCHS} | "
          f"Train Loss: {train_loss:.4f} | "
          f"Val Loss: {val_loss:.4f} | "
          f"Val Acc: {val_acc:.4f}")

# --- 5. Final Evaluation ---
print("\nTraining complete.")
print("Final Validation Results:")
# 'le' is the LabelEncoder you fit earlier
print(classification_report(all_labels, all_preds, target_names=le.classes_))

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.metrics import classification_report, accuracy_score
import torchvision.models as models

# --- 1. Create a Custom PyTorch Dataset ---
# We must modify this Dataset to handle 3 channels, as ResNet
# expects a 3-channel (RGB) input.

class MammoDataset(Dataset):
    def __init__(self, images, labels):
        # images are (N, H, W, C) - we need (N, C, H, W)
        images_permuted = torch.tensor(images, dtype=torch.float32).permute(0, 3, 1, 2)

        # --- NEW: Duplicate the single channel 3 times ---
        # (N, 1, H, W) -> (N, 3, H, W)
        self.X = images_permuted.repeat(1, 3, 1, 1)

        self.y = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# --- 2. Define the Pre-trained Model ---

# Load a ResNet-50 model pre-trained on ImageNet
# NEW
model = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)

# We need to replace the final layer (the "head")
# The original model was trained on 1000 classes. We only have 2.
num_features = model.fc.in_features # Get size of features before the last layer

# Replace the final layer with a new, untrained layer for our 2 classes
model.fc = nn.Linear(num_features, 2)

# --- 3. Set up Data, Model, Loss, and Optimizer ---

# Set device (THIS MUST BE 'cuda' AFTER YOU ENABLE THE GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}\n")

# Move the model to the GPU
model = model.to(device)

# Create Datasets and DataLoaders
# (Assumes X_train, y_train, etc. are in memory)
train_dataset = MammoDataset(X_train, y_train)
val_dataset = MammoDataset(X_val, y_val)

BATCH_SIZE = 32
train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(dataset=val_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
# We only want to train the *new* final layer (model.fc) at first,
# or we can train all parameters with a low learning rate.
# Let's start by fine-tuning the whole model.
optimizer = optim.Adam(model.parameters(), lr=0.0001) # Use a much smaller LR

print("Model architecture: ResNet-50 (Fine-Tuned)")
print("Starting training...")

# --- 4. Training and Validation Loop ---
NUM_EPOCHS = 10

for epoch in range(NUM_EPOCHS):
    model.train()
    train_loss = 0.0

    # Add a progress bar using tqdm
    from tqdm.notebook import tqdm
    for images, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{NUM_EPOCHS} [Train]"):
        images = images.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * images.size(0)

    train_loss = train_loss / len(train_loader.dataset)

    # --- Validation ---
    model.eval()
    val_loss = 0.0
    val_corrects = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for images, labels in tqdm(val_loader, desc=f"Epoch {epoch+1}/{NUM_EPOCHS} [Val]"):
            images = images.to(device)
            labels = labels.to(device)

            outputs = model(images)
            loss = criterion(outputs, labels)

            _, preds = torch.max(outputs, 1)

            val_loss += loss.item() * images.size(0)
            val_corrects += torch.sum(preds == labels.data)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    val_loss = val_loss / len(val_loader.dataset)
    val_acc = val_corrects.double() / len(val_loader.dataset)

    print(f"Epoch {epoch+1}/{NUM_EPOCHS} | "
          f"Train Loss: {train_loss:.4f} | "
          f"Val Loss: {val_loss:.4f} | "
          f"Val Acc: {val_acc:.4f}")

# --- 5. Final Evaluation ---
print("\nTraining complete.")
print("Final Validation Results (ResNet-50 Fine-Tuned):")
print(classification_report(all_labels, all_preds, target_names=le.classes_))