In [1]:
import os
import pandas as pd
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, roc_auc_score
import numpy as np
import torch.nn as nn
import torchvision.models as models
import matplotlib.pyplot as plt

In [2]:
class CustomDataset(Dataset):
    def __init__(self, dataframe, root_dir, transform=None):
        self.dataframe = dataframe
        self.root_dir = root_dir
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        img_name = os.path.join(self.root_dir, self.dataframe.iloc[idx, 0])
        image = Image.open(img_name)
        label = self.dataframe.iloc[idx, 1]
        
        if self.transform:
            image = self.transform(image)

        return image, label

In [3]:
train_transform = transforms.Compose([
    transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(brightness=0.5, contrast=0.5, saturation=0.5, hue=0.2),
    transforms.RandomChoice([
        transforms.RandomRotation(30),
        transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)),
        transforms.RandomAffine(degrees=0, scale=(0.8, 1.2)),
    ]),
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
])

In [4]:
test_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
])

In [5]:
# Load CSV
csv_path = 'er_status_all_data.csv'
dataframe = pd.read_csv(csv_path)

In [6]:
# Splitting datasets (70% training, 10% validation, 20% test)
train_val_patients, test_patients = train_test_split(dataframe['sample'].unique(), test_size=0.2, random_state=42)
train_patients, val_patients = train_test_split(train_val_patients, test_size=0.125, random_state=42)

train_data = dataframe[dataframe['sample'].isin(train_patients)]
val_data = dataframe[dataframe['sample'].isin(val_patients)]
test_data = dataframe[dataframe['sample'].isin(test_patients)]

In [7]:
# Initialize datasets
train_dataset = CustomDataset(train_data, root_dir='/Users/VanKhai/Desktop/TCGA_BRCA_Histology/', transform=train_transform)
val_dataset = CustomDataset(val_data, root_dir='/Users/VanKhai/Desktop/TCGA_BRCA_Histology/', transform=test_transform)
test_dataset = CustomDataset(test_data, root_dir='/Users/VanKhai/Desktop/TCGA_BRCA_Histology/', transform=test_transform)
# Initialize dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False) 
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False) 

In [8]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [9]:
# Define CNN model
class CustomCNN(nn.Module):
    def __init__(self):
        super(CustomCNN, self).__init__()
        self.resnet = models.resnet50(pretrained=True)
        num_ftrs = self.resnet.fc.in_features
        self.resnet.fc = nn.Linear(num_ftrs, 1)

    def forward(self, x):
        x = self.resnet(x)
        return x

In [10]:
# Initialize model
model = CustomCNN().to(device)




In [11]:
# Define loss function and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [12]:
# Train the model
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for images, labels_tuple in train_dataloader:
        labels = torch.tensor([1 if label == 'Positive' else 0 for label in labels_tuple]).float().to(device)
        inputs = images.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs.squeeze(), labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * inputs.size(0)
    epoch_loss = running_loss / len(train_dataset)
    print(f'Training Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}')

Training Epoch [1/5], Loss: 0.6578
Training Epoch [2/5], Loss: 0.5891
Training Epoch [3/5], Loss: 0.5193
Training Epoch [4/5], Loss: 0.4864
Training Epoch [5/5], Loss: 0.4789


In [13]:
print(labels_tuple)

('Positive', 'Positive', 'Positive', 'Positive', 'Positive', 'Positive', 'Negative', 'Positive', 'Positive', 'Negative', 'Negative', 'Negative', 'Negative', 'Positive', 'Negative', 'Negative', 'Negative', 'Positive', 'Positive')


In [14]:
import numpy as np

test_loss = 0.0
correct_predictions = 0
total_samples = len(test_dataset)

model.eval()
all_labels = []
all_predictions = []

with torch.no_grad():
    for inputs, labels_tuple in test_dataloader:
        labels = torch.tensor([1 if label == 'Positive' else 0 for label in labels_tuple]).to(device)
        inputs = inputs.to(device)
        outputs = model(inputs)
        predictions = torch.sigmoid(outputs)
        all_labels.extend(labels.cpu().numpy())
        all_predictions.extend(predictions.cpu().numpy())
        loss = criterion(outputs.squeeze(), labels.float())
        test_loss += loss.item() * inputs.size(0)

test_loss /= len(test_dataset)

binary_predictions = [1 if pred > 0.5 else 0 for pred in all_predictions]
accuracy = sum(np.array(all_labels) == np.array(binary_predictions)) / len(all_labels)

print(f'Test Loss: {test_loss:.4f}')
print(f'Accuracy: {accuracy:.4f}')

Test Loss: 0.9171
Accuracy: 0.5188
