<a href="https://www.kaggle.com/code/chihsiangliao/cancer-classification-cnn?scriptVersionId=213459508" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import cv2
import torch
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
#import CNN model
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import classification_report
from tqdm import tqdm
# Load data
file_path = ''
data = pd.read_csv(file_path)

# derive 'label' from the 'filename' column
data['label'] = data['filename'].apply(lambda x: 0 if 'benign' in x else 1)
# filter data for magnification(mag) == 40
data = data[data['mag'] == 40]

#prepare dataset
#parameters
target_size = (224, 224) #target size for CNN input

#Dataset class for train and test
class BreastCancerDataset(Dataset):
    def __init__(self, dataframe, target_size):
        self.dataframe = dataframe
        self.target_size = target_size

    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        img_path = row['filename']
        img_path = '' + img_path
        label = row['label']
    
        #load image
        img = cv2.imread(img_path)
        if img is None:
            raise FileNotFoundError(f"Image not found: {img_path}")
        
        #Resize and preprocess
        img = cv2.resize(img, self.target_size)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # convert to RGB
        img = img / 255.0 # Normalize to [0, 1]
        img = np.transpose(img, (2, 0, 1)) # Convert to (C, H, W)

        #convert to tensor
        img_tensor = torch.tensor(img, dtype=torch.float32)
        label_tensor = torch.tensor(label, dtype=torch.long)

        return img_tensor, label_tensor
    
# Split data into train and test based on 'grp'
train_df = data[data['grp'] == 'train']
test_df = data[data['grp'] == 'test']

# Create train and test datasets
train_dataset = BreastCancerDataset(train_df, target_size)
test_dataset = BreastCancerDataset(test_df, target_size)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

#Define CNN model
class CNNModel(nn.Module):
    def __init__(self, num_classes):
        super(CNNModel, self).__init__()
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        self.fc1 = nn.Linear(64* 56 * 56, 128) # Adjust according to input size
        self.fc2 = nn.Linear(128, num_classes)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = self.pool(self.relu(self.conv1(x)))
        x = self.pool(self.relu(self.conv2(x)))
        x = x.view(x.size(0), -1) # Flatten the feature map
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x
    
# Initialize the model, loss function, and optimizer
num_classes = len(data['label'])
# print('num_classes', num_classes)
model = CNNModel(num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

#Training function
def train_model(model, train_loader, criterion, optimizer, epochs):
    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        for images, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
            images, labels = images.to(device), labels.to(device)

            #zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        print(f"Epoch {epoch+1}, Loss: {running_loss / len(train_loader)}")

epochs = 10
train_model(model, train_loader, criterion, optimizer, epochs)


# Evaluation function
def evaluate_model(model, test_loader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    return all_labels, all_preds

# Evaluate the model
y_test, y_pred = evaluate_model(model, test_loader)

target_names = ['Beningn', 'Malignant'] # define label name
print(classification_report(y_test, y_pred, target_names=target_names))
