# Load Data & Train Model
This notebook allows us to load the entire dataset, split it into a proper train/test split and trains the model using K-Fold cross validation.

## Imports

In [1]:
# Essentials
import math
import random
import os
import copy
import glob
import shutil
import numpy as np
import json
import importlib
import time as time

# Torch
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import Sequential as Seq, Linear as Lin, Conv2d

import torchvision.models as models
from torchvision import datasets, transforms

from torch.utils.data import Dataset, DataLoader

import torch.optim as optim
import timm
from timm.models import create_model
from timm.data import create_transform
from sklearn.metrics import accuracy_score

# Images
import albumentations
import albumentations.pytorch

import cv2

from PIL import Image

# Machine Learning
from sklearn.model_selection import KFold

from barbar import Bar

import utils
importlib.reload(utils)

from utils import get_data, select_gpu
from utils import My_data, CustomTransforms

## Set Device

In [2]:
device = torch.device(f"cuda:{select_gpu()}" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Selecting GPU 1 with 9716MB free memory
Using device: cuda:1


In [3]:
torch.cuda.device_count()

2

In [4]:
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print(torch.cuda.memory_summary(device=None, abbreviated=False))
    torch.cuda.manual_seed(42)
    torch.cuda.manual_seed_all(42)
    torch.backends.cudnn.benchmark = True
    torch.backends.cudnn.deterministic = False

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |       0 B  |       0 B  |       0 B  |       0 B  |
|       from large pool |       0 B  |       0 B  |       0 B  |       0 B  |
|       from small pool |       0 B  |       0 B  |       0 B  |       0 B  |
|---------------------------------------------------------------------------|
| Active memory         |       0 B  |       0 B  |       0 B  |       0 B  |
|       from large pool |       0 B  |       0 B  |       0 B  |       0 B  |
|       from small pool |       0 B  |       0 B  |       0 B  |       0 B  |
|---------------------------------------------------------------

## Load Dataset

This downloads the dataset to the server and unzips it so we can use it. Check the size of the folder before running the rest of the code. Should be around 4GB

In [5]:
full_dataset = './BreaKHis_v1/histology_slides/breast/**/SOB/**/**/**/*.png'

In [6]:
# All eight classes
classes = ["A", "F", "TA", "PT", "DC", "LC", "MC", "PC"]
benign_classes = classes[:4]
malignant_classes = classes[4:]

# All four zoom levels
zooms = ["40", "100", "200", "400"]

train_dict, test_dict = get_data(full_dataset)

Opening Train.txt
Opening Train.txt


## Prepare Data
In this section we do the following:
1. Store all image paths in a dictionary that is filtered on class and zoom level
2. Create a stratified train/test split (based on class and zoom level) and store the image paths of both sets again in a filtered dictionary
3. Copy all images from the raw data source to the structured data folder "dataset/"
4. Create a 5-fold cross validation split for the train dataset
5. Prepare all splits for forward pass of the model

### K-Fold Split
Explanation of K-Fold: https://isheunesu48.medium.com/cross-validation-using-k-fold-with-scikit-learn-cfc44bf1ce6

In [7]:
# Determine number of folds
n_folds = 5

# Store train and validate for every fold
folds = {str(i): {"train": [], "validate": []} for i in range(n_folds)}

kf = KFold(n_splits=n_folds)

for c in classes:
    for z in zooms:

        # For every class and zoom, create a 5-fold split
        for i, (train_index, validate_index) in enumerate(kf.split(train_dict[c][z])):

            # Store paths of all train images in fold i
            fold_train_img = [img for i, img in enumerate(train_dict[c][z]) if i in train_index]

            # Store paths of all validate images in fold i
            fold_validate_img = [img for i, img in enumerate(train_dict[c][z]) if i in validate_index]

            # Add paths to fold i
            folds[str(i)]["train"] += fold_train_img
            folds[str(i)]["validate"] += fold_validate_img

# Shuffle images in each train/validate fold to make sure order of classes is mixed
for i in range(n_folds):
    random.shuffle(folds[str(i)]["train"])
    random.shuffle(folds[str(i)]["validate"])

# Check number of train and validate items per fold
for k, v in folds.items():
    print("Fold", int(k)+1)
    print("Train:", len(v["train"]))
    print("Validate:", len(v["validate"]))
    print()

Fold 1
Train: 5694
Validate: 1438

Fold 2
Train: 5698
Validate: 1434

Fold 3
Train: 5704
Validate: 1428

Fold 4
Train: 5713
Validate: 1419

Fold 5
Train: 5719
Validate: 1413



In [8]:
# Check for duplicates in validation set
sets = []
for i in range(n_folds):
    sets.append(set(folds[str(i)]["validate"]))

for i in sets:
    for j in sets:
        if i == j:
            continue

        duplicates = i.intersection(j)
        if duplicates:
            print("Duplicates found:", len(duplicates))

### Prepare Data for Forward Pass

In [9]:
# initialize transformers
transform = CustomTransforms()

# Create datasets for each fold
train_folds = [My_data(folds[str(i)]["train"], transforms=transform.get_transform('train')) for i in range(n_folds)]
validate_folds = [My_data(folds[str(i)]["validate"], transforms=transform.get_transform('valid')) for i in range(n_folds)]

# Create data loaders for each fold
train_dataloaders = [DataLoader(dataset=train_folds[i], batch_size=4,shuffle=True,num_workers=2,
                                              pin_memory=True,prefetch_factor=2) for i in range(n_folds)]

validate_dataloaders = [DataLoader(dataset=validate_folds[i], batch_size=4,shuffle=True,num_workers=2,
                                              pin_memory=True,prefetch_factor=2) for i in range(n_folds)]

## Create Model

The output of the below cell is softcoded as possible initialization class_weights in the class utils.FocalLoss

In [10]:
# Number of samples in each class
# Assumption is made that in each fold, these are the same (since we do stratified split)
class_samples = [sum([len(v) for v in z.values()]) for z in train_dict.values()]

# Hardcoded from paper
# class_samples = [367, 803, 456, 370, 2763, 492, 629, 449]  # Number of samples in each class for training

total_samples = sum(class_samples)
samples = total_samples/len(class_samples)
class_weights = [samples / (s + 1e-8) for s in class_samples]
class_weights = torch.tensor(class_weights)
print(class_weights)

print(sum(class_samples))

tensor([2.2232, 0.9754, 1.7378, 2.1744, 0.2868, 1.5779, 1.2469, 1.7619])
7132


In [11]:
model = timm.create_model(
    'timm/resnet18.a1_in1k',
    pretrained=True,
    features_only=False,
    num_classes=8,
    # TODO: find out what these mean
    drop_path_rate=0.2,
    drop_rate=0.5
)

print(model.get_classifier())
model

model.safetensors:   0%|          | 0.00/46.8M [00:00<?, ?B/s]

Linear(in_features=512, out_features=8, bias=True)


ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (act1): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (drop_block): Identity()
      (act1): ReLU(inplace=True)
      (aa): Identity()
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (act2): ReLU(inplace=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, m

In [12]:
print(len([param for param in model.named_parameters()]))

# Iterate over the parameters and check requires_grad
for name, param in model.named_parameters():
    if param.requires_grad:
        print(f"Parameter '{name}' requires grad.")
    else:
        print(f"Parameter '{name}' does not require grad.")
    
    param.requires_grad = True

62
Parameter 'conv1.weight' requires grad.
Parameter 'bn1.weight' requires grad.
Parameter 'bn1.bias' requires grad.
Parameter 'layer1.0.conv1.weight' requires grad.
Parameter 'layer1.0.bn1.weight' requires grad.
Parameter 'layer1.0.bn1.bias' requires grad.
Parameter 'layer1.0.conv2.weight' requires grad.
Parameter 'layer1.0.bn2.weight' requires grad.
Parameter 'layer1.0.bn2.bias' requires grad.
Parameter 'layer1.1.conv1.weight' requires grad.
Parameter 'layer1.1.bn1.weight' requires grad.
Parameter 'layer1.1.bn1.bias' requires grad.
Parameter 'layer1.1.conv2.weight' requires grad.
Parameter 'layer1.1.bn2.weight' requires grad.
Parameter 'layer1.1.bn2.bias' requires grad.
Parameter 'layer2.0.conv1.weight' requires grad.
Parameter 'layer2.0.bn1.weight' requires grad.
Parameter 'layer2.0.bn1.bias' requires grad.
Parameter 'layer2.0.conv2.weight' requires grad.
Parameter 'layer2.0.bn2.weight' requires grad.
Parameter 'layer2.0.bn2.bias' requires grad.
Parameter 'layer2.0.downsample.0.weig

In [13]:
 class FocalLoss(nn.Module):
    def __init__(self, gamma=2.0, class_weights=None):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.class_weights = class_weights

    def forward(self, logits, labels):
        probs = torch.sigmoid(logits)
        ce_loss = nn.BCELoss()(probs, labels)
        # print(type(probs), probs, self.gamma)
        weight = (1 - probs).pow(self.gamma)
        loss = ce_loss  # Initialize loss with cross-entropy loss
        if self.class_weights is not None:
            weight = weight * self.class_weights
            loss = loss * weight
        return loss

## Train Model

This section performs the actual training of the model. We first determine methods fit and validate that will be called during the training. Afterwards, we define the loop that optimizes the model.

In [14]:
def fit(model, dataloader, optimizer,scheduler, criterion):
    #print('Training')
    model.train()
    train_running_loss = 0.0
    train_running_correct = 0
    accum_iter = 4

    for i, (inputs, labels) in enumerate(Bar(dataloader)):
        inputs = inputs.to(device)
        labels = labels.float().to(device)
        optimizer.zero_grad()
        #model.zero_grad(set_to_none=True)
        # Forward pass - compute outputs on input data using the model
        outputs = model(inputs)
        thresholds = [0.5, 0.5, 0.5,0.5,0.5,0.5,0.5,0.5]
        # Compute loss
        loss = criterion(outputs, labels)
        train_running_loss += loss.item()* inputs.size(0)
        # _ , preds = torch.max(outputs.data, 1)
        # Apply sigmoid activation to obtain probabilities
        #preds = (outputs > 0.5).float()
        probs = torch.sigmoid(outputs)
        preds = torch.zeros_like(probs)

        # Set predicted labels based on the threshold
        for i, threshold in enumerate(thresholds):
            preds[:, i] = (probs[:, i] >= threshold).float()
        train_running_correct += (preds == labels).all(dim=1).float().sum()
        # Backpropagate the gradients
        loss /= accum_iter
        loss.backward()

        if ((i + 1) % accum_iter == 0) :
            optimizer.step()
            optimizer.zero_grad()

    scheduler.step()

    train_loss = train_running_loss/len(dataloader.dataset)
    train_accuracy = 100. * train_running_correct/len(dataloader.dataset)
    return train_loss, train_accuracy

In [15]:
def validate(model, dataloader, optimizer, criterion):
    #print('Validating')
    model.eval()
    val_running_loss = 0.0
    val_running_correct = 0
    with torch.no_grad():
        for i, (inputs, labels) in enumerate(dataloader):
            inputs = inputs.to(device)
            labels = labels.float()
            labels = labels.to(device)
            outputs = model(inputs)
            thresholds = [0.5, 0.5, 0.5,0.5,0.5,0.5,0.5,0.5]
            loss = criterion(outputs, labels)

            val_running_loss += loss.item()*inputs.size(0)
            #_, preds = torch.max(outputs.data, 1)
            #preds = (outputs > 0.5).float()
            probs = torch.sigmoid(outputs)
            preds = torch.zeros_like(probs)
            # Set predicted labels based on the threshold
            for i, threshold in enumerate(thresholds):
                preds[:, i] = (probs[:, i] >= threshold).float()
            val_running_correct += (preds == labels).all(dim=1).float().sum()

    val_loss = val_running_loss/len(dataloader.dataset)
    val_accuracy = 100. * val_running_correct/len(dataloader.dataset)
    return val_loss, val_accuracy

In [16]:
history=[]

if torch.cuda.is_available():
    torch.cuda.empty_cache()

model = model.to(device)

best_model_wts = copy.deepcopy(model.state_dict())

class_weights = class_weights.to(device)

criterion = FocalLoss(class_weights)

optimizer = optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-4)

scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=15, gamma=0.5)

best_optimizer_state = optimizer.state_dict()

#best_optimizer_state = copy.deepcopy(optimizer.state_dict())

import time as time
history=[]
#best_optimizer_state = copy.deepcopy(optimizer.state_dict())
best_acc = 0.0
epochs=50

train_dataloader = train_dataloaders[0]
validate_dataloader = validate_dataloaders[0]

for epoch in range(epochs):
    epoch_start = time.time()
    print('Epoch-{0}/{1} lr: {2}'.format(epoch+1,epochs ,optimizer.param_groups[0]['lr']))
    
    if epoch % 10 == 0:
        train_dataloader = train_dataloaders[int(epoch / 10)]
        valid_dataloader = validate_dataloaders[int(epoch / 10)]

    # Why is this here???
    if  epoch > 14:
        for param in model.parameters():
            param.requires_grad = True
    #print(f"Epoch {epoch+1} of {epochs}")
    train_epoch_loss, train_epoch_accuracy = fit(model,train_dataloader,optimizer,scheduler,criterion)
    val_epoch_loss, val_epoch_accuracy = validate(model,valid_dataloader,optimizer,criterion)

    epoch_end = time.time()
    history.append([epoch+1,train_epoch_loss, train_epoch_accuracy, val_epoch_loss, val_epoch_accuracy,(epoch_end-epoch_start)])
    print(f"Train Loss: {train_epoch_loss:.4f}, Train Acc: {train_epoch_accuracy:.2f},Val Loss: {val_epoch_loss:.4f}, Val Acc: {val_epoch_accuracy:.2f},time : {epoch_end-epoch_start:.2f}")
    torch.save({'history':history},'Master_resnet_his.pth')
    if val_epoch_accuracy > best_acc:
        best_acc = val_epoch_accuracy
        best_model_wts = copy.deepcopy(model.state_dict())

        best_epoch=epoch
        torch.save({
            'epoch': epoch+1,
            'model_state_dict': best_model_wts,
            'loss': criterion,
            'history':history,
            'best_epoch': best_epoch+1,

            }, 'Master_resnet.pth')

Epoch-1/50 lr: 0.0001
Train Loss: 0.3384, Train Acc: 21.88,Val Loss: 0.2906, Val Acc: 29.62,time : 80.44
Epoch-2/50 lr: 0.0001
Train Loss: 0.2953, Train Acc: 29.40,Val Loss: 0.2556, Val Acc: 41.10,time : 76.09
Epoch-3/50 lr: 0.0001
Train Loss: 0.2772, Train Acc: 31.44,Val Loss: 0.2285, Val Acc: 44.71,time : 79.22
Epoch-4/50 lr: 0.0001
Train Loss: 0.2651, Train Acc: 33.39,Val Loss: 0.2107, Val Acc: 52.36,time : 76.66
Epoch-5/50 lr: 0.0001
Train Loss: 0.2515, Train Acc: 36.49,Val Loss: 0.1939, Val Acc: 54.87,time : 67.01
Epoch-6/50 lr: 0.0001
Train Loss: 0.2443, Train Acc: 37.69,Val Loss: 0.1902, Val Acc: 56.19,time : 61.75
Epoch-7/50 lr: 0.0001
Train Loss: 0.2380, Train Acc: 39.43,Val Loss: 0.1892, Val Acc: 51.81,time : 62.45
Epoch-8/50 lr: 0.0001
Train Loss: 0.2297, Train Acc: 40.87,Val Loss: 0.1624, Val Acc: 62.03,time : 84.95
Epoch-9/50 lr: 0.0001
Train Loss: 0.2224, Train Acc: 43.01,Val Loss: 0.1517, Val Acc: 65.23,time : 84.21
Epoch-10/50 lr: 0.0001
Train Loss: 0.2188, Train Acc: 4