# Load Data & Train Model
This notebook allows us to load the entire dataset, split it into a proper train/test split and trains the model using K-Fold cross validation.

## Imports

In [15]:
# Essentials
import math
import random
import os
import copy
import glob
import shutil
import numpy as np
import json
import importlib
import time as time

# Torch
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import Sequential as Seq, Linear as Lin, Conv2d

import torchvision.models as models
from torchvision import datasets, transforms

from torch.utils.data import Dataset, DataLoader

import torch.optim as optim
import timm
from timm.models import create_model
from timm.data import create_transform
from sklearn.metrics import accuracy_score

# Images
import albumentations
import albumentations.pytorch

import cv2

from PIL import Image

# Machine Learning
from sklearn.model_selection import KFold

from barbar import Bar

import utils
importlib.reload(utils)

from utils import get_data, select_gpu, get_model, get_class_weigths, get_files
from utils import My_data, CustomTransforms

## Set Device

In [16]:
device = torch.device(f"cuda:{select_gpu()}" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Selecting GPU 1 with 16967MB free memory
Using device: cuda:1


In [17]:
torch.cuda.device_count()

2

In [18]:
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print(torch.cuda.memory_summary(device=None, abbreviated=False))
    torch.cuda.manual_seed(42)
    torch.cuda.manual_seed_all(42)
    torch.backends.cudnn.benchmark = True
    torch.backends.cudnn.deterministic = False

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |  222970 KB |    1404 MB |   11168 GB |   11168 GB |
|       from large pool |  206080 KB |    1377 MB |   10067 GB |   10067 GB |
|       from small pool |   16890 KB |      35 MB |    1100 GB |    1100 GB |
|---------------------------------------------------------------------------|
| Active memory         |  222970 KB |    1404 MB |   11168 GB |   11168 GB |
|       from large pool |  206080 KB |    1377 MB |   10067 GB |   10067 GB |
|       from small pool |   16890 KB |      35 MB |    1100 GB |    1100 GB |
|---------------------------------------------------------------

## Load Dataset

This downloads the dataset to the server and unzips it so we can use it. Check the size of the folder before running the rest of the code. Should be around 4GB

In [19]:
model_name = "resnet"

In [20]:
full_dataset = './BreaKHis_v1/histology_slides/breast/**/SOB/**/**/**/*.png'

In [21]:
# All eight classes
classes = ["A", "F", "TA", "PT", "DC", "LC", "MC", "PC"]
benign_classes = classes[:4]
malignant_classes = classes[4:]

# All four zoom levels
zooms = ["40", "100", "200", "400"]

train_dict, test_dict = get_data(full_dataset)

Opening /home/jovyan/txt/train.txt.txt
Opening /home/jovyan/txt/test.txt.txt


In [22]:
# Below is for cross validation with retraining
org_dataset = './dataset/train/original/**/**/*.png'
pert_dataset = f'./dataset/train/pgd_attack/{model_name}/**/**/*.png'

dataset = get_files(org_dataset) + get_files(pert_dataset)
print(len(dataset))

# Create the train_dict
train_dict = {c: {z: [path for path in dataset if path.split("_")[-1].split("-")[0] == c and path.split("_")[-1].split("-")[3] == z] for z in zooms} for c in classes}

14264


## Prepare Data
In this section we do the following:
1. Store all image paths in a dictionary that is filtered on class and zoom level
2. Create a stratified train/test split (based on class and zoom level) and store the image paths of both sets again in a filtered dictionary
3. Copy all images from the raw data source to the structured data folder "dataset/"
4. Create a 5-fold cross validation split for the train dataset
5. Prepare all splits for forward pass of the model

### K-Fold Split
Explanation of K-Fold: https://isheunesu48.medium.com/cross-validation-using-k-fold-with-scikit-learn-cfc44bf1ce6

In [25]:
# Determine number of folds
n_folds = 10

# Store train and validate for every fold
folds = {str(i): {"train": [], "validate": []} for i in range(n_folds)}

kf = KFold(n_splits=n_folds)

for c in classes:
    for z in zooms:

        # For every class and zoom, create a 5-fold split
        for i, (train_index, validate_index) in enumerate(kf.split(train_dict[c][z])):

            # Store paths of all train images in fold i
            fold_train_img = [img for i, img in enumerate(train_dict[c][z]) if i in train_index]

            # Store paths of all validate images in fold i
            fold_validate_img = [img for i, img in enumerate(train_dict[c][z]) if i in validate_index]

            # Add paths to fold i
            folds[str(i)]["train"] += fold_train_img
            folds[str(i)]["validate"] += fold_validate_img

# Shuffle images in each train/validate fold to make sure order of classes is mixed
for i in range(n_folds):
    random.shuffle(folds[str(i)]["train"])
    random.shuffle(folds[str(i)]["validate"])

# Check number of train and validate items per fold
for k, v in folds.items():
    print("Fold", int(k)+1)
    print("Train:", len(v["train"]))
    print("Validate:", len(v["validate"]))
    print()

Fold 1
Train: 12826
Validate: 1438

Fold 2
Train: 12826
Validate: 1438

Fold 3
Train: 12830
Validate: 1434

Fold 4
Train: 12830
Validate: 1434

Fold 5
Train: 12836
Validate: 1428

Fold 6
Train: 12836
Validate: 1428

Fold 7
Train: 12845
Validate: 1419

Fold 8
Train: 12845
Validate: 1419

Fold 9
Train: 12851
Validate: 1413

Fold 10
Train: 12851
Validate: 1413



In [26]:
# Check for duplicates in validation set
sets = []
for i in range(n_folds):
    sets.append(set(folds[str(i)]["validate"]))

for i in sets:
    for j in sets:
        if i == j:
            continue

        duplicates = i.intersection(j)
        if duplicates:
            print("Duplicates found:", len(duplicates))

### Prepare Data for Forward Pass

In [27]:
# initialize transformers
transform = CustomTransforms()

# Create datasets for each fold
train_folds = [My_data(folds[str(i)]["train"], transforms=transform.get_transform('train')) for i in range(n_folds)]
validate_folds = [My_data(folds[str(i)]["validate"], transforms=transform.get_transform('valid')) for i in range(n_folds)]

# Create data loaders for each fold
train_dataloaders = [DataLoader(dataset=train_folds[i], batch_size=4,shuffle=True,num_workers=2,
                                              pin_memory=True,prefetch_factor=2) for i in range(n_folds)]

validate_dataloaders = [DataLoader(dataset=validate_folds[i], batch_size=4,shuffle=True,num_workers=2,
                                              pin_memory=True,prefetch_factor=2) for i in range(n_folds)]

test_dataloader = DataLoader(My_data(get_files("./dataset/test/original/**/**/*.png"), transforms=transform.get_transform('test')))

## Create Model

In [28]:
class FocalLoss(nn.Module):
    def __init__(self, gamma=2.0, class_weights=None):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.class_weights = class_weights

    def forward(self, logits, labels):
        probs = torch.sigmoid(logits)
        ce_loss = nn.BCELoss()(probs, labels)
        # print(type(probs), probs, self.gamma)
        weight = (1 - probs).pow(self.gamma)
        loss = ce_loss  # Initialize loss with cross-entropy loss
        if self.class_weights is not None:
            weight = weight * self.class_weights
            loss = loss * weight
        return loss

## Train Model

This section performs the actual training of the model. We first determine methods fit and validate that will be called during the training. Afterwards, we define the loop that optimizes the model.

In [29]:
def fit(model, dataloader, optimizer,scheduler, criterion):
    #print('Training')
    model.train()
    train_running_loss = 0.0
    train_running_correct = 0
    accum_iter = 4

    for i, (inputs, labels) in enumerate(Bar(dataloader)):
        inputs = inputs.to(device)
        labels = labels.float().to(device)
        optimizer.zero_grad()
        #model.zero_grad(set_to_none=True)
        # Forward pass - compute outputs on input data using the model
        outputs = model(inputs)
        thresholds = [0.5, 0.5, 0.5,0.5,0.5,0.5,0.5,0.5]
        # Compute loss
        loss = criterion(outputs, labels)
        train_running_loss += loss.item()* inputs.size(0)
        # _ , preds = torch.max(outputs.data, 1)
        # Apply sigmoid activation to obtain probabilities
        #preds = (outputs > 0.5).float()
        probs = torch.sigmoid(outputs)
        preds = torch.zeros_like(probs)

        # Set predicted labels based on the threshold
        for i, threshold in enumerate(thresholds):
            preds[:, i] = (probs[:, i] >= threshold).float()
        train_running_correct += (preds == labels).all(dim=1).float().sum()
        # Backpropagate the gradients
        loss /= accum_iter
        loss.backward()

        if ((i + 1) % accum_iter == 0) :
            optimizer.step()
            optimizer.zero_grad()

    scheduler.step()

    train_loss = train_running_loss/len(dataloader.dataset)
    train_accuracy = 100. * train_running_correct/len(dataloader.dataset)
    return train_loss, train_accuracy

In [30]:
def validate(model, dataloader, optimizer, criterion):
    #print('Validating')
    model.eval()
    val_running_loss = 0.0
    val_running_correct = 0
    with torch.no_grad():
        for i, (inputs, labels) in enumerate(dataloader):
            inputs = inputs.to(device)
            labels = labels.float()
            labels = labels.to(device)
            outputs = model(inputs)
            thresholds = [0.5, 0.5, 0.5,0.5,0.5,0.5,0.5,0.5]
            loss = criterion(outputs, labels)

            val_running_loss += loss.item()*inputs.size(0)
            #_, preds = torch.max(outputs.data, 1)
            #preds = (outputs > 0.5).float()
            probs = torch.sigmoid(outputs)
            preds = torch.zeros_like(probs)
            # Set predicted labels based on the threshold
            for i, threshold in enumerate(thresholds):
                preds[:, i] = (probs[:, i] >= threshold).float()
            val_running_correct += (preds == labels).all(dim=1).float().sum()

    val_loss = val_running_loss/len(dataloader.dataset)
    val_accuracy = 100. * val_running_correct/len(dataloader.dataset)
    return val_loss, val_accuracy 

In [31]:
def reset_weights(m):
  '''
    Try resetting model weights to avoid
    weight leakage.
  '''
  for layer in m.children():
   if hasattr(layer, 'reset_parameters'):
    # print(f'Reset trainable parameters of layer = {layer}')
    layer.reset_parameters()

In [32]:
if torch.cuda.is_available():
    torch.cuda.empty_cache()

criterion = FocalLoss(get_class_weigths(train_dict).to(device))
# For fold results
results = {}
epochs=1

for fold in range(n_folds):
    print("-----------------------------------------------")
    print(f"FOLD {fold}")
    print("-----------------------------------------------")

    history=[]
    best_acc = 0.0

    train_dataloader = train_dataloaders[fold]
    valid_dataloader = validate_dataloaders[fold]

    model = get_model(device, model=model_name)
    model.apply(reset_weights)

    optimizer = optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-4)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=15, gamma=0.5)


    for epoch in range(epochs):
        epoch_start = time.time()
        print('Epoch-{0}/{1} lr: {2}'.format(epoch+1,epochs ,optimizer.param_groups[0]['lr']))
        
        # Why is this here???
        # if  epoch > 14:
        #     for param in model.parameters():
        #         param.requires_grad = True
        #print(f"Epoch {epoch+1} of {epochs}")
        train_epoch_loss, train_epoch_accuracy = fit(model,train_dataloader,optimizer,scheduler,criterion)
        val_epoch_loss, val_epoch_accuracy = validate(model,valid_dataloader,optimizer,criterion)

        epoch_end = time.time()
        history.append([epoch+1,train_epoch_loss, train_epoch_accuracy, val_epoch_loss, val_epoch_accuracy,(epoch_end-epoch_start)])
        print(f"Train Loss: {train_epoch_loss:.4f}, Train Acc: {train_epoch_accuracy:.2f},Val Loss: {val_epoch_loss:.4f}, Val Acc: {val_epoch_accuracy:.2f},time : {epoch_end-epoch_start:.2f}")
        torch.save({'history':history}, f'models/cv/Master_{model_name}-cv-{fold}_his.pth')
        if val_epoch_accuracy > best_acc:
            best_acc = val_epoch_accuracy
            best_model_wts = copy.deepcopy(model.state_dict())

            best_epoch=epoch
            torch.save({
                'epoch': epoch+1,
                'model_state_dict': best_model_wts,
                'loss': criterion,
                'history':history,
                'best_epoch': best_epoch+1,

                }, f'models/cv/Master_{model_name}-best-{fold}.pth')
            
     # Process is complete.
    print('Training process has finished. Saving trained model.')
    # Print about testing
    print('Starting testing')
    
    # Saving the model
    torch.save(model.state_dict(), f'models/cv/Master_{model_name}-final-{fold}.pth')

    # Evaluationfor this fold
    correct, total = 0, 0
    with torch.no_grad():

      # Iterate over the test data and generate predictions
      for i, data in enumerate(test_dataloader, 0):
        
        # Get inputs
        inputs, targets = data

        true_label = torch.argmax(targets.to(device), dim=1)
        confs = model(inputs.to(device))
        pred_label = torch.argmax(confs, dim=1)
    
        total += targets.size(0)
        correct += (pred_label == true_label).sum().item()

      # Print accuracy
      print('Accuracy for fold %d: %d %%' % (fold, 100.0 * correct / total))
      print('--------------------------------')
      results[fold] = 100.0 * (correct / total)
    
    # Print fold results
print(f'K-FOLD CROSS VALIDATION RESULTS FOR {n_folds} FOLDS')
print('--------------------------------')
sum = 0.0
for key, value in results.items():
    print(f'Fold {key}: {value} %')
    sum += value
print(f'Average: {sum/len(results.items())} %')

-----------------------------------------------
FOLD 0
-----------------------------------------------
Epoch-1/1 lr: 0.0001
Train Loss: 0.3014, Train Acc: 28.26,Val Loss: 0.3369, Val Acc: 9.46,time : 135.35
Training process has finished. Saving trained model.
Starting testing
Accuracy for fold 0: 34 %
--------------------------------
-----------------------------------------------
FOLD 1
-----------------------------------------------
Epoch-1/1 lr: 0.0001
Train Loss: 0.2997, Train Acc: 28.94,Val Loss: 0.2913, Val Acc: 30.53,time : 94.75
Training process has finished. Saving trained model.
Starting testing
Accuracy for fold 1: 45 %
--------------------------------
-----------------------------------------------
FOLD 2
-----------------------------------------------
Epoch-1/1 lr: 0.0001
Train Loss: 0.2950, Train Acc: 29.74,Val Loss: 0.3344, Val Acc: 29.85,time : 100.81
Training process has finished. Saving trained model.
Starting testing
Accuracy for fold 2: 48 %
------------------------