In [1]:
import builtins
builtins.seed = 1
import torch
torch.manual_seed(builtins.seed)
#
from torch.optim import SGD, RMSprop
import torchvision.models as models 
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
import pickle
import json

In [2]:
base_path = '/data/lindsay/HECellClassification/'
import sys
sys.path.append(base_path)

In [3]:
from nn.learner import Learner, Config
from image.data import HistoDataset
from slidescore.data import CellClassificationDataset 
from slidescore.client import SlidescoreClient
from nn.models import iv3, densenet161
from image.utils import *

In [4]:
config = Config()
config.minibatch_size=16
config.ExponentialLR_gamma = .9
config.normalize_loss = False
config.device = 'cuda:0'

## Assign label class number

In [5]:
with open(base_path + 'classification/cell_labels_translate.json', 'r') as f:
    reverse_translate_labels = json.load(f)

annotation_classes = {}
i = 0
for v in reverse_translate_labels.values():
    if annotation_classes.get(v, -1) == -1:
        annotation_classes[v] = i
        i += 1

print(annotation_classes)

{'tumor': 0, 'lymphocyte': 1}


## Create training (and validation) dataset from all trainig samples 

In [6]:
annotation_fpath= '/data/lindsay/HECellClassification/IID/mrr.txt'
a = pd.read_csv(annotation_fpath, sep='\t', dtype=str)
annotation_by=['l.leek@nki.nl']
scores = a[a['By'].apply(lambda b: b in annotation_by)].copy().reset_index(drop=True)
annotation_classes = {v: i for i, v in enumerate(scores['Question'].unique())}

In [7]:
# See the example annotation under classification/. This can be directly downloaded from slidescore
cell = CellClassificationDataset(key_fpath='/data/lindsay/HECellClassification/IID/IID_slidescore_training.key', 
                                 annotation_fpath= '/data/lindsay/HECellClassification/IID/mrr.txt', 
                                 annotation_by=['l.leek@nki.nl'],#, 'name_of_annotator_2', '...'], 
                                 shuffle=True, #for the batches
                                 server='https://slidescore.nki.nl', 
                                 channel_first=True,  # (channel last is possible but not default in pytorch)
                                 sample_size=256) #width and height, bcs only squares
  
# change string labels to class numbers that was created in the previous block
cell.labels.label = cell.labels.label.apply(lambda x: reverse_translate_labels[x])
cell.annotation_classes = annotation_classes



## Load previously saved slide score tiles cache to avoid downloading them

In [20]:
#cell.client.load_cache(path='/data/lindsay/HECellClassification/IID/')

## Split the training and validation

In [21]:
training_objects, validation_objects = cell.split(r=.9)

## Create data loaders

In [22]:
#from pytorch; transformers, Siamak already tested and selected those but you can play around
training_data = HistoDataset(training_objects.get_samples,
                             provides_minibatches=False,
                             minibatch_size=config.minibatch_size,
                             data_transforms=HistoDataset.histology_transforms,
                             im_normalize=True,
                             output_type='long')

validation_data = HistoDataset(validation_objects.get_samples,
                               provides_minibatches=False,
                               minibatch_size=config.minibatch_size,
                               im_normalize=True,
                               output_type='long')

## Create a simple feed-forward model. (And load previously trained weights) 

In [23]:
# model = iv3(num_classes=len(cell.annotation_classes))
model = densenet161(num_classes=6)#len(cell.annotation_classes))

## Setup the training

In [24]:

#apply model weights
config.Adam_lr = 0.00001
learner = Learner(model=model,
                  # weights for the classe samples can be passed here
                  criterion=torch.nn.CrossEntropyLoss(weight=torch.Tensor([1., 1., 1., 1., 1., 1.])),
                  training_dataset=training_data,
                  validation_dataset=validation_data,
                  config=config)

# If learning rate warm-up needs to be done, it should be here by accessing learner.optimizer and learner.lr_scheduler.

In [25]:
#knowledge distillation

learner.load_model_state('/data/lindsay/HECellClassification/IID/cc_dense161_256p_6class_v01_3adam1e5.pkl')
# if some layers need to be frozen, it should be done here

In [26]:
learner.model.classifier = torch.nn.Linear(in_features=2208, out_features=2, bias = True)

In [27]:
learner.model.classifier

Linear(in_features=2208, out_features=2, bias=True)

## Start the training

In [28]:
#Validation epochs; you can set to 1; or 5 is also good
#Logs and stuff will be here

learner.train(num_epochs=1, validation_epochs=1)
# validation_epochs arguments says run validation epoch every validation_epochs epochs. Did that make sense?



RuntimeError: Tensor for 'out' is on CPU, Tensor for argument #1 'self' is on CPU, but expected them to be on GPU (while checking arguments for addmm)

## Save the cache

In [None]:
# Save immidiately after training is done (at least for 1 epoch) to avoid downloading in future. Later load the cache as was shown above.
cell.client.save_cache(path='/data/lindsay/HECellClassification/IID/cache.pkl')

## Save model

In [None]:
learner.save_model_state('/data/lindsay/HECellClassification/IID/trained_weights.pkl')