In [1]:
%matplotlib inline
from pytorch_metric_learning import losses, miners, samplers, trainers, testers
from pytorch_metric_learning.utils import common_functions
import pytorch_metric_learning.utils.logging_presets as logging_presets
import sys
sys.path.append('../')
from panda_challenge.train_utils import QWKCallback
from panda_challenge.utils import tile
from panda_challenge.dataset import MetricLearningAndClassifcationDatasetMultiCrop
import umap
import torch
from torch import nn
from torch.nn import functional as F
from catalyst.contrib.nn.modules import GlobalMaxPool2d, Flatten, Lambda
import timm
from catalyst import dl
from catalyst.dl import utils
from torch.utils.data import DataLoader, Dataset
from tqdm.auto import tqdm
import numpy as np
import pandas as pd
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import train_test_split
import albumentations as A
import json
import os
import warnings
import PIL
import gc
import pickle
from catalyst.contrib.nn.optimizers import RAdam, Lookahead

  from pandas import Panel

numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject


numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject


numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject


unclosed file <_io.BufferedReader name='/home/skolchenko/.conda/envs/deeplearning/lib/python3.7/site-packages/torchvision/_C.so'>


numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject



Define models

In [2]:
class TrunkMultiCropModel(nn.Module):
    def __init__(self, 
                 model_name='resnet34',
                 num_classes=6,
                 embeddings_size=64,
                     **kwargs):
        super().__init__()
        m = timm.create_model(
            model_name,
            **kwargs)
        self.enc = nn.Sequential(*list(m.children())[:-2])
        nc = list(m.children())[-1].in_features
        self.pool = nn.AdaptiveAvgPool2d(output_size=1)
        self.embedder = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(nc, nc//4),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(nc//4, embedding_size))  
        self.classifier = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(nc, nc//4),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(nc//4, num_classes))
        
    def forward(self, x):
        shape = x[0].shape
        n = len(x[0])
        x = x.view(-1, shape[1], shape[2], shape[3])
        # x: bs*N x 3 x 128 x 128
        x = self.enc(x)
        # should be: bs*N x C x 4 x 4
        shape = x.shape
        # concatenate the output for tiles into a single map
        x = x.view(-1, n, shape[1], shape[2], shape[3])
        x = x.permute(0, 2, 1, 3, 4).contiguous()
        x = x.view(-1, shape[1], shape[2]*n, shape[3])
        x = self.pool(x)
        x = torch.squeeze(x)
        x_cls = self.classifier(x)
        x_embedding = self.embedder(x)
        # should be: bs x C x N*4 x 4
        return x_cls, x_embedding
    
    
    
class PickleMetricLearningAndClassifcationDatasetMultiCrop(Dataset):
    def __init__(
        self,
        data_df,
        transforms_json,
        image_dir,
        mean=np.array([0.90949707, 0.8188697, 0.87795304]),
        std=np.array([0.36357649, 0.49984502, 0.40477625]),
        N=16,
        zoom_level=2,
        crop_size=128,
        label_smoothing=0.15,
        output_type='classification',
        *args,
            **kwargs):
        """Prepares pytorch dataset for training
        Generates tiles from coarse slide and returns it

        Args:
            data_df (pd.DataFrame): data.frame with slides id and labels.
            augmentations (albumentations.compose): augmentations.
            image_dir (str): folder with images.
            mask_dir (str): folder with masks.
            crop_size(int): crop size around mask. Default: 128
        Returns
            Dataset

        """
        self.data_df = pd.read_csv(data_df)
        self.image_dir = image_dir
        self.crop_size = crop_size
        self.N = N
        self.transforms = self._get_aug(transforms_json)
        self.mean = mean
        self.std = std
        self.zoom_level = zoom_level
        self.output_type = output_type
        self.label_smoothing = label_smoothing
        self.targets = self.data_df.isup_grade.unique().astype(str)

    def _get_aug(self, arg):
        with open(arg) as f:
            augs = A.from_dict(json.load(f))
        target = {}
        for i in range(1, self.N):
            target['image' + str(i)] = 'image'
        return A.Compose(augs, p=1, additional_targets=target)

    def __len__(self):
        return(len(self.data_df))

    def __getitem__(self, idx):
        """Will load the mask, get random coordinates around/with the mask,
        load the image by coordinates
        """
        slide_id = self.data_df.image_id.values[idx]
        isup_grade = self.data_df.isup_grade.values[idx]
        isup_grade_class = self.data_df.isup_grade.values[idx]
        input_file = os.path.join(self.image_dir, f"{slide_id}_{self.zoom_level}_{self.N}_{self.crop_size}.pkl")
        with open(input_file, "rb") as input_file:
            tiled_images = pickle.load(input_file)
        target_names = ['image' + str(i) if i > 0 else 'image'
                        for i in range(len(tiled_images))]
        tiled_images = dict(zip(
            target_names,
            tiled_images))
        augmented = self.transforms(**tiled_images)
        tiled_images = [augmented[target] for target in target_names]
        tiled_images = np.stack(tiled_images)
        tiled_images = (1.0 - tiled_images/255.0)
        tiled_images = (tiled_images - self.mean)/self.std
        assert len(tiled_images) == self.N
        tiled_images = tiled_images.transpose(0, 3, 1, 2)
        # Fix outputs for each of the tasks
        # To do: move as separate function
        if self.output_type == 'regression':
            isup_grade = np.expand_dims(isup_grade, 0)
            isup_grade = torch.from_numpy(isup_grade).float()
        elif self.output_type == 'classification':
            isup_grade = torch.tensor(isup_grade)
        elif self.output_type == 'ordinal':
            raise NotImplementedError
        elif self.output_type == 'ohe_classification':
            '''
            We can make it asymetric if we use the assumption that
            errors 4-5 are more likely than errors 0-5
            '''
            ohe_isup_grade = np.zeros(6)
            ohe_isup_grade[isup_grade] = 1
            isup_grade = ohe_isup_grade
            isup_grade = isup_grade * (1 - self.label_smoothing) + \
                self.label_smoothing / 6
            isup_grade = torch.from_numpy(isup_grade).float()
        else:
            raise NotImplementedError

        data = {'features': torch.from_numpy(tiled_images).float(),
                'targets': isup_grade,
                'metric_learning_targets': isup_grade_class}
        return(data)

In [None]:
#timm.list_models()

In [7]:
model_name = 'resnet34'
embedding_size = 128
pretrained = True
num_classes = 1

# Create the trunk
model = TrunkMultiCropModel(
    model_name,
    num_classes,
    embedding_size,
    pretrained=pretrained)

In [8]:
TRAIN_LABELS = '/data/personal_folders/skolchenko/panda/train_cleaned.csv'
train_labes = pd.read_csv(TRAIN_LABELS)
data_train, data_val = train_test_split(
    train_labes,
    test_size=0.25,
    random_state=42)
data_train.to_csv(
    '/data/personal_folders/skolchenko/panda/data_train_cleaned.csv',
    index=False)
data_val.to_csv(
    '/data/personal_folders/skolchenko/panda/data_val_cleaned.csv',
    index=False)

In [9]:
train_dataset = PickleMetricLearningAndClassifcationDatasetMultiCrop(
    "/data/personal_folders/skolchenko/panda/data_train_cleaned.csv",
    "../configs/light_transforms_noNorm.json",
    "/data/personal_folders/skolchenko/panda/pickled_tiled_images",
    N=16, 
    zoom_level=1,
    crop_size=128, 
    output_type='regression')

val_dataset = PickleMetricLearningAndClassifcationDatasetMultiCrop(
    "/data/personal_folders/skolchenko/panda/data_val_cleaned.csv",
    "../configs/validation_augs_noNorm.json",
    "/data/personal_folders/skolchenko/panda/pickled_tiled_images",
    N=16, 
    zoom_level=1,
    crop_size=128, 
    output_type='regression')

In [10]:
loss = losses.TripletMarginLoss(margin=0.1)
classification_loss = torch.nn.MSELoss()
miner = miners.MultiSimilarityMiner(epsilon=0.1)
sampler = samplers.MPerClassSampler(train_dataset.targets, m=4, length_before_new_iter=len(train_dataset))
batch_size = 24
loss_funcs = {"metric_loss": loss, "classifier_loss": classification_loss}
mining_funcs = {"tuple_miner": miner}
loss_weights = {"metric_loss": 10, "classifier_loss": 1}                             

In [11]:
class MetricLearningClassifierRunner(dl.Runner):
    def __init__(
        self, 
        mining_funcs,
        loss_funcs,
        loss_weights,
        model = None, 
        device = None,  
      ):
        super().__init__(model, device)
        self.mining_funcs = mining_funcs
        self.loss_funcs = loss_funcs
        self.loss_weights = loss_weights
        
    def get_final_embeddings(self, base_output):
        return self.model["embedder"](base_output.cuda())

    def get_model_output(self, data):
        return self.model(data.cuda())

    def mine_embeddings(self, embeddings, labels):
        return self.mining_funcs["tuple_miner"](embeddings.cuda(), labels.cuda())
    
    def get_classifier_loss(self, logits, labels):
        return self.loss_funcs["classifier_loss"](logits.cuda(), labels.cuda())

    def get_logits(self, embeddings):
        return self.model["classifier"](embeddings)
  
    def get_metric_loss(self, embeddings, labels, indices_tuple):
        return self.loss_funcs["metric_loss"](embeddings, labels, indices_tuple)
    
    def _handle_batch(self, batch):
        batch_metrics = {}
        data  = batch['features']
        labels_classifier = batch['targets']
        labels = batch['metric_learning_targets']
        cls_output, embedding_output = self.get_model_output(data) # trunk output
        indices_tuple = self.mine_embeddings(embedding_output, labels) # mining for metric learning
        metric_loss = self.loss_weights['metric_loss']*self.get_metric_loss(embedding_output, labels, indices_tuple)
        classifier_loss = self.loss_weights['classifier_loss']*self.get_classifier_loss(cls_output, labels_classifier)
        loss = metric_loss + classifier_loss
        batch_metrics["metric_loss"] = metric_loss
        batch_metrics["classifier_loss"] = classifier_loss
        batch_metrics["loss"] = loss
        self.state.batch_metrics.update(**batch_metrics)  
        self.state.output = {"logits": cls_output}
        if self.state.is_train_loader:
            loss.backward()
            self.state.optimizer.step()
            self.state.optimizer.zero_grad()

In [12]:
loaders = {
  "train": DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=64),
  "valid": DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=64)
}

runner = MetricLearningClassifierRunner(
    mining_funcs=mining_funcs,
    loss_weights=loss_weights,
    loss_funcs=loss_funcs)

In [13]:
logdir="./exp_1"

In [14]:
num_epochs = 25
optimizer = RAdam(model.parameters(), lr=1e-4)
optimizer_la = Lookahead(optimizer)
runner.train(
    model=model, 
    optimizer=optimizer_la,
    scheduler=torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, patience=5, factor=0.1),
    loaders=loaders,
    main_metric="classifier_loss",
    num_epochs=num_epochs,
    callbacks=[QWKCallback(qwk_name='simple')],
    verbose=True,
    logdir=logdir
)

1/25 * Epoch (train): 100% 311/312 [02:29<00:00,  2.17it/s, batch_qwk=0.395, classifier_loss=2.078, loss=3.459, metric_loss=1.380]     

IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)

In [None]:
num_epochs = 25
optimizer = RAdam(model.parameters(), lr=1e-5)
optimizer_la = Lookahead(optimizer)
runner.train(
    model=model, 
    optimizer=optimizer_la,
    scheduler=torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, patience=5, factor=0.1),
    loaders=loaders,
    main_metric="classifier_loss",
    num_epochs=num_epochs,
    callbacks=[QWKCallback(qwk_name='simple')],
    verbose=True,
    logdir=logdir
)

In [None]:
num_epochs = 25
optimizer = RAdam(model.parameters(), lr=1e-6)
optimizer_la = Lookahead(optimizer)
runner.train(
    model=model, 
    optimizer=optimizer_la,
    scheduler=torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, patience=5, factor=0.1),
    loaders=loaders,
    main_metric="classifier_loss",
    num_epochs=num_epochs,
    callbacks=[QWKCallback(qwk_name='simple')],
    verbose=True,
    logdir=logdir
)

### Check embeddings

In [None]:
model.eval()

In [None]:
train_dataset = PickleMetricLearningAndClassifcationDatasetMultiCrop(
    "/data/personal_folders/skolchenko/panda/data_train_cleaned.csv",
    "../configs/validation_augs_noNorm.json",
    "/data/personal_folders/skolchenko/panda/pickled_tiled_images",
    N=16, 
    zoom_level=1,
    crop_size=128, 
    output_type='regression')

val_dataset = PickleMetricLearningAndClassifcationDatasetMultiCrop(
    "/data/personal_folders/skolchenko/panda/data_val_cleaned.csv",
    "../configs/validation_augs_noNorm.json",
    "/data/personal_folders/skolchenko/panda/pickled_tiled_images",
    N=16, 
    zoom_level=1,
    crop_size=128, 
    output_type='regression')
loaders = {
  "train": DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=48),
  "valid": DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=48)
}

In [None]:
predicted_cls = []
embeddings = []
targets = []
for batch in tqdm(loaders['train'], total=len(loaders['train'])):
    with torch.no_grad():
        pred_cls, pred_emb = model(batch['features'].cuda())
        pred_cls = pred_cls.cpu().numpy()
        pred_emb = pred_emb.cpu().numpy()
        embeddings.extend(pred_emb)
        predicted_cls.extend(pred_cls)
        targets.extend(batch['targets'].cpu().numpy())

In [None]:
preds = np.array([x[0] for x in predicted_cls])
preds[preds<0] = 0
preds[preds>5] = 5
preds = np.round(preds).astype(int)
cohen_kappa_score(targets, preds, weights='quadratic')

In [None]:
predicted_cls_valid = []
embeddings_valid = []
targets_valid = []
for batch in tqdm(loaders['valid'], total=len(loaders['valid'])):
    with torch.no_grad():
        pred_cls, pred_emb = model(batch['features'].cuda())
        pred_cls = pred_cls.cpu().numpy()
        pred_emb = pred_emb.cpu().numpy()
        embeddings_valid.extend(pred_emb)
        predicted_cls_valid.extend(pred_cls)
        targets_valid.extend(batch['targets'].cpu().numpy())

In [None]:
preds_valid = np.array([x[0] for x in predicted_cls_valid])
preds_valid[preds_valid<0] = 0
preds_valid[preds_valid>5] = 5
preds_valid = np.round(preds_valid).astype(int)
cohen_kappa_score(targets_valid, preds_valid, weights='quadratic')

In [None]:
embeddings_valid = np.stack(embeddings_valid)
predicted_cls_valid = np.stack(predicted_cls_valid)
targets_valid = np.stack(targets_valid)
embeddings = np.stack(embeddings)
predicted_cls = np.stack(predicted_cls)
targets = np.stack(targets)

In [None]:
import matplotlib.pyplot as plt
import umap
%matplotlib inline

In [None]:
transformer = umap.UMAP(n_neighbors=15,
                        min_dist=0.01, 
                        spread=2)
transformer.fit(embeddings)
embeddings_umap = transformer.transform(embeddings)
embeddings_valid_umap = transformer.transform(embeddings_valid)

In [None]:
plt.scatter(embeddings_umap[:, 0], embeddings_umap[:, 1], c=[x[0] for x in targets], cmap='YlOrRd', s=0.25)

In [None]:
plt.scatter(embeddings_valid_umap[:, 0], embeddings_valid_umap[:, 1], c=[x[0] for x in targets_valid],
            cmap='YlOrRd', marker='*', s=0.25)

In [None]:
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK
from hyperopt.pyll import scope as ho_scope
from hyperopt.pyll.stochastic import sample as ho_sample
from sklearn.model_selection import StratifiedKFold, cross_val_score
from hyperopt.pyll.stochastic import sample as ho_sample
from functools import partial

In [None]:
def opt_knn(hps, X, y, ncv=10, SEED=42):
    """
    Source: https://www.kaggle.com/fanvacoolt/tutorial-on-hyperopt
    """
    model = KNeighborsRegressor(**hps)
    cv_res = cross_val_score(model, X, y, cv=StratifiedKFold(ncv, random_state=SEED, shuffle=True),
                             scoring='neg_mean_squared_error', n_jobs=-1)

    return {
        'loss': -cv_res.mean(),
        'cv_std': cv_res.std(),
        'status': STATUS_OK
    } 

In [None]:
SEED = 42
hp_space_knn = {
            'weights': hp.choice('weights', ['uniform', 'distance']),
            'n_neighbors': hp.choice('n_neighbors', range(1, 50)),
            }

trials_knn = Trials() 
best_knn = fmin(partial(opt_knn, X=embeddings_umap, y=targets), 
               hp_space_knn, 
               algo=tpe.suggest,
               max_evals=25, # Should be enough 
               trials=trials_knn, 
               rstate=np.random.RandomState(SEED))
print(best_knn)

In [None]:
clf = KNeighborsRegressor(n_neighbors=18, weights='distance')
clf.fit(embeddings_umap, targets.astype(int))
predictions_knn = clf.predict(embeddings_valid_umap)

In [None]:
preds_valid = np.array([x[0] for x in predicted_cls_valid])
preds_valid[preds_valid<0] = 0
preds_valid[preds_valid>5] = 5
preds_valid = np.round(preds_valid).astype(int)
cohen_kappa_score([int(x[0]) for x in targets_valid], preds_valid, weights='quadratic')

In [None]:
preds_valid_knn = np.array([x[0] for x in predictions_knn])
preds_valid_knn[preds_valid_knn<0] = 0
preds_valid_knn[preds_valid_knn>5] = 5
preds_valid_knn = np.round(preds_valid_knn).astype(int)
cohen_kappa_score([int(x[0]) for x in targets_valid], preds_valid_knn, weights='quadratic')

In [None]:
mixed_preds = np.array([x[0] for x in predicted_cls_valid]) + np.array([x[0] for x in predictions_knn])
mixed_preds = mixed_preds / 2
mixed_preds = np.round(mixed_preds)
mixed_preds[mixed_preds<0] = 0
mixed_preds[mixed_preds>5] = 5
mixed_preds = mixed_preds.astype(int)
cohen_kappa_score([int(x[0]) for x in targets_valid], mixed_preds, weights='quadratic')

In [15]:
Lookahead(optimizer)

Lookahead (
Parameter Group 0
    betas: (0.9, 0.999)
    counter: 0
    eps: 1e-08
    lr: 0.0001
    weight_decay: 0
)

In [16]:
optimizer

RAdam (
Parameter Group 0
    betas: (0.9, 0.999)
    counter: 0
    eps: 1e-08
    lr: 0.0001
    weight_decay: 0
)