In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys
sys.path.append("..")

In [None]:
label_to_string = {
    0:  'Nucleoplasm',
    1:  'Nuclear membrane',
    2:  'Nucleoli',   
    3:  'Nucleoli fibrillar center',
    4:  'Nuclear speckles',
    5:  'Nuclear bodies',
    6:  'Endoplasmic reticulum',   
    7:  'Golgi apparatus',
    8:  'Peroxisomes',
    9:  'Endosomes',
    10:  'Lysosomes',
    11:  'Intermediate filaments',
    12:  'Actin filaments',
    13:  'Focal adhesion sites',   
    14:  'Microtubules',
    15:  'Microtubule ends',  
    16:  'Cytokinetic bridge',   
    17:  'Mitotic spindle',
    18:  'Microtubule organizing center',  
    19:  'Centrosome',
    20:  'Lipid droplets',
    21:  'Plasma membrane',   
    22:  'Cell junctions', 
    23:  'Mitochondria',
    24:  'Aggresome',
    25:  'Cytosol',
    26:  'Cytoplasmic bodies',   
    27:  'Rods & rings'
}

### First load in the data

In [None]:
from pathlib import Path
import random


import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
import torch
import torch.utils.data


from src.data import DataPaths, Image, open_rgby

#### Load the training images

In [None]:
from pytorch_toolbox.fastai.fastai import vision

In [None]:
train_paths = [Path(DataPaths.TRAIN_IMAGES, img_id) for img_id in np.unique([p.name[:36] for p in DataPaths.TRAIN_IMAGES.glob("*")])]
test_paths = [Path(DataPaths.TEST_IMAGES, img_id) for img_id in np.unique([p.name[:36] for p in DataPaths.TEST_IMAGES.glob("*")])]

In [None]:
sample_path = random.sample(train_paths, 1)[0]
sample_img = open_rgby(sample_path)
plt.imshow(sample_img)

##### Image augmentation

In [None]:
from functools import partial

In [None]:
from albumentations import (
    HorizontalFlip, IAAPerspective, ShiftScaleRotate, CLAHE, RandomRotate90,
    Transpose, ShiftScaleRotate, Blur, OpticalDistortion, GridDistortion, HueSaturationValue,
    IAAAdditiveGaussianNoise, GaussNoise, MotionBlur, MedianBlur, IAAPiecewiseAffine,
    IAASharpen, IAAEmboss, RandomContrast, RandomBrightness, Flip, OneOf, Compose, ElasticTransform,
    Resize
)
import numpy as np

def strong_aug(p=0.5):
    return Compose([
        RandomRotate90(),
        Flip(),
        ShiftScaleRotate(shift_limit=0.0625, scale_limit=0.2, rotate_limit=45, always_apply=True),
        Resize(height=256, width=256, always_apply=True)
#         ElasticTransform(p=0.5),
    ], p=p)

def albumentations_transform_wrapper(image, augment_fn):
    augmentation = augment_fn(image=image)
    return augmentation['image']

augment_fn = partial(albumentations_transform_wrapper, augment_fn=strong_aug(p=1))

In [None]:
plt.imshow(augment_fn(sample_img))

Plot out the results

In [None]:
_, axs = plt.subplots(2, 4, figsize=(12, 12))
axs = axs.flatten()
for ax in axs:
    ax.imshow(augment_fn(sample_img))

#### Load the training labels

In [None]:
labels_df = pd.read_csv(DataPaths.TRAIN_LABELS)
labels_df['Target'] = [[int(i) for i in s.split()] for s in labels_df['Target']]

In [None]:
labels_df.head()

In [None]:
label = labels_df.loc[labels_df["Id"] == sample_path.name]['Target'].values[0]

In [None]:
label

##### Generate one hot labels

In [None]:
train_labels_one_hot = []
train_labels = labels_df['Target']
for labels in tqdm_notebook(train_labels):
    one_hot = np.zeros((28))
    for label in labels:
        one_hot[label] = 1
    train_labels_one_hot.append(one_hot.astype(np.float32))

In [None]:
# train_labels[0]
train_labels_one_hot[0]

#### Now generate training input/label pairs

In [None]:
train_paths = sorted([Path(DataPaths.TRAIN_IMAGES, img_id) for img_id in np.unique([p.name[:36] for p in DataPaths.TRAIN_IMAGES.glob("*")])], key=lambda p: p.name)

In [None]:
labels_df = labels_df.sort_values(["Id"], ascending=[True])

In [None]:
np.all(np.array([p.name for p in train_paths]) == labels_df["Id"])

In [None]:
train_labels = labels_df["Target"].values

In [None]:
len(train_paths) == len(train_labels)

#### Load data into dataset

In [None]:
from functools import partial
from src.data import ProteinClassificationDataset
from pytorch_toolbox.utils.image import normalize, denormalize, tensor2img

In [None]:
four_channel_image_net_stats = {
    'mean': [0.485, 0.456, 0.406, 0.485],
    'sd': [0.229, 0.224, 0.224, 0.229]
}
four_channel_image_net_normalize = partial(normalize, **four_channel_image_net_stats)
four_channel_image_net_denormalize = partial(denormalize, **four_channel_image_net_stats)

In [None]:
ds = ProteinClassificationDataset(train_paths, labels=train_labels_one_hot, augment_fn=augment_fn, normalization_fn=four_channel_image_net_normalize)

In [None]:
batch = next(iter(ds))
inp, name, label = batch['input'], batch['name'], batch['label']

In [None]:
plt.imshow(tensor2img(inp, denorm_fn=four_channel_image_net_denormalize))

#### Define the loss to be used

In [None]:
import torch.nn as nn
from pytorch_toolbox.fastai_extensions.loss import LossWrapper, FocalLoss

#### Create the data bunch

In [None]:
from functools import partial
from sklearn.model_selection import ShuffleSplit
from pytorch_toolbox.fastai_extensions.basic_data import DataBunch

In [None]:
shuffle_split_method = partial(ShuffleSplit(n_splits=1, test_size=0.1, random_state=42).split, X=train_paths)

In [None]:
def get_splits(split_method):
    return next(iter(split_method()))

In [None]:
def create_data_bunch(split_method, augment_fn, normalization_fn, num_workers=fastai.defaults.cpus, train_bs=64, val_bs=None, test_bs=None):
    train_idx, val_idx = get_splits(split_method)
    if val_bs is None: val_bs = train_bs * 2
    if test_bs is None: test_bs = train_bs * 2
    
    train_ds = ProteinClassificationDataset(inputs=np.array(train_paths)[train_idx], 
                                            labels=np.array(train_labels_one_hot)[train_idx],
                                            augment_fn=augment_fn,
                                            normalization_fn=normalization_fn)
    val_ds = ProteinClassificationDataset(inputs=np.array(train_paths)[val_idx],
                                          labels=np.array(train_labels_one_hot)[val_idx],
                                          normalization_fn=normalization_fn)
    test_ds = ProteinClassificationDataset(inputs=np.array(test_paths),
                                           normalization_fn=normalization_fn)
    
    data = DataBunch.create(train_ds, val_ds, test_ds,
                            num_workers=num_workers,
                            collate_fn=ProteinClassificationDataset.collate_fn,
                            train_bs=train_bs,
                            val_bs=val_bs,
                            test_bs=test_bs)
    return data

In [None]:
data = create_data_bunch(split_method=shuffle_split_method,
                         augment_fn=augment_fn,
                         normalization_fn=four_channel_image_net_normalize,
                         num_workers=0)

In [None]:
batch = next(iter(data.train_dl))

In [None]:
inp, name, label = batch[0], batch[1]['name'], batch[1]['label']

In [None]:
plt.imshow(tensor2img(inp[4], denorm_fn=four_channel_image_net_denormalize))

#### Initializing the model

In [54]:
import pytorch_toolbox.fastai.fastai as fastai
from pytorch_toolbox.fastai_extensions.basic_train import Learner
from src.models import cbam_resnet50_four_channel_input, resnet50_four_channel_input, resnet34_four_channel_input

##### Initialize model pretrained resnet34

In [55]:
model = resnet34_four_channel_input(pretrained=True)
n_starting_layers = len(fastai.flatten_model(model[:6]))
n_middle_layers = len(fastai.flatten_model(model[6:9]))
n_head = len(fastai.flatten_model(model[9:]))
layer_groups = fastai.split_model_idx(model, [n_starting_layers, n_starting_layers + n_middle_layers])

##### Initialize model pretrained resnet50

In [56]:
# model = resnet50_four_channel_input(pretrained=True)
# n_starting_layers = len(fastai.flatten_model(model[:6]))
# n_middle_layers = len(fastai.flatten_model(model[6:9]))
# n_head = len(fastai.flatten_model(model[9:]))
# layer_groups = fastai.split_model_idx(model, [n_starting_layers, n_starting_layers + n_middle_layers])

##### Initialize model pretrained resnet CBAM50

In [57]:
# model = cbam_resnet50_four_channel_input(pretrained=False)
# n_starting_layers = len(fastai.flatten_model(model[:6]))
# n_middle_layers = len(fastai.flatten_model(model[6:9]))
# n_head = len(fastai.flatten_model(model[9:]))
# layer_groups = fastai.split_model_idx(model, [n_starting_layers, n_starting_layers + n_middle_layers])

#### Initialize callbacks

##### For extracting relevant information from output of our data bunch

In [None]:
from pytorch_toolbox.fastai_extensions.callbacks import NameExtractionTrainer

##### For saving the results of our training

In [None]:
from collections import defaultdict

In [None]:
class OutputRecorder(fastai.LearnerCallback):
    _order = -10
    def __init__(self, learn):
        super().__init__(learn)
        self.history = defaultdict(list)
        self.phase = None
        self.current_batch = dict()
        
    def on_batch_begin(self, last_input, last_target, epoch, train, **kwargs):
        if train:
            self.phase = 'TRAIN'
        else:
            label = last_target.get('label')
            if label is not None:
                self.phase = 'VAL'
            else:
                self.phase = 'TEST'
        self.key = (self.phase, epoch)
        inputs = tensor2img(last_input, denorm_fn=four_channel_image_net_denormalize)
        self.current_batch['input'] = inputs
        self.current_batch['name'] = last_target['name']
        if self.phase == 'TRAIN' or self.phase == 'VAL':
            label = to_numpy(last_target['label'])
            self.current_batch['label'] = label

    def on_loss_begin(self, last_output, epoch, **kwargs):
        model_output = to_numpy(last_output)
        self.current_batch['prediction_probs'] = model_output
        classes = np.where(model_output > 0.5)[0]
        prediction = model_output.copy()
        prediction[prediction < 0.5] = 0
        prediction[prediction >= 0.5] = 1
        self.current_batch['prediction'] = prediction
    
    def on_batch_end(self, epoch, **kwargs):
        self.current_batch['loss'] = to_numpy(self.learn.loss_func.focal_loss.loss)
        prediction = self.current_batch['prediction']
        label = self.current_batch['label']
        n_classes = label.shape[-1]
        indices_to_keep = np.where((prediction == label).sum(axis=1) != n_classes)[0]
        
        if self.phase == "VAL":
            for idx in indices_to_keep:
                sample_to_save = dict()
                for k, v in self.current_batch.items():
                    sample_to_save[k] = v[idx]
                self.history[self.key].append(sample_to_save)
    

##### Define metric to track

In [None]:
def accuracy(preds,targs,th=0.0):
    preds = (preds > th).int()
    targs = targs.int()
    return (preds==targs).float().mean()

#### Initialize Learner

In [None]:
learner = Learner(data,
                  layer_groups=layer_groups,
                  model=model, 
                  loss_func=LossWrapper([
                      FocalLoss()
                  ]),
                  callbacks=[NameExtractionTrainer()],
                  callback_fns=[OutputRecorder],
                  metrics=[accuracy])

##### Find optimal learning rate

In [None]:
import warnings
warnings.filterwarnings('ignore')
learner.lr_find(start_lr=[1e-5] * 3, end_lr=[10] * 3, num_it=100)

In [None]:
learner.recorder.plot()

##### Fit the model for real this time

In [None]:
learner.fit_one_cycle(cyc_len=3, max_lr=[2e-2] * 3)

#### Define a recorder for the test results

In [None]:
from pytorch_toolbox.utils.core import to_numpy

In [None]:
class TestRecorder(fastai.Callback):
    _order = -10
    def __init__(self):
        self.names = []
        self.prob_preds = []
    
    def on_batch_begin(self, last_input, last_target, train, **kwargs):
#         inputs = tensor2img(last_input, denorm_fn=image_net_denormalize)
#         self.inputs.extend(inputs)
        self.names.extend(last_target['name'])
            
    def on_loss_begin(self, last_output, **kwargs):
        prob_pred = to_numpy(torch.sigmoid(last_output))
        self.prob_preds.extend(prob_pred)

##### Run prediction on learner

In [None]:
test_recorder = TestRecorder()
learner.predict_on_dl(dl=learner.data.test_dl, callbacks=[test_recorder])

In [None]:
def F1_soft(preds,targs,th=0.5,d=50.0):
    preds = sigmoid_np(d*(preds - th))
    targs = targs.astype(np.float)
    score = 2.0*(preds*targs).sum(axis=0)/((preds+targs).sum(axis=0) + 1e-6)
    return score

def fit_val(x,y):
    params = 0.5*np.ones(len(name_label_dict))
    wd = 1e-5
    error = lambda p: np.concatenate((F1_soft(x,y,p) - 1.0,
                                      wd*(p - 0.5)), axis=None)
    p, success = opt.leastsq(error, params)
    return p

In [None]:
x = np.stack(test_recorder.prob_preds)
predicted = []
for i in tqdm_notebook(x):
    classes = []
    max_class = np.argmax(i)
    other_classes = np.where(i > 0.5)[0]
    classes.append(max_class)
    classes.extend(other_classes)
    classes = np.unique(classes)
    classes = [str(c) for c in classes]
    predicted.append(" ".join(classes))

In [None]:
submission_df = pd.DataFrame({
    "Id": test_recorder.names,
    "Predicted": predicted
})

In [None]:
submission_df.to_csv("submission.csv", index=False)