# Library

In [1]:
import os
import sys
import gc
import time
import random
import cv2
import glob
import requests
import json

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from PIL import Image
from multiprocessing import cpu_count
# from tqdm import tqdm_notebook as tqdm
from tqdm import tqdm
from multiprocessing import Pool

import torch
import torch.nn as nn
import torch.nn.init as init
import torch.optim as optim
import torch.nn.functional as F
import torchvision.models as models
import torchvision.transforms as transforms

from torch.utils.data import DataLoader, Dataset, SubsetRandomSampler

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score

import albumentations as A
from albumentations import torch as AT

# Configuration

In [2]:
config = {
    # settings
    'seed': 43,
    'num_workers': cpu_count(),
    'device': 'cuda' if torch.cuda.is_available() else 'cpu',
    
    # data
    'data_path': os.path.join('..', 'input'),
    
    # optimizer
    'optimizer_name': 'Adam',
    'lr': 0.0001,
    
    # model
    'model_name': 'resnet34',
    'pretrained': True,
    'weight_path': None,
    
    # loss
    'loss_name': 'CrossEntropy',
    
    # transforms
#     'transforms': 'pytorch',
    'transforms': 'albumentations',
    
    'pytorch': {
        'resize': {'train': False, 'test': False, 'train_size': 256, 'test_size': 224},
        'centerCrop': {'train': True, 'test': False, 'train_size': 224,'test_size': 224},
        'randomHorizontalFlip': {'train': True, 'test': False},
        'randomRotation': {'train': True, 'test': True, 'degrees': 180},
        'toTensor': {'train': True, 'test': True},
        'normalize': {'train': True, 'test': True},
    },
    
    'albumentations': {
        'resize': {'train': False, 'test': False, 'train_size': 256, 'test_size': 224},
        'centerCrop': {'train': True, 'test': False, 'train_size': 224,'test_size': 224},
        'horizontalFlip': {'train': True, 'test': False},
        'rotate': {'train': True, 'test': False, 'limit': 180},
        'clahe': {'train': True, 'test': True},
        'gaussNoise': {'train': True, 'test': True},
        'randomBrightness': {'train': True, 'test': True},
        'randomContrast': {'train': True, 'test': True},
        'randomBrightnrssContrast': {'train': False, 'test': False},
        'hueSaturationValue': {'train': True, 'test': True},
        'toTensor': {'train': True, 'test': True},
        'normalize': {'train': True, 'test': True},
    },
    
    # train settings
    'image_size': 256,
    'epochs': 100,
    'patience': 10,
    'verbose': True,
    'batch_size': 32,
    'valid_size': 0.1,
}

# Function

## seed

In [3]:
def seed_everything():
    seed = config['seed']
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

## preprocess

In [4]:
def crop_image_from_gray(image, tol=7):
    if image.ndim == 2:
        mask = image > tol
        return image[np.ix_(mask.any(1), mask.any(0))]
   
    elif image.ndim == 3:
        gray_image = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
        mask = gray_image > tol
        
        check_shape = image[:,:,0][np.ix_(mask.any(1), mask.any(0))].shape[0]
        if (check_shape == 0): 
            return image
        
        else:
            imageR = image[:,:,0][np.ix_(mask.any(1), mask.any(0))]
            imageG = image[:,:,1][np.ix_(mask.any(1), mask.any(0))]
            imageB = image[:,:,2][np.ix_(mask.any(1), mask.any(0))]
            image = np.stack([imageR, imageG, imageB], axis=-1)
        
        return image
    
def preprocess(image_name):
    image = cv2.imread(image_name)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image = crop_image_from_gray(image)
    image = cv2.resize(image, (config['image_size'], config['image_size']))
    image = cv2.addWeighted(image, 4, cv2.GaussianBlur(image, (0, 0), 30), -4, 128)

    return image

## getter

In [5]:
def get_model():
    model_name = config['model_name']
    pretrained = config['pretrained']
    if model_name == 'resnet18':
        model = models.resnet18(pretrained=pretrained)
    elif model_name == 'resnet34':
        model = models.resnet34(pretrained=pretrained)
    elif model_name == 'resnet50':
        model = models.resnet50(pretrained=pretrained)
    elif model_name == 'resnet101':
        model = models.resnet101(pretrained=pretrained)
    elif model_name == 'resnet152':
        model = models.resnet152(pretrained=pretrained)
    elif model_name == 'vgg11_bn':
        model = models.vgg11_bn(pretrained=pretrained)
    elif model_name == 'vgg13_bn':
        model = models.vgg13_bn(pretrained=pretrained)
    elif model_name == 'vgg16_bn':
        model = models.vgg16_bn(pretrained=pretrained)
    elif model_name == 'vgg19_bn':
        model = models.vgg19_bn(pretrained=pretrained)
    elif model_name == 'inception_v3':
        model = models.inception_v3(pretrained=pretrain)
        
    return model

def get_optimizer(params): 
    optimizer_name = config['optimizer_name']
    lr = config['lr']
    if optimizer_name == 'SGD':
        optimizer = optim.SGD(params=params, lr=lr)
    elif optimizer_name == 'Adam':
        optimizer = optim.Adam(params=params, lr=lr)
    elif optimizer_name == 'Adagrad':
        optimizer = optim.Adagrad(params=params, lr=lr)
    elif optimizer_name == 'Adadelta':
        optimizer = optim.Adadelta(params=params, lr=lr)
    elif optimizer_name == 'RMSprop':
        optimizer = optim.RMSprop(params=params, lr=lr)
    
    return optimizer

def get_loss():
    loss_name = config['loss_name']
    if loss_name == 'MSE':
        loss = nn.MSELoss()
    elif loss_name == 'CrossEntropy':
        loss = nn.CrossEntropyLoss()
    elif loss_name == 'BCE':
        loss = nn.BCELoss()
    elif loss_name == 'BCEWithLogits':
        loss = nn.BCEWithLogitsLoss()
        
    return loss

def get_label_data(train):
    
    return train['diagnosis'].values
                                
def get_transforms_train():
    transforms_train_list = []

    if config['transforms'] == 'pytorch':
        if config['pytorch']['resize']['train']:
            transforms_train_list.append(transforms.Resize(size=(config['pytorch']['resize']['train_size'], config['pytorch']['resize']['train_size'])))
        if config['pytorch']['centerCrop']['train']:
            transforms_train_list.append(transforms.Resize(size=(config['pytorch']['centerCrop']['train_size'], config['pytorch']['centerCrop']['train_size'])))
        if config['pytorch']['randomHorizontalFlip']['train']:
            transforms_train_list.append(transforms.RandomHorizontalFlip())
        if config['pytorch']['randomRotation']['train']:
            transforms_train_list.append(transforms.RandomRotation(degrees=config['pytorch']['randomRotation']['degrees']))
        if config['pytorch']['toTensor']['train']:
            transforms_train_list.append(transforms.ToTensor())
        if config['pytorch']['normalize']['train']:
            transforms_train_list.append(transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]))
        train_transforms = transforms.Compose(transforms_train_list)
    else:
        if config['albumentations']['resize']['train']:
            transforms_train_list.append(A.Resize(config['albumentations']['resize']['train_size'], config['albumentations']['resize']['train_size']))
        if config['pytorch']['centerCrop']['train']:
            transforms_train_list.append(transforms.Resize(config['pytorch']['centerCrop']['train_size'], config['pytorch']['centerCrop']['train_size']))
        if config['albumentations']['horizontalFlip']['train']:
            transforms_train_list.append(A.HorizontalFlip())
        if config['albumentations']['rotate']['train']:
            transforms_train_list.append(A.Rotate(config['albumentations']['rotate']['limit']))
        if config['albumentations']['clahe']['train']:
            transforms_train_list.append(A.CLAHE())
        if config['albumentations']['gaussNoise']['train']:
            transforms_train_list.append(A.GaussNoise())
        if config['albumentations']['randomBrightness']['train']:
            transforms_train_list.append(A.RandomBrightness())
        if config['albumentations']['randomContrast']['train']:
            transforms_train_list.append(A.RandomContrast())
        if config['albumentations']['randomBrightnrssContrast']['train']:
            transforms_train_list.append(A.RandomBrightnessContrast())
        if config['albumentations']['hueSaturationValue']['train']:
            transforms_train_list.append(A.HueSaturationValue())
        if config['albumentations']['normalize']['train']:
            transforms_train_list.append(A.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]))
        if config['albumentations']['toTensor']['train']:
            transforms_train_list.append(AT.ToTensor())
        train_transforms = A.Compose(transforms_train_list)
        
    return train_transforms
                                
def get_transforms_test():
    transforms_test_list = []

    if config['transforms'] == 'pytorch':
        if config['pytorch']['resize']['test']:
            transforms_test_list.append(transforms.Resize(size=(config['pytorch']['resize']['test_size'], config['pytorch']['resize']['test_size'])))
        if config['pytorch']['centerCrop']['test']:
            transforms_test_list.append(transforms.Resize(size=(config['pytorch']['centerCrop']['test_size'], config['pytorch']['centerCrop']['test_size'])))
        if config['pytorch']['randomHorizontalFlip']['test']:
            transforms_test_list.append(transforms.RandomHorizontalFlip())
        if config['pytorch']['randomRotation']['test']:
            transforms_test_list.append(transforms.RandomRotation(degrees=config['pytorch']['randomRotation']['degrees']))
        if config['pytorch']['toTensor']['test']:
            transforms_test_list.append(transforms.ToTensor())
        if config['pytorch']['normalize']['test']:
            transforms_test_list.append(transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]))
        test_transforms = transforms.Compose(transforms_test_list)
    else:
        if config['albumentations']['resize']['test']:
            transforms_test_list.append(A.Resize(config['albumentations']['resize']['test_size'], config['albumentations']['resize']['test_size']))
        if config['pytorch']['centerCrop']['test']:
            transforms_test_list.append(transforms.Resize(config['albumentations']['centerCrop']['test_size'], config['albumentations']['centerCrop']['test_size']))
        if config['albumentations']['horizontalFlip']['test']:
            transforms_test_list.append(A.HorizontalFlip())
        if config['albumentations']['rotate']['test']:
            transforms_test_list.append(A.Rotate(config['albumentations']['rotate']['limit']))
        if config['albumentations']['clahe']['test']:
            transforms_test_list.append(A.CLAHE())
        if config['albumentations']['gaussNoise']['test']:
            transforms_test_list.append(A.GaussNoise())
        if config['albumentations']['randomBrightness']['test']:
            transforms_test_list.append(A.RandomBrightness())
        if config['albumentations']['randomContrast']['test']:
            transforms_test_list.append(A.RandomContrast())
        if config['albumentations']['randomBrightnrssContrast']['test']:
            transforms_test_list.append(A.RandomBrightnessContrast())
        if config['albumentations']['hueSaturationValue']['test']:
            transforms_test_list.append(A.HueSaturationValue())
        if config['albumentations']['normalize']['test']:
            transforms_test_list.append(A.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]))
        if config['albumentations']['toTensor']['test']:
            transforms_test_list.append(AT.ToTensor())
        test_transforms = A.Compose(transforms_test_list)
        
    return test_transforms                       
        
def get_train_data():
    train = pd.read_csv(os.path.join(config['data_path'], 'train.csv'))
    y = get_label_data(train)
    
    train_dataset = TrainDataset(id_code=train['id_code'].values, transform=get_transforms_train(), y=y)
    print(len(train_dataset)) # debug
    tr, val = train_test_split(train['diagnosis'], stratify=train['diagnosis'], test_size=config['valid_size'])

    train_sampler = SubsetRandomSampler(list(tr.index))
    valid_sampler = SubsetRandomSampler(list(val.index))
                                
    train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], sampler=train_sampler, num_workers=config['num_workers'])
    valid_loader = DataLoader(train_dataset, batch_size=config['batch_size'], sampler=valid_sampler, num_workers=config['num_workers'])

    return train_loader, valid_loader
    
def get_test_data():
    test = pd.read_csv(os.path.join(config['data_path'], 'test.csv'))

    test_dataset = TestDataset(id_code=test['id_code'].values, transform=get_transforms_test())
    print(len(test_dataset)) # debug
    test_loader = DataLoader(test_dataset, batch_size=config['batch_size'], num_workers=config['num_workers'])
    
    return test_loader

## train & valid & test

In [6]:
def train(model, criterion, optimizer, train_loader):
    model.train()
    
    running_loss = 0
    for _, (data, target) in enumerate(train_loader):
        data, target = data.to(config['device']), target.to(config['device'])
        
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output.float(), target)
        running_loss += loss.data
        
        loss.backward()
        optimizer.step()
        
    return running_loss / len(train_loader)

def valid(model, criterion, optimizer, valid_loader):
    model.eval()
    
    running_loss = 0
    for _, (data, target) in enumerate(valid_loader):
        data, target = data.to(config['device']), target.to(config['device'])
        
        output = model(data)
        loss = criterion(output.float(), target)
        running_loss += loss.data
        
    # TODO: implementation
#         output = output.cpu().detach().numpy()
#         target = target.cpu().detach().numpy()
#         res = np.zeros(output.shape[0])
#         for i, e in enumerate(output):
#             res[i] = np.argmax(e)
        
#         score = 1 - cohen_kappa_score(res, target)
#         print(score)
        
    return running_loss / len(valid_loader)

def test(model, test_loader, sub):
    model.eval()
    
    for (data, _, name) in test_loader:
        data = data.to(config['device'])
            
        output = model(data)
        output = output.cpu().detach().numpy()
        
        for i, (e, n) in enumerate(list(zip(output, name))):
#             sub.loc[sub['id_code'] == n.split('/')[-1].split('.')[0], 'diagnosis'] = le.inverse_transform([np.argmax(e)])
            sub.loc[sub['id_code'] == n.split('/')[-1].split('.')[0], 'diagnosis'] = np.argmax(e)
    
    return sub

# Class

## model

In [7]:
class DRModel(nn.Module):
    
    def __init__(self):
        
        super(DRModel, self).__init__()
        self.model = get_model()
        if config['weight_path'] != None:
            self.model.load_state_dict(torch.load(config['weight_path']))
        self.model.fc = nn.Linear(in_features=self.model.fc.in_features, out_features=5, bias=True)
        
    def forward(self, x):
        
        x = self.model(x)
        x = F.softmax(x)
        
        return x

## dataset

In [8]:
class TrainDataset(Dataset):
    
    def __init__(self, id_code, transform, y):
        
        self.image_name_list = [os.path.join(config['data_path'], 'train_images', f'{image_name}.png') for image_name in tqdm(id_code)]
        self.image_list = [preprocess(image_name) for image_name in tqdm(self.image_name_list)]
        
        self.transform = transform
        self.labels = y
        
    def __len__(self):
        
        return len(self.image_name_list)
    
    def __getitem__(self, idx):
        
        image_name = self.image_name_list[idx]
        image = self.image_list[idx]
        
        if config['transforms'] == 'pytorch':
            image = transforms.ToPILImage()(image)
            image = self.transform(image)
        elif config['transforms'] == 'albumentations':
            image = self.transform(image)
            image = image['image']
            
        label = self.labels[idx]
        
        return image, label
    
    
class TestDataset(Dataset):
    
    def __init__(self, id_code, transform):
        
        self.image_name_list = [os.path.join(config['data_path'], 'test_images', f'{i}.png') for i in tqdm(id_code)]
        
        self.transform = transform
        self.labels = np.zeros((len(self.image_name_list), 5))
        
    def __len__(self):
        
        return len(self.image_name_list)

    def __getitem__(self, idx):
        
        image_name = self.image_name_list[idx]
        image = preprocess(image_name)
        
        if config['transforms'] == 'pytorch':
            image = transforms.ToPILImage()(image)
            image = self.transform(image)
        elif config['transforms'] == 'albumentations':
            image = self.transform(image=image)
            image = image['image']
            
        label = self.labels[idx]
        
        return image, label, image_name

## early stopping

In [9]:
class EarlyStopping:

    def __init__(self):
 
        self.patience = config['patience']
        self.verbose = config['verbose']
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf

    def __call__(self, val_loss, model):

        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score:
            self.counter += 1
            print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):

        if self.verbose:
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), 'best.pth')
        self.val_loss_min = val_loss

# Run

In [10]:
seed_everything()

In [11]:
model = DRModel().to(config['device'])

In [12]:
train_loader, valid_loader = get_train_data()

100%|██████████| 3662/3662 [00:00<00:00, 413658.16it/s]
100%|██████████| 3662/3662 [27:24<00:00,  2.12it/s]

3662





In [13]:
criterion = get_loss()
optimizer = get_optimizer(params=model.parameters())
early_stopping = EarlyStopping()

In [26]:
train_loader

<torch.utils.data.dataloader.DataLoader at 0x7f78f0ad4630>

In [25]:
for k, (i, j) in enumerate(train_loader):
    print(i, j)
    break

ValueError: Traceback (most recent call last):
  File "/home/ubuntu/.local/lib/python3.6/site-packages/torch/utils/data/_utils/worker.py", line 99, in _worker_loop
    samples = collate_fn([dataset[i] for i in batch_indices])
  File "/home/ubuntu/.local/lib/python3.6/site-packages/torch/utils/data/_utils/worker.py", line 99, in <listcomp>
    samples = collate_fn([dataset[i] for i in batch_indices])
  File "<ipython-input-8-1e49aa23ba9f>", line 24, in __getitem__
    image = self.transform(image)
  File "/home/ubuntu/.local/lib/python3.6/site-packages/albumentations/core/composition.py", line 164, in __call__
    need_to_run = force_apply or random.random() < self.p
ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()


In [14]:
for epoch in tqdm(range(config['epochs'])):
    train_loss = train(model, criterion, optimizer, train_loader)
    val_loss = valid(model, criterion, optimizer, valid_loader)

    print('epoch {:d}, loss: {:.4f} val_loss: {:.4f}'.format(epoch, train_loss, val_loss))

    early_stopping(val_loss, model)
    if early_stopping.early_stop:
        print("Early stopping")
        break

  0%|          | 0/100 [00:00<?, ?it/s]


ValueError: Traceback (most recent call last):
  File "/home/ubuntu/.local/lib/python3.6/site-packages/torch/utils/data/_utils/worker.py", line 99, in _worker_loop
    samples = collate_fn([dataset[i] for i in batch_indices])
  File "/home/ubuntu/.local/lib/python3.6/site-packages/torch/utils/data/_utils/worker.py", line 99, in <listcomp>
    samples = collate_fn([dataset[i] for i in batch_indices])
  File "<ipython-input-8-1e49aa23ba9f>", line 24, in __getitem__
    image = self.transform(image)
  File "/home/ubuntu/.local/lib/python3.6/site-packages/albumentations/core/composition.py", line 164, in __call__
    need_to_run = force_apply or random.random() < self.p
ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()


In [None]:
# del train_loader, valid_loader
# gc.collect()

In [None]:
# test_loader = get_test_data()

# sub = pd.read_csv(os.path.join(config['data_path'], 'sample_submission.csv'))
# sub = test(model, test_loader, sub)
# sub.to_csv('submission.csv', index=False)

In [None]:
WEB_HOOK_URL = 'https://hooks.slack.com/services/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'
requests.post(WEB_HOOK_URL, data = json.dumps({
    'text': f'{config}\nbest score: {early_stopping.best_score}',  
    'username': 'Kaggle Notification',  
    'icon_emoji': ':kerneler:',  
    'link_names': 1,  
}))