Reference : https://www.kaggle.com/manabendrarout/transformers-classifier-method-starter-train

### Imports

In [1]:
import sys
sys.path.append('../pytorch-image-models/pytorch-image-models-master')
# Asthetics
import warnings
import sklearn.exceptions
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings("ignore", category=sklearn.exceptions.UndefinedMetricWarning)

# General
from tqdm.auto import tqdm
from collections import defaultdict
import pandas as pd
import numpy as np
import os
import random
import gc
import cv2
import glob
import math
gc.enable()
pd.set_option('display.max_columns', None)

# Visialisation
import matplotlib.pyplot as plt
%matplotlib inline

# Image Aug
from albumentations.pytorch import ToTensorV2
import albumentations as A

# import albumentations
# from albumentations.pytorch.transforms import ToTensorV2

# Deep Learning
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, OneCycleLR, CosineAnnealingLR
import torch
import torchvision
import timm
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.optimizer import Optimizer

#Metrics
from sklearn.metrics import mean_squared_error

# Random Seed Initialize
RANDOM_SEED = 42

def seed_everything(seed=RANDOM_SEED):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
seed_everything()

# Device Optimization
if torch.cuda.is_available():
    device = torch.device('cuda:1')
else:
    device = torch.device('cpu')
    
print(f'Using device: {device}')

Using device: cuda:1


In [2]:
csv_dir = '/mnt/hdd1/wearly/kaggle/petfinder/data/'
train_dir = '/mnt/hdd1/wearly/kaggle/petfinder/data/train/'
test_dir = '/mnt/hdd1/wearly/kaggle/petfinder/data/test/'

train_file_path = '/mnt/hdd1/wearly/kaggle/petfinder/data/datasets/pawpular-folds/train_5folds.csv'
sample_sub_file_path = os.path.join(csv_dir, 'sample_submission.csv')

print(f'Train file : {train_file_path}')
print(f'Train file : {sample_sub_file_path}')
      

Train file : /mnt/hdd1/wearly/kaggle/petfinder/data/datasets/pawpular-folds/train_5folds.csv
Train file : /mnt/hdd1/wearly/kaggle/petfinder/data/sample_submission.csv


In [3]:
train_df = pd.read_csv(train_file_path)
test_df = pd.read_csv(sample_sub_file_path)

def return_filpath(name, folder=train_dir):
    path = os.path.join(folder, f'{name}.jpg')
    return path

In [4]:
train_df['image_path'] = train_df['Id'].apply(lambda x: return_filpath(x))
test_df['image_path'] = test_df['Id'].apply(lambda x: return_filpath(x, folder=test_dir))

In [5]:
target = ['Pawpularity']
not_features = ['Id', 'kfold', 'image_path', 'Pawpularity']
cols = list(train_df.columns)
features = [feat for feat in cols if feat not in not_features]
print(features)

['Subject Focus', 'Eyes', 'Face', 'Near', 'Action', 'Accessory', 'Group', 'Collage', 'Human', 'Occlusion', 'Info', 'Blur']


### CFG

In [6]:
TRAIN_FOLDS = [0,1,2,3,4,5,6,7,8,9]

params = {
    'model': 'vit_large_patch32_384', #'vit_large_patch32_384'
    'dense_features': features,
    'pretrained': True,
    'inp_channels': 3,
    'im_size': 384,
    'device': device,
    'lr': 1e-5,
    'weight_decay': 1e-6,
    'batch_size': 16,
    'num_workers' : 0,
    'epochs': 10,
    'out_features': 1,
    'dropout': 0.2,
    'num_fold': len(TRAIN_FOLDS),
    'mixup': False,
    'mixup_alpha': 1.0,
    'scheduler_name': 'CosineAnnealingWarmRestarts',
    'T_0': 5,
    'T_max': 5,
    'T_mult': 1,
    'min_lr': 1e-7,
    'max_lr': 1e-4,
    'multi-gpu' : False#True
}

## 1. Train Augmentations

In [7]:
# def get_train_transforms(DIM = params['im_size']):
#     cut = random.randint(230,280) #추가
#     p = 0.1
    
#     return albumentations.Compose(
#         [
#             albumentations.Resize(DIM,DIM),
#             albumentations.Normalize(
#                 mean=[0.485, 0.456, 0.406],
#                 std=[0.229, 0.224, 0.225],
#             ),
#             albumentations.ShiftScaleRotate(shift_limit=0.2, scale_limit=0, rotate_limit=0,p=p,border_mode=cv2.INTER_NEAREST), 
#             albumentations.HorizontalFlip(p=p),
#             albumentations.VerticalFlip(p=p),
#             albumentations.Rotate(limit=180, p=p),
#             albumentations.CoarseDropout(max_holes=1, max_height=30, max_width=30,fill_value=0, p=0.3),
# #             albumentations.HueSaturationValue(
# #                 hue_shift_limit=0.2, sat_shift_limit=0.2,
# #                 val_shift_limit=0.2, p=0.5
# #             ),
# #             albumentations.RandomBrightnessContrast(
# #                 brightness_limit=(-0.1, 0.1),
# #                 contrast_limit=(-0.1, 0.1), p=0.5
# #             ),
#             ToTensorV2(p=1.0),
#         ]
#     )

In [8]:
def get_train_transforms(epoch, dim = params['im_size']):
    return A.Compose(
        [             
            # resize like Resize in fastai
            A.SmallestMaxSize(max_size=dim, p=1.0),
            A.RandomCrop(height=dim, width=dim, p=1.0),
            A.VerticalFlip(p = 0.5),
            A.HorizontalFlip(p = 0.5)
            #A.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
        ]
  )

def get_inference_fixed_transforms(mode=0, dim = params['im_size']):
    if mode == 0: # do not original aspects, colors and angles
        return A.Compose([
                A.SmallestMaxSize(max_size=dim, p=1.0),
                A.CenterCrop(height=dim, width=dim, p=1.0),
                #A.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
            ], p=1.0)
    elif mode == 1:
        return A.Compose([
                A.SmallestMaxSize(max_size=dim, p=1.0),
                A.CenterCrop(height=dim, width=dim, p=1.0),
                #A.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),,
                A.VerticalFlip(p = 1.0)
            ], p=1.0)    
    elif mode == 2:
        return A.Compose([
                A.SmallestMaxSize(max_size=dim, p=1.0),
                A.CenterCrop(height=dim, width=dim, p=1.0),
                #A.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),
                A.HorizontalFlip(p = 1.0)
            ], p=1.0)
    elif mode == 3:
        return A.Compose([
                A.SmallestMaxSize(max_size=dim, p=1.0),
                A.CenterCrop(height=dim, width=dim, p=1.0),
                #A.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),
                A.Transpose(p=1.0)
            ], p=1.0)
        
def get_inference_random_transforms(mode=0, dim = params['im_size']):
    if mode == 0: # do not original aspects, colors and angles
        return A.Compose([
                A.SmallestMaxSize(max_size=dim, p=1.0),
                A.CenterCrop(height=dim, width=dim, p=1.0),
                #A.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
            ], p=1.0)
    else:
        return A.Compose(
            [            
                A.SmallestMaxSize(max_size=dim, p=1.0),
                A.CenterCrop(height=dim, width=dim, p=1.0),
                A.VerticalFlip(p = 0.5),
                A.HorizontalFlip(p = 0.5)
                #A.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
            ]
      )

In [9]:
# def get_train_transforms(DIM = params['im_size']):
#     return albumentations.Compose(
#         [
#             albumentations.Resize(DIM,DIM),
#             albumentations.Normalize(
#                 mean=[0.485, 0.456, 0.406],
#                 std=[0.229, 0.224, 0.225],
#             ),
#             albumentations.HorizontalFlip(p=0.5),
#             albumentations.VerticalFlip(p=0.5),
#             albumentations.Rotate(limit=180, p=0.7),
#             albumentations.ShiftScaleRotate(
#                 shift_limit = 0.1, scale_limit=0.1, rotate_limit=45, p=0.5
#             ),
#             albumentations.HueSaturationValue(
#                 hue_shift_limit=0.2, sat_shift_limit=0.2,
#                 val_shift_limit=0.2, p=0.5
#             ),
#             albumentations.RandomBrightnessContrast(
#                 brightness_limit=(-0.1, 0.1),
#                 contrast_limit=(-0.1, 0.1), p=0.5
#             ),
#             ToTensorV2(p=1.0),
#         ]
#     )

## 2. Mixup

In [10]:
def mixup_data(x, z, y, params):
    if params['mixup_alpha'] > 0:
        lam = np.random.beta(
            params['mixup_alpha'], params['mixup_alpha']
        )
    else:
        lam = 1

    batch_size = x.size()[0]
    if params['device'].type == 'cuda':
        index = torch.randperm(batch_size).cuda()
    else:
        index = torch.randperm(batch_size)

    mixed_x = lam * x + (1 - lam) * x[index, :]
    mixed_z = lam * z + (1 - lam) * z[index, :]
    y_a, y_b = y, y[index]
    return mixed_x, mixed_z, y_a, y_b, lam

def mixup_criterion(criterion, pred, y_a, y_b, lam):
    return lam * criterion(pred, y_a) + (1 - lam) * criterion(pred, y_b)    

## 3.Valid Augmentations

In [11]:
# def get_valid_transforms(DIM= params['im_size']):
#     return albumentations.Compose(
#         [
#             albumentations.Resize(DIM,DIM),
#             albumentations.Normalize(
#                 mean = [0.485,0.456,0.406],
#                 std = [0.229,0.224,0.225],
#             ),
#             ToTensorV2(p=1.0)
#         ]
#     )

## 4. Dataset

In [12]:
class CuteDataset(Dataset):
    def __init__(self, images_filepaths, dense_features, targets, transform=None):
        self.images_filepaths = images_filepaths
        self.dense_features = dense_features
        self.targets = targets
        self.transform = transform

    def __len__(self):
        return len(self.images_filepaths)

    def __getitem__(self, idx):
        image_filepath = self.images_filepaths[idx]
        image = cv2.imread(image_filepath)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        if self.transform is not None:
            image = self.transform(image=image)['image']
        
        image = image / 255 # convert to 0-1
        image = np.transpose(image, (2, 0, 1)).astype(np.float32)
        image = torch.tensor(image, dtype = torch.float)
        dense = self.dense_features[idx, :]
        label = torch.tensor(self.targets[idx]).float()
        return image, dense, label

## 5. Activation Function

In [13]:
#credit : https://github.com/tyunist/memory_efficient_mish_swish/blob/master/mish.py

''' I just wanted to understand and implement custom backward activation in PyTorch so I choose this.
    You can also simply use this function below too.

class Mish(nn.Module):
    def __init__(self):
        super(Mish, self).__init__()

    def forward(self, input):
        return input * (torch.tanh(F.softplus(input)))
'''

class Mish_func(torch.autograd.Function):
    
    @staticmethod
    def forward(ctx, i):
        result = i * torch.tanh(F.softplus(i))
        ctx.save_for_backward(i)
        return result

    @staticmethod
    def backward(ctx, grad_output):
        i = ctx.saved_tensors[0]
  
        v = 1. + i.exp()
        h = v.log() 
        grad_gh = 1./h.cosh().pow_(2) 

        # Note that grad_hv * grad_vx = sigmoid(x)
        #grad_hv = 1./v  
        #grad_vx = i.exp()
        
        grad_hx = i.sigmoid()

        grad_gx = grad_gh *  grad_hx #grad_hv * grad_vx 
        
        grad_f =  torch.tanh(F.softplus(i)) + i * grad_gx 
        
        return grad_output * grad_f 


class Mish(nn.Module):
    def __init__(self, **kwargs):
        super().__init__()
        print("Mish initialized")
        pass
    def forward(self, input_tensor):
        return Mish_func.apply(input_tensor)

In [14]:
def replace_activations(model, existing_layer, new_layer):
    for name, module in reversed(model._modules.items()):
        if len(list(module.children())) > 0:
            model._modules[name] = replace_activations(module, existing_layer, new_layer)

        if type(module) == existing_layer:
            layer_old = module
            layer_new = new_layer
            model._modules[name] = layer_new
    return model

## 6. Optimizer

In [15]:
#credit : https://github.com/Yonghongwei/Gradient-Centralization

def centralized_gradient(x, use_gc=True, gc_conv_only=False):
    if use_gc:
        if gc_conv_only:
            if len(list(x.size())) > 3:
                x.add_(-x.mean(dim=tuple(range(1, len(list(x.size())))), keepdim=True))
        else:
            if len(list(x.size())) > 1:
                x.add_(-x.mean(dim=tuple(range(1, len(list(x.size())))), keepdim=True))
    return x


class Ranger(Optimizer):

    def __init__(self, params, lr=1e-3,                       # lr
                 alpha=0.5, k=5, N_sma_threshhold=5,           # Ranger options
                 betas=(.95, 0.999), eps=1e-5, weight_decay=0,  # Adam options
                 # Gradient centralization on or off, applied to conv layers only or conv + fc layers
                 use_gc=True, gc_conv_only=False, gc_loc=True
                 ):

        # parameter checks
        if not 0.0 <= alpha <= 1.0:
            raise ValueError(f'Invalid slow update rate: {alpha}')
        if not 1 <= k:
            raise ValueError(f'Invalid lookahead steps: {k}')
        if not lr > 0:
            raise ValueError(f'Invalid Learning Rate: {lr}')
        if not eps > 0:
            raise ValueError(f'Invalid eps: {eps}')

        # parameter comments:
        # beta1 (momentum) of .95 seems to work better than .90...
        # N_sma_threshold of 5 seems better in testing than 4.
        # In both cases, worth testing on your dataset (.90 vs .95, 4 vs 5) to make sure which works best for you.

        # prep defaults and init torch.optim base
        defaults = dict(lr=lr, alpha=alpha, k=k, step_counter=0, betas=betas,
                        N_sma_threshhold=N_sma_threshhold, eps=eps, weight_decay=weight_decay)
        super().__init__(params, defaults)

        # adjustable threshold
        self.N_sma_threshhold = N_sma_threshhold

        # look ahead params

        self.alpha = alpha
        self.k = k

        # radam buffer for state
        self.radam_buffer = [[None, None, None] for ind in range(10)]

        # gc on or off
        self.gc_loc = gc_loc
        self.use_gc = use_gc
        self.gc_conv_only = gc_conv_only
        # level of gradient centralization
        #self.gc_gradient_threshold = 3 if gc_conv_only else 1

        print(
            f"Ranger optimizer loaded. \nGradient Centralization usage = {self.use_gc}")
        if (self.use_gc and self.gc_conv_only == False):
            print(f"GC applied to both conv and fc layers")
        elif (self.use_gc and self.gc_conv_only == True):
            print(f"GC applied to conv layers only")

    def __setstate__(self, state):
        print("set state called")
        super(Ranger, self).__setstate__(state)

    def step(self, closure=None):
        loss = None
        # note - below is commented out b/c I have other work that passes back the loss as a float, and thus not a callable closure.
        # Uncomment if you need to use the actual closure...

        # if closure is not None:
        #loss = closure()

        # Evaluate averages and grad, update param tensors
        for group in self.param_groups:

            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data.float()

                if grad.is_sparse:
                    raise RuntimeError(
                        'Ranger optimizer does not support sparse gradients')

                p_data_fp32 = p.data.float()

                state = self.state[p]  # get state dict for this param

                if len(state) == 0:  # if first time to run...init dictionary with our desired entries
                    # if self.first_run_check==0:
                    # self.first_run_check=1
                    #print("Initializing slow buffer...should not see this at load from saved model!")
                    state['step'] = 0
                    state['exp_avg'] = torch.zeros_like(p_data_fp32)
                    state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)

                    # look ahead weight storage now in state dict
                    state['slow_buffer'] = torch.empty_like(p.data)
                    state['slow_buffer'].copy_(p.data)

                else:
                    state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
                    state['exp_avg_sq'] = state['exp_avg_sq'].type_as(
                        p_data_fp32)

                # begin computations
                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                beta1, beta2 = group['betas']

                # GC operation for Conv layers and FC layers
                # if grad.dim() > self.gc_gradient_threshold:
                #    grad.add_(-grad.mean(dim=tuple(range(1, grad.dim())), keepdim=True))
                if self.gc_loc:
                    grad = centralized_gradient(grad, use_gc=self.use_gc, gc_conv_only=self.gc_conv_only)

                state['step'] += 1

                # compute variance mov avg
                exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)

                # compute mean moving avg
                exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)

                buffered = self.radam_buffer[int(state['step'] % 10)]

                if state['step'] == buffered[0]:
                    N_sma, step_size = buffered[1], buffered[2]
                else:
                    buffered[0] = state['step']
                    beta2_t = beta2 ** state['step']
                    N_sma_max = 2 / (1 - beta2) - 1
                    N_sma = N_sma_max - 2 * \
                        state['step'] * beta2_t / (1 - beta2_t)
                    buffered[1] = N_sma
                    if N_sma > self.N_sma_threshhold:
                        step_size = math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (
                            N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step'])
                    else:
                        step_size = 1.0 / (1 - beta1 ** state['step'])
                    buffered[2] = step_size

                # if group['weight_decay'] != 0:
                #    p_data_fp32.add_(-group['weight_decay']
                #                     * group['lr'], p_data_fp32)

                # apply lr
                if N_sma > self.N_sma_threshhold:
                    denom = exp_avg_sq.sqrt().add_(group['eps'])
                    G_grad = exp_avg / denom
                else:
                    G_grad = exp_avg

                if group['weight_decay'] != 0:
                    G_grad.add_(p_data_fp32, alpha=group['weight_decay'])
                # GC operation
                if self.gc_loc == False:
                    G_grad = centralized_gradient(G_grad, use_gc=self.use_gc, gc_conv_only=self.gc_conv_only)

                p_data_fp32.add_(G_grad, alpha=-step_size * group['lr'])
                p.data.copy_(p_data_fp32)

                # integrated look ahead...
                # we do it at the param level instead of group level
                if state['step'] % group['k'] == 0:
                    # get access to slow param tensor
                    slow_p = state['slow_buffer']
                    # (fast weights - slow weights) * alpha
                    slow_p.add_(p.data - slow_p, alpha=self.alpha)
                    # copy interpolated weights to RAdam param tensor
                    p.data.copy_(slow_p)

        return loss

## Visualize Some Examples

In [16]:
X_train = train_df['image_path']
X_train_dense = train_df[params['dense_features']]
y_train = train_df['Pawpularity']
train_dataset = CuteDataset(
    images_filepaths=X_train.values,
    dense_features=X_train_dense.values,
    targets=y_train.values,
    transform=get_train_transforms(0)
)

In [17]:
def show_image(train_dataset=train_dataset, inline=4):
    plt.figure(figsize=(20,10))
    for i in range(inline):
        rand = random.randint(0, len(train_dataset))
        image, dense, label = train_dataset[rand]
        plt.subplot(1, inline, i%inline +1)
        plt.axis('off')
        plt.imshow(image.permute(2, 1, 0))
        plt.title(f'Pawpularity: {label}')

In [18]:
# for i in range(3):
#     show_image(inline=4)

In [19]:
del X_train, X_train_dense, y_train, train_dataset


## Metrics

In [20]:
def usr_rmse_score(output,target):
    y_pred = torch.sigmoid(output).cpu()
    y_pred = y_pred.detach().numpy()*100
    target = target.cpu()*100
    
    return mean_squared_error(target, y_pred, squared=False)

In [21]:
class MetricMonitor:
    def __init__(self, float_precision=3):
        self.float_precision = float_precision
        self.reset()

    def reset(self):
        self.metrics = defaultdict(lambda: {"val": 0, "count": 0, "avg": 0})

    def update(self, metric_name, val):
        metric = self.metrics[metric_name]

        metric["val"] += val
        metric["count"] += 1
        metric["avg"] = metric["val"] / metric["count"]

    def __str__(self):
        return " | ".join(
            [
                "{metric_name}: {avg:.{float_precision}f}".format(
                    metric_name=metric_name, avg=metric["avg"],
                    float_precision=self.float_precision
                )
                for (metric_name, metric) in self.metrics.items()
            ]
        )

## Scheduler¶
Scheduler is essentially an function that changes our learning rate over epochs/steps. But why do we need to do that?

The first reason is that our network may become stuck in either saddle points or local minima, and the low learning rate may not be sufficient to break out of the area and descend into areas of the loss landscape with lower loss.
Secondly, our model and optimizer may be very sensitive to our initial learning rate choice. If we make a poor initial choice in learning rate, our model may be stuck from the very start.
Instead, we can use Schedulers and specifically Cyclical Learning Rates(CLR) to oscillate our learning rate between upper and lower bounds, enabling us to:

Have more freedom in our initial learning rate choices.
Break out of saddle points and local minima.
In practice, using CLRs leads to far fewer learning rate tuning experiments along with near identical accuracy to exhaustive hyperparameter tuning.

In [22]:
def get_scheduler(optimizer, scheduler_params=params):
    if scheduler_params['scheduler_name'] == 'CosineAnnealingWarmRestarts':
        scheduler = CosineAnnealingWarmRestarts(
            optimizer,
            T_0=scheduler_params['T_0'],
            eta_min=scheduler_params['min_lr'],
            last_epoch=-1
        )
    elif scheduler_params['scheduler_name'] == 'OneCycleLR':
        scheduler = OneCycleLR(
            optimizer,
            max_lr=scheduler_params['max_lr'],
            steps_per_epoch=int(((scheduler_params['num_fold']-1) * train_df.shape[0]) / (scheduler_params['num_fold'] * scheduler_params['batch_size'])) + 1,
            epochs=scheduler_params['epochs'],
        )

    elif scheduler_params['scheduler_name'] == 'CosineAnnealingLR':
        scheduler = CosineAnnealingLR(
            optimizer,
            T_max=scheduler_params['T_max'],
            eta_min=scheduler_params['min_lr'],
            last_epoch=-1
        )
    return scheduler

## CNN Model

In [23]:
class PetNet(nn.Module):
    def __init__(self, model_name=params['model'], out_features=params['out_features'], inp_channels=params['inp_channels'],
                 pretrained=params['pretrained'], num_dense=len(params['dense_features'])):
        super().__init__()
        self.model = timm.create_model(model_name, pretrained=pretrained, in_chans=inp_channels)
        n_features = self.model.head.in_features #swin : 1536
        self.model.head = nn.Linear(n_features, 128)
        #self.fc = nn.Sequential(
        #    nn.Linear(128 + num_dense, 64), #dense feature 적용
        #    nn.ReLU(),
        #    nn.Linear(64, out_features)
        #)
        self.fc = nn.Sequential(
            nn.Linear(128 + num_dense, 64), #dense feature 적용
            nn.ReLU(),
            nn.Linear(64, out_features)
            ##nn.BatchNorm1d(64),
            #nn.GELU(),
            #nn.Linear(64, 32),
            ##nn.BatchNorm1d(32),
            #nn.GELU(),
            #nn.Linear(32, out_features),
        )
        self.dropout = nn.Dropout(params['dropout'])
        #self.BatchNorm = nn.BatchNorm1d(128 + num_dense)
    
    def forward(self, image, dense):
        embeddings = self.model(image)
        x = self.dropout(embeddings)
        x = torch.cat([x, dense], dim=1)
        output = self.fc(x)
        return output

## Train and Valid functions

### 1. Train Functions

In [24]:
def train_fn(train_loader, model, criterion, optimizer, epoch, params, scheduler=None):
    metric_monitor = MetricMonitor()
    model.train()
    stream = tqdm(train_loader)
    
    for i, (images, dense, target) in enumerate(stream, start=1):
        if params['mixup']:
            images, dense, target_a, target_b, lam = mixup_data(images, dense, target.view(-1, 1), params)
            images = images.to(params['device'], dtype=torch.float)
            dense = dense.to(params['device'], dtype=torch.float)
            target_a = target_a.to(params['device'], dtype=torch.float)
            target_b = target_b.to(params['device'], dtype=torch.float)
        else:
            images = images.to(params['device'], non_blocking=True)
            dense = dense.to(params['device'], non_blocking=True)
            target = target.to(params['device'], non_blocking=True).float().view(-1, 1)
            
        output = model(images, dense)
        
        if params['mixup']:
            loss = mixup_criterion(criterion, output, target_a, target_b, lam)
        else:
            loss = criterion(output, target)
            
        rmse_score = usr_rmse_score(output, target)
        metric_monitor.update('Loss', loss.item())
        metric_monitor.update('RMSE', rmse_score)
        loss.backward()
        optimizer.step()
            
        if scheduler is not None:
            scheduler.step()
        
        optimizer.zero_grad()
        stream.set_description(f"Epoch: {epoch:02}. Train. {metric_monitor}")

### 2. Valid functions

In [25]:
def validate_fn(val_loader, model, criterion, epoch, params):
    metric_monitor = MetricMonitor()
    model.eval()
    stream = tqdm(val_loader)
    final_targets = []
    final_outputs = []
    with torch.no_grad():
        for i, (images, dense, target) in enumerate(stream, start=1):
            images = images.to(params['device'], non_blocking=True)
            dense = dense.to(params['device'], non_blocking=True)
            target = target.to(params['device'], non_blocking=True).float().view(-1, 1)
            output = model(images, dense)
            loss = criterion(output, target)
            rmse_score = usr_rmse_score(output, target)
            metric_monitor.update('Loss', loss.item())
            metric_monitor.update('RMSE', rmse_score)
            stream.set_description(f"Epoch: {epoch:02}. Valid. {metric_monitor}")
            
            targets = (target.detach().cpu().numpy()*100).tolist()
            outputs = (torch.sigmoid(output).detach().cpu().numpy()*100).tolist()
            
            final_targets.extend(targets)
            final_outputs.extend(outputs)
    return final_outputs, final_targets

### Run

In [26]:
best_models_of_each_fold = []
rmse_tracker = []

In [27]:
for fold in TRAIN_FOLDS:
    print(''.join(['#']*50))
    print(f"{''.join(['=']*15)} TRAINING FOLD: {fold+1}/{train_df['kfold'].nunique()} {''.join(['=']*15)}")
    # Data Split to train and Validation
    train = train_df[train_df['kfold'] != fold]
    valid = train_df[train_df['kfold'] == fold]
    
    X_train = train['image_path']
    X_train_dense = train[params['dense_features']]
    y_train = train['Pawpularity']/100
    X_valid = valid['image_path']
    X_valid_dense = valid[params['dense_features']]
    y_valid = valid['Pawpularity']/100
    
    # Pytorch Dataset Creation
    train_dataset = CuteDataset(
        images_filepaths=X_train.values,
        dense_features=X_train_dense.values,
        targets=y_train.values,
        transform=get_train_transforms(0)
    )

    valid_dataset = CuteDataset(
        images_filepaths=X_valid.values,
        dense_features=X_valid_dense.values,
        targets=y_valid.values,
        #transform=get_valid_transforms(),
        transform=get_inference_fixed_transforms(),
        
    )
    
    # Pytorch Dataloader creation
    train_loader = DataLoader(
        train_dataset, batch_size=params['batch_size'], shuffle=True,
        num_workers=params['num_workers'], pin_memory=True
        )

    val_loader = DataLoader(
        valid_dataset, batch_size=params['batch_size'], shuffle=False,
        num_workers=params['num_workers'], pin_memory=True
        )
    
    # Model, cost function and optimizer instancing
    model = PetNet()
    #model = replace_activations(model, torch.nn.SiLU, Mish())
    if params['multi-gpu'] == True:
        model = nn.DataParallel(model) #for multi-gpu processing
    else:
        pass
    
    model = model.to(params['device'])
    criterion = nn.BCEWithLogitsLoss()
    #optimizer = Ranger(model.parameters(), lr = params['lr'])
    optimizer = torch.optim.AdamW(model.parameters(), lr=params['lr'],
                                  weight_decay=params['weight_decay'],
                                  amsgrad=False)
    scheduler = get_scheduler(optimizer)
    
    # Training and Validation Loop
    best_rmse = np.inf
    best_epoch = np.inf
    best_model_name = None
    for epoch in range(1, params['epochs'] + 1):
        train_fn(train_loader, model, criterion, optimizer, epoch, params, scheduler)
        predictions, valid_targets = validate_fn(val_loader, model, criterion, epoch, params)
        rmse = round(mean_squared_error(valid_targets, predictions, squared=False), 3)
        if rmse < best_rmse:
            best_rmse = rmse
            best_epoch = epoch
            if best_model_name is not None:
                os.remove(best_model_name)
            torch.save(model.state_dict(),
                       f"weights/crop_{params['model']}_{epoch}_epoch_f{fold+1}_{rmse}_rmse.pth")
            best_model_name = f"weights/crop_{params['model']}_{epoch}_epoch_f{fold+1}_{rmse}_rmse.pth"

    # Print summary of this fold
    print('')
    print(f'The best RMSE: {best_rmse} for fold {fold+1} was achieved on epoch: {best_epoch}.')
    print(f'The Best saved model is: {best_model_name}')
    best_models_of_each_fold.append(best_model_name)
    rmse_tracker.append(best_rmse)
    print(''.join(['#']*50))
    del model
    gc.collect()
    torch.cuda.empty_cache()

print('')
print(f'Average RMSE of all folds: {round(np.mean(rmse_tracker), 4)}')

##################################################


HBox(children=(FloatProgress(value=0.0, max=496.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=124.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=496.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=124.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=496.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=124.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=496.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=124.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=496.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=124.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=496.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=124.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=496.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=124.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=496.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=124.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=496.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=124.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=496.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=124.0), HTML(value='')))



The best RMSE: 17.9 for fold 1 was achieved on epoch: 4.
The Best saved model is: weights/crop_vit_large_patch32_384_4_epoch_f1_17.9_rmse.pth
##################################################
##################################################


HBox(children=(FloatProgress(value=0.0, max=496.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=124.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=496.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=124.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=496.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=124.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=496.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=124.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=496.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=124.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=496.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=124.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=496.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=124.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=496.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=124.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=496.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=124.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=496.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=124.0), HTML(value='')))



The best RMSE: 18.361 for fold 2 was achieved on epoch: 3.
The Best saved model is: weights/crop_vit_large_patch32_384_3_epoch_f2_18.361_rmse.pth
##################################################
##################################################


HBox(children=(FloatProgress(value=0.0, max=496.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=124.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=496.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=124.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=496.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=124.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=496.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=124.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=496.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=124.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=496.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=124.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=496.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=124.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=496.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=124.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=496.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=124.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=496.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=124.0), HTML(value='')))



The best RMSE: 18.075 for fold 3 was achieved on epoch: 3.
The Best saved model is: weights/crop_vit_large_patch32_384_3_epoch_f3_18.075_rmse.pth
##################################################
##################################################


HBox(children=(FloatProgress(value=0.0, max=496.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=124.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=496.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=124.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=496.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=124.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=496.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=124.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=496.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=124.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=496.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=124.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=496.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=124.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=496.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=124.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=496.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=124.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=496.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=124.0), HTML(value='')))



The best RMSE: 17.873 for fold 4 was achieved on epoch: 4.
The Best saved model is: weights/crop_vit_large_patch32_384_4_epoch_f4_17.873_rmse.pth
##################################################
##################################################


HBox(children=(FloatProgress(value=0.0, max=496.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=124.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=496.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=124.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=496.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=124.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=496.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=124.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=496.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=124.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=496.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=124.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=496.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=124.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=496.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=124.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=496.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=124.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=496.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=124.0), HTML(value='')))



The best RMSE: 18.08 for fold 5 was achieved on epoch: 4.
The Best saved model is: weights/crop_vit_large_patch32_384_4_epoch_f5_18.08_rmse.pth
##################################################
##################################################


HBox(children=(FloatProgress(value=0.0, max=620.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…




ValueError: Found array with 0 sample(s) (shape=(0,)) while a minimum of 1 is required.

In [28]:
for i, name in enumerate(best_models_of_each_fold):
    print(f'Best model of fold {i+1}: {name}')

Best model of fold 1: weights/crop_vit_large_patch32_384_4_epoch_f1_17.9_rmse.pth
Best model of fold 2: weights/crop_vit_large_patch32_384_3_epoch_f2_18.361_rmse.pth
Best model of fold 3: weights/crop_vit_large_patch32_384_3_epoch_f3_18.075_rmse.pth
Best model of fold 4: weights/crop_vit_large_patch32_384_4_epoch_f4_17.873_rmse.pth
Best model of fold 5: weights/crop_vit_large_patch32_384_4_epoch_f5_18.08_rmse.pth


In [29]:
np.mean([np.float(x.split('_')[-2]) for x in best_models_of_each_fold])

18.0578