Credits

Tez Training by Abhishek - https://www.kaggle.com/abhishek/tez-pawpular-training/notebook

RAPIDS SVR by Chris - https://www.kaggle.com/cdeotte/rapids-svr-boost-17-8?scriptVersionId=76428219

Fold creation - https://www.kaggle.com/abhishek/same-old-creating-folds 

Install RAPIDS in Colab - https://is.gd/KOS1nC 



# Arguments

In [12]:
# Simple class to hold globals 
class args:
    
    # Where are you developing? So we can use P100 from both. =)
    platform = 'colab' # colab or kaggle
    
    # Source is where we download dataset
    # Sink location is where we upload and download outputs, should preserve outputs even if runtime disconns.
    source = 'drive' # 's3' or 'drive' or 'kaggle'
    sink = 's3' # 's3' or kaggle
    
    # Depending on where, we init diff paths
    if platform == 'kaggle':
        csv_path = "../input/petfinder-pawpularity-score/"
        model_path = "../input/pawpularitymodelfiles/"
        image_path = "../input/petfinder-pawpularity-score/train/"

    if platform == 'colab':
        csv_path = '/content/'
        model_path = '/content/'
        image_path = '/content/img/'
    
    # Model training
    model_list = ['swin_large_patch4_window12_384'
                  , 'swin_large_patch4_window12_384_in22k' 
                  , 'tf_efficientnet_b0_ns' 
                  , 'tf_efficientnetv2_l_in21k']
    nn_model_name = model_list[1]
    nn_model_filename = "nn_"+nn_model_name
    svr_filename = "SVR_"+nn_model_name
    nn_prefix_name = 'train_25Nov/'
    svr_prefix_name = 'train_25Nov/'

    # NN trg
    if nn_model_name == model_list[0] or model_list[1]:
        batch_size = 10 # Effnet: 32 ; SWIN: 10
        image_size = 384 # EffNet: 256 ; SWIN: 384
    else:
        batch_size = 32 # Effnet: 32 ; SWIN: 10
        image_size = 384 # EffNet: 256 ; SWIN: 384        
    epochs = 20
    fold = [0,1,2,3]
    num_splits = 4
    rid_dup = 1
    low_vif = 1

    # SVR trg
    regularization = 20
    kernel = 'rbf' # poly , linear , sigmoid
    degree = 2
    
    # Weightage between NN and SVR regression results
    nn_svr_weight = 0.5

    # Activity control switches
    isNNtraining = 1
    extract_embeds = 1
    isSVRfitting = 0
    isOOF = 0
    isSUBMIT = 0
    

    # Hardware ; Not in use 25 Nov 2021
    isGPU = 1
    isTPU = 0

# Installs & Imports

In [2]:
# Credit https://www.kaggle.com/abhishek/tez-pawpular-swin-ference
# based on the post here: https://www.kaggle.com/c/petfinder-pawpularity-score/discussion/275094

# Silencing this cell's output.
# %%capture

# Standard ones
import os 
import sys
import math
import re
import gc
from tqdm import tqdm
import pickle 
import subprocess

# Installs & path includes. Different platforms have different defaults. Argh.
if args.platform == 'kaggle':
    sys.path.append("../input/tez-lib/")
    sys.path.append("../input/timmmaster/")
    
if args.platform == 'colab':
    subprocess.run('pip install tez', shell=True)
    subprocess.run('pip install timm', shell=True)
    subprocess.run('pip install boto3', shell=True)
    subprocess.run('apt install unzip', shell=True)

# Analysis
import pandas as pd
import cv2
import numpy as np
from sklearn import metrics
from sklearn import datasets
from sklearn import model_selection
import albumentations # https://is.gd/ngksFx ; for image augmentation
import torch
import torch.nn as nn
import timm # https://is.gd/suAm9l ; Pytorch vision library
import tez # https://git.io/J1KCq ; convenience library for using Pytorch
from tez.callbacks import EarlyStopping

# For GPU enabled support vector machines
if args.platform == 'kaggle':
    # We can direclty import in kaggle as it is included by default.
    import cudf, cuml # Doc - https://is.gd/oshcbU ; 
    from cuml.svm import SVR
    print('RAPIDS version',cuml.__version__,'\n')



# Helpers

In [3]:
# Saving strings of name of metadata given in train.csv
# We use this when we instantiate a Pawpular dataset, to tell it what metadata to expect.
# From data exlore: some metadata features have very high VIF (measure of multicolinearity)
dense_features_full = [
    'Subject Focus', 'Eyes', 'Face', 'Near', 'Action', 'Accessory',
    'Group', 'Collage', 'Human', 'Occlusion', 'Info', 'Blur'
]

# Metadata features with VIF < 2
dense_features_lowVIF = [
    'Subject Focus', 'Action', 'Accessory',
    'Group', 'Collage', 'Info', 'Blur'
]

if args.low_vif:
    dense_features = dense_features_lowVIF
else:
    dense_features = dense_features_full

# Duplicate image handling
duplicates = ['13d215b4c71c3dc603cd13fc3ec80181', '373c763f5218610e9b3f82b12ada8ae5', '5ef7ba98fc97917aec56ded5d5c2b099', '67e97de8ec7ddcda59a58b027263cdcc', '839087a28fa67bf97cdcaf4c8db458ef', 'a8f044478dba8040cc410e3ec7514da1', '1feb99c2a4cac3f3c4f8a4510421d6f5', '264845a4236bc9b95123dde3fb809a88', '3c50a7050df30197e47865d08762f041', 'def7b2f2685468751f711cc63611e65b', '37ae1a5164cd9ab4007427b08ea2c5a3', '3f0222f5310e4184a60a7030da8dc84b', '5a642ecc14e9c57a05b8e010414011f2', 'c504568822c53675a4f425c8e5800a36', '2a8409a5f82061e823d06e913dee591c', '86a71a412f662212fe8dcd40fdaee8e6', '3c602cbcb19db7a0998e1411082c487d', 'a8bb509cd1bd09b27ff5343e3f36bf9e', '0422cd506773b78a6f19416c98952407', '0b04f9560a1f429b7c48e049bcaffcca', '68e55574e523cf1cdc17b60ce6cc2f60', '9b3267c1652691240d78b7b3d072baf3', '1059231cf2948216fcc2ac6afb4f8db8', 'bca6811ee0a78bdcc41b659624608125', '5da97b511389a1b62ef7a55b0a19a532', '8ffde3ae7ab3726cff7ca28697687a42', '78a02b3cb6ed38b2772215c0c0a7f78e', 'c25384f6d93ca6b802925da84dfa453e', '08440f8c2c040cf2941687de6dc5462f', 'bf8501acaeeedc2a421bac3d9af58bb7', '0c4d454d8f09c90c655bd0e2af6eb2e5', 'fe47539e989df047507eaa60a16bc3fd', '5a5c229e1340c0da7798b26edf86d180', 'dd042410dc7f02e648162d7764b50900', '871bb3cbdf48bd3bfd5a6779e752613e', '988b31dd48a1bc867dbc9e14d21b05f6', 'dbf25ce0b2a5d3cb43af95b2bd855718', 'e359704524fa26d6a3dcd8bfeeaedd2e', '43bd09ca68b3bcdc2b0c549fd309d1ba', '6ae42b731c00756ddd291fa615c822a1', '43ab682adde9c14adb7c05435e5f2e0e', '9a0238499efb15551f06ad583a6fa951', 'a9513f7f0c93e179b87c01be847b3e4c', 'b86589c3e85f784a5278e377b726a4d4', '38426ba3cbf5484555f2b5e9504a6b03', '6cb18e0936faa730077732a25c3dfb94', '589286d5bfdc1b26ad0bf7d4b7f74816', 'cd909abf8f425d7e646eebe4d3bf4769', '9f5a457ce7e22eecd0992f4ea17b6107', 'b967656eb7e648a524ca4ffbbc172c06', 'b148cbea87c3dcc65a05b15f78910715', 'e09a818b7534422fb4c688f12566e38f', '3877f2981e502fe1812af38d4f511fd2', '902786862cbae94e890a090e5700298b', '8f20c67f8b1230d1488138e2adbb0e64', 'b190f25b33bd52a8aae8fd81bd069888', '221b2b852e65fe407ad5fd2c8e9965ef', '94c823294d542af6e660423f0348bf31', '2b737750362ef6b31068c4a4194909ed', '41c85c2c974cc15ca77f5ababb652f84', '01430d6ae02e79774b651175edd40842', '6dc1ae625a3bfb50571efedc0afc297c', '72b33c9c368d86648b756143ab19baeb', '763d66b9cf01069602a968e573feb334', '03d82e64d1b4d99f457259f03ebe604d', 'dbc47155644aeb3edd1bd39dba9b6953', '851c7427071afd2eaf38af0def360987', 'b49ad3aac4296376d7520445a27726de', '54563ff51aa70ea8c6a9325c15f55399', 'b956edfd0677dd6d95de6cb29a85db9c', '87c6a8f85af93b84594a36f8ffd5d6b8', 'd050e78384bd8b20e7291b3efedf6a5b', '04201c5191c3b980ae307b20113c8853', '16d8e12207ede187e65ab45d7def117b']

# Helper: sigmoid, math for 1 of many types of activation function.
def sigmoid(x):
    return 1 / (1 + math.exp(-x))

def tanh(x):
    return np.tanh(x)

def gauss(x):
    return np.exp(-(x*x))

# Clear out memory that GPU is hogging
def clear_trash():
    gc.collect()
    torch.cuda.empty_cache()
    subprocess.run('export PYTORCH_CUDA_ALLOC_CONF=0', shell=True)
    return

def rid_duplicates(df):
    for id in duplicates:
        df = df.drop(df[df.Id == id].index)
    return df.reset_index(drop=True)

# Credit: https://is.gd/zwVtpa ; Takes in pandas dataframe, returns pd Series ready for cross-val
def create_folds(data, num_splits):

    # Added col "kfold", assigned val -1 for starters
    data["kfold"] = -1 

    # https://is.gd/wgZBrH & https://www.statisticshowto.com/?p=7678
    # Rule-thumb for setting bin sizes, most likely Sturge rule used here
    # bins are akin to giving labels to equal width range of pscore. 
    num_bins = int(np.floor(1 + np.log2(len(data)))) 
    
    # https://is.gd/U6dVgC, sort by pscore, segment into num_bins, output into col "bins": bin 1, bin 2 ...
    data.loc[:, "bins"] = pd.cut(data["Pawpularity"], bins=num_bins, labels=False) 

    # Doc: https://is.gd/6MJHst, What: https://is.gd/pG4oqH , Why: https://is.gd/bVYvsS
    # Instantiate a stratifed k-fold object, 10 folds gives us 90%/10% split betw train/valid
    kf = model_selection.StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=42)
    
    # .split generates indices for split betw train/valid, accord to bin labels
    # enumerate(kf.split(X=data, y=data.bins.values)) is of shape (10,2) , col 1 is fold label, col 2 is a tuple of entry-index of (train, valid)
    # What's enumerate: https://www.programiz.com/node/600 : adds col 1's fold label
    # For each fold label, use .loc to assign list of valid-index corresponding fold label
    for f, (t_, v_) in enumerate(kf.split(X=data, y=data.bins.values)):
        data.loc[v_, 'kfold'] = f
    
    # Delte bin label col as we no longer need it.
    data = data.drop("bins", axis=1)

    # Return an edited dataframe
    return data

# Data & Model Class

In [4]:
# Class to represent dataset's image , metadata & labels , sushi rolled into 1 object. 
# Returns dict of torch tensors form of image, metadata & labels
class PawpularDataset:

    # Method called when object is instantiated, here, we set what params to take in
    # https://www.geeksforgeeks.org/?p=360686 
    def __init__(self, image_paths, dense_features, targets, augmentations):
        self.image_paths = image_paths
        self.dense_features = dense_features
        self.targets = targets
        self.augmentations = augmentations
    
    # Gives length of an attribute here when using len() on an instance
    # https://www.analyticsvidhya.com/?p=83204#h2_5 
    def __len__(self):
        return len(self.image_paths)
    
    # https://www.geeksforgeeks.org/?p=385574 
    # https://www.codespeedy.com/?p=28884 : setitem vs getitem 
    # __getitem__ called when we -> InstanceOfDataSet[0] 
    # __setitem__ called when we -> InstanceOfDataSet[0] = *something*
    def __getitem__(self, item):

        # Use cv2 to read image with path
        image = cv2.imread(self.image_paths[item])

        # Why: https://is.gd/eSX1xj
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) 
        
        # If augm obj passed in, we perform augm and return augmented images
        if self.augmentations is not None:

            # https://is.gd/01LBW6, do the augm
            augmented = self.augmentations(image=image)

            # Albumentation returns a dictionary after augm, only a single key. 
            image = augmented["image"]

        # https://is.gd/nPDsJf, why are we transposing here? 
        image = np.transpose(image, (2, 0, 1)).astype(np.float32)

        # Get image's dense_features
        features = self.dense_features[item, :]

        # Get image's targets
        targets = self.targets[item]
        
        return {
            "image": torch.tensor(image, dtype=torch.float),
            "features": torch.tensor(features, dtype=torch.float),
            "targets": torch.tensor(targets, dtype=torch.float),
        }
    
class PawpularModel(tez.Model):
    def __init__(self, get_pretrained, model_string):
        
        # Inherit tez.Model, https://git.io/J1VAJ , tez.Model in turn inherits nn.Module
        super().__init__()
        
        self.get_pretrained = get_pretrained
        self.model_string = model_string
        
        # Use of pretrained tf_efficientnet_b0_ns as base
        # https://is.gd/pOZdAH : List of models supported by timm 
        # https://git.io/J1VNa : Useful links to Efficient net family
        # https://is.gd/zCLbMt: What create_model does
        # https://git.io/J1weq : where create_model is in timm's code
        # create_model > load_checkpoint > load_pretrained 
        # A sort of object returned, seems to download from somewhere, rather than actually creating it. Bunch of checks along the way        
        self.model = timm.create_model(self.model_string, pretrained=self.get_pretrained, in_chans=3)
        
        # self.model.classifier is a module of the model above, a pre-trained model created with timm, consisting of deep sequence of diff modules
        # https://git.io/J1wmw : EfficientNet class in Pytorch ; https://git.io/J1wmH : nn.Module class in torch
        # Change output feature value from default of 1280 to 128
        if re.match('tf_efficient', self.model_string): # CNN
            self.model.classifier = nn.Linear(self.model.classifier.in_features, 128)
        if re.match('swin_', self.model_string): # Transformers
            self.model.head = nn.Linear(self.model.head.in_features, 128) 
        if re.match('vit_', self.model_string): # Transformers
            self.model.head = nn.Linear(self.model.head.in_features, 128)

        # https://is.gd/joBsSF : form of regularization technique, zero-ing elements in a tensor with a Bernoulli distri with param p=0.1
        self.dropout = nn.Dropout(0.1)

        # https://is.gd/TnIKzT : Defining MLP layers
        # 128+12 input as from model above
        # 1 output since we're doing regreession of pscore here. 
        self.dense1 = nn.Linear(128+len(dense_features), 64)
        self.dense2 = nn.Linear(64, 1)
        
        self.step_scheduler_after = "epoch"

    # RMSE - https://is.gd/1700Cd 
    # RMSE reporting when target is passed in. 
    def monitor_metrics(self, outputs, targets):
        if args.isNNtraining:
            outputs = outputs.cpu().detach().numpy()
            targets = targets.cpu().detach().numpy()
            rmse = metrics.mean_squared_error(targets, outputs, squared=False)
            return {"rmse": rmse}
        else:
            return 

    # https://is.gd/gMaUfO : control Learning rate decay, similar to that we learnt in SGD-Module 2-MLPy in MM, more spohisticated. 
    def fetch_scheduler(self):
        sch = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
            self.optimizer, T_0=10, T_mult=1, eta_min=1e-6, last_epoch=-1
        )
        return sch

    # https://is.gd/N2C9eB : ADAM optimizer
    def fetch_optimizer(self):
        opt = torch.optim.Adam(self.parameters(), lr=1e-4)
        return opt

    def forward(self, image, features, targets=None):
        
        # send an image into a model
        x1 = self.model(image)
        
        # Apply dropout @ 10%
        x = self.dropout(x1)
        
        # Concatenate dropout output & metadata features given, 
        x = torch.cat([x, features], dim=1)
        
        # A MLP's single layer: 128+1 in , 64 out
        x = self.dense1(x)
        
        # A MLP's single layer: 64 in, 1 out
        x = self.dense2(x)

        if not args.isNNtraining:
            # https://is.gd/YIDi42
            # If we're not training, return MLP output, image, metadata features
            x = torch.cat([x, x1, features], dim=1)

        if targets is not None:
            loss = nn.MSELoss()(x, targets.view(-1, 1))
            metrics = self.monitor_metrics(x, targets)
            return x, loss, metrics        
        
        return x, 0, {}

# Data 

## Setup sources

In [5]:
if args.source == 'drive':
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True) 

if args.source == 's3' or args.sink == 's3':

    # Setting up for AWS S3

    import boto3 # Doc - https://is.gd/K9zpHw 
    from getpass import getpass

    BUCKET_NAME = 'pawpularity-data'

    # Using getpass from here https://is.gd/yN9yap for security. 

    print('Input AWS access key ID:')
    aws_access_key_id = getpass()
    print('Input AWS secret access key:')
    aws_secret_access_key = getpass()

    s3r = boto3.resource('s3', 
                        aws_access_key_id = aws_access_key_id, 
                        aws_secret_access_key= aws_secret_access_key) 

    s3c = boto3.client('s3', 
                        aws_access_key_id = aws_access_key_id, 
                        aws_secret_access_key= aws_secret_access_key)


Mounted at /content/drive
Input AWS access key ID:
··········
Input AWS secret access key:
··········


## CSV, dataframe & images from Drive

In [6]:
# Get CSVs and images.

if args.source == 's3':
    
    # Download CSVs from S3
    s3r.Object(BUCKET_NAME, 'train.csv').download_file('train.csv')
    s3r.Object(BUCKET_NAME, 'test.csv').download_file('test.csv')
    s3r.Object(BUCKET_NAME, 'train_10folds.csv').download_file('train_10folds.csv')

    # Read original csv
    df = pd.read_csv(args.path+"train_10folds.csv")

    # Read test csv
    df_test = pd.read_csv(args.path+"test.csv")

if args.source == 'drive':
    
    # Unzip and copy over
    subprocess.run('unzip -j /content/drive/MyDrive/Pawpularity_TC_Drive/petfinder-pawpularity-score.zip -d img/', shell=True)
    subprocess.run('mv /content/img/train.csv /content', shell=True)
    subprocess.run('mv /content/img/test.csv /content', shell=True)
    subprocess.run('rm -rf /content/img/test /content/img/sample_submission.csv', shell=True)
    
    # Read original csv
    df = create_folds( pd.read_csv("train.csv") , num_splits=10)

    # Read test csv
    df_test = pd.read_csv("test.csv")
    
if args.source == 'kaggle':
    # Read original csv
    df = create_folds( pd.read_csv(args.csv_path+"train.csv") , num_splits=10)

    # Read test csv
    df_test = pd.read_csv(args.csv_path+"test.csv")

if args.rid_dup:
    df = create_folds(rid_duplicates(pd.read_csv(args.csv_path+"train.csv")) , num_splits=args.num_splits)

## Images from S3

In [7]:
# Credit and edits made: https://is.gd/jEPCoQ
# mknod : https://is.gd/OVXJsr 
# https://masnun.com/?p=3009: Tutorial on Python's concurrent & futures
# https://is.gd/S1x8tA : When to ThresdPool and ProcessPool

# Get images from S3. Peeled out because this takes longest and costs tiny $$
if args.source == 's3':
    from concurrent import futures

    prefix = 'img'
    bucket_name = 'pawpularity-data'
    max_workers = 20000

    # Saving strings of keys of images ; Since we want to be S3 compatible, we'll need prefix + "/" + image name + .jpg
    img_keys = [prefix+"/"+str(x)+".jpg" for x in df["Id"].values]
    abs_path = os.path.abspath('')

    try:
        os.makedirs('./'+ prefix)
    except:
        print("Directory already created. Moving on ...")

    def fetch(key):
        file = f'{abs_path}/{key}'
        os.mknod(file, mode=384)  
        with open(file, 'wb') as data:
            s3c.download_fileobj(bucket_name, key, data)
        return file

    def fetch_all(keys):

        with futures.ThreadPoolExecutor(max_workers=max_workers) as executor:

            print("Hang on ... submitting file downloads")

            future_to_key = {executor.submit(fetch, key): key for key in keys}

            print("All URLs submitted.")

            for future in futures.as_completed(future_to_key):

                key = future_to_key[future]
                exception = future.exception()

                if not exception:
                    yield key, future.result()
                else:
                    yield key, exception

    i=0
    for key, result in fetch_all(img_keys):
        i+=1

    print('Number of images downloaded: ', i)

# Image Augmentation

In [8]:
# Albumentation is a lib for image augmentation operations. 
# .Compose is the way we define an augmentation pipe line with Albumentation, See https://is.gd/tn84mO , https://is.gd/07Sh95 
# list of transforms supported : https://is.gd/weLPx8

train_aug = albumentations.Compose(
    [
        albumentations.Resize(args.image_size, args.image_size, p=1), # https://is.gd/J26M4R
        albumentations.HueSaturationValue( # https://is.gd/NlVygf
            hue_shift_limit=0.2, sat_shift_limit=0.2, val_shift_limit=0.2, p=0.5
        ),
        albumentations.RandomBrightnessContrast( # https://is.gd/rSiA5P
            brightness_limit=(-0.1, 0.1), contrast_limit=(-0.1, 0.1), p=0.5
        ),
        albumentations.Normalize( # https://is.gd/GQ4pFo, values used here are defaults
            mean=[0.485, 0.456, 0.406],
            std=[0.229, 0.224, 0.225],
            max_pixel_value=255.0,
            p=1.0,
        ),
    ],
    p=1.0,
)

# Validation set performs only resizing & normalize. Presumably to match images from train set
valid_aug = albumentations.Compose(
    [
        albumentations.Resize(args.image_size, args.image_size, p=1),
        albumentations.Normalize(
            mean=[0.485, 0.456, 0.406],
            std=[0.229, 0.224, 0.225],
            max_pixel_value=255.0,
            p=1.0,
        ),
    ],
    p=1.0,
)

test_aug = albumentations.Compose(
    [
        albumentations.Resize(args.image_size, args.image_size, p=1),
        albumentations.Normalize(
            mean=[0.485, 0.456, 0.406],
            std=[0.229, 0.224, 0.225],
            max_pixel_value=255.0,
            p=1.0,
        ),
    ],
    p=1.0,
)

In [9]:
# timm.list_models()

# Neural Net Training

In [None]:
clear_trash()

if args.isNNtraining:

    # Instantiate model, in prep for train.
    model = PawpularModel(True, args.nn_model_name)
    
    for i in args.fold:
        print("\nTraining fold number:", i)

        # What name?
        nn_model_filename = args.nn_model_filename+f"_f{i}.bin"
        nn_prefix_name = args.nn_prefix_name

        ##############################################################################
        # Setting up dataframe for this particular fold
        df_train = df[df.kfold != i].reset_index(drop=True)
        df_valid = df[df.kfold == i].reset_index(drop=True)

        # Adding in full path so model will take this in to know where to expect to find images for training
        # remove '/content/' if running on Kaggle 
        train_img_paths = [args.image_path+f"{x}.jpg" for x in df_train["Id"].values]
        valid_img_paths = [args.image_path+f"{x}.jpg" for x in df_valid["Id"].values]

        # Instantiating PawpularDataset objects: 1 for training, 1 for validation
        train_dataset = PawpularDataset(
            image_paths=train_img_paths,
            dense_features=df_train[dense_features].values,
            targets=df_train.Pawpularity.values/100.0,
            augmentations=train_aug,
        )

        valid_dataset = PawpularDataset(
            image_paths=valid_img_paths,
            dense_features=df_valid[dense_features].values,
            targets=df_valid.Pawpularity.values/100.0,
            augmentations=valid_aug,
        )

        # Defining an early stop callback function
        es = EarlyStopping(
            monitor="valid_rmse",
            model_path= nn_model_filename,
            patience=3,
            mode="min",
            save_weights_only=True,
        )

        # Hit the gym and train!!
        model.fit(
            train_dataset,
            valid_dataset=valid_dataset,
            train_bs=args.batch_size,
            valid_bs=2*args.batch_size,
            device="cuda",
            epochs=args.epochs,
            callbacks=[es],
            fp16=True,
        )
        
        if args.sink == 's3':
            # Send model bin file to S3
            s3r.meta.client.upload_file(nn_model_filename, BUCKET_NAME, nn_prefix_name+nn_model_filename)
            print("\nUploaded trained model to S3. Fold : ", i)
        
        if args.sink == 'kaggle':
            print('Model output sink set to kaggle. Are you sure? Rmb to manual download it *future improvement')

clear_trash()

100%|██████████| 738/738 [17:33<00:00,  1.43s/it, loss=0.0396, rmse=0.19, stage=train]
  cpuset_checked))
100%|██████████| 123/123 [02:04<00:00,  1.02s/it, loss=0.0353, rmse=0.183, stage=valid]


Validation score improved (inf --> 0.183226142108925). Saving model!


  cpuset_checked))
100%|██████████| 738/738 [17:32<00:00,  1.43s/it, loss=0.0326, rmse=0.173, stage=train]
  cpuset_checked))
100%|██████████| 123/123 [02:04<00:00,  1.01s/it, loss=0.035, rmse=0.182, stage=valid]


Validation score improved (0.183226142108925 --> 0.1819872346108522). Saving model!


  cpuset_checked))
100%|██████████| 738/738 [17:32<00:00,  1.43s/it, loss=0.0225, rmse=0.144, stage=train]
  cpuset_checked))
100%|██████████| 123/123 [02:04<00:00,  1.01s/it, loss=0.0398, rmse=0.195, stage=valid]


EarlyStopping counter: 1 out of 3


  cpuset_checked))
100%|██████████| 738/738 [17:32<00:00,  1.43s/it, loss=0.0124, rmse=0.107, stage=train]
  cpuset_checked))
100%|██████████| 123/123 [02:04<00:00,  1.01s/it, loss=0.0372, rmse=0.188, stage=valid]


EarlyStopping counter: 2 out of 3


  cpuset_checked))
100%|██████████| 738/738 [17:32<00:00,  1.43s/it, loss=0.0075, rmse=0.0834, stage=train]
  cpuset_checked))
100%|██████████| 123/123 [02:04<00:00,  1.01s/it, loss=0.0364, rmse=0.187, stage=valid]


EarlyStopping counter: 3 out of 3

Uploaded trained model to S3. Fold :  [0, 1, 2, 3]

Training fold number: 1


  cpuset_checked))
100%|██████████| 738/738 [17:31<00:00,  1.43s/it, loss=0.0044, rmse=0.0641, stage=train]
  cpuset_checked))
100%|██████████| 123/123 [02:03<00:00,  1.01s/it, loss=0.0352, rmse=0.183, stage=valid]


Validation score improved (inf --> 0.18277397782095078). Saving model!


  cpuset_checked))
100%|██████████| 738/738 [17:32<00:00,  1.43s/it, loss=0.00255, rmse=0.0486, stage=train]
  cpuset_checked))
100%|██████████| 123/123 [02:04<00:00,  1.01s/it, loss=0.0352, rmse=0.183, stage=valid]


EarlyStopping counter: 1 out of 3


  cpuset_checked))
100%|██████████| 738/738 [17:32<00:00,  1.43s/it, loss=0.00157, rmse=0.0381, stage=train]
  cpuset_checked))
100%|██████████| 123/123 [02:04<00:00,  1.01s/it, loss=0.0347, rmse=0.182, stage=valid]


Validation score improved (0.18277397782095078 --> 0.181542109062032). Saving model!


  cpuset_checked))
100%|██████████| 738/738 [17:32<00:00,  1.43s/it, loss=0.0011, rmse=0.0318, stage=train]
  cpuset_checked))
100%|██████████| 123/123 [02:04<00:00,  1.01s/it, loss=0.0349, rmse=0.182, stage=valid]


EarlyStopping counter: 1 out of 3


  cpuset_checked))
100%|██████████| 738/738 [17:32<00:00,  1.43s/it, loss=0.000809, rmse=0.0272, stage=train]
  cpuset_checked))
100%|██████████| 123/123 [02:04<00:00,  1.01s/it, loss=0.0345, rmse=0.181, stage=valid]


EarlyStopping counter: 2 out of 3


  cpuset_checked))
100%|██████████| 738/738 [17:32<00:00,  1.43s/it, loss=0.0197, rmse=0.13, stage=train]
  cpuset_checked))
100%|██████████| 123/123 [02:03<00:00,  1.01s/it, loss=0.0407, rmse=0.197, stage=valid]


EarlyStopping counter: 3 out of 3

Uploaded trained model to S3. Fold :  [0, 1, 2, 3]

Training fold number: 2


  cpuset_checked))
100%|██████████| 738/738 [17:32<00:00,  1.43s/it, loss=0.0105, rmse=0.098, stage=train]
  cpuset_checked))
100%|██████████| 123/123 [02:04<00:00,  1.01s/it, loss=0.0346, rmse=0.181, stage=valid]


Validation score improved (inf --> 0.1813010650315905). Saving model!


  cpuset_checked))
100%|██████████| 738/738 [17:32<00:00,  1.43s/it, loss=0.00544, rmse=0.071, stage=train]
  cpuset_checked))
100%|██████████| 123/123 [02:04<00:00,  1.01s/it, loss=0.0362, rmse=0.185, stage=valid]


EarlyStopping counter: 1 out of 3


  cpuset_checked))
100%|██████████| 738/738 [17:32<00:00,  1.43s/it, loss=0.00365, rmse=0.058, stage=train]
  cpuset_checked))
100%|██████████| 123/123 [02:04<00:00,  1.01s/it, loss=0.0347, rmse=0.182, stage=valid]


EarlyStopping counter: 2 out of 3


  cpuset_checked))
100%|██████████| 738/738 [17:32<00:00,  1.43s/it, loss=0.00258, rmse=0.0488, stage=train]
  cpuset_checked))
100%|██████████| 123/123 [02:04<00:00,  1.01s/it, loss=0.0346, rmse=0.181, stage=valid]


EarlyStopping counter: 3 out of 3

Uploaded trained model to S3. Fold :  [0, 1, 2, 3]

Training fold number: 3


  cpuset_checked))
 19%|█▊        | 137/738 [03:16<14:16,  1.42s/it, loss=0.00217, rmse=0.0441, stage=train]

# Neural Net + Support Vector Regression 

In [11]:
# Incorporating RAPIDS's Support Vector Regression
clear_trash()

# Placeholders for Out Of Folda & Test predictions; 
# Needs to be here as we take mean of all folds and use as valid or submit results.
allfolds_oof_nnreg_preds = []
allfolds_oof_svrreg_preds = []
allfolds_oof_validation_tgts = []
allfolds_submit_nnreg_preds = []
allfolds_submit_svrreg_preds = []

# Adding in full path so model will take this in to know where to expect to find images for training
test_img_paths = [args.csv_path+"test/"+f"{x}.jpg" for x in df_test["Id"].values]

# for fold in range(10):
for i in args.fold:

    # We re-initialize the model at every fold
    model = PawpularModel(False, args.nn_model_name) # swin_large_patch4_window12_384 , tf_efficientnetv2_b0 , tf_efficientnetv2_l_in21ft1k

    # What names?
    nn_model_filename = args.nn_model_filename+f"_f{i}.bin"
    svr_filename = args.svr_filename+f"_f{i}.pkl"
    nn_prefix_name = args.nn_prefix_name
    svr_prefix_name = args.svr_prefix_name

##############################################################################    
    # Are we doing a submission? Are we doing an SVR fitting?
    if not args.isSVRfitting and not args.isSUBMIT:
        if args.sink == 's3':
            try:
                s3r.Object(BUCKET_NAME, svr_prefix_name+svr_filename).download_file(svr_filename)
                LOAD_SVR_FROM_PATH = './'
            except:
                print('File name : ', nn_prefix_name+nn_model_filename)
                print("Did not manage to download model bin for fold. Exiting")                
        if args.sink == 'kaggle':
            LOAD_SVR_FROM_PATH = args.model_path
        
    if args.isSUBMIT:
        LOAD_SVR_FROM_PATH = args.model_path
    
    # Get neural net model file to reap the hard work we put in 
    if args.sink == 's3':
        try:
            # Downloading it from S3
            s3r.Object(BUCKET_NAME, nn_prefix_name+nn_model_filename).download_file(nn_model_filename) 
            model.load(nn_model_filename, device="cuda", weights_only=True)
        except:
            print('File name : ', nn_prefix_name+nn_model_filename)
            print("Did not manage to download model bin for fold. Exiting")
            break

    if args.sink == 'kaggle':
        try:
            # For this to work, must manually "Add Data"
            model.load(args.model_path+nn_model_filename, device="cuda", weights_only=True)
        except:
            print("Did not manage to load model bin for fold. Sure its there? Exiting")
            break            
    
    # Setting up dataframe for this particular fold
    df_train = df[df.kfold != i].reset_index(drop=True)
    df_valid = df[df.kfold == i].reset_index(drop=True)

    # Adding in full path so model will take this in to know where to expect to find images for training
    train_img_paths = [args.image_path+f"{x}.jpg" for x in df_train["Id"].values]
    valid_img_paths = [args.image_path+f"{x}.jpg" for x in df_valid["Id"].values]


##############################################################################    
    if args.extract_embeds:
        # Extracting embeddings from trained model
        print('Extracting train embedding...')
        
        # Why are we using test_aug (same as valid_aug)
        # Targets divided by 100 here due to use of sigmoid at final output. 100 is multiplied back after.
        train_dataset = PawpularDataset(
            image_paths=train_img_paths,
            dense_features=df_train[dense_features].values,
            targets=df_train.Pawpularity.values/100.00,
            augmentations=valid_aug,
        )

        # Record our predictions from nerual net 
        train_predictions = model.predict(train_dataset, batch_size=2*args.batch_size, n_jobs=-1)

        # Prepare a container to store embeddings
        embed = np.array([]).reshape((0,128+len(dense_features)))

        # For each prediction, we store all rows and cols 1 to the rest.
        # This step takes ~7 mins. Pred does not seem to occur until actually accessing the preds.
        for preds in train_predictions:
            embed = np.concatenate([embed,preds[:,1:]],axis=0)
            
        pickle.dump(embed, open('embed_'+nn_model_filename,"wb"))

        clear_trash()

##############################################################################        
    # Train SVR if no path defined
    if args.isSVRfitting:
        
        # Fit RAPIDS SVR
        print('Fitting SVR...')

        # Init SVR machine. C value here is the regularization parameter ; https://is.gd/hsYtDM 
        # Poly with deg2 performed better than RBF. Cache size above 10k unstable. Gamma: scale is better
        clf = SVR(C=args.regularization, kernel=args.kernel, degree=args.degree, cache_size=8192)        

        # Open extracted embedding pickel
        embed_pkl = pickle.load(open('embed_'+nn_model_filename, "rb"))

        # Fit SVR machine. Essentially Multi In, Single Out here.
        clf.fit(embed_pkl.astype('float32'), df_train.Pawpularity.values.astype('int32'))

        # Save RAPIDS SVR
        print('Saving SVR...')
        pickle.dump(clf, open(svr_filename, "wb"))
        

        if args.sink == 's3':
            s3r.meta.client.upload_file(svr_filename, BUCKET_NAME, svr_prefix_name+svr_filename)
            
        if args.sink == 'kaggle':
            print('Model output sink set to kaggle. Are you sure? *future improvement*')
    
    # Load SVR if we have it.
    else:
        
        print('Loading SVR...',LOAD_SVR_FROM_PATH+svr_filename)
        
        if args.source == 's3':
            s3r.Object(BUCKET_NAME, svr_prefix_name+svr_filename).download_file(svr_filename)
            LOAD_SVR_FROM_PATH = './'
            
        if args.source == 'kaggle':
            if args.isSUBMIT:
                LOAD_SVR_FROM_PATH = args.model_path
        
        clf = pickle.load(open(LOAD_SVR_FROM_PATH+svr_filename, "rb"))

        clear_trash()

##############################################################################
    if args.isOOF:
        # Out of Fold [OOF] Predictions, What is OOF https://is.gd/99qW1p
        print('Predicting Out of Fold...')

        # Instantiate validation dataset objects
        valid_dataset = PawpularDataset(
            image_paths=valid_img_paths,
            dense_features=df_valid[dense_features].values,
            targets=df_valid['Pawpularity'].values/100.00,
            augmentations=valid_aug,
        )

        valid_predictions = model.predict(valid_dataset, batch_size=2*args.batch_size, n_jobs=-1)

        sglfold_oof_nnreg_pred = []
        embed = np.array([]).reshape((0,128+len(dense_features)))
        for preds in valid_predictions:
            sglfold_oof_nnreg_pred.extend(preds[:,:1].ravel().tolist())
            embed = np.concatenate([embed,preds[:,1:]],axis=0)

        sglfold_oof_nnreg_pred = [x * 100 for x in sglfold_oof_nnreg_pred] # [sigmoid(x) * 100 for x in sglfold_oof_nnreg_pred]
        sglfold_oof_svrreg_pred = clf.predict(embed)    
        allfolds_oof_nnreg_preds.append(sglfold_oof_nnreg_pred)
        allfolds_oof_svrreg_preds.append(sglfold_oof_svrreg_pred)

        sglfold_oof_validation_tgts = df_valid['Pawpularity'].values
        allfolds_oof_validation_tgts.append(sglfold_oof_validation_tgts)

        #################################################
        # Compute RMSE
        rsme = np.sqrt( np.mean( (allfolds_oof_validation_tgts[-1] - np.array(allfolds_oof_nnreg_preds[-1]))**2.0 ) )
        print('\nNN RSME =',rsme,'\n')

        rsme = np.sqrt( np.mean( (allfolds_oof_validation_tgts[-1] - np.array(allfolds_oof_svrreg_preds[-1]))**2.0 ) )
        print('SVR RSME =',rsme,'\n')

        oof2 = (1-args.nn_svr_weight)*np.array(allfolds_oof_nnreg_preds[-1]) + args.nn_svr_weight*np.array(allfolds_oof_svrreg_preds[-1])
        rsme = np.sqrt( np.mean( (allfolds_oof_validation_tgts[-1] - oof2)**2.0 ) )
        print('Ensemble RSME =',rsme,'\n')

        clear_trash()

##############################################################################
    if args.isSUBMIT:
        # Testing our predictions
        print('Predicting test...')

        # Initialize test dataset. Actual test images are only used after code submit
        # Notice also that we init targets as array of ones
        test_dataset = PawpularDataset(
            image_paths=test_img_paths,
            dense_features=df_test[dense_features].values,
            targets=np.ones(len(test_img_paths)),
            augmentations=test_aug,
        )
        
        # Store our predictions of test images
        test_predictions = model.predict(test_dataset, batch_size=2*args.batch_size, n_jobs=-1)

        # Store emebddings from test predictions.
        sglfold_submit_nnreg_pred = []
        embed = np.array([]).reshape((0,128+len(dense_features)))
        for preds in test_predictions: #tqdm
            sglfold_submit_nnreg_pred.extend(preds[:,:1].ravel().tolist())
            embed = np.concatenate([embed,preds[:,1:]],axis=0)


        # Final compute for predictions out of NN
        sglfold_submit_nnreg_pred = [x * 100 for x in sglfold_submit_nnreg_pred]

        # Take embeddings from NN, and use SVR to get predictions.
        sglfold_submit_svrreg_pred = clf.predict(embed)

        # Store both predictions above
        allfolds_submit_nnreg_preds.append(sglfold_submit_nnreg_pred)
        allfolds_submit_svrreg_preds.append(sglfold_submit_svrreg_pred)



  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


ClientError: ignored

In [None]:
if args.isOOF:

    true = np.hstack(allfolds_oof_validation_tgts)

    oof = np.hstack(allfolds_oof_nnreg_preds)
    rsme = np.sqrt( np.mean( (oof - true)**2.0 ))
    print('Overall CV NN head RSME =',rsme)

    oof2 = np.hstack(allfolds_oof_svrreg_preds)
    rsme = np.sqrt( np.mean( (oof2 - true)**2.0 ))
    print('Overall CV SVR head RSME =',rsme)

    oof3 = (1-args.nn_svr_weight)*oof + args.nn_svr_weight*oof2
    rsme = np.sqrt( np.mean( (oof3 - true)**2.0 ))
    print('Overall CV Ensemble heads RSME with 50% NN and 50% SVR =',rsme)

In [None]:
if args.isSUBMIT:
    # FORCE SVR WEIGHT TO LOWER VALUE TO HELP PUBLIC LB

    allfolds_submit_nnreg_preds = np.mean(np.column_stack(allfolds_submit_nnreg_preds), axis=1)
    allfolds_submit_svrreg_preds = np.mean(np.column_stack(allfolds_submit_svrreg_preds), axis=1)
    df_test["Pawpularity"] = (1-args.nn_svr_weight)*allfolds_submit_nnreg_preds + args.nn_svr_weight*allfolds_submit_svrreg_preds
    df_test = df_test[["Id", "Pawpularity"]]
    df_test.to_csv("submission.csv", index=False)