In [None]:
!pip install ../input/pretrainedmodelspython/pretrainedmodels-0.7.4-py3-none-any.whl

In [None]:
import gc
import glob
import os
import sys
import json
import random
import time
import re
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np
from scipy.stats import kurtosis, iqr, skew
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix as sk_cmatrix
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import category_encoders as ce

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data
from torch.optim import Optimizer

from joblib import Parallel, delayed
from tqdm import tqdm

import gensim
from gensim.models import FastText
from keras.preprocessing.sequence import pad_sequences
import spacy

In [None]:
def set_seed(seed=4334):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = False
    
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
SEED = 4334
BATCH_SIZE = 512
n_fold = 5

set_seed(SEED)

In [None]:
start_time = time.time()
print("Loading data ...")

train = pd.read_csv('../input/petfinder-adoption-prediction/train/train.csv')
test = pd.read_csv('../input/petfinder-adoption-prediction/test/test.csv')
sample_submission = pd.read_csv('../input/petfinder-adoption-prediction/test/sample_submission.csv')

train_size = train.shape[0]
test_size = test.shape[0]

print(train.shape)
print(test.shape)

print("--- %s seconds ---" % (time.time() - start_time))

# Image feature extraction

In [None]:
import cv2
import os
import pretrainedmodels
import torchvision.transforms as transforms

import fastai
from fastai.imports import *
from fastai.vision import *
from fastai.metrics import *
from fastai.gen_doc.nbdoc import *

In [None]:
standardize = transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])

"""
Define Dataset to load image
"""

def resize_to_square(im):
    old_size = im.shape[:2]
    ratio = float(img_size)/max(old_size)
    new_size = tuple([int(x*ratio) for x in old_size])
    im = cv2.resize(im, (new_size[1], new_size[0]))
    delta_w = img_size - new_size[1]
    delta_h = img_size - new_size[0]
    top, bottom = delta_h//2, delta_h-(delta_h//2)
    left, right = delta_w//2, delta_w-(delta_w//2)
    color = [0, 0, 0]
    new_im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT,value=color)
    return new_im
    
def load_image(img_dir):
    image = cv2.imread(img_dir, 1)
    image = resize_to_square(image)
    image = standardize(transforms.functional.to_tensor(image)).cuda()
    return image

def get_keys(img_dir):
    pet_id, img_num = re.search('_images/' + '(.*).jpg', img_dir).group(1).split('-')
    return pet_id, img_num

## Classification Model (seResNeXt50)

In [None]:
seresnext50 = pretrainedmodels.__dict__['se_resnext50_32x4d'](num_classes=1000, pretrained=None)
seresnext50.load_state_dict(torch.load('../input/pretrainedmodelsweights/seresnext50.pth'))
seresnext50 = seresnext50.cuda()

In [None]:
img_size = 224
img_batch_size = 48
softmax = nn.Softmax(dim=1)

In [None]:
train_img_dir = glob.glob(f'../input/petfinder-adoption-prediction/train_images/*.jpg')
n_batches = len(train_img_dir) // img_batch_size + (len(train_img_dir) % img_batch_size != 0)

seresnext50_predict_train = {}
seresnext50.eval()
for b in tqdm(range(n_batches)):
    start = b*img_batch_size
    end = (b+1)*img_batch_size
    batch_pets = train_img_dir[start:end]
    batch_images = torch.zeros((len(batch_pets), 3, img_size, img_size)).cuda()
    for i, img_dir in enumerate(batch_pets):
        batch_images[i] = load_image(img_dir)
    batch_hats = softmax(seresnext50(batch_images))
    for i, img_dir in enumerate(batch_pets):
        pet_id, img_num = get_keys(img_dir)
        if pet_id not in seresnext50_predict_train:
            seresnext50_predict_train[pet_id] = {}
        seresnext50_predict_train[pet_id][int(img_num)] = batch_hats[i].argmax().detach().cpu().numpy() + 1

In [None]:
test_img_dir = glob.glob(f'../input/petfinder-adoption-prediction/test_images/*.jpg')
n_batches = len(test_img_dir) // img_batch_size + (len(test_img_dir) % img_batch_size != 0)

seresnext50_predict_test = {}
seresnext50.eval()
for b in tqdm(range(n_batches)):
    start = b*img_batch_size
    end = (b+1)*img_batch_size
    batch_pets = test_img_dir[start:end]
    batch_images = torch.zeros((len(batch_pets), 3, img_size, img_size)).cuda()
    for i, img_dir in enumerate(batch_pets):
        batch_images[i] = load_image(img_dir)
    batch_hats = softmax(seresnext50(batch_images))
    for i, img_dir in enumerate(batch_pets):
        pet_id, img_num = get_keys(img_dir)
        if pet_id not in seresnext50_predict_test:
            seresnext50_predict_test[pet_id] = {}
        seresnext50_predict_test[pet_id][int(img_num)] = batch_hats[i].argmax().detach().cpu().numpy() + 1

In [None]:
# Process image prediction features

img_seq_len = 12
pet_ids = train['PetID'].values
seresnext50_pred_train = np.zeros((train_size, img_seq_len))

for i, pet_id in enumerate(tqdm(pet_ids)):
    if pet_id in seresnext50_predict_train:
        for j in seresnext50_predict_train[pet_id]:
            if j <= img_seq_len:
                seresnext50_pred_train[i, j-1] = seresnext50_predict_train[pet_id][j]
            else:
                pass
    else:
        pass

pet_ids = test['PetID'].values
seresnext50_pred_test = np.zeros((test_size, img_seq_len))

for i, pet_id in enumerate(tqdm(pet_ids)):
    if pet_id in seresnext50_predict_test:
        for j in seresnext50_predict_test[pet_id]:
            if j <= img_seq_len:
                seresnext50_pred_test[i, j-1] = seresnext50_predict_test[pet_id][j]
            else:
                pass
    else:
        pass

In [None]:
del seresnext50, seresnext50_predict_train, seresnext50_predict_test
gc.collect()

## Cuteness Model (DenseNet201)

In [None]:
def fastai2pytorch(fastai_model, n_class):
    body = create_body(fastai_model, False, None)
    nf = callbacks.hooks.num_features_model(body) * 2
    head = create_head(nf, n_class, None, ps=0.5, bn_final=False)
    return nn.Sequential(body, head)

In [None]:
cute_model = fastai2pytorch(models.densenet201, 2)
cute_model.load_state_dict(torch.load('../input/cat-and-dog-pretrained-weights/densenet201_cuteness.pth')['model'])

In [None]:
class CuteBottom(nn.Module):
    def __init__(self, cute_model):
        super(CuteBottom, self).__init__()
        
        children = list(cute_model.children())
        self.backbone = children[0]
        self.features = nn.Sequential(*list(children[1].children())[:-4])
        self.head = nn.Sequential(*list(children[1].children())[-4:])
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, x):
        x = self.backbone(x)
        x = self.features(x)
        return x
    
    def predict_on_hidden(self, x):
        return self.softmax(self.head(x))
    
cute_bottom = CuteBottom(cute_model)
cute_bottom = cute_bottom.cuda()

del cute_model
gc.collect()

In [None]:
img_size = 224
img_batch_size = 32

In [None]:
train_img_dir = glob.glob(f'../input/petfinder-adoption-prediction/train_images/*.jpg')
n_batches = len(train_img_dir) // img_batch_size + (len(train_img_dir) % img_batch_size != 0)

img_features_train = {}
img_cute_train = {}
cute_bottom.eval()
for b in tqdm(range(n_batches)):
    start = b*img_batch_size
    end = (b+1)*img_batch_size
    batch_pets = train_img_dir[start:end]
    batch_images = torch.zeros((len(batch_pets), 3, img_size, img_size)).cuda()
    for i, img_dir in enumerate(batch_pets):
        batch_images[i] = load_image(img_dir)
    batch_preds = cute_bottom(batch_images)
    batch_preds_arr = batch_preds.detach().cpu().numpy()
    batch_hats = cute_bottom.predict_on_hidden(batch_preds).detach()
    for i, img_dir in enumerate(batch_pets):
        pet_id, img_num = get_keys(img_dir)
        if pet_id not in img_features_train:
            img_features_train[pet_id] = {}
            img_cute_train[pet_id] = {}
        img_features_train[pet_id][int(img_num)] = batch_preds_arr[i]
        img_cute_train[pet_id][int(img_num)] = batch_hats[i].argmax().cpu().numpy() + 1

In [None]:
test_img_dir = glob.glob(f'../input/petfinder-adoption-prediction/test_images/*.jpg')
n_batches = len(test_img_dir) // img_batch_size + (len(test_img_dir) % img_batch_size != 0)

img_features_test = {}
img_cute_test = {}
cute_bottom.eval()
for b in tqdm(range(n_batches)):
    start = b*img_batch_size
    end = (b+1)*img_batch_size
    batch_pets = test_img_dir[start:end]
    batch_images = torch.zeros((len(batch_pets), 3, img_size, img_size)).cuda()
    for i, img_dir in enumerate(batch_pets):
        batch_images[i] = load_image(img_dir)
    batch_preds = cute_bottom(batch_images)
    batch_preds_arr = batch_preds.detach().cpu().numpy()
    batch_hats = cute_bottom.predict_on_hidden(batch_preds).detach()
    for i, img_dir in enumerate(batch_pets):
        pet_id, img_num = get_keys(img_dir)
        if pet_id not in img_features_test:
            img_features_test[pet_id] = {}
            img_cute_test[pet_id] = {}
        img_features_test[pet_id][int(img_num)] = batch_preds_arr[i]
        img_cute_test[pet_id][int(img_num)] = batch_hats[i].argmax().detach().cpu().numpy() + 1

In [None]:
# Process image features

img_feat_dim = 512
img_seq_len = 12
pet_ids = train['PetID'].values
img_seq_train = np.zeros((train_size, img_seq_len, img_feat_dim))

for i, pet_id in enumerate(tqdm(pet_ids)):
    if pet_id in img_features_train:
        for j in img_features_train[pet_id]:
            if j <= img_seq_len:
                img_seq_train[i, j-1, :] = img_features_train[pet_id][j]
            else:
                pass
    else:
        pass
    

pet_ids = test['PetID'].values
img_seq_test = np.zeros((test_size, img_seq_len, img_feat_dim))

for i, pet_id in enumerate(tqdm(pet_ids)):
    if pet_id in img_features_test:
        for j in img_features_test[pet_id]:
            if j <= img_seq_len:
                img_seq_test[i, j-1, :] = img_features_test[pet_id][j]
            else:
                pass
    else:
        pass

In [None]:
# Process image prediction features

img_seq_len = 12
pet_ids = train['PetID'].values
cuteness_train = np.zeros((train_size, img_seq_len))

for i, pet_id in enumerate(tqdm(pet_ids)):
    if pet_id in img_cute_train:
        for j in img_cute_train[pet_id]:
            if j <= img_seq_len:
                cuteness_train[i, j-1] = img_cute_train[pet_id][j]
            else:
                pass
    else:
        pass

pet_ids = test['PetID'].values
cuteness_test = np.zeros((test_size, img_seq_len))

for i, pet_id in enumerate(tqdm(pet_ids)):
    if pet_id in img_cute_test:
        for j in img_cute_test[pet_id]:
            if j <= img_seq_len:
                cuteness_test[i, j-1] = img_cute_test[pet_id][j]
            else:
                pass
    else:
        pass

In [None]:
del cute_bottom, img_features_train, img_features_test, img_cute_train, img_cute_test
gc.collect()

# Description processing

In [None]:
start_time = time.time()
print("Loading text data ...")
train_text = train['Description'].fillna(' ')
test_text = test['Description'].fillna(' ')
text_list = pd.concat([train_text, test_text])

plt.hist([len(sentence.split()) for sentence in text_list], bins=50)
plt.axvline(x=200, color='r')
plt.show()
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
start_time = time.time()
print("Spacy NLP ...")
nlp = spacy.load('en_core_web_lg', disable=['parser','ner','tagger'])
nlp.vocab.add_flag(lambda s: s.lower() in spacy.lang.en.stop_words.STOP_WORDS, spacy.attrs.IS_STOP)
word_dict = {}
word_index = 1
lemma_dict = {}
docs = nlp.pipe(text_list, n_threads = 6)
word_sequences = []
sentences = []
for doc in tqdm(docs):
    word_seq = []
    sentence_seq = []
    for token in doc:
        if (token.text not in word_dict) and (token.pos_ is not "PUNCT"):
            word_dict[token.text] = word_index
            word_index += 1
            lemma_dict[token.text] = token.lemma_
        if token.pos_ is not "PUNCT":
            word_seq.append(word_dict[token.text])
            sentence_seq.append(token.text)
    word_sequences.append(word_seq)
    sentences.append(sentence_seq)
del docs
gc.collect()
train_word_sequences = word_sequences[:train_size]
test_word_sequences = word_sequences[train_size:]
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
max_length = 200
embed_size = 300

train_word_sequences = pad_sequences(train_word_sequences, maxlen=max_length, padding='pre')
test_word_sequences = pad_sequences(test_word_sequences, maxlen=max_length, padding='pre')
print(train_word_sequences[:1])
print(test_word_sequences[:1])
pred_prob = np.zeros((len(test_word_sequences),), dtype=np.float32)

In [None]:
start_time = time.time()
print("Training fastText ...")

ft_model = FastText(size=embed_size, min_count=1)
ft_model.build_vocab(sentences=sentences)
ft_model.train(sentences=sentences, total_examples=len(sentences), epochs=10)
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
start_time = time.time()
print("Loading embedding matrix ...")

def load_fasttext(word_dict):
    nb_words = len(word_dict) + 1
    embedding_matrix = np.zeros((nb_words, embed_size), dtype=np.float32)
    
    for word in tqdm(word_dict):
        embedding_matrix[word_dict[word]] = ft_model.wv[word]
    return embedding_matrix, nb_words

embedding_matrix, nb_words = load_fasttext(word_dict)
n_words = len(embedding_matrix)
print(embedding_matrix.shape)
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
del ft_model
gc.collect()

# Name text processing

In [None]:
start_time = time.time()
print("Loading name data ...")
train_name = train['Name'].fillna(' ')
test_name = test['Name'].fillna(' ')
name_list = pd.concat([train_name, test_name])

plt.hist([len(name) for name in name_list], bins=40)
plt.axvline(x=20, color='r')
plt.show()
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
start_time = time.time()
print("Generating name sequences ...")

all_letters = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 -,;.!?:’/\|_@#$%ˆ&*˜‘+=()[]{}'
n_letters = len(all_letters)

letter_dict = {all_letters[i]: i for i in range(n_letters)}
letter_sequences = []
for name in name_list.values:
    letter_seq = []
    for letter in name:
        try:
            letter_seq.append(letter_dict[letter])
        except KeyError:
            letter_seq.append(len(letter_dict))
    letter_sequences.append(letter_seq)

train_name_sequences = letter_sequences[:train_size]
test_name_sequences = letter_sequences[train_size:]
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
maxlen_name = 20
embed_size_name = n_letters + 1

train_name_sequences = pad_sequences(train_name_sequences, maxlen=maxlen_name, padding='pre')
test_name_sequences = pad_sequences(test_name_sequences, maxlen=maxlen_name, padding='pre')
print(train_name_sequences[:1])
print(test_name_sequences[:1])

name_embed_mat = np.eye(94, 93, k=-1, dtype=np.float32)

# Main table feature engineering

In [None]:
train_proc = train.copy()
test_proc = test.copy()

In [None]:
df = pd.concat([train_proc, test_proc], ignore_index=True, sort=False)
X_temp = df.copy()

y_temp = X_temp['AdoptionSpeed']
X_temp = X_temp.drop(['AdoptionSpeed'], axis=1)

print('NaN structure:\n{}'.format(np.sum(pd.isnull(df))))

## Additional State Data

In [None]:
state_info = pd.read_csv('../input/stateinfo/State info.csv')

X_temp = X_temp.merge(
    state_info, how='left', on='State',
    suffixes=('', '_state')
)

X_temp['population_density'] = X_temp['Population'] / X_temp['Total Area']

## is_mixed_breed, n_color, contains_chinese

In [None]:
X_temp['is_mixed_breed'] = X_temp.apply(lambda x: 0 if x.Breed2==0 and x.Breed1!=307 else 1, axis=1)
X_temp['n_color'] = X_temp.apply(lambda x:  3-sum([y==0 for y in [x.Color1, x.Color2, x.Color3]]), axis=1)

# contains_chinese
def isChinese(s):
    if type(s) != str:
        return 2
    if len(re.findall(u'[\u4e00-\u9fff]', s)) > 0:
        return 1
    else:
        return 0
X_temp['contains_chinese'] = X_temp['Description'].map(isChinese)

## is_nameless

In [None]:
names = X_temp.Name.unique()
no_names = []
for name in names:
    if type(name) is float:
        continue
    if 'name' in name.lower() or 'kitt' in name.lower() or 'pupp' in name.lower() or 'cats' in name.lower() or 'dogs' in name.lower():
        no_names += [name]

X_temp['is_nameless'] = (pd.isnull(X_temp['Name'])).astype(np.int64)

for index in range(len(X_temp['Name'])):
    if type(X_temp['Name'].iloc[index]) == float:
        continue
    if X_temp['Name'].iloc[index] in no_names:
        X_temp.loc[index, 'is_nameless'] = 1

# Final Processing

In [None]:
print('NaN structure:\n{}'.format(np.sum(pd.isnull(X_temp))))
print('-'*99)

column_types = X_temp.dtypes

print('\tinteger columns:\n{}'.format(column_types[column_types == np.int64]))
print('\n\tfloat columns:\n{}'.format(column_types[column_types == 'float']))
print('\n\tto encode categorical columns:\n{}'.format(column_types[column_types == 'object']))

## Label encoding for categorical features

In [None]:
# X_temp.loc[X_temp['PhotoAmt'] > 10, ['PhotoAmt']] = 11
X_temp['PhotoAmt'] = X_temp['PhotoAmt'].astype(np.int64)

# Count RescuerID occurrences:
rescuer_count = X_temp.groupby(['RescuerID'])['PetID'].count().reset_index()
rescuer_count.columns = ['RescuerID', 'RescuerID_COUNT']

# Merge as another feature onto main DF:
X_temp = X_temp.merge(rescuer_count, how='left', on='RescuerID')

cat_cols = [
    'Type', 'Age', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2', 'Color3',
    'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed', 'Sterilized',
    'Health', 'Quantity', 'Fee', 'State', 'VideoAmt', 'is_mixed_breed',
    'n_color', 'contains_chinese', 'is_nameless', 'RescuerID_COUNT',
    'Type_state', 'Region'
]

lbe = LabelEncoder()
for f in cat_cols:
    lbe = LabelEncoder()
    X_temp[f] = lbe.fit_transform(X_temp[f])

In [None]:
config = {}
config['field_size'] = len(cat_cols)
config['feature_sizes'] = list(X_temp[cat_cols].nunique())

In [None]:
print('NaN structure:\n{}'.format(np.sum(pd.isnull(X_temp))))
print('-'*99)

column_types = X_temp.dtypes

print('\tint64 columns:\n{}'.format(column_types[column_types == np.int64]))
print('\tint32 columns:\n{}'.format(column_types[column_types == np.int32]))
print('\n\tfloat columns:\n{}'.format(column_types[column_types == 'float']))
print('\n\tto encode categorical columns:\n{}'.format(column_types[column_types == 'object']))

## Numerical feature procession (normalization, percent encoding)

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
scaler = MinMaxScaler()
num_cols = [
    'population_density', 'Population', 'Total Area', '2016 GDP(RM Million)', '2016 GDPper capita(RM)',
    'HDI', 'GDP Growth', 'Service', 'Manufacturing', 'Agriculture', 'Mining', 
    'Construction', 'Import'
]
X_temp[num_cols] = scaler.fit_transform(X_temp[num_cols])

In [None]:
def percent_encoder(col_name, main_df):
    size = main_df.shape[0]
    temp_df = main_df.groupby([col_name])['PetID'].count() / size
    temp_df = temp_df.reset_index()
    temp_df.columns = [col_name, col_name + '_PERCENT']
    return main_df.merge(temp_df, how='left', on=col_name)

for c in cat_cols:
    X_temp = percent_encoder(c, X_temp)

In [None]:
print('NaN structure:\n{}'.format(np.sum(pd.isnull(X_temp))))
print('-'*99)

column_types = X_temp.dtypes

print('\tint64 columns:\n{}'.format(column_types[column_types == np.int64]))
print('\tint32 columns:\n{}'.format(column_types[column_types == np.int32]))
print('\n\tfloat columns:\n{}'.format(column_types[column_types == 'float']))
print('\n\tto encode categorical columns:\n{}'.format(column_types[column_types == 'object']))

# Print categorical and numerical feature names

In [None]:
num_cols.extend(
    ['Type_PERCENT', 'Age_PERCENT', 'Breed1_PERCENT', 'Breed2_PERCENT',
     'Gender_PERCENT', 'Color1_PERCENT', 'Color2_PERCENT', 'Color3_PERCENT', 
     'MaturitySize_PERCENT', 'FurLength_PERCENT', 'Vaccinated_PERCENT', 
     'Dewormed_PERCENT', 'Sterilized_PERCENT', 'Health_PERCENT', 'Quantity_PERCENT',
     'Fee_PERCENT', 'State_PERCENT', 'VideoAmt_PERCENT', 'is_mixed_breed_PERCENT',
     'n_color_PERCENT', 'contains_chinese_PERCENT', 'is_nameless_PERCENT',
     'RescuerID_COUNT_PERCENT', 'Type_state_PERCENT', 'Region_PERCENT'
    ]
)

print("Categorical features:\n")
print(cat_cols)
print("# categorical features: ", len(cat_cols))

print("Numerical features:\n")
print(num_cols)
print("# numerical features: ", len(num_cols))
print('\nTotal matrix size: ', X_temp.shape)

# Split train/test

In [None]:
train_X, test_X = X_temp.iloc[:train_size, :], X_temp.iloc[train_size:, :]
train_y = y_temp.iloc[:train_size]
train_y = train_y.astype(int)

# Model

In [None]:
img_feat_dim = 512

from fastai.text.models import EmbeddingDropout

class Dense(nn.Module):
    def __init__(self, in_channel, out_channel, dropout=0.25):
        super(Dense, self).__init__()
        
        self.linear = nn.Linear(in_channel, out_channel, bias=False)
        self.activation = nn.PReLU()
        self.dropout = nn.Dropout(dropout)
        self.bn = nn.BatchNorm1d(out_channel)
    
    def forward(self, x):
        out = self.linear(x)
        out = self.activation(out)
        out = self.dropout(out)
        out = self.bn(out)
        return out

class RNNModel(nn.Module):
    def __init__(self, rnn_type, input_size, hidden_size):
        super(RNNModel, self).__init__()
        self.rnn = getattr(nn, rnn_type)(input_size, hidden_size, bidirectional=True, batch_first=True)
    
    def init_weights(self):
        ih = (param.data for name, param in self.named_parameters() if 'weight_ih' in name)
        hh = (param.data for name, param in self.named_parameters() if 'weight_hh' in name)
        b = (param.data for name, param in self.named_parameters() if 'bias' in name)
        for k in ih:
            nn.init.xavier_uniform_(k)
        for k in hh:
            nn.init.orthogonal_(k)
        for k in b:
            nn.init.constant_(k, 0)

    def forward(self, x):
        return self.rnn(x)
    
class Image_Model(nn.Module):
    def __init__(self, hidden_size, out_size, kernel_sizes, dropout, width=5):
        super(Image_Model, self).__init__()
        
        self.feat_dropout = nn.Dropout(dropout)
        self.conv_list = nn.ModuleList([WideConv(img_feat_dim, hidden_size, kernel_sizes) for _ in range(width)])
        self.dense = Dense(hidden_size*len(kernel_sizes)*width, out_size, dropout=0)
    
    def forward(self, x):
        out = x.transpose(1, 2)
        out = self.feat_dropout(out)

        out_list = []
        for conv in self.conv_list:
            out_list.append(conv(out))
        
        conc = torch.squeeze(torch.cat(out_list, 1)) 
        return self.dense(conc)
    
class WideConv(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_sizes):
        super(WideConv, self).__init__()
        
        self.conv_list = nn.ModuleList([nn.Sequential(
            nn.Conv1d(in_channels, out_channels, k, bias=False),
            nn.BatchNorm1d(out_channels),
            nn.ReLU(inplace=True),
            nn.AdaptiveMaxPool1d(1)
        ) for k in kernel_sizes])
        
    def forward(self, emb):
        out_list = []
        for conv in self.conv_list:
            out_list.append(conv(emb))
        
        return torch.squeeze(torch.cat(out_list, 1))
    
class Img_Pred(nn.Module):
    def __init__(self, n_class, hidden_size, out_size, embed_size, width, kernel_sizes, dropout):
        super(Img_Pred, self).__init__()
        
        self.width = width
        self.embed = nn.Embedding(n_class, embed_size, padding_idx=0)
        
        self.conv_list = nn.ModuleList([WideConv(embed_size, hidden_size, kernel_sizes) for _ in range(width)])
        self.dense = Dense(hidden_size*len(kernel_sizes)*width, out_size, dropout=dropout)
    
    def forward(self, x):
        emb = self.embed(x)
        emb = emb.transpose(1, 2)
        
        out_list = []
        for conv in self.conv_list:
            out_list.append(conv(emb))
        conc = torch.cat(out_list, 1)
        return self.dense(conc)
    
class LSTM_TextCNN(nn.Module):
    def __init__(self, hidden_size, out_size, embedding_matrix, embed_size, dropout, initialization=True):
        super(LSTM_TextCNN, self).__init__()
        
        self.embedding = nn.Embedding(n_words, embed_size, padding_idx=0)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        
        self.embedding_dropout = EmbeddingDropout(self.embedding, dropout)
        
        self.lstm = RNNModel("LSTM", embed_size, hidden_size)
        self.lstm.init_weights()
        
        self.conv1 = nn.Conv1d(hidden_size*2, hidden_size, 1, bias=False)
        self.bn1 = nn.BatchNorm1d(hidden_size)
        self.conv2 = nn.Conv1d(hidden_size*2, hidden_size, 2, bias=False)
        self.bn2 = nn.BatchNorm1d(hidden_size)
        self.conv3 = nn.Conv1d(hidden_size*2, hidden_size, 3, bias=False)
        self.bn3 = nn.BatchNorm1d(hidden_size)
        self.conv4 = nn.Conv1d(hidden_size*2, hidden_size, 4, bias=False)
        self.bn4 = nn.BatchNorm1d(hidden_size)
        self.relu = nn.ReLU(inplace=True)
        
        self.dense = Dense(hidden_size*4, out_size, dropout=0)
        
    def forward(self, x):
        embed = self.embedding_dropout(x)
        out, _ = self.lstm(embed)
        
        out = out.transpose(1, 2)
        
        out1 = self.bn1(self.relu(self.conv1(out)))
        out1 = F.adaptive_max_pool1d(out1, 1)
        
        out2 = self.bn2(self.relu(self.conv2(out)))
        out2 = F.adaptive_max_pool1d(out2, 1)
        
        out3 = self.bn3(self.relu(self.conv3(out)))
        out3 = F.adaptive_max_pool1d(out3, 1)
        
        out4 = self.bn4(self.relu(self.conv4(out)))
        out4 = F.adaptive_max_pool1d(out4, 1)
        
        conc = torch.squeeze(torch.cat((out1, out2, out3, out4), 1)) 
        return self.dense(conc)

class NFM(torch.nn.Module):
    """
    :parameter
    -------------
    field_size: size of the feature fields
    feature_sizes: a field_size-dim array, sizes of the feature dictionary
    embedding_size: size of the feature embedding
    is_shallow_dropout: bool, shallow part(fm or ffm part) uses dropout or not?
    dropout_shallow: an array of the size of 1, example:[0.5], the element is for the-first order part
    h_depth: deep network's hidden layers' depth
    deep_layers: a h_depth-dim array, each element is the size of corresponding hidden layers. example:[32,32] h_depth = 2
    is_deep_dropout: bool, deep part uses dropout or not?
    dropout_deep: an array of dropout factors,example:[0.5,0.5,0.5] h_depth=2
    deep_layers_activation: relu or sigmoid etc
    is_batch_norm：bool,  use batch_norm or not ?
    random_seed: random_seed=950104 someone's birthday, my lukcy number
    use_fm: bool
    use_ffm: bool
    interation_type: bool, When it's true, the element-wise product of the fm or ffm embeddings will be added together, otherwise, the element-wise prodcut of embeddings will be concatenated.
    use_cuda: bool use gpu or cpu?
    out_size: output size

    Attention: only support logsitcs regression
    """
    def __init__(self,field_size, feature_sizes, embedding_size = 8, is_shallow_dropout = True, dropout_shallow = [0.25],
                 h_depth = 2, deep_layers = [128, 128], is_deep_dropout = True, dropout_deep=[0.25, 0.25, 0.25],
                 deep_layers_activation = 'relu', is_batch_norm = False, random_seed = SEED,
                 use_fm = True, use_ffm = False, interation_type = True,
                 use_cuda = True, out_size = 1
                 ):
        super(NFM, self).__init__()
        self.field_size = field_size
        self.feature_sizes = feature_sizes
        self.out_size = out_size
        self.embedding_size = embedding_size
        self.is_shallow_dropout = is_shallow_dropout
        self.dropout_shallow = dropout_shallow
        self.h_depth = h_depth
        self.deep_layers = deep_layers
        self.is_deep_dropout = is_deep_dropout
        self.dropout_deep = dropout_deep
        self.deep_layers_activation = deep_layers_activation
        self.is_batch_norm = is_batch_norm
        self.random_seed = random_seed
        self.use_fm = use_fm
        self.use_ffm = use_ffm
        self.interation_type = interation_type
        self.use_cuda = use_cuda
        self.out_size = out_size

        torch.manual_seed(self.random_seed)

        """
            check cuda
        """
        if self.use_cuda and not torch.cuda.is_available():
            self.use_cuda = False
            print("Cuda is not available, automatically changed into cpu model")

        """
            check use fm or ffm
        """
        if self.use_fm and self.use_ffm:
            print("only support one type only, please make sure to choose only fm or ffm part")
            exit(1)
        elif self.use_fm:
            print("The model is nfm(fm+nn layers)")
        elif self.use_ffm:
            print("The model is nffm(ffm+nn layers)")
        else:
            print("You have to choose more than one of (fm, ffm) models to use")
            exit(1)
        """
            bias
        """
        self.bias = torch.nn.Parameter(torch.randn(1, self.out_size))

        """
            fm part
        """
        if self.use_fm:
#             print("Init fm part")
            self.fm_first_order_embeddings = nn.ModuleList([nn.Embedding(feature_size,1) for feature_size in self.feature_sizes])
            if self.dropout_shallow:
                self.fm_first_order_dropout = nn.Dropout(self.dropout_shallow[0])
            self.fm_second_order_embeddings = nn.ModuleList([nn.Embedding(feature_size, self.embedding_size) for feature_size in self.feature_sizes])
#             print("Init fm part succeed")

        """
            ffm part
        """
        if self.use_ffm:
#             print("Init ffm part")
            self.ffm_first_order_embeddings = nn.ModuleList([nn.Embedding(feature_size,1) for feature_size in self.feature_sizes])
            if self.dropout_shallow:
                self.ffm_first_order_dropout = nn.Dropout(self.dropout_shallow[0])
            self.ffm_second_order_embeddings = nn.ModuleList([nn.ModuleList([nn.Embedding(feature_size, self.embedding_size) for i in range(self.field_size)]) for feature_size in self.feature_sizes])
#             print("Init ffm part succeed")

        """
            deep part
        """
#         print("Init deep part")

        if self.is_deep_dropout:
            self.linear_0_dropout = nn.Dropout(self.dropout_deep[0])
        if self.interation_type:
            self.linear_1 = nn.Linear(self.embedding_size, deep_layers[0])
        else:
            self.linear_1 = nn.Linear(self.field_size*(self.field_size-1)//2, deep_layers[0])
        if self.is_batch_norm:
            self.batch_norm_1 = nn.BatchNorm1d(deep_layers[0])
        if self.is_deep_dropout:
            self.linear_1_dropout = nn.Dropout(self.dropout_deep[1])
        for i, h in enumerate(self.deep_layers[1:], 1):
            setattr(self, 'linear_' + str(i + 1), nn.Linear(self.deep_layers[i - 1], self.deep_layers[i]))
            if self.is_batch_norm:
                setattr(self, 'batch_norm_' + str(i + 1), nn.BatchNorm1d(deep_layers[i]))
            if self.is_deep_dropout:
                setattr(self, 'linear_' + str(i + 1) + '_dropout', nn.Dropout(self.dropout_deep[i + 1]))

#         print("Init deep part succeed")
        
        self.dense = Dense(deep_layers[1], out_size, dropout=0)

#         print("Init succeed")

    def forward(self, Xi, Xv):
        """
        :param Xi_train: index input tensor, batch_size * k * 1
        :param Xv_train: value input tensor, batch_size * k * 1
        :return: the last output
        """
        """
            fm part
        """
        if self.use_fm:
            fm_first_order_emb_arr = [(torch.sum(emb(Xi[:,i,:]),1).t()*Xv[:,i]).t() for i, emb in enumerate(self.fm_first_order_embeddings)]
            fm_first_order = torch.cat(fm_first_order_emb_arr,1)
            if self.is_shallow_dropout:
                fm_first_order = self.fm_first_order_dropout(fm_first_order)

            if self.interation_type:
                # use 2xy = (x+y)^2 - x^2 - y^2 reduce calculation
                fm_second_order_emb_arr = [(torch.sum(emb(Xi[:,i,:]),1).t()*Xv[:,i]).t() for i, emb in enumerate(self.fm_second_order_embeddings)]
                fm_sum_second_order_emb = sum(fm_second_order_emb_arr)
                fm_sum_second_order_emb_square = fm_sum_second_order_emb*fm_sum_second_order_emb # (x+y)^2
                fm_second_order_emb_square = [item*item for item in fm_second_order_emb_arr]
                fm_second_order_emb_square_sum = sum(fm_second_order_emb_square) #x^2+y^2
                fm_second_order = (fm_sum_second_order_emb_square - fm_second_order_emb_square_sum) * 0.5
            else:
                fm_second_order_emb_arr = [(torch.sum(emb(Xi[:, i, :]), 1).t() * Xv[:, i]).t() for i, emb in
                                           enumerate(self.fm_second_order_embeddings)]
                fm_wij_arr = []
                for i in range(self.field_size):
                    for j in range(i + 1, self.field_size):
                        fm_wij_arr.append(fm_second_order_emb_arr[i] * fm_second_order_emb_arr[j])


        """
            ffm part
        """
        if self.use_ffm:
            ffm_first_order_emb_arr = [(torch.sum(emb(Xi[:,i,:]),1).t()*Xv[:,i]).t() for i, emb in enumerate(self.ffm_first_order_embeddings)]
            ffm_first_order = torch.cat(ffm_first_order_emb_arr,1)
            if self.is_shallow_dropout:
                ffm_first_order = self.ffm_first_order_dropout(ffm_first_order)
            ffm_second_order_emb_arr = [[(torch.sum(emb(Xi[:,i,:]), 1).t() * Xv[:,i]).t() for emb in  f_embs] for i, f_embs in enumerate(self.ffm_second_order_embeddings)]
            ffm_wij_arr = []
            for i in range(self.field_size):
                for j in range(i+1, self.field_size):
                    ffm_wij_arr.append(ffm_second_order_emb_arr[i][j]*ffm_second_order_emb_arr[j][i])
            ffm_second_order = sum(ffm_wij_arr)

        """
            deep part
        """
        if self.use_fm and self.interation_type:
            deep_emb = fm_second_order
        elif self.use_ffm and self.interation_type:
            deep_emb = ffm_second_order
        elif self.use_fm:
            deep_emb = torch.cat([torch.sum(fm_wij,1).view([-1,1]) for fm_wij in fm_wij_arr], 1)
        else:
            deep_emb = torch.cat([torch.sum(ffm_wij,1).view([-1,1]) for ffm_wij in ffm_wij_arr],1)

        if self.deep_layers_activation == 'sigmoid':
            activation = F.sigmoid
        elif self.deep_layers_activation == 'tanh':
            activation = F.tanh
        else:
            activation = F.relu

        if self.is_deep_dropout:
            deep_emb = self.linear_0_dropout(deep_emb)
        x_deep = self.linear_1(deep_emb)
        if self.is_batch_norm:
            x_deep = self.batch_norm_1(x_deep)
        x_deep = activation(x_deep)
        if self.is_deep_dropout:
            x_deep = self.linear_1_dropout(x_deep)
        for i in range(1, len(self.deep_layers)):
            x_deep = getattr(self, 'linear_' + str(i + 1))(x_deep)
            if self.is_batch_norm:
                x_deep = getattr(self, 'batch_norm_' + str(i + 1))(x_deep)
            x_deep = activation(x_deep)
            if self.is_deep_dropout:
                x_deep = getattr(self, 'linear_' + str(i + 1) + '_dropout')(x_deep)

        """
            output
        """
        
        out = self.dense(x_deep)
        return out

In [None]:
def kappa_loss(p, y, n_classes=5, eps=1e-10):
    """
    QWK loss function as described in https://arxiv.org/pdf/1612.00775.pdf
    
    Arguments:
        p: a tensor with probability predictions, [batch_size, n_classes],
        y, a tensor with one-hot encoded class labels, [batch_size, n_classes]
    Returns:
        QWK loss
    """
    
    W = np.zeros((n_classes, n_classes))
    for i in range(n_classes):
        for j in range(n_classes):
            W[i,j] = (i-j)**2
    
    W = torch.from_numpy(W.astype(np.float32)).to(device)
    
    O = torch.matmul(y.t(), p)
    E = torch.matmul(y.sum(dim=0).view(-1,1), p.sum(dim=0).view(1,-1)) / O.sum()
    
    return (W*O).sum() / ((W*E).sum() + eps)

def one_hot(batch, depth):
    emb = nn.Embedding(depth, depth)
    emb.weight.data = torch.eye(depth).to(device)
    emb.weight.requires_grad = False
    return emb(batch)

def label_smoother(batch, depth, epsilon):
    emb = nn.Embedding(depth, depth)
    emb.weight.data = (torch.empty(depth, depth).fill_(epsilon/(depth-1)) + torch.eye(depth) * (1-epsilon/(depth-1)*depth)).to(device)
    emb.weight.requires_grad = False
    return emb(batch)

In [None]:
# Based on https://github.com/pytorch/pytorch/pull/3740
import math

class Nadam(Optimizer):
    """Implements Nadam algorithm (a variant of Adam based on Nesterov momentum).
    It has been proposed in `Incorporating Nesterov Momentum into Adam`__.
    Arguments:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float, optional): learning rate (default: 2e-3)
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its square
        eps (float, optional): term added to the denominator to improve
            numerical stability (default: 1e-8)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        schedule_decay (float, optional): momentum schedule decay (default: 4e-3)
    __ http://cs229.stanford.edu/proj2015/054_report.pdf
    __ http://www.cs.toronto.edu/~fritz/absps/momentum.pdf
    """

    def __init__(self, params, lr=2e-3, betas=(0.9, 0.999), eps=1e-8,
                 weight_decay=0, schedule_decay=4e-3):
        defaults = dict(lr=lr, betas=betas, eps=eps,
                        weight_decay=weight_decay, schedule_decay=schedule_decay)
        super(Nadam, self).__init__(params, defaults)

    def step(self, closure=None):
        """Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data
                state = self.state[p]

                # State initialization
                if len(state) == 0:
                    state['step'] = 0
                    state['m_schedule'] = 1.
                    state['exp_avg'] = grad.new().resize_as_(grad).zero_()
                    state['exp_avg_sq'] = grad.new().resize_as_(grad).zero_()

                # Warming momentum schedule
                m_schedule = state['m_schedule']
                schedule_decay = group['schedule_decay']
                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                beta1, beta2 = group['betas']
                eps = group['eps']
                state['step'] += 1
                t = state['step']

                if group['weight_decay'] != 0:
                    grad = grad.add(group['weight_decay'], p.data)

                momentum_cache_t = beta1 * \
                    (1. - 0.5 * (0.96 ** (t * schedule_decay)))
                momentum_cache_t_1 = beta1 * \
                    (1. - 0.5 * (0.96 ** ((t + 1) * schedule_decay)))
                m_schedule_new = m_schedule * momentum_cache_t
                m_schedule_next = m_schedule * momentum_cache_t * momentum_cache_t_1
                state['m_schedule'] = m_schedule_new

                # Decay the first and second moment running average coefficient
                exp_avg.mul_(beta1).add_(1. - beta1, grad)
                exp_avg_sq.mul_(beta2).addcmul_(1. - beta2, grad, grad)
                exp_avg_sq_prime = exp_avg_sq / (1. - beta2 ** t)
                denom = exp_avg_sq_prime.sqrt_().add_(eps)

                p.data.addcdiv_(-group['lr'] * (1. - momentum_cache_t) / (1. - m_schedule_new), grad, denom)
                p.data.addcdiv_(-group['lr'] * momentum_cache_t_1 / (1. - m_schedule_next), exp_avg, denom)

        return loss

class CosineLRWithRestarts():
    """Decays learning rate with cosine annealing, normalizes weight decay
    hyperparameter value, implements restarts.
    https://arxiv.org/abs/1711.05101
    Args:
        optimizer (Optimizer): Wrapped optimizer.
        batch_size: minibatch size
        epoch_size: training samples per epoch
        restart_period: epoch count in the first restart period
        t_mult: multiplication factor by which the next restart period will extend/shrink
    Example:
        >>> scheduler = CosineLRWithRestarts(optimizer, 32, 1024, restart_period=5, t_mult=1.2)
        >>> for epoch in range(100):
        >>>     scheduler.step()
        >>>     train(...)
        >>>         ...
        >>>         optimizer.zero_grad()
        >>>         loss.backward()
        >>>         optimizer.step()
        >>>         scheduler.batch_step()
        >>>     validate(...)
    """

    def __init__(self, optimizer, batch_size, epoch_size, restart_period=100,
                 t_mult=2, last_epoch=-1, eta_threshold=1000, verbose=False):
        if not isinstance(optimizer, Optimizer):
            raise TypeError('{} is not an Optimizer'.format(
                type(optimizer).__name__))
        self.optimizer = optimizer
        if last_epoch == -1:
            for group in optimizer.param_groups:
                group.setdefault('initial_lr', group['lr'])
        else:
            for i, group in enumerate(optimizer.param_groups):
                if 'initial_lr' not in group:
                    raise KeyError("param 'initial_lr' is not specified "
                                   "in param_groups[{}] when resuming an"
                                   " optimizer".format(i))
        self.base_lrs = list(map(lambda group: group['initial_lr'],
                                 optimizer.param_groups))

        self.last_epoch = last_epoch
        self.batch_size = batch_size
        self.epoch_size = epoch_size
        self.eta_threshold = eta_threshold
        self.t_mult = t_mult
        self.verbose = verbose
        self.base_weight_decays = list(map(lambda group: group['weight_decay'],
                                           optimizer.param_groups))
        self.restart_period = restart_period
        self.restarts = 0
        self.t_epoch = -1

    def _schedule_eta(self):
        """
        Threshold value could be adjusted to shrink eta_min and eta_max values.
        """
        eta_min = 0
        eta_max = 1
        if self.restarts <= self.eta_threshold:
            return eta_min, eta_max
        else:
            d = self.restarts - self.eta_threshold
            k = d * 0.09
            return (eta_min + k, eta_max - k)

    def get_lr(self, t_cur):
        eta_min, eta_max = self._schedule_eta()

        eta_t = (eta_min + 0.5 * (eta_max - eta_min)
                 * (1. + math.cos(math.pi *
                                  (t_cur / self.restart_period))))

        weight_decay_norm_multi = math.sqrt(self.batch_size /
                                            (self.epoch_size *
                                             self.restart_period))
        lrs = [base_lr * eta_t for base_lr in self.base_lrs]
        weight_decays = [base_weight_decay * eta_t * weight_decay_norm_multi
                         for base_weight_decay in self.base_weight_decays]

        if self.t_epoch % self.restart_period < self.t_epoch:
            if self.verbose:
                print("Restart at epoch {}".format(self.last_epoch))
            self.restart_period *= self.t_mult
            self.restarts += 1
            self.t_epoch = 0

        return zip(lrs, weight_decays)

    def _set_batch_size(self):
        d, r = divmod(self.epoch_size, self.batch_size)
        batches_in_epoch = d + 2 if r > 0 else d + 1
        self.batch_increment = iter(torch.linspace(0, 1, batches_in_epoch))

    def step(self):
        self.last_epoch += 1
        self.t_epoch += 1
        self._set_batch_size()
        self.batch_step()

    def batch_step(self):
        t_cur = self.t_epoch + next(self.batch_increment)
        for param_group, (lr, weight_decay) in zip(self.optimizer.param_groups,
                                                   self.get_lr(t_cur)):
            param_group['lr'] = lr
            param_group['weight_decay'] = weight_decay

## Mean Target Encoding: In-Fold Operation

In [None]:
"""
Define Encoding Functions
"""

def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))

def target_encoder(trn_series,
                   val_series,
                   tst_series, 
                   target, 
                   min_samples_leaf=1, 
                   smoothing=1,
                   noise_level=0):
    """
    Smoothing is computed like in the following paper by Daniele Micci-Barreca
    https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf
    trn_series : training categorical feature as a pd.Series
    tst_series : test categorical feature as a pd.Series
    target : target data as a pd.Series
    min_samples_leaf (int) : minimum samples to take category average into account
    smoothing (int) : smoothing effect to balance categorical average vs prior  
    """ 
    assert len(trn_series) == len(target)
    assert trn_series.name == val_series.name
    assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean 
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # Apply average function to all target data
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index 
    
    ft_val_series = pd.merge(
        val_series.to_frame(val_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=val_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_val_series.index = val_series.index
    
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_tst_series.index = tst_series.index
    
    # Final processing
    return add_noise(ft_trn_series, noise_level).values, add_noise(ft_val_series, noise_level).values, add_noise(ft_tst_series, noise_level).values

def target_encoding_process(cat_cols, trn_df, val_df, tst_df, trn_y):
    trn_mat = np.empty((len(trn_df), len(cat_cols)))
    val_mat = np.empty((len(val_df), len(cat_cols)))
    tst_mat = np.empty((len(tst_df), len(cat_cols)))
    
    for i, c in enumerate(cat_cols):
        whole_arr = np.concatenate(target_encoder(trn_df[c], val_df[c], tst_df[c], trn_y))
#         whole_arr = tanh_scaler(whole_arr)
        whole_arr = (whole_arr - whole_arr.min()) / (whole_arr.max() - whole_arr.min())
        trn_mat[:, i] = whole_arr[:len(trn_df)]
        val_mat[:, i] = whole_arr[len(trn_df):(len(trn_df)+len(val_df))]
        tst_mat[:, i] = whole_arr[(len(trn_df)+len(val_df)):]
    
    return trn_mat, val_mat, tst_mat

In [None]:
# input order: Xi, Xv, Xnum, Xtext, Xname, Ximg

splits = list(StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=SEED).split(train_X, train_y))

xi_test = torch.tensor(test_X[cat_cols].values, dtype=torch.long).cuda()
xv_test = torch.ones_like(xi_test, dtype=torch.float)
xi_test = xi_test.unsqueeze(2)
xnum_test = torch.tensor(test_X[num_cols].values, dtype=torch.float).cuda()
xtext_test = torch.tensor(test_word_sequences, dtype=torch.long).cuda()
xname_test = torch.tensor(test_name_sequences, dtype=torch.long).cuda()
ximg_test = torch.tensor(img_seq_test, dtype=torch.float).cuda()
xpred_test = torch.tensor(seresnext50_pred_test, dtype=torch.long).cuda()
xcute_test = torch.tensor(cuteness_test, dtype=torch.long).cuda()

# test_ds = torch.utils.data.TensorDataset(xi_test, xv_test, xnum_test, xtext_test, xname_test, ximg_test)
# test_loader = torch.utils.data.DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False, pin_memory=False)

# Training

In [None]:
# Evaluation Functions

def confusion_matrix(rater_a, rater_b, min_rating=None, max_rating=None):
    """
    Returns the confusion matrix between rater's ratings
    """
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(rater_a + rater_b)
    if max_rating is None:
        max_rating = max(rater_a + rater_b)
    num_ratings = int(max_rating - min_rating + 1)
    conf_mat = [[0 for i in range(num_ratings)]
                for j in range(num_ratings)]
    for a, b in zip(rater_a, rater_b):
        conf_mat[a - min_rating][b - min_rating] += 1
    return conf_mat

def histogram(ratings, min_rating=None, max_rating=None):
    """
    Returns the counts of each type of rating that a rater made
    """
    if min_rating is None:
        min_rating = min(ratings)
    if max_rating is None:
        max_rating = max(ratings)
    num_ratings = int(max_rating - min_rating + 1)
    hist_ratings = [0 for x in range(num_ratings)]
    for r in ratings:
        hist_ratings[r - min_rating] += 1
    return hist_ratings

def quadratic_weighted_kappa(y, y_pred):
    """
    Calculates the quadratic weighted kappa
    axquadratic_weighted_kappa calculates the quadratic weighted kappa
    value, which is a measure of inter-rater agreement between two raters
    that provide discrete numeric ratings.  Potential values range from -1
    (representing complete disagreement) to 1 (representing complete
    agreement).  A kappa value of 0 is expected if all agreement is due to
    chance.
    quadratic_weighted_kappa(rater_a, rater_b), where rater_a and rater_b
    each correspond to a list of integer ratings.  These lists must have the
    same length.
    The ratings should be integers, and it is assumed that they contain
    the complete range of possible ratings.
    quadratic_weighted_kappa(X, min_rating, max_rating), where min_rating
    is the minimum possible rating, and max_rating is the maximum possible
    rating
    """
    rater_a = y
    rater_b = y_pred
    min_rating=None
    max_rating=None
    rater_a = np.array(rater_a, dtype=int)
    rater_b = np.array(rater_b, dtype=int)
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(min(rater_a), min(rater_b))
    if max_rating is None:
        max_rating = max(max(rater_a), max(rater_b))
    conf_mat = confusion_matrix(rater_a, rater_b,
                                min_rating, max_rating)
    num_ratings = len(conf_mat)
    num_scored_items = float(len(rater_a))

    hist_rater_a = histogram(rater_a, min_rating, max_rating)
    hist_rater_b = histogram(rater_b, min_rating, max_rating)

    numerator = 0.0
    denominator = 0.0

    for i in range(num_ratings):
        for j in range(num_ratings):
            expected_count = (hist_rater_a[i] * hist_rater_b[j]
                              / num_scored_items)
            d = pow(i - j, 2.0) / pow(num_ratings - 1, 2.0)
            numerator += d * conf_mat[i][j] / num_scored_items
            denominator += d * expected_count / num_scored_items

    return (1.0 - numerator / denominator)

## Model 1

In [None]:
n_num_feats = len(num_cols) + len(cat_cols)

class PetFinderModel(nn.Module):
    def __init__(self, config, cat_size, num_size, text_size, name_size, img_feat_size, img_pred_size, cute_size, in_plane, n_class, dropout=0.25):
        super(PetFinderModel, self).__init__()
        
        self.cat_model = NFM(config['field_size'], 
                             config['feature_sizes'], 
                             embedding_size=8,
                             is_shallow_dropout=False,
                             deep_layers=[128, 128],
                             use_cuda=True,
                             deep_layers_activation = 'relu',
                             dropout_deep = [0, dropout, dropout],
                             is_batch_norm = True,
                             use_fm=False, 
                             use_ffm=True, 
                             interation_type=False,
                             out_size=cat_size)
        self.num_model = Dense(n_num_feats, num_size, dropout=0)
        self.text_model = LSTM_TextCNN(256, text_size, embedding_matrix, embed_size, dropout)
        self.name_model = LSTM_TextCNN(128, name_size, name_embed_mat, embed_size_name, dropout=0)
        self.img_model = Image_Model(256, img_feat_size, [1, 2, 3], dropout=0, width=5) # hidden_size, out_size, kernel_sizes, dropout, width=5
        self.pred_model = Img_Pred(1001, 64, img_pred_size, 8, 5, [1, 2, 3], dropout=0) # n_class, hidden_size, out_size, embed_size, width, kernel_sizes, dropout
        self.cute_model = Img_Pred(3, 64, cute_size, 8, 5, [1, 2, 3], dropout=0)
        
        self.dense1 = Dense(cat_size + num_size + text_size + name_size + img_feat_size + img_pred_size + cute_size, in_plane, dropout=0)
        self.dense2 = Dense(in_plane, in_plane*2, dropout=0)
        self.logit = nn.Linear(in_plane*2, n_class)
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, Xi, Xv, Xnum, Xtext, Xname, Ximg, Xpred, Xcute):
        cat_out = self.cat_model(Xi, Xv)
        num_out = self.num_model(Xnum)
        text_out = self.text_model(Xtext)
        name_out = self.name_model(Xname)
        img_out = self.img_model(Ximg)
        pred_out = self.pred_model(Xpred)
        cute_out = self.cute_model(Xcute)
        out = torch.cat((cat_out, num_out, text_out, name_out, img_out, pred_out, cute_out), 1)
        out = self.dense1(out)
        out = self.dense2(out)
        out = self.logit(out)
        return self.softmax(out)
    
    def unfreeze(self):
        self.text_model.embedding.weight.requires_grad = True

In [None]:
TRAIN_EPOCHS = 8
LOG_INTERVAL = 6
weight_decay = 0.025

train_preds1 = np.zeros((len(train_X), 5))
test_preds1 = np.zeros((len(test_X), 5))

for i, (train_idx, valid_idx) in enumerate(splits):
    xi_train_fold = torch.tensor(train_X[cat_cols].values[train_idx], dtype=torch.long).cuda()
    xv_train_fold = torch.ones_like(xi_train_fold, dtype=torch.float)
    xi_train_fold = xi_train_fold.unsqueeze(2)
#     xnum_train_fold = torch.tensor(train_X[num_cols].values[train_idx], dtype=torch.float).cuda()
    xtext_train_fold = torch.tensor(train_word_sequences[train_idx], dtype=torch.long).cuda()
    xname_train_fold = torch.tensor(train_name_sequences[train_idx], dtype=torch.long).cuda()
    ximg_train_fold = torch.tensor(img_seq_train[train_idx], dtype=torch.float).cuda()
    xpred_train_fold = torch.tensor(seresnext50_pred_train[train_idx], dtype=torch.long).cuda()
    xcute_train_fold = torch.tensor(cuteness_train[train_idx], dtype=torch.long).cuda()
    y_train_fold  = torch.tensor(train_y.values[train_idx], dtype=torch.long).cuda()
    
    xi_valid_fold = torch.tensor(train_X[cat_cols].values[valid_idx], dtype=torch.long).cuda()
    xv_valid_fold = torch.ones_like(xi_valid_fold, dtype=torch.float)
    xi_valid_fold = xi_valid_fold.unsqueeze(2)
#     xnum_valid_fold = torch.tensor(train_X[num_cols].values[valid_idx], dtype=torch.float).cuda()
    xtext_valid_fold = torch.tensor(train_word_sequences[valid_idx], dtype=torch.long).cuda()
    xname_valid_fold = torch.tensor(train_name_sequences[valid_idx], dtype=torch.long).cuda()
    ximg_valid_fold = torch.tensor(img_seq_train[valid_idx], dtype=torch.float).cuda()
    xpred_valid_fold = torch.tensor(seresnext50_pred_train[valid_idx], dtype=torch.long).cuda()
    xcute_valid_fold = torch.tensor(cuteness_train[valid_idx], dtype=torch.long).cuda()
    y_valid_fold  = torch.tensor(train_y.values[valid_idx], dtype=torch.long).cuda()
    
     # get target encoded features
    xtge_train_fold, xtge_valid_fold, xtge_test_fold = target_encoding_process(cat_cols, train_X.iloc[train_idx], train_X.iloc[valid_idx], test_X, train_y.iloc[train_idx])
    xnum_train_fold = torch.tensor(np.concatenate((train_X[num_cols].values[train_idx], xtge_train_fold), axis=1), dtype=torch.float).cuda()
    xnum_valid_fold = torch.tensor(np.concatenate((train_X[num_cols].values[valid_idx], xtge_valid_fold), axis=1), dtype=torch.float).cuda()
    xnum_test = torch.tensor(np.concatenate((test_X[num_cols].values, xtge_test_fold), axis=1), dtype=torch.float).cuda()
    
    test_ds = torch.utils.data.TensorDataset(xi_test, xv_test, xnum_test, xtext_test, xname_test, ximg_test, xpred_test, xcute_test)
    test_loader = torch.utils.data.DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False, pin_memory=False)
    
    # cat_size, num_size, text_size, name_size, img_feat_size, img_pred_size, cute_size, in_plane, n_class
    model = PetFinderModel(config, 128, 128, 128, 128, 128, 128, 128, 64, 5, dropout=0.25).cuda()
    
    criterion = kappa_loss
    optimizer = Nadam(model.parameters(), lr=0.01)
    scheduler = CosineLRWithRestarts(optimizer, BATCH_SIZE, len(xi_train_fold), restart_period=TRAIN_EPOCHS, t_mult=1, verbose=True)
    
    # input order: Xi, Xv, Xnum, Xtext, Xname, Ximg
    train_ds = torch.utils.data.TensorDataset(
        xi_train_fold, xv_train_fold, xnum_train_fold, xtext_train_fold, 
        xname_train_fold, ximg_train_fold, xpred_train_fold, xcute_train_fold, y_train_fold)
    valid_ds = torch.utils.data.TensorDataset(
        xi_valid_fold, xv_valid_fold, xnum_valid_fold, xtext_valid_fold, 
        xname_valid_fold, ximg_valid_fold, xpred_valid_fold, xcute_valid_fold, y_valid_fold)
    
    train_loader = torch.utils.data.DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, pin_memory=False)
    valid_loader = torch.utils.data.DataLoader(valid_ds, batch_size=BATCH_SIZE, shuffle=False, pin_memory=False)
    
    print(f'Fold {i + 1}')
    
    for epoch in range(TRAIN_EPOCHS):
        start_time = time.time()
        start_time2 = time.time()
        
        scheduler.step()
        model.train()
        total_loss = 0.
        avg_loss = 0.
        
        if epoch == 1:
            model.unfreeze()
        
        for batch_idx, (xi_batch, xv_batch, xnum_batch, xtext_batch, xname_batch, ximg_batch, xpred_batch, xcute_batch, y_batch) in enumerate(train_loader):
            y_pred = model(xi_batch, xv_batch, xnum_batch, xtext_batch, xname_batch, ximg_batch, xpred_batch, xcute_batch)
            loss = criterion(y_pred, one_hot(y_batch, 5))
            optimizer.zero_grad()
            loss.backward()
#             for group in optimizer.param_groups:
#                 for param in group['params']:
#                     param.data = param.data.add(-weight_decay * group['lr'], param.data)
            optimizer.step()
            scheduler.batch_step()
        
            avg_loss += loss.item() / len(train_loader)
            total_loss += loss.item()
            
            if batch_idx % LOG_INTERVAL == 0 and batch_idx > 0:
                cur_loss = total_loss / LOG_INTERVAL
                elapsed_time2 = time.time() - start_time2
                print('| epoch {:3d} | {:5d}/{:5d} batches | ms/batch {:5.2f} | '
                    'loss {:5.4f}'.format(
                        epoch+1, batch_idx, len(train_loader),
                        elapsed_time2 * 1000 / LOG_INTERVAL, cur_loss))
                total_loss = 0
                start_time2 = time.time()
        
        model.eval()
        valid_preds_fold = np.zeros((xi_valid_fold.size(0), 5))
        avg_val_loss = 0.
        
        for i, (xi_batch, xv_batch, xnum_batch, xtext_batch, xname_batch, ximg_batch, xpred_batch, xcute_batch, y_batch) in enumerate(valid_loader):
            y_pred = model(xi_batch, xv_batch, xnum_batch, xtext_batch, xname_batch, ximg_batch, xpred_batch, xcute_batch).detach()
            avg_val_loss += criterion(y_pred, one_hot(y_batch, 5)).item() / len(valid_loader)
            valid_preds_fold[i * BATCH_SIZE:(i+1) * BATCH_SIZE] = y_pred.cpu().numpy()
        
        elapsed_time = time.time() - start_time 
        print('Epoch {}/{} | train_loss={:.4f} | val_loss={:.4f} | time={:.2f}s'.format(
            epoch + 1, TRAIN_EPOCHS, avg_loss, avg_val_loss, elapsed_time))
            
    test_preds_fold = np.zeros((len(test_X), 5))
    for i, (xi_batch, xv_batch, xnum_batch, xtext_batch, xname_batch, ximg_batch, xpred_batch, xcute_batch) in enumerate(test_loader):
        y_pred = model(xi_batch, xv_batch, xnum_batch, xtext_batch, xname_batch, ximg_batch, xpred_batch, xcute_batch).detach()
        test_preds_fold[i * BATCH_SIZE:(i+1) * BATCH_SIZE] = y_pred.cpu().numpy()
        
    train_preds1[valid_idx] = valid_preds_fold
    test_preds1 += test_preds_fold / n_fold

> ### Model 1 Evaluation

In [None]:
print("CV: ", quadratic_weighted_kappa(train_y.values, np.argmax(train_preds1, 1)))

## Model 2

In [None]:
n_num_feats = len(num_cols)

class PetFinderModel2(nn.Module):
    def __init__(self, config, cat_size, num_size, text_size, name_size, img_feat_size, img_pred_size, cute_size, in_plane, n_class, dropout=0.25):
        super(PetFinderModel2, self).__init__()
        
        self.cat_model = NFM(config['field_size'], 
                             config['feature_sizes'], 
                             embedding_size=8,
                             is_shallow_dropout=False,
                             deep_layers=[128, 128],
                             use_cuda=True,
                             deep_layers_activation = 'relu',
                             dropout_deep = [0, dropout, dropout],
                             is_batch_norm = True,
                             use_fm=False, 
                             use_ffm=True, 
                             interation_type=True,
                             out_size=cat_size)
        self.num_model = Dense(n_num_feats, num_size, dropout=0)
        self.text_model = LSTM_TextCNN(256, text_size, embedding_matrix, embed_size, dropout)
        self.name_model = LSTM_TextCNN(128, name_size, name_embed_mat, embed_size_name, dropout=0)
        self.img_model = Image_Model(256, img_feat_size, [1, 2, 3], dropout=0, width=5) # hidden_size, out_size, kernel_sizes, dropout, width=5
        self.pred_model = Img_Pred(1001, 64, img_pred_size, 8, 5, [1, 2, 3], dropout=0) # n_class, hidden_size, out_size, embed_size, width, kernel_sizes, dropout
        self.cute_model = Img_Pred(3, 64, cute_size, 8, 5, [1, 2, 3], dropout=0) # n_class, hidden_size, out_size, embed_size, width, kernel_sizes, dropout
        
        self.dense1 = Dense(cat_size + num_size + text_size + name_size + img_feat_size + img_pred_size + cute_size, in_plane, dropout=0)
        self.dense2 = Dense(in_plane, in_plane*2, dropout=0)
        self.logit = nn.Linear(in_plane*2, n_class)
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, Xi, Xv, Xnum, Xtext, Xname, Ximg, Xpred, Xcute):
        cat_out = self.cat_model(Xi, Xv)
        num_out = self.num_model(Xnum)
        text_out = self.text_model(Xtext)
        name_out = self.name_model(Xname)
        img_out = self.img_model(Ximg)
        pred_out = self.pred_model(Xpred)
        cute_out = self.cute_model(Xcute)
        out = torch.cat((cat_out, num_out, text_out, name_out, img_out, pred_out, cute_out), 1)
        out = self.dense1(out)
        out = self.dense2(out)
        out = self.logit(out)
        return self.softmax(out)
    
    def unfreeze(self):
        self.text_model.embedding.weight.requires_grad = True

In [None]:
TRAIN_EPOCHS = 8
LOG_INTERVAL = 6
weight_decay = 0.025

train_preds2 = np.zeros((len(train_X), 5))
test_preds2 = np.zeros((len(test_X), 5))

for i, (train_idx, valid_idx) in enumerate(splits):
    xi_train_fold = torch.tensor(train_X[cat_cols].values[train_idx], dtype=torch.long).cuda()
    xv_train_fold = torch.ones_like(xi_train_fold, dtype=torch.float)
    xi_train_fold = xi_train_fold.unsqueeze(2)
    xnum_train_fold = torch.tensor(train_X[num_cols].values[train_idx], dtype=torch.float).cuda()
    xtext_train_fold = torch.tensor(train_word_sequences[train_idx], dtype=torch.long).cuda()
    xname_train_fold = torch.tensor(train_name_sequences[train_idx], dtype=torch.long).cuda()
    ximg_train_fold = torch.tensor(img_seq_train[train_idx], dtype=torch.float).cuda()
    xpred_train_fold = torch.tensor(seresnext50_pred_train[train_idx], dtype=torch.long).cuda()
    xcute_train_fold = torch.tensor(cuteness_train[train_idx], dtype=torch.long).cuda()
    y_train_fold  = torch.tensor(train_y.values[train_idx], dtype=torch.long).cuda()
    
    xi_valid_fold = torch.tensor(train_X[cat_cols].values[valid_idx], dtype=torch.long).cuda()
    xv_valid_fold = torch.ones_like(xi_valid_fold, dtype=torch.float)
    xi_valid_fold = xi_valid_fold.unsqueeze(2)
    xnum_valid_fold = torch.tensor(train_X[num_cols].values[valid_idx], dtype=torch.float).cuda()
    xtext_valid_fold = torch.tensor(train_word_sequences[valid_idx], dtype=torch.long).cuda()
    xname_valid_fold = torch.tensor(train_name_sequences[valid_idx], dtype=torch.long).cuda()
    ximg_valid_fold = torch.tensor(img_seq_train[valid_idx], dtype=torch.float).cuda()
    xpred_valid_fold = torch.tensor(seresnext50_pred_train[valid_idx], dtype=torch.long).cuda()
    xcute_valid_fold = torch.tensor(cuteness_train[valid_idx], dtype=torch.long).cuda()
    y_valid_fold  = torch.tensor(train_y.values[valid_idx], dtype=torch.long).cuda()
    
     # get target encoded features
#     xtge_train_fold, xtge_valid_fold, xtge_test_fold = target_encoding_process(cat_cols, train_X.iloc[train_idx], train_X.iloc[valid_idx], test_X, train_y.iloc[train_idx])
#     xnum_train_fold = torch.tensor(np.concatenate((train_X[num_cols].values[train_idx], xtge_train_fold), axis=1), dtype=torch.float).cuda()
#     xnum_valid_fold = torch.tensor(np.concatenate((train_X[num_cols].values[valid_idx], xtge_valid_fold), axis=1), dtype=torch.float).cuda()
#     xnum_test = torch.tensor(np.concatenate((test_X[num_cols].values, xtge_test_fold), axis=1), dtype=torch.float).cuda()
    
    xnum_test = torch.tensor(test_X[num_cols].values, dtype=torch.float).cuda()
    test_ds = torch.utils.data.TensorDataset(xi_test, xv_test, xnum_test, xtext_test, xname_test, ximg_test, xpred_test, xcute_test)
    test_loader = torch.utils.data.DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False, pin_memory=False)
    
    # cat_size, num_size, text_size, name_size, img_feat_size, img_pred_size, cute_size, in_plane, n_class
    model = PetFinderModel(config, 128, 128, 128, 128, 128, 128, 128, 64, 5, dropout=0.25).cuda()
    
    criterion = kappa_loss
    optimizer = Nadam(model.parameters(), lr=0.01)
    scheduler = CosineLRWithRestarts(optimizer, BATCH_SIZE, len(xi_train_fold), restart_period=TRAIN_EPOCHS, t_mult=1, verbose=True)
    
    # input order: Xi, Xv, Xnum, Xtext, Xname, Ximg
    train_ds = torch.utils.data.TensorDataset(
        xi_train_fold, xv_train_fold, xnum_train_fold, xtext_train_fold, 
        xname_train_fold, ximg_train_fold, xpred_train_fold, xcute_train_fold, y_train_fold)
    valid_ds = torch.utils.data.TensorDataset(
        xi_valid_fold, xv_valid_fold, xnum_valid_fold, xtext_valid_fold, 
        xname_valid_fold, ximg_valid_fold, xpred_valid_fold, xcute_valid_fold, y_valid_fold)
    
    train_loader = torch.utils.data.DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, pin_memory=False)
    valid_loader = torch.utils.data.DataLoader(valid_ds, batch_size=BATCH_SIZE, shuffle=False, pin_memory=False)
    
    print(f'Fold {i + 1}')
    
    for epoch in range(TRAIN_EPOCHS):
        start_time = time.time()
        start_time2 = time.time()
        
        scheduler.step()
        model.train()
        total_loss = 0.
        avg_loss = 0.
        
        if epoch == 1:
            model.unfreeze()
        
        for batch_idx, (xi_batch, xv_batch, xnum_batch, xtext_batch, xname_batch, ximg_batch, xpred_batch, xcute_batch, y_batch) in enumerate(train_loader):
            y_pred = model(xi_batch, xv_batch, xnum_batch, xtext_batch, xname_batch, ximg_batch, xpred_batch, xcute_batch)
            loss = criterion(y_pred, one_hot(y_batch, 5))
            optimizer.zero_grad()
            loss.backward()
#             for group in optimizer.param_groups:
#                 for param in group['params']:
#                     param.data = param.data.add(-weight_decay * group['lr'], param.data)
            optimizer.step()
            scheduler.batch_step()
        
            avg_loss += loss.item() / len(train_loader)
            total_loss += loss.item()
            
            if batch_idx % LOG_INTERVAL == 0 and batch_idx > 0:
                cur_loss = total_loss / LOG_INTERVAL
                elapsed_time2 = time.time() - start_time2
                print('| epoch {:3d} | {:5d}/{:5d} batches | ms/batch {:5.2f} | '
                    'loss {:5.4f}'.format(
                        epoch+1, batch_idx, len(train_loader),
                        elapsed_time2 * 1000 / LOG_INTERVAL, cur_loss))
                total_loss = 0
                start_time2 = time.time()
        
        model.eval()
        valid_preds_fold = np.zeros((xi_valid_fold.size(0), 5))
        avg_val_loss = 0.
        
        for i, (xi_batch, xv_batch, xnum_batch, xtext_batch, xname_batch, ximg_batch, xpred_batch, xcute_batch, y_batch) in enumerate(valid_loader):
            y_pred = model(xi_batch, xv_batch, xnum_batch, xtext_batch, xname_batch, ximg_batch, xpred_batch, xcute_batch).detach()
            avg_val_loss += criterion(y_pred, one_hot(y_batch, 5)).item() / len(valid_loader)
            valid_preds_fold[i * BATCH_SIZE:(i+1) * BATCH_SIZE] = y_pred.cpu().numpy()
        
        elapsed_time = time.time() - start_time 
        print('Epoch {}/{} | train_loss={:.4f} | val_loss={:.4f} | time={:.2f}s'.format(
            epoch + 1, TRAIN_EPOCHS, avg_loss, avg_val_loss, elapsed_time))
            
    test_preds_fold = np.zeros((len(test_X), 5))
    for i, (xi_batch, xv_batch, xnum_batch, xtext_batch, xname_batch, ximg_batch, xpred_batch, xcute_batch) in enumerate(test_loader):
        y_pred = model(xi_batch, xv_batch, xnum_batch, xtext_batch, xname_batch, ximg_batch, xpred_batch, xcute_batch).detach()
        test_preds_fold[i * BATCH_SIZE:(i+1) * BATCH_SIZE] = y_pred.cpu().numpy()
        
    train_preds2[valid_idx] = valid_preds_fold
    test_preds2 += test_preds_fold / n_fold

### Model2: Evaluation

In [None]:
print("CV: ", quadratic_weighted_kappa(train_y.values, np.argmax(train_preds2, 1)))

# Make Submission

In [None]:
# OOF CV
print("CV: ", quadratic_weighted_kappa(train_y.values, np.argmax((train_preds1+train_preds2)/2, 1)))

In [None]:
# Generate submission:

sample_submission["AdoptionSpeed"] = np.argmax((test_preds1+test_preds2)/2, 1).astype(np.int32)
sample_submission.to_csv("submission.csv", index=False)