# Kaggle [PetFinder.my Adoption Prediction](https://www.kaggle.com/c/petfinder-adoption-prediction) Competition Solution


![](https://storage.googleapis.com/kaggle-media/competitions/Petfinder/PetFinder%20-%20Logo.png)
![](https://s3.amazonaws.com/cdn-origin-etr.akc.org/wp-content/uploads/2017/11/12232719/Golden-Retriever-On-White-05.jpg)

This is our solution for PetFinder.my Kaggle competition which [me](https://www.kaggle.com/aruchomu) and my teammate [Dmitry Voynov](https://www.kaggle.com/vainof) submitted. <br>
The solution scored 0.40767 of [Quadratic Weighted Kappa](https://stats.stackexchange.com/questions/59798/quadratic-weighted-kappa-versus-linear-weighted-kappa?rq=1) (QWK) and reached top 33% on the private leaderboard. <br>


 <a id="top"></a> <br>
## Contents
1. [Preparations](#1)
2. [Feature Extraction from Sentiment and Image Metadata](#2)
3. [Text and Image Features](#3)
4. [Modeling](#4)
5. [Submission](#5)

<a id="1"></a> 
## 1. Preparations

In [21]:
# Dependencies

import numpy as np
import pandas as pd

import os
import json
from joblib import Parallel, delayed
from PIL import Image
from tqdm import tqdm
import random

import tensorflow as tf
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.applications.densenet import preprocess_input, DenseNet121
from tensorflow.keras.layers import GlobalAveragePooling2D, Input, Lambda, AveragePooling1D, \
    MaxPooling1D, Dense, BatchNormalization, Dropout, Embedding, Reshape, Concatenate
from tensorflow.keras import losses
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import regularizers
import tensorflow.keras.backend as K

from sklearn.model_selection import GroupKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, NMF
from sklearn.metrics import cohen_kappa_score, mean_squared_error
from sklearn.preprocessing import StandardScaler

import lightgbm as lgb

from gensim.models import KeyedVectors

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Random seed function (thanks to Benjamin Minixhofer)

seed = 73

def seed_everything(seed=seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.set_random_seed(seed)
    np.random.seed(seed)

In [3]:
# Load dataframes
train_df = pd.read_csv('../input/petfinder-adoption-prediction/train/train.csv')
test_df = pd.read_csv('../input/petfinder-adoption-prediction/test/test.csv')

breeds_df = pd.read_csv('../input/petfinder-adoption-prediction/breed_labels.csv')
colors_df = pd.read_csv('../input/petfinder-adoption-prediction/color_labels.csv')
states_df = pd.read_csv('../input/petfinder-adoption-prediction/state_labels.csv')

In [4]:
# Correct possible data errors

# Replace Breed1 with Breed2
train_df['Breed1'].replace(0, train_df['Breed2'], inplace=True)

# Replace Breed1 with 0
ids = ['1bc0f89d8', '15a206d0d', 'f8654865f', '36b20cfb5',
       '699a81c51', '85ec1aac0','6a72cfda7'] 
train_df.loc[train_df['PetID'].isin(ids), 'Breed1'] = 0

# Replace Breed2 with 0
ids = ['f8654865f', '699a81c51', '6a72cfda7']
train_df.loc[train_df['PetID'].isin(ids), 'Breed2'] = 0

# Change Type to 1
train_df.loc[train_df['PetID'] == '6c399cb06', 'Type'] = 1

<a id="2"></a> 
## 2. Feature Extraction from Sentiment and Image Metadata

In [5]:
# Extraction functions

def get_metadata_features(pet_id, dataset):
    """
    Collects the following features from the image metadata for profile images.
    
    1. Image resolution.
    2. Top 3 dominant colors by score.
    """
    json_path = '../input/petfinder-adoption-prediction/{}_metadata/{}-1.json'.format(dataset, pet_id)
    image_path = '../input/petfinder-adoption-prediction/{}_images/{}-1.jpg'.format(dataset, pet_id)
    
    if not os.path.exists(json_path):
        # Test sample with no profile picture
        if os.path.exists('../input/petfinder-adoption-prediction/{}_metadata/{}-2.json'.format(dataset, pet_id)):
            json_path = '../input/petfinder-adoption-prediction/{}_metadata/{}-2.json'.format(dataset, pet_id)
            image_path = '../input/petfinder-adoption-prediction/{}_images/{}-2.jpg'.format(dataset, pet_id)
        else:
            return
    
    row = {}
    
    row['PetID'] = pet_id
    
    with open(json_path) as fp:
        row_json = json.load(fp)
    
    try:
        image = Image.open(image_path)
        row['img_width'], row['img_height'] = image.size
    except:
        row['img_width'], row['img_height'] = np.nan, np.nan
    
    try:
        colors = row_json['imagePropertiesAnnotation']['dominantColors']['colors']
        reds, greens, blues, scores = [], [], [], []
        for color in colors:
            reds.append(color['color'].get('red', 0))
            greens.append(color['color'].get('green', 0))
            blues.append(color['color'].get('blue', 0))
            scores.append(color.get('score', 0))
        colors_df = pd.DataFrame({'red': reds, 'green': greens, 'blue': blues, 'score': scores})
        row.update(dict(zip(['img_color_1_red', 'img_color_1_green', 'img_color_1_blue',
                             'img_color_2_red', 'img_color_2_green', 'img_color_2_blue',
                             'img_color_3_red', 'img_color_3_green', 'img_color_3_blue'],
                            colors_df.sort_values('score', ascending=False).iloc[:3, :-1].values.ravel())))
    except:
        row.update(dict(zip(['img_color_1_red', 'img_color_1_green', 'img_color_1_blue',
                             'img_color_2_red', 'img_color_2_green', 'img_color_2_blue',
                             'img_color_3_red', 'img_color_3_green', 'img_color_3_blue'], [np.nan] * 9)))
    
    return row


def get_sentiment_features(filename, dataset):
    """
    Collects the following features from the sentiment data.
    
    1. Sentences scores mean and variance weighted by magnitude.
    2. Document sentiment magnitude and score.
    """
    path = '../input/petfinder-adoption-prediction/' + dataset + '_sentiment'
    with open(os.path.join(path, filename)) as fp:
        row_json = json.load(fp)
    row = {}

    row['PetID'] = filename.replace('.json', '')
    
    try:
        magnitudes, scores = [], []
        for sentence in row_json['sentences']:
            magnitudes.append(sentence['sentiment']['magnitude'])
            scores.append(sentence['sentiment']['score'])
        sentences_df = pd.DataFrame({'magnitude': magnitudes, 'score': scores})
        sentences_df['score'] = sentences_df['magnitude'] * sentences_df['score']
        epsilon = np.finfo(np.float32).eps
        sentences_df['score'] = sentences_df['magnitude'] / (sentences_df['magnitude'].sum() + epsilon)
        row['sentence_score_mean'] = sentences_df['score'].mean()
        row['sentence_score_var'] = sentences_df['score'].var()
    except:
        row['sentence_score_mean'] = np.nan
        row['sentence_score_var'] = np.nan 

    try:
        row['document_magnitude'] = row_json['documentSentiment']['magnitude']
        row['document_score'] = row_json['documentSentiment']['score']
    except:
        row['document_magnitude'] = np.nan
        row['document_score'] = np.nan
    
    return row

# Use parallel processing
train_metadata_rows = Parallel(n_jobs=-1, verbose=2)(
    delayed(get_metadata_features)(pet_id, 'train') for pet_id in train_df['PetID'])
train_metadata_df = pd.DataFrame([row for row in train_metadata_rows if row is not None])
test_metadata_rows = Parallel(n_jobs=-1, verbose=2)(
    delayed(get_metadata_features)(pet_id, 'test') for pet_id in test_df['PetID'])
test_metadata_df = pd.DataFrame([row for row in test_metadata_rows if row is not None])

train_sentiment_df = pd.DataFrame(Parallel(n_jobs=-1, verbose=2)(
    delayed(get_sentiment_features)(filename,'train') for filename in os.listdir(
        '../input/petfinder-adoption-prediction/train_sentiment')))
test_sentiment_df = pd.DataFrame(Parallel(n_jobs=-1, verbose=2)(
    delayed(get_sentiment_features)(filename, 'test') for filename in os.listdir(
        '../input/petfinder-adoption-prediction/test_sentiment')))

# Merge everything
train_merged = pd.merge(train_df, train_metadata_df, how='left', on='PetID')
train_merged = pd.merge(train_merged, train_sentiment_df, how='left', on='PetID')
test_merged = pd.merge(test_df, test_metadata_df, how='left', on='PetID')
test_merged = pd.merge(test_merged, test_sentiment_df, how='left', on='PetID')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 149 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done 2199 tasks      | elapsed:   12.9s
[Parallel(n_jobs=-1)]: Done 6259 tasks      | elapsed:   29.6s
[Parallel(n_jobs=-1)]: Done 11919 tasks      | elapsed:   52.0s
[Parallel(n_jobs=-1)]: Done 14993 out of 14993 | elapsed:  1.1min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 1054 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done 3972 out of 3972 | elapsed:   16.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 2132 tasks      | elapsed:    4.9s
[Parallel(n_jobs=-1)]: Done 10844 tasks      | elapsed:   25.0s
[Parallel(n_jobs=-1)]: Done 14442 out of 14442 | elapsed:   32.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 239

In [6]:
# New features

# Add name length
train_merged['name_len'] = train_merged['Name'].map(len, na_action='ignore')
test_merged['name_len'] = test_merged['Name'].map(len, na_action='ignore')

# Add description length
train_merged['desc_len'] = train_merged['Description'].map(len, na_action='ignore')
test_merged['desc_len'] = test_merged['Description'].map(len, na_action='ignore')

# Add RescuerID count
train_merged['rescuer_count'] = train_merged['RescuerID'].replace(train_merged.groupby('RescuerID').size())
test_merged['rescuer_count'] = test_merged['RescuerID'].replace(test_merged.groupby('RescuerID').size())

<a id="3"></a> 
## 3. Text and Image Features

### 3.1. Text Features

In [7]:
# We simply average pretrained FastText vectors for description

model = KeyedVectors.load_word2vec_format('../input/fasttext-crawl-300d-2m/crawl-300d-2M.vec')

pet_ids = train_df[~train_df['Description'].isna()]['PetID']
vects = []
found_pet_ids = []
for pet_id in pet_ids:
    desc = train_df[train_df['PetID'] == pet_id]['Description'].values[0].split(' ')
    word_vectors = []
    for word in desc:
        try:
            word_vectors.append(model.get_vector(word))
        except KeyError:
            pass
    if word_vectors:
        mean_vect = np.mean(word_vectors, axis=0)
        vects.append(mean_vect)
        found_pet_ids.append(pet_id)
fasttext_train_df = pd.DataFrame(np.array(vects)).add_prefix('fasttext_')
fasttext_train_df['PetID'] = found_pet_ids
train_merged = pd.merge(train_merged, fasttext_train_df, how='left', on='PetID')

pet_ids = test_df[~test_df['Description'].isna()]['PetID']
vects = []
found_pet_ids = []
for pet_id in pet_ids:
    desc = test_df[test_df['PetID'] == pet_id]['Description'].values[0].split(' ')
    word_vectors = []
    for word in desc:
        try:
            word_vectors.append(model.get_vector(word))
        except KeyError:
            pass
    if word_vectors:
        mean_vect = np.mean(word_vectors, axis=0)
        vects.append(mean_vect)
        found_pet_ids.append(pet_id)
fasttext_test_df = pd.DataFrame(np.array(vects)).add_prefix('fasttext_')
fasttext_test_df['PetID'] = found_pet_ids
test_merged = pd.merge(test_merged, fasttext_test_df, how='left', on='PetID')

In [8]:
text_columns = ['Description']

# Fill nans with empty text
train_merged[text_columns] = train_merged[text_columns].fillna('')
test_merged[text_columns] = test_merged[text_columns].fillna('')

# Text feature extractor class
# We use TF-IDF vectorizer and then extract SVD and NMF vectors with 13 components each

class TextFeatureExtractor():
    """Extracts text features from text columns."""
    def __init__(self, n_components):
        self.tfidf = TfidfVectorizer(min_df=2, max_features=None,
                          strip_accents='unicode', analyzer='word', token_pattern='\w+',
                          ngram_range=(1, 3), use_idf=1, smooth_idf=1, sublinear_tf=1)
        self.svd = TruncatedSVD(n_components=n_components, random_state=seed)
        self.nmf = NMF(n_components=n_components, random_state=seed)
        
    def fit_transform(self, X_text):
        text_features = []
        for col in X_text.columns:
            tfidf_col = self.tfidf.fit_transform(X_text[col])
            
            svd_col = self.svd.fit_transform(tfidf_col)
            svd_col = pd.DataFrame(svd_col)
            svd_col = svd_col.add_prefix('SVD_{}_'.format(col))
            text_features.append(svd_col)
            
            nmf_col = self.nmf.fit_transform(tfidf_col)
            nmf_col = pd.DataFrame(nmf_col)
            nmf_col = nmf_col.add_prefix('NMF_{}_'.format(col))
            text_features.append(nmf_col)
            
        text_features = pd.concat(text_features, axis=1)
        
        return text_features
    
    def transform(self, X_text):
        text_features = []
        for col in X_text.columns:
            tfidf_col = self.tfidf.transform(X_text[col])
            
            svd_col = self.svd.transform(tfidf_col)
            svd_col = pd.DataFrame(svd_col)
            svd_col = svd_col.add_prefix('SVD_{}_'.format(col))
            text_features.append(svd_col)
            
            nmf_col = self.nmf.transform(tfidf_col)
            nmf_col = pd.DataFrame(nmf_col)
            nmf_col = nmf_col.add_prefix('NMF_{}_'.format(col))
            text_features.append(nmf_col)
            
        text_features = pd.concat(text_features, axis=1)
        
        return text_features

    
text_feature_extractor = TextFeatureExtractor(n_components=13)

 ### 3.2 Image Features

In [9]:
# We extract image features using DenseNet121 and apply Average Pooling
# with window_size=4 for profile images and window_size=8 for second images.

weights_path = '../input/densenet121weights/densenet121_weights_tf_dim_ordering_tf_kernels_notop.h5'

class ImageFeatureExtractor():
    def __init__(self,
                 shape=[256, 256, 3],
                 average_pooling_window=4):
        self.shape = shape
        self.size = self.shape[:2]
        input_tensor = Input(shape)
        densenet = DenseNet121(input_tensor=input_tensor,
                               weights=weights_path,
                               include_top=False)
        out = densenet.output
        out = GlobalAveragePooling2D()(out)
        out = Lambda(lambda x: K.expand_dims(x, axis=-1))(out)
        if average_pooling_window:
            out = AveragePooling1D(average_pooling_window)(out)
        out = Lambda(lambda x: x[:,:,0])(out)
        
        self.model = Model(input_tensor, out)
        self.feats_shape = list(map(int, self.model.output.shape[1:]))

    def resize_to_square(self, img):
        return img.resize(self.size) 

    def resize_saving_ratio(self, img):
        # works if self.size represents a square
        # resize initial image
        max_dim = max(img.width, img.height)
        k = self.size[0] / max_dim
        width = int(img.width * k)
        height = int(img.height * k)
        img = img.resize([width, height])
        # concat with black rectangle
        res_img = Image.new('RGB', self.size)
        res_img.paste(img, (0, 0))
        return res_img

    def load_image_by_path(self, filepath, resize_method='square'):
        img = Image.open(filepath)
        if resize_method == 'square':
            img = self.resize_to_square(img)
        else:
            img = self.resize_saving_ratio(img)
        img = np.array(img).astype(np.float32)
        img = preprocess_input(img)
        if len(img.shape) == 2:
            img = np.repeat(np.expand_dims(img, axis=2), repeats=3, axis=2)
        return img

    def extract(self, filepath, resize_method='square'):
        img = self.load_image_by_path(filepath, resize_method='square')
        return self.model.predict(np.expand_dims(img, axis=0))

    def extract_all(self, filepaths, batch_size=16, resize_method='square'):
        res_feats = np.empty(shape=[0]+self.feats_shape, dtype=np.float32)
        num_batches = int(np.ceil(len(filepaths) / batch_size))
        for it in tqdm(range(num_batches)):
            batch_filepaths = filepaths[it * batch_size: (it + 1) * batch_size]
            batch = []
            for fp in batch_filepaths:
                img = self.load_image_by_path(fp, resize_method=resize_method)
                batch.append(img)
            batch = np.array(batch)
            feats = self.model.predict(batch)
            res_feats = np.append(res_feats, feats, axis=0)
        return res_feats


def get_image_filepaths(pet_ids, dataset, img_number):
    filepaths = []
    found_pet_ids = []
    for pet_id in pet_ids:
        path = '../input/petfinder-adoption-prediction/{}_images/{}-{}.jpg'.format(dataset,
                                                                                   pet_id,
                                                                                   img_number)
        if os.path.exists(path):
            filepaths.append(path)
            found_pet_ids.append(pet_id)
        elif os.path.exists('../input/petfinder-adoption-prediction/{}_images/{}-2.jpg'.format(dataset, pet_id)):
            path = '../input/petfinder-adoption-prediction/{}_images/{}-2.jpg'.format(dataset,
                                                                                      pet_id)
            filepaths.append(path)
            found_pet_ids.append(pet_id)
    return filepaths, found_pet_ids

# Profile images
image_feature_extractor = ImageFeatureExtractor()

train_img_filepaths, train_found_pet_ids = get_image_filepaths(train_df['PetID'], 'train', 1)
test_img_filepaths, test_found_pet_ids = get_image_filepaths(test_df['PetID'], 'test', 1)

train_img_feats = image_feature_extractor.extract_all(train_img_filepaths,
                                                      resize_method='square')
train_img_feats_df = pd.DataFrame(train_img_feats).add_prefix('img_feat_')
train_img_feats_df['PetID'] = train_found_pet_ids
test_img_feats = image_feature_extractor.extract_all(test_img_filepaths,
                                                     resize_method='square')
test_img_feats_df = pd.DataFrame(test_img_feats).add_prefix('img_feat_')
test_img_feats_df['PetID'] = test_found_pet_ids
    
train_merged = pd.merge(train_merged, train_img_feats_df, how='left', on='PetID')
test_merged = pd.merge(test_merged, test_img_feats_df, how='left', on='PetID')

Instructions for updating:
Colocations handled automatically by placer.


100%|██████████| 916/916 [02:31<00:00,  4.77it/s]
100%|██████████| 242/242 [00:34<00:00,  6.05it/s]


In [10]:
# Second images
image_feature_extractor = ImageFeatureExtractor(average_pooling_window=8)

train_img_filepaths, train_found_pet_ids = get_image_filepaths(train_df['PetID'], 'train', 2)
test_img_filepaths, test_found_pet_ids = get_image_filepaths(test_df['PetID'], 'test', 2)

train_img_feats = image_feature_extractor.extract_all(train_img_filepaths,
                                                      resize_method='square')
train_img_feats_df = pd.DataFrame(train_img_feats).add_prefix('img2_feat_')
train_img_feats_df['PetID'] = train_found_pet_ids
test_img_feats = image_feature_extractor.extract_all(test_img_filepaths,
                                                     resize_method='square')
test_img_feats_df = pd.DataFrame(test_img_feats).add_prefix('img2_feat_')
test_img_feats_df['PetID'] = test_found_pet_ids
    
train_merged = pd.merge(train_merged, train_img_feats_df, how='left', on='PetID')
test_merged = pd.merge(test_merged, test_img_feats_df, how='left', on='PetID')

100%|██████████| 724/724 [02:11<00:00,  4.42it/s]
100%|██████████| 183/183 [00:31<00:00,  4.88it/s]


<a id="4"></a> 
## 4. Modeling

### 4.1. Metrics

In [11]:
# Regression objective
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Competition metric
def qwk(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

### 4.2. Thresholds Optimization

In [12]:
# We tried different rounding techniques for threshold optimization.
# For us rounding by following train distribution gave best results.

def get_thresholds_from_dist(y_true, y_pred):
    """Calculates thresholds for raw predictions
    so as to follow the true distribution.
    """
    idxs = np.cumsum(np.bincount(y_true))[:-1]
    idxs = (idxs * y_pred.size / y_true.size).astype(int)
    return np.sort(y_pred)[idxs]

def allocate_to_rate(y_pred, thresholds):
    """Allocates raw predictions to adoption rates."""
    rates = np.zeros(y_pred.size, dtype=int)
    for i in range(4):
        rates[y_pred >= thresholds[i]] = i + 1
    return rates

### 4.3. Training

#### 4.3.1 Neural Net Embeddings
We trained a neural net with embeddings for some categorical features. <br>
We then used the embeddings to train the LightGBM model. <br>
This part was mainly done by my teammate.

In [22]:
# Final datasets arrangement

X_train = train_merged.drop(columns=['PetID', 'AdoptionSpeed'])
y_train = train_merged['AdoptionSpeed']

X_test = test_merged.drop(columns=['PetID'])

# We also add the most frequent breed of each rescuer as feature.
X_train['rescuer_breed_mode'] = X_train['RescuerID'].map(X_train.groupby('RescuerID')['Breed1'].agg(
    lambda x:x.value_counts().index[0]))
X_test['rescuer_breed_mode'] = X_test['RescuerID'].map(X_test.groupby('RescuerID')['Breed1'].agg(
    lambda x:x.value_counts().index[0]))

cat_feats = ['Type', 'Breed1', 'Breed2', 'Vaccinated',
             'Dewormed', 'Sterilized', 'State', 'rescuer_breed_mode']

X_train = X_train.drop(columns=['Name', 'RescuerID', 'Description'])
X_test = X_test.drop(columns=['Name', 'RescuerID', 'Description'])

In [23]:
# Some useful dicts

onehot_feats = ['Type', 'Gender',
             'Color1', 'Color2', 'Color3', 'Vaccinated',
             'Dewormed', 'Sterilized']
onehot_sizes = dict(X_train[onehot_feats].nunique())
onehot_sizes['Color2'] = onehot_sizes['Color1']
onehot_sizes['Color3'] = onehot_sizes['Color1']
cat_feats = ['Breed1', 'Breed2', 'rescuer_breed_mode', 'State']
X_concat = pd.concat([X_train, X_test])
embedding_sizes = {
    'Breed1': 32,
    'Breed2': 32,
    'rescuer_breed_mode': 32,
    'State': 8
}
cat_feats_sizes = {
    'Breed1': X_concat['Breed1'].nunique(),
    'Breed2': X_concat['Breed2'].nunique(),
    'rescuer_breed_mode': X_concat['rescuer_breed_mode'].nunique(),
    'State': X_concat['State'].nunique()
}

cat_feats_mappings = {}
for cat_feat in cat_feats:
    mapping = {}
    vals = X_concat[cat_feat].unique()
    vals.sort()
    for i, feat in enumerate(vals):
        mapping[feat] = i
    cat_feats_mappings[cat_feat] = mapping

img_feats = ['img_feat_{}'.format(i) for i in range(256)]
text_feats = ['fasttext_{}'.format(i) for i in range(300)]
numerical_feats = [col for col in X_train.columns 
                   if not col in img_feats and
                   not col in text_feats and
                   not col in onehot_feats and
                   not col in cat_feats]

In [24]:
# Numerical feats
num_scaler = StandardScaler()
X_train[numerical_feats] = num_scaler.fit_transform(X_train[numerical_feats])
X_test[numerical_feats] = num_scaler.transform(X_test[numerical_feats])

# Image feats
img_scaler = StandardScaler()
X_train[img_feats] = img_scaler.fit_transform(X_train[img_feats])
X_test[img_feats] = img_scaler.transform(X_test[img_feats])

# Text feats
text_scaler = StandardScaler()
X_train[text_feats] = text_scaler.fit_transform(X_train[text_feats])
X_test[text_feats] = text_scaler.transform(X_test[text_feats])

X_train[X_train.isna()] = 0
X_test[X_test.isna()] = 0

In [25]:
def get_model():
    K.clear_session()
    # Numerical feats
    num_input = Input(shape=[len(numerical_feats)])

    num_out = Dense(32, activation='relu')(num_input)
    num_out = BatchNormalization()(num_out)
    num_out = Dropout(rate=0.66)(num_out)

    # Image feats
    img_input = Input(shape=[256])

    img_out = Lambda(lambda x: K.expand_dims(x, axis=-1))(img_input)
    img_out = AveragePooling1D(4)(img_out)
    img_out = Lambda(lambda x: x[:,:,0])(img_out)
    img_out = Dense(128, activation='relu')(img_out)
    img_out = BatchNormalization()(img_out)
    img_out = Dropout(rate=0.66)(img_out)

    # Text feats
    text_input = Input(shape=[300])

    text_out = Dense(128, activation='relu')(text_input)
    text_out = BatchNormalization()(text_out)
    text_out = Dropout(rate=0.66)(text_out)

    # Categorical feats
    cat_inputs = []
    cat_outs = []
    for cat_feat in cat_feats:
        cat_input = Input(shape=[1])

        cat_out = Embedding(input_dim=cat_feats_sizes[cat_feat],
                            output_dim=embedding_sizes[cat_feat],
                            input_length=1)(cat_input)
        cat_out = Reshape(target_shape=[embedding_sizes[cat_feat]])(cat_out)
        cat_out = Dense(embedding_sizes[cat_feat], activation='relu')(cat_out)
        cat_out = BatchNormalization()(cat_out)
        cat_out = Dropout(rate=0.66)(cat_out)

        cat_inputs.append(cat_input)
        cat_outs.append(cat_out)

    feat_inputs = []
    feat_outs = []
    for onehot_feat in onehot_feats:
        feat_input = Input(shape=[onehot_sizes[onehot_feat]])

        feat_out = Dense(8, activation='relu')(feat_input)
        feat_out = BatchNormalization()(feat_out)
        feat_out = Dropout(rate=0.66)(feat_out)

        feat_inputs.append(feat_input)
        feat_outs.append(feat_out)

    cat_outs += feat_outs
    cats_out = Concatenate()(cat_outs)
    cats_out = Dense(64, activation='relu')(cats_out)
    cats_out = BatchNormalization()(cats_out)
    cats_out = Dropout(rate=0.66)(cats_out)

    # Concatenate dense outputs from different features
    out = Concatenate()([num_out, img_out, text_out, cats_out])
    out = Dense(192, activation='relu')(out)
    out = BatchNormalization()(out)
    out = Dropout(rate=0.66)(out)
    out = Dense(64, activation='relu')(out)
    out = BatchNormalization()(out)
    out = Dropout(rate=0.66)(out)
    out = Dense(1)(out)

    inputs = [num_input] + [img_input] + [text_input] + cat_inputs + feat_inputs
    outputs = [out]
    model = Model(inputs=inputs,
                  outputs=outputs)

    return model

In [26]:
# Transforming data
train_num_data = X_train[numerical_feats].values
train_img_data = X_train[img_feats].values
train_text_data = X_train[text_feats].values
train_cat_data = X_train[cat_feats].values.T

train_onh_data = []
# Mapping onehot categories to vectors
for onh in onehot_feats:
    vals = X_train[onh].values
    vals = to_categorical(np.clip(vals - 1, 0, np.inf).astype(np.uint8), num_classes=onehot_sizes[onh])
    train_onh_data.append(vals)

# Mapping cat_feats using cat_feats_mappings
for i, cat_feat in enumerate(cat_feats):
    train_feats = train_cat_data[i]
    for j in range(len(train_feats)):
        train_feats[j] = cat_feats_mappings[cat_feat][train_feats[j]]

In [27]:
# Reshape to list of 15 arrays
train_data = \
[train_num_data] + \
[train_img_data] + \
[train_text_data] + \
[d for d in train_cat_data] + \
train_onh_data

In [28]:
# Transforming data
test_num_data = X_test[numerical_feats].values
test_img_data = X_test[img_feats].values
test_text_data = X_test[text_feats].values
test_cat_data = X_test[cat_feats].values.T

# categories range should start from 0

test_onh_data = []
# Mapping onehot categories to vectors
for onh in onehot_feats:
    vals = X_test[onh].values
    vals = to_categorical(np.clip(vals - 1, 0, np.inf).astype(np.uint8), num_classes=onehot_sizes[onh])
    test_onh_data.append(vals)

# Mapping cat_feats using cat_feats_mappings
for i, cat_feat in enumerate(cat_feats):
    test_feats = test_cat_data[i]
    for j in range(len(test_feats)):
        test_feats[j] = cat_feats_mappings[cat_feat][test_feats[j]]

In [29]:
# Reshape to list of 15 arrays
test_data = \
[test_num_data] + \
[test_img_data] + \
[test_text_data] + \
[d for d in test_cat_data] + \
test_onh_data

In [30]:
def rmse_loss(y_true, y_pred):
    diff = y_true - y_pred
    return K.sqrt(K.mean(K.square(diff)))

def map_to_int(y_true, y_pred, preds):
    thresholds = get_thresholds_from_dist(y_true, y_pred)
    return allocate_to_rate(preds, thresholds)

In [31]:
# CV
seed_everything()

n_splits = 5
early_stopping_steps = 5
epochs = 100

X_train = train_data
X_test = test_data

early_stopping = EarlyStopping(monitor='val_loss',
                               min_delta=1e-4,
                               patience=early_stopping_steps,
                               restore_best_weights=True)
callbacks = [early_stopping]

gr_kfold_split = GroupKFold(n_splits=n_splits).split([0] * len(train_df),
                                              y_train,
                                              groups=train_df['RescuerID'])

oof_train = np.zeros(shape=[len(train_df)])
oof_test = np.zeros(shape=[len(test_df), n_splits])

qwks = []
rmses = []

embeddingses = {}
for cat_feat in cat_feats:
    embeddingses[cat_feat] = []
    
for i, (train_inds, test_inds) in enumerate(gr_kfold_split):
    print('---- Fold {} ----'.format(i))

    X_tr = []
    X_val = []
    for X_inp in X_train:
        X_tr.append(X_inp[train_inds])
        X_val.append(X_inp[test_inds])
        
    y_tr = np.array(y_train)[train_inds]
    y_val = np.array(y_train)[test_inds]
          
    model = get_model()
    model.compile(optimizer=tf.train.AdamOptimizer(0.01),
                  loss='mse')
    
    model.fit(X_tr, y_tr,
              batch_size=32,
              validation_data=(X_val, y_val),
              epochs=epochs,
              callbacks=callbacks)
    
    embedding_layers = [l for l in model.layers if isinstance(l, Embedding)]
    for cat_feat, emb_layer in zip(cat_feats, embedding_layers):
        embeddingses[cat_feat].append(emb_layer.get_weights()[0])
    
    tr_pred = model.predict(X_tr)
    val_pred = model.predict(X_val)
    
    tr_pred = np.squeeze(tr_pred)
    val_pred = np.squeeze(val_pred)
    
    tr_rates = map_to_int(y_tr, tr_pred, tr_pred)
    val_rates = map_to_int(y_tr, tr_pred, val_pred)
    
    tr_rmse = rmse(y_tr, tr_pred)
    tr_qwk = qwk(y_tr, tr_rates)
    
    print('TR___RMSE: {:7.5F}___QWK: {:7.5F}'.format(tr_rmse, tr_qwk))
    
    val_rmse = rmse(y_val, val_pred)
    val_qwk = qwk(y_val, val_rates)
    
    print('VAL___RMSE: {:7.5F}___QWK: {:7.5F}'.format(val_rmse, val_qwk))
    
# Out-of-fold predictions
    oof_train[test_inds] = val_pred
    oof_test_pred = model.predict(X_test)
    oof_test_pred = np.squeeze(oof_test_pred)
    oof_test[:, i] = oof_test_pred
    
    qwks.append(val_qwk)
    rmses.append(val_rmse)
    
print('QWK CV: {} +/- {}'.format(np.mean(qwks), np.std(qwks)))
print('RMSE CV: {} +/- {}'.format(np.mean(rmses), np.std(rmses)))

for cat_feat in cat_feats:
    embeddingses[cat_feat] = np.mean(np.array(embeddingses[cat_feat]), axis=0)

---- Fold 0 ----
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Train on 11994 samples, validate on 2999 samples
Instructions for updating:
Use tf.cast instead.
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
TR___RMSE: 1.01909___QWK: 0.51964
VAL___RMSE: 1.07731___QWK: 0.36042
---- Fold 1 ----
Train on 11994 samples, validate on 2999 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
TR___RMSE: 1.02436___QWK: 0.48312
VAL___RMSE: 1.07343___QWK: 0.40456
---- Fold 2 ----
Train on 11994 samples, validate on 2999 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100

#### 4.3.2 LightGBM Training

In [34]:
# Final datasets arrangement

X_train = train_merged.drop(columns=['PetID', 'AdoptionSpeed'])
y_train = train_merged['AdoptionSpeed']

X_test = test_merged.drop(columns=['PetID'])

# The most frequent breed of each rescuer as feature.
X_train['rescuer_breed_mode'] = X_train['RescuerID'].map(X_train.groupby('RescuerID')['Breed1'].agg(
    lambda x:x.value_counts().index[0]))
X_test['rescuer_breed_mode'] = X_test['RescuerID'].map(X_test.groupby('RescuerID')['Breed1'].agg(
    lambda x:x.value_counts().index[0]))

cat_feats = ['Type', 'Vaccinated',
             'Dewormed', 'Sterilized'] 

X_train = X_train.drop(columns=['Name', 'RescuerID'])
X_test = X_test.drop(columns=['Name', 'RescuerID'])

In [35]:
# Extracting the embeddings and inserting them into the dataframes

breeds = pd.concat([X_train['Breed1'], X_test['Breed1']]).unique()
embed_list = []
for breed in breeds:
    embed_list.append(embeddingses['Breed1'][cat_feats_mappings['Breed1'][breed]])
embed_list = np.array(embed_list)
breed1_embed_df = pd.DataFrame(embed_list).add_prefix('breed1_embed_')
breed1_embed_df['Breed1'] = breeds
X_train = X_train.merge(breed1_embed_df, how='left', on='Breed1')
X_test = X_test.merge(breed1_embed_df, how='left', on='Breed1')

breeds = pd.concat([X_train['Breed2'], X_test['Breed2']]).unique()
embed_list = []
for breed in breeds:
    embed_list.append(embeddingses['Breed2'][cat_feats_mappings['Breed2'][breed]])
embed_list = np.array(embed_list)
breed1_embed_df = pd.DataFrame(embed_list).add_prefix('breed2_embed_')
breed1_embed_df['Breed2'] = breeds
X_train = X_train.merge(breed1_embed_df, how='left', on='Breed2')
X_test = X_test.merge(breed1_embed_df, how='left', on='Breed2')

breeds = pd.concat([X_train['rescuer_breed_mode'], X_test['rescuer_breed_mode']]).unique()
embed_list = []
for breed in breeds:
    embed_list.append(embeddingses['rescuer_breed_mode'][cat_feats_mappings['rescuer_breed_mode'][breed]])
embed_list = np.array(embed_list)
breed1_embed_df = pd.DataFrame(embed_list).add_prefix('rescuer_breed_mode_embed_')
breed1_embed_df['rescuer_breed_mode'] = breeds
X_train = X_train.merge(breed1_embed_df, how='left', on='rescuer_breed_mode')
X_test = X_test.merge(breed1_embed_df, how='left', on='rescuer_breed_mode')

breeds = pd.concat([X_train['State'], X_test['State']]).unique()
embed_list = []
for breed in breeds:
    embed_list.append(embeddingses['State'][cat_feats_mappings['State'][breed]])
embed_list = np.array(embed_list)
breed1_embed_df = pd.DataFrame(embed_list).add_prefix('state_embed_')
breed1_embed_df['State'] = breeds
X_train = X_train.merge(breed1_embed_df, how='left', on='State')
X_test = X_test.merge(breed1_embed_df, how='left', on='State')

X_train = X_train.drop(columns=['Breed1', 'Breed2', 'State', 'rescuer_breed_mode'])
X_test = X_test.drop(columns=['Breed1', 'Breed2', 'State', 'rescuer_breed_mode'])

In [37]:
# Using LightGBM regression

params = {'objective': 'mse',
          'boosting': 'gbdt',
          'metric': 'rmse',
          'num_leaves': 10,
          'max_depth': 5,
          'min_data_in_leaf': 60,
          'learning_rate': 0.01,
          'bagging_fraction': 0.5,
          'bagging_freq': 1,
          'feature_fraction': 0.3,
          'feature_fraction_seed': 73,
          'lambda_l1': 0,
          'lambda_l2': 0.3,
          'verbosity': -1,
          'seed': seed}

seed_everything()

def cross_validation(X_train, y_train,
                     params, 
                     n_splits=5,
                     early_stopping_rounds=500,
                     verbose_eval=100,
                     num_boost_round=10000,
                     seed=seed):

    gr_kfold_split = GroupKFold(n_splits=5).split(X_train, y_train,
                                                  groups=train_df['RescuerID'])

    oof_train = np.zeros((X_train.shape[0]))
    oof_test = np.zeros((X_test.shape[0], n_splits))
    qwks, rmses = [], []
    importances = []
    i = 0
    for train_index, valid_index in gr_kfold_split:
        X_tr = X_train.iloc[train_index, :]
        y_tr = y_train[train_index].values
        X_val = X_train.iloc[valid_index, :]
        y_val = y_train[valid_index].values
        
        # Text features 
        X_tr_text = text_feature_extractor.fit_transform(X_tr[text_columns]).set_index(X_tr.index)
        X_val_text = text_feature_extractor.transform(X_val[text_columns]).set_index(X_val.index)       
        X_tr = pd.concat([X_tr.drop(columns=text_columns),
                          X_tr_text], axis=1)
        X_val = pd.concat([X_val.drop(columns=text_columns),
                          X_val_text], axis=1) 
        
        # LGB datasets
        d_train = lgb.Dataset(X_tr, label=y_tr)
        d_valid = lgb.Dataset(X_val, label=y_val)
        valid_sets = [d_train, d_valid]

        # Training
        print('Fold {}/{}'.format(i + 1, n_splits))
        model = lgb.train(params,
                          train_set=d_train,
                          num_boost_round=num_boost_round,
                          valid_sets=valid_sets,
                          verbose_eval=verbose_eval,
                          early_stopping_rounds=early_stopping_rounds,
                          categorical_feature=cat_feats)

        # Predictions
        tr_pred = model.predict(X_tr)
        val_pred = model.predict(X_val)
       
        # Rounding
        thresholds = get_thresholds_from_dist(y_tr, tr_pred)
        val_pred_rounded = allocate_to_rate(val_pred, thresholds)
        
        # Evaluation
        qwk_val = qwk(y_val, val_pred_rounded)
        rmse_val = rmse(y_val, val_pred)
        qwks.append(qwk_val)
        rmses.append(rmse_val)

        # Out-of-fold predictions
        oof_train[valid_index] = val_pred
        
        # Test predictions
        X_test_text = text_feature_extractor.transform(X_test[text_columns]).set_index(X_test.index)
        X_test_val = pd.concat([X_test.drop(columns=text_columns),
                                X_test_text], axis=1)
        test_pred = model.predict(X_test_val)
        oof_test[:, i] = test_pred
            
        importance = model.feature_importance('gain') 
        importances.append(pd.Series(dict(zip(X_tr.columns, importance))))

        i += 1

        print('QWK: {}, RMSE: {}\n'.format(qwk_val, rmse_val))
    
    return qwks, rmses, oof_train, oof_test, importances

qwks, rmses, oof_train, oof_test, importances = cross_validation(X_train, y_train, params)

print('QWK CV: {} +/- {}'.format(np.mean(qwks), np.std(qwks)))
print('RMSE CV: {} +/- {}'.format(np.mean(rmses), np.std(rmses)))

Fold 1/5
Training until validation scores don't improve for 500 rounds.
[100]	training's rmse: 1.10141	valid_1's rmse: 1.09383
[200]	training's rmse: 1.06191	valid_1's rmse: 1.06861
[300]	training's rmse: 1.03461	valid_1's rmse: 1.05377
[400]	training's rmse: 1.01402	valid_1's rmse: 1.04438
[500]	training's rmse: 0.997284	valid_1's rmse: 1.03885
[600]	training's rmse: 0.982431	valid_1's rmse: 1.03474
[700]	training's rmse: 0.969258	valid_1's rmse: 1.03161
[800]	training's rmse: 0.956835	valid_1's rmse: 1.02992
[900]	training's rmse: 0.945343	valid_1's rmse: 1.02814
[1000]	training's rmse: 0.934452	valid_1's rmse: 1.02731
[1100]	training's rmse: 0.924119	valid_1's rmse: 1.02656
[1200]	training's rmse: 0.914088	valid_1's rmse: 1.02571
[1300]	training's rmse: 0.904211	valid_1's rmse: 1.02538
[1400]	training's rmse: 0.894588	valid_1's rmse: 1.02497
[1500]	training's rmse: 0.885205	valid_1's rmse: 1.02442
[1600]	training's rmse: 0.876385	valid_1's rmse: 1.02421
[1700]	training's rmse: 0.867

<a id="5"></a> 
## 5. Submission

In [38]:
def submit(oof_train, oof_test):
    """Generates submission from test OOF predictions."""
    preds = oof_test.mean(axis=1)
    
    thresholds = get_thresholds_from_dist(y_train, preds)
    preds = allocate_to_rate(preds, thresholds)

    preds = preds.astype(np.int32)
    submission = pd.DataFrame({'PetID': test_df['PetID'].values, 'AdoptionSpeed': preds})
    submission.to_csv('submission.csv', index=False)
    
    return preds
    
preds = submit(oof_train, oof_test)