## Stacking different models together
Using this technique we can hope to achieve better scores 

In [76]:
import pandas as pd
import numpy as np

from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

import time
import pickle
import warnings
warnings.filterwarnings('ignore')

def evaluate_model(model_name, model, X, y):
    
    predictions_probas = model.predict_proba(X)[:,1]

    AUC = roc_auc_score(y, predictions_probas)
    LogLoss = log_loss(y, predictions_probas)

    print('AUC for', model_name, ': %1.4f' % AUC)
    print('LogLoss for', model_name, ': %1.3f' % LogLoss)

    metrics_table = pd.DataFrame({'AUC' : [round(AUC, 4)], 'LogLoss' : [round(LogLoss, 3)]}, index = [model_name])
    
    return metrics_table

In [37]:
paths = {
   
    'catb_h1n1' : 'models/h1n1_catboost.pkl',
    'catb_seas' : 'models/seasonal_catboost.pkl',
    'lgb_h1n1_cat' : 'models/h1n1_lighgbm_categoricals.pkl',
    'lgb_seas_cat' : 'models/seasonal_lighgbm_categoricals.pkl'
    
}

models = {}

for model_name in paths:
    path = paths[model_name]
    with open(path, 'rb') as file:
        models[model_name] = pickle.load(file)

#### Functions to prepare test data for predicting on

In [42]:
def catboost_prepare(X_):
    
    X = X_.copy()
    
    for col in X.dtypes[X.dtypes == 'float64'].index.tolist():
        X[col] = X[col].astype(pd.Int64Dtype()).astype('O')

    for col in X.columns.tolist():
        X[col] = X[col].fillna('None')
        
    return X


def lgb_cat_prepare(X_):
    
    X = X_.copy()
    
    for col in X.dtypes[X.dtypes == 'object'].index.tolist():
        X[col] = X[col].fillna('None')

    for col in X.dtypes[X.dtypes == 'float64'].index.tolist():
        X[col] = X[col].fillna(-1).astype(pd.Int64Dtype())

    for col in X.columns.tolist():
        X[col] = X[col].astype('category')
        
    return X

## Read labeled dataset, use models for predictions, apply stacker
### H1N1

In [86]:
h1n1 = pd.read_csv('../../data/h1n1_catboost.csv')

X_h1n1 = h1n1.drop(['h1n1_vaccine'], 1)
y_h1n1 = h1n1['h1n1_vaccine'].copy()

X_h1n1_catboost = catboost_prepare(X_h1n1)
X_h1n1_lgb_cat = lgb_cat_prepare(X_h1n1)

h1n1 = pd.DataFrame(models['catb_h1n1'].predict_proba(X_h1n1_catboost)[:,1], columns = ['h1n1_catboost'])
h1n1['h1n1_lightgbm_categoricals'] = models['lgb_h1n1_cat'].predict_proba(X_h1n1_lgb_cat)[:,1]

h1n1.head()

Unnamed: 0,h1n1_catboost,h1n1_lightgbm_categoricals
0,0.038705,0.052832
1,0.509489,0.436289
2,0.023477,0.018908
3,0.040582,0.036509
4,0.039613,0.036054


In [92]:
X_train, X_test, y_train, y_test = train_test_split(h1n1, y_h1n1, random_state = 20202020, test_size = 0.2)

print(roc_auc_score(y_test, X_test['h1n1_catboost']))
print(roc_auc_score(y_test, X_test['h1n1_lightgbm_categoricals']))
print(roc_auc_score(y_test, X_test.max(axis = 1)))
print(roc_auc_score(y_test, X_test.min(axis = 1)))

0.8674056758389777
0.8672860581337819
0.8684173542042981
0.8676444885719654


In [88]:
h1n1_stacker = LogisticRegression()
h1n1_stacker.fit(X_train, y_train)

evaluate_model('Stacking H1N1', h1n1_stacker, X_test, y_test)

AUC for Stacking H1N1 : 0.8528
LogLoss for Stacking H1N1 : 0.382


Unnamed: 0,AUC,LogLoss
Stacking H1N1,0.8528,0.382


Let's try taking the max predicted value for now as its AUC is better than stacking

### Seasonal

In [93]:
seasonal = pd.read_csv('../../data/seasonal_catboost.csv')

X_seasonal = seasonal.drop(['seasonal_vaccine'], 1)
y_seasonal = seasonal['seasonal_vaccine'].copy()

X_seasonal_catboost = catboost_prepare(X_seasonal)
X_seasonal_lgb_cat = lgb_cat_prepare(X_seasonal)

seasonal = pd.DataFrame(models['catb_seas'].predict_proba(X_seasonal_catboost)[:,1], columns = ['seasonal_catboost'])
seasonal['seasonal_lightgbm_categoricals'] = models['lgb_seas_cat'].predict_proba(X_seasonal_lgb_cat)[:,1]

seasonal.head()

Unnamed: 0,seasonal_catboost,seasonal_lightgbm_categoricals
0,0.070242,0.065949
1,0.141779,0.157418
2,0.077667,0.098619
3,0.884547,0.901017
4,0.126142,0.117315


In [94]:
X_train, X_test, y_train, y_test = train_test_split(seasonal, y_seasonal, random_state = 20202020, test_size = 0.2)

print(roc_auc_score(y_test, X_test['seasonal_catboost']))
print(roc_auc_score(y_test, X_test['seasonal_lightgbm_categoricals']))
print(roc_auc_score(y_test, X_test.max(axis = 1)))
print(roc_auc_score(y_test, X_test.min(axis = 1)))

0.8610109270507117
0.8616528320532837
0.8622560986615887
0.8616618570796819


Take the max here too

## Read in test and process for predictions

In [95]:
test = pd.read_csv('../../data/originals/test_catboost.csv')
respondent_id = test['respondent_id'].copy()
test.drop(['respondent_id'], 1, inplace = True)

test_catboost = catboost_prepare(test)
test_lgb = lgb_cat_prepare(test)

#### Make predictions
H1N1

In [96]:
h1n1 = pd.DataFrame(models['catb_h1n1'].predict_proba(test_catboost)[:,1], columns = ['h1n1_catboost'])
h1n1['h1n1_lightgbm_categoricals'] = models['lgb_h1n1_cat'].predict_proba(test_lgb)[:,1]

h1n1.head()

Unnamed: 0,h1n1_catboost,h1n1_lightgbm_categoricals
0,0.121296,0.155108
1,0.041142,0.075113
2,0.14776,0.212651
3,0.620204,0.671908
4,0.362429,0.378022


Seasonal

In [97]:
seasonal = pd.DataFrame(models['catb_seas'].predict_proba(test_catboost)[:,1], columns = ['seasonal_catboost'])
seasonal['seasonal_lightgbm_categoricals'] = models['lgb_seas_cat'].predict_proba(test_lgb)[:,1]

seasonal.head()

Unnamed: 0,seasonal_catboost,seasonal_lightgbm_categoricals
0,0.191064,0.268159
1,0.050702,0.033132
2,0.746198,0.714583
3,0.904706,0.902284
4,0.567235,0.39791


#### Put together submission file by taking max of CatBoost / LightGBM predictions

In [100]:
submission = pd.DataFrame(respondent_id)
submission['h1n1_vaccine'] = h1n1.max(axis = 1)
submission['seasonal_vaccine'] = seasonal.max(axis = 1)

submission.head()

Unnamed: 0,respondent_id,h1n1_vaccine,seasonal_vaccine
0,26707,0.155108,0.268159
1,26708,0.075113,0.050702
2,26709,0.212651,0.746198
3,26710,0.671908,0.904706
4,26711,0.378022,0.567235


In [101]:
submission.to_csv('../../data/submissions/catboost_lighgbm_max.csv', index = False)