## Stacking different models together
Using this technique we can hope to achieve better scores 

In [38]:
import pandas as pd
import numpy as np

from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier

import time
import pickle
import warnings
warnings.filterwarnings('ignore')

def evaluate_model(model_name, model, X, y):
    
    if type(model) == LinearRegression:        
        predictions_probas = model.predict(X)
        
    else:
        predictions_probas = model.predict_proba(X)[:,1]

    AUC = roc_auc_score(y, predictions_probas)
    LogLoss = log_loss(y, predictions_probas)

    print('AUC for', model_name, ': %1.4f' % AUC)
    print('LogLoss for', model_name, ': %1.3f' % LogLoss)

    metrics_table = pd.DataFrame({'AUC' : [round(AUC, 4)], 'LogLoss' : [round(LogLoss, 3)]}, index = [model_name])
    
    return metrics_table

In [17]:
paths = {
   
    'catb_h1n1' : 'models/h1n1_catboost.pkl',
    'catb_seas' : 'models/seasonal_catboost.pkl',
    'lgb_h1n1_cat' : 'models/h1n1_lighgbm_categoricals.pkl',
    'lgb_seas_cat' : 'models/seasonal_lighgbm_categoricals.pkl',
    'xgb_h1n1_enc' : 'models/h1n1_xgb_encoded.pkl',
    'xgb_seas_enc' : 'models/seasonal_xgb_encoded.pkl',
    'xgb_h1n1_cont' : 'models/h1n1_xgb_continuous.pkl',
    'xgb_seas_cont' : 'models/seasonal_xgb_continuous.pkl',
    'lgb_h1n1_cont' : 'models/h1n1_lighgbm_continuous.pkl',
    'lgb_seas_cont' : 'models/seasonal_lighgbm_continuous.pkl'
    
}

models = {}

for model_name in paths:
    path = paths[model_name]
    with open(path, 'rb') as file:
        models[model_name] = pickle.load(file)

#### Functions to prepare test data for predicting on

In [18]:
def catboost_prepare(X_):
    
    X = X_.copy()
    
    for col in X.dtypes[X.dtypes == 'float64'].index.tolist():
        X[col] = X[col].astype(pd.Int64Dtype()).astype('O')

    for col in X.columns.tolist():
        X[col] = X[col].fillna('None')
        
    return X


def lgb_cat_prepare(X_):
    
    X = X_.copy()
    
    for col in X.dtypes[X.dtypes == 'object'].index.tolist():
        X[col] = X[col].fillna('None')

    for col in X.dtypes[X.dtypes == 'float64'].index.tolist():
        X[col] = X[col].fillna(-1).astype(pd.Int64Dtype())

    for col in X.columns.tolist():
        X[col] = X[col].astype('category')
        
    return X


def xgb_encoded_prepare(X_):
    
    X = X_.copy()
    
    for col in X.dtypes[X.dtypes == 'float64'].index.tolist():
        X[col] = X[col].astype(pd.Int64Dtype())
        
    to_encode = ['h1n1_concern', 'h1n1_knowledge', 'opinion_h1n1_vacc_effective', 'opinion_h1n1_risk',
                 'opinion_h1n1_sick_from_vacc', 'opinion_seas_vacc_effective', 'opinion_seas_risk', 
                 'opinion_seas_sick_from_vacc']

    X = pd.get_dummies(X, prefix_sep = ' = ', columns = to_encode)
    
    to_float = ['behavioral_antiviral_meds', 'behavioral_avoidance', 'behavioral_face_mask', 
                'behavioral_wash_hands', 'behavioral_large_gatherings', 'behavioral_outside_home', 
                'behavioral_touch_face', 'doctor_recc_h1n1', 'doctor_recc_seasonal', 'chronic_med_condition', 
                'child_under_6_months', 'health_worker', 'health_insurance', 'household_adults', 
                'household_children', 'marital_status_married', 'rent_or_own_own']
    
    for col in to_float:
        X[col] = X[col].astype(float)
        
    X.columns = [col_name.replace('<', 'smaller_than').replace(',', '') for col_name in X.columns.tolist()]
    
    return X


def xgb_continuous_prepare(X_):
    
    X = X_.copy()    
    X.columns = [col_name.replace('<', 'smaller_than').replace(',', '') for col_name in X.columns.tolist()]
    
    return X

## Read labeled dataset, use models for predictions, apply stacker
### H1N1

In [25]:
h1n1 = pd.read_csv('../../data/h1n1_catboost.csv')
X_h1n1 = h1n1.drop(['h1n1_vaccine'], 1)
y_h1n1 = h1n1['h1n1_vaccine'].copy()

X_h1n1_catboost = catboost_prepare(X_h1n1)
X_h1n1_lgb_cat = lgb_cat_prepare(X_h1n1)

h1n1 = pd.read_csv('../../data/h1n1_encoded.csv')
X_h1n1 = h1n1.drop(['h1n1_vaccine'], 1)

X_h1n1_xgb_encoded = xgb_encoded_prepare(X_h1n1)
X_h1n1_xgb_cont = xgb_continuous_prepare(X_h1n1)
X_h1n1_lgb_cont = xgb_continuous_prepare(X_h1n1)

h1n1 = pd.DataFrame(models['catb_h1n1'].predict_proba(X_h1n1_catboost)[:,1], columns = ['h1n1_catboost'])
h1n1['h1n1_lightgbm_categoricals'] = models['lgb_h1n1_cat'].predict_proba(X_h1n1_lgb_cat)[:,1]
h1n1['h1n1_xgb_encoded'] = models['xgb_h1n1_enc'].predict_proba(X_h1n1_xgb_encoded)[:,1]
h1n1['h1n1_xgb_continuous'] = models['xgb_h1n1_cont'].predict_proba(X_h1n1_xgb_cont)[:,1]
h1n1['h1n1_lightgbm_continuous'] = models['lgb_h1n1_cont'].predict_proba(X_h1n1_lgb_cont)[:,1]

h1n1.head()

Unnamed: 0,h1n1_catboost,h1n1_lightgbm_categoricals,h1n1_xgb_encoded,h1n1_xgb_continuous,h1n1_lightgbm_continuous
0,0.038705,0.052832,0.028569,0.028741,0.030537
1,0.509489,0.436289,0.220429,0.287381,0.335314
2,0.023477,0.018908,0.011959,0.009967,0.011712
3,0.040582,0.036509,0.029551,0.029272,0.033814
4,0.039613,0.036054,0.027199,0.024441,0.024135


In [20]:
X_train, X_test, y_train, y_test = train_test_split(h1n1, y_h1n1, random_state = 20202020, test_size = 0.2)

print(roc_auc_score(y_test, X_test['h1n1_catboost']))
print(roc_auc_score(y_test, X_test['h1n1_lightgbm_categoricals']))
print(roc_auc_score(y_test, X_test['h1n1_xgb_encoded']))
print(roc_auc_score(y_test, X_test['h1n1_xgb_continuous']))
print(roc_auc_score(y_test, X_test['h1n1_lightgbm_continuous']))

print(roc_auc_score(y_test, X_test.max(axis = 1)))
print(roc_auc_score(y_test, X_test.min(axis = 1)))
print(roc_auc_score(y_test, X_test.mean(axis = 1)))
print(roc_auc_score(y_test, X_test.median(axis = 1)))

0.8674056758389777
0.8672860581337819
0.8633703746675906
0.8642501776830135
0.8656496625660619
0.867963187334218
0.8669644006297048
0.8684530704448954
0.8683480351101212


In [40]:
h1n1_stacker = LogisticRegression()
h1n1_stacker.fit(X_train, y_train)

evaluate_model('Stacking H1N1', h1n1_stacker, X_test, y_test)

AUC for Stacking H1N1 : 0.7961
LogLoss for Stacking H1N1 : 0.540


Unnamed: 0,AUC,LogLoss
Stacking H1N1,0.7961,0.54


Take the mean

### Seasonal

In [41]:
seasonal = pd.read_csv('../../data/seasonal_catboost.csv')
X_seasonal = seasonal.drop(['seasonal_vaccine'], 1)
y_seasonal = seasonal['seasonal_vaccine'].copy()

X_seasonal_catboost = catboost_prepare(X_seasonal)
X_seasonal_lgb_cat = lgb_cat_prepare(X_seasonal)

seasonal = pd.read_csv('../../data/seasonal_encoded.csv')
X_seasonal = seasonal.drop(['seasonal_vaccine'], 1)

X_seasonal_xgb_encoded = xgb_encoded_prepare(X_seasonal)
X_seasonal_xgb_cont = xgb_continuous_prepare(X_seasonal)
X_seasonal_lgb_cont = xgb_continuous_prepare(X_seasonal)

seasonal = pd.DataFrame(models['catb_seas'].predict_proba(X_seasonal_catboost)[:,1], columns = ['seasonal_catboost'])
seasonal['seasonal_lightgbm_categoricals'] = models['lgb_seas_cat'].predict_proba(X_seasonal_lgb_cat)[:,1]
seasonal['seasonal_xgb_encoded'] = models['xgb_seas_enc'].predict_proba(X_seasonal_xgb_encoded)[:,1]
seasonal['seasonal_xgb_continuous'] = models['xgb_seas_cont'].predict_proba(X_seasonal_xgb_cont)[:,1]
seasonal['seasonal_lightgbm_continuous'] = models['lgb_seas_cont'].predict_proba(X_seasonal_lgb_cont)[:,1]

seasonal.head()

Unnamed: 0,seasonal_catboost,seasonal_lightgbm_categoricals,seasonal_xgb_encoded,seasonal_xgb_continuous,seasonal_lightgbm_continuous
0,0.062891,0.065949,0.103778,0.123237,0.077412
1,0.130829,0.157418,0.390777,0.321423,0.20432
2,0.086799,0.098619,0.132293,0.115932,0.075465
3,0.904319,0.901017,0.872618,0.856674,0.890557
4,0.122953,0.117315,0.138615,0.138363,0.102314


In [42]:
X_train, X_test, y_train, y_test = train_test_split(seasonal, y_seasonal, random_state = 20202020, test_size = 0.2)

print(roc_auc_score(y_test, X_test['seasonal_catboost']))
print(roc_auc_score(y_test, X_test['seasonal_lightgbm_categoricals']))
print(roc_auc_score(y_test, X_test['seasonal_xgb_encoded']))
print(roc_auc_score(y_test, X_test['seasonal_xgb_continuous']))
print(roc_auc_score(y_test, X_test['seasonal_lightgbm_continuous']))

print(roc_auc_score(y_test, X_test.max(axis = 1)))
print(roc_auc_score(y_test, X_test.min(axis = 1)))
print(roc_auc_score(y_test, X_test.mean(axis = 1)))
print(roc_auc_score(y_test, X_test.median(axis = 1)))

0.8614477947347995
0.8616528320532837
0.8551859070828407
0.8590857761071451
0.8619333129518153
0.8615144953205238
0.8602425306625272
0.8622032176475366
0.8622537013889515


Take the median

## Read in test and process for predictions

In [43]:
test = pd.read_csv('../../data/originals/test_catboost.csv')
respondent_id = test['respondent_id'].copy()
test.drop(['respondent_id'], 1, inplace = True)

test_catboost = catboost_prepare(test)
test_lgb = lgb_cat_prepare(test)

test = pd.read_csv('../../data/originals/test_encoded.csv')
test.drop(['respondent_id'], 1, inplace = True)

test_xgb_encoded = xgb_encoded_prepare(test)
test_xgb_cont = xgb_continuous_prepare(test)
test_lgb_cont = xgb_continuous_prepare(test)

#### Make predictions
H1N1

In [44]:
h1n1 = pd.DataFrame(models['catb_h1n1'].predict_proba(test_catboost)[:,1], columns = ['h1n1_catboost'])
h1n1['h1n1_lightgbm_categoricals'] = models['lgb_h1n1_cat'].predict_proba(test_lgb)[:,1]
h1n1['h1n1_xgb_encoded'] = models['xgb_h1n1_enc'].predict_proba(test_xgb_encoded)[:,1]
h1n1['h1n1_xgb_continuous'] = models['xgb_h1n1_cont'].predict_proba(test_xgb_cont)[:,1]
h1n1['h1n1_lightgbm_continuous'] = models['lgb_h1n1_cont'].predict_proba(test_lgb_cont)[:,1]

h1n1.head()

Unnamed: 0,h1n1_catboost,h1n1_lightgbm_categoricals,h1n1_xgb_encoded,h1n1_xgb_continuous,h1n1_lightgbm_continuous
0,0.121296,0.155108,0.095608,0.10268,0.130207
1,0.041142,0.075113,0.055153,0.045703,0.045001
2,0.14776,0.212651,0.288272,0.215291,0.213177
3,0.620204,0.671908,0.672732,0.60822,0.586978
4,0.362429,0.378022,0.214239,0.230761,0.233213


Seasonal

In [45]:
seasonal = pd.DataFrame(models['catb_seas'].predict_proba(test_catboost)[:,1], columns = ['seasonal_catboost'])
seasonal['seasonal_lightgbm_categoricals'] = models['lgb_seas_cat'].predict_proba(test_lgb)[:,1]
seasonal['seasonal_xgb_encoded'] = models['xgb_seas_enc'].predict_proba(test_xgb_encoded)[:,1]
seasonal['seasonal_xgb_continuous'] = models['xgb_seas_cont'].predict_proba(test_xgb_cont)[:,1]
seasonal['seasonal_lightgbm_continuous'] = models['lgb_seas_cont'].predict_proba(test_lgb_cont)[:,1]

seasonal.head()

Unnamed: 0,seasonal_catboost,seasonal_lightgbm_categoricals,seasonal_xgb_encoded,seasonal_xgb_continuous,seasonal_lightgbm_continuous
0,0.215002,0.268159,0.196756,0.260801,0.272264
1,0.040724,0.033132,0.079003,0.079955,0.029423
2,0.802155,0.714583,0.812739,0.783467,0.721784
3,0.884563,0.902284,0.877807,0.868499,0.888846
4,0.474062,0.39791,0.518264,0.516074,0.441687


#### Put together submission file

In [46]:
submission = pd.DataFrame(respondent_id)
submission['h1n1_vaccine'] = h1n1.mean(axis = 1)
submission['seasonal_vaccine'] = seasonal.median(axis = 1)

submission.head()

Unnamed: 0,respondent_id,h1n1_vaccine,seasonal_vaccine
0,26707,0.12098,0.260801
1,26708,0.052422,0.040724
2,26709,0.21543,0.783467
3,26710,0.632009,0.884563
4,26711,0.283733,0.474062


In [47]:
submission.to_csv('../../data/submissions/stack_mean_median_of_models.csv', index = False)