## Modeling the h1n1 and seasonal flu shots
I have 2 target features
1. h1n1 vaccination
2. seasonal vaccination

For now, both target variables will be predicted by the same underlying X features.
As most of my features are categorical / binary variables, my goal is to build 4 models, then stack them
1. CatBoost on original (not one hot encoded) dataset
2. LightGBM on original (not one hot encoded) dataset by setting categorical features
3. xgBoost on one hot encoded dataset
4. LightGbm / CatBoost / sklearn GBM on one-hot encoded dataset (whichever is best)

### First, I'll build models with CatBoost

In [12]:
import pandas as pd
import numpy as np

from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

import time
import pickle
import warnings
warnings.filterwarnings('ignore')

def evaluate_model(model_name, model, X, y):
    
    predictions_probas = model.predict_proba(X)[:,1]

    AUC = roc_auc_score(y, predictions_probas)
    LogLoss = log_loss(y, predictions_probas)

    print('AUC for', model_name, ': %1.4f' % AUC)
    print('LogLoss for', model_name, ': %1.3f' % LogLoss)

    metrics_table = pd.DataFrame({'AUC' : [round(AUC, 4)], 'LogLoss' : [round(LogLoss, 3)]}, index = [model_name])
    
    return metrics_table

#### H1N1

In [3]:
h1n1_cat = pd.read_csv('../../data/h1n1_catboost.csv')

X_catboost = h1n1_cat.drop(['h1n1_vaccine'], 1)
y_catboost = h1n1_cat['h1n1_vaccine'].copy()

print('Original shape:', h1n1_cat.shape)
print('X shape:', X_catboost.shape)
print('y shape:', y_catboost.shape)

Original shape: (26707, 34)
X shape: (26707, 33)
y shape: (26707,)


In [4]:
for col in X_catboost.dtypes[X_catboost.dtypes == 'float64'].index.tolist():
    X_catboost[col] = X_catboost[col].astype(pd.Int64Dtype()).astype('O')
    
for col in X_catboost.columns.tolist():
    X_catboost[col] = X_catboost[col].fillna('None')

In [5]:
X_cat_train, X_cat_test, y_cat_train, y_cat_test = train_test_split(X_catboost, y_catboost, test_size = 0.2, random_state = 20202020)

In [34]:
start = time.time()
print("Started at", str(time.ctime(int(start))))

cat_params = {'learning_rate': [0.1],
              'l2_leaf_reg': [0.5],
              'subsample': [0.75],
              'rsm' : [2/3],
              'max_depth': [9], # up to 16 (8 on gpu)
              'grow_policy': ['Lossguide'],
              'min_data_in_leaf' : [23], 
              'max_leaves' : [23],
              'iterations' : [100]} 

cat = CatBoostClassifier(random_state = 20202020, verbose = 0,
                         eval_metric = 'AUC:hints=skip_train~false', objective = 'Logloss',
                         cat_features = X_catboost.columns.tolist())

GRID_cat = GridSearchCV(cat, param_grid = cat_params, cv = 5, scoring = 'roc_auc', n_jobs = -1)

GRID_cat.fit(X_cat_train, y_cat_train)

print("Ended at", str(time.ctime(int(time.time()))))
print((time.time() - start) / 60, 'minutes')

Started at Fri Feb  5 11:50:24 2021
Ended at Fri Feb  5 11:51:06 2021
0.6912403861681621 minutes


In [35]:
GRID_cat.best_params_

{'grow_policy': 'Lossguide',
 'iterations': 100,
 'l2_leaf_reg': 0.5,
 'learning_rate': 0.1,
 'max_depth': 9,
 'max_leaves': 23,
 'min_data_in_leaf': 23,
 'rsm': 0.6666666666666666,
 'subsample': 0.75}

In [36]:
evaluate_model('CatBoost h1n1', GRID_cat.best_estimator_, X_cat_test, y_cat_test)

AUC for CatBoost h1n1 : 0.8674
LogLoss for CatBoost h1n1 : 0.345


Unnamed: 0,AUC,LogLoss
CatBoost h1n1,0.8674,0.345


In [64]:
model_filename = 'models/h1n1_catboost.pkl'
with open(model_filename, 'wb') as file:
    pickle.dump(GRID_cat.best_estimator_, file)

#### Seasonal

In [38]:
seasonal_cat = pd.read_csv('../../data/seasonal_catboost.csv')

X_catboost = seasonal_cat.drop(['seasonal_vaccine'], 1)
y_catboost = seasonal_cat['seasonal_vaccine'].copy()

print('Original shape:', seasonal_cat.shape)
print('X shape:', X_catboost.shape)
print('y shape:', y_catboost.shape)

Original shape: (26707, 34)
X shape: (26707, 33)
y shape: (26707,)


In [39]:
for col in X_catboost.dtypes[X_catboost.dtypes == 'float64'].index.tolist():
    X_catboost[col] = X_catboost[col].astype(pd.Int64Dtype()).astype('O')
    
for col in X_catboost.columns.tolist():
    X_catboost[col] = X_catboost[col].fillna('None')

In [40]:
X_cat_train, X_cat_test, y_cat_train, y_cat_test = train_test_split(X_catboost, y_catboost, test_size = 0.2, random_state = 20202020)

In [46]:
start = time.time()
print("Started at", str(time.ctime(int(start))))

cat_params = {'learning_rate': [0.1],
              'l2_leaf_reg': [0.5],
              'subsample': [0.75],
              'rsm' : [2/3],
              'max_depth': [9], # up to 16 (8 on gpu)
              'grow_policy': ['Lossguide'],
              'min_data_in_leaf' : [23], 
              'max_leaves' : [23],
              'iterations' : [100]}

cat = CatBoostClassifier(random_state = 20202020, verbose = 0,
                         eval_metric = 'AUC:hints=skip_train~false', objective = 'Logloss',
                         cat_features = X_catboost.columns.tolist())

GRID_cat_seasonal = GridSearchCV(cat, param_grid = cat_params, cv = 5, scoring = 'roc_auc', n_jobs = -1)

GRID_cat_seasonal.fit(X_cat_train, y_cat_train)

print("Ended at", str(time.ctime(int(time.time()))))
print((time.time() - start) / 60, 'minutes')

Started at Fri Feb  5 11:54:43 2021
Ended at Fri Feb  5 11:55:29 2021
0.7637951334317525 minutes


In [47]:
evaluate_model('CatBoost seasonal', GRID_cat_seasonal.best_estimator_, X_cat_test, y_cat_test)

AUC for CatBoost seasonal : 0.8610
LogLoss for CatBoost seasonal : 0.465


Unnamed: 0,AUC,LogLoss
CatBoost seasonal,0.861,0.465


In [65]:
model_filename = 'models/seasonal_catboost.pkl'
with open(model_filename, 'wb') as file:
    pickle.dump(GRID_cat_seasonal.best_estimator_, file)

#### Make test predictions for 1st submit

In [57]:
test = pd.read_csv('../../data/originals/test_catboost.csv')

for col in test.dtypes[test.dtypes == 'float64'].index.tolist():
    test[col] = test[col].astype(pd.Int64Dtype()).astype('O')
    
for col in test.columns.tolist():
    test[col] = test[col].fillna('None')
    
respondent_id = test['respondent_id'].copy()
test.drop(['respondent_id'], 1, inplace = True)

In [79]:
submission = pd.DataFrame(respondent_id)
submission['h1n1_vaccine'] = GRID_cat.best_estimator_.predict_proba(test)[:,1]
submission['seasonal_vaccine'] = GRID_cat_seasonal.best_estimator_.predict_proba(test)[:,1]

submission.head()

Unnamed: 0,respondent_id,h1n1_vaccine,seasonal_vaccine
0,26707,0.121296,0.191064
1,26708,0.041142,0.050702
2,26709,0.14776,0.746198
3,26710,0.620204,0.904706
4,26711,0.362429,0.567235


In [80]:
submission.to_csv('../../data/submissions/first_submission_catboost.csv', index = False)