In [33]:
# Load packages
import numpy as np
import pandas as pd
import pandas_profiling as pf
import statsmodels.formula.api as smf

import catboost
from catboost import Pool,CatBoostClassifier

import optbinning
from optbinning import Scorecard, BinningProcess
from optbinning.scorecard.plots import plot_ks, plot_auc_roc

from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.model_selection import StratifiedKFold,train_test_split,KFold
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from category_encoders import OrdinalEncoder as oe
from catboost import CatBoostClassifier
from catboost import Pool, cv
from sklearn.metrics import roc_curve, roc_auc_score
import optuna

import matplotlib.pyplot as plt
%matplotlib inline
from scipy import stats
import re
import seaborn as sns
import os
import warnings
warnings.filterwarnings('ignore') # if there are any warning due to version mismatch, it will be ignored

In [2]:
## Read & join datasets
in_features=pd.read_csv('.\\training_set_features.csv')
in_labels=pd.read_csv('.\\training_set_labels.csv')

# Evaluate test features and prepare submission output
val=pd.read_csv('.\\test_set_features.csv')
submission_format=pd.read_csv('.\\submission_format.csv')

train_sample = pd.merge(in_features, in_labels, on='respondent_id')

train_orig, test_orig = train_test_split(train_sample, train_size=(2/3), random_state=44)

## 0) Dataset preparation

In [3]:
numerical_vars = ['h1n1_concern','h1n1_knowledge','behavioral_antiviral_meds','behavioral_avoidance','behavioral_face_mask',
                  'behavioral_wash_hands','behavioral_large_gatherings','behavioral_outside_home','behavioral_touch_face',
                  'doctor_recc_h1n1','doctor_recc_seasonal','chronic_med_condition','child_under_6_months',
                  'health_worker','health_insurance','opinion_h1n1_vacc_effective','opinion_h1n1_risk',
                  'opinion_h1n1_sick_from_vacc','opinion_seas_vacc_effective','opinion_seas_risk',
                  'opinion_seas_sick_from_vacc','household_adults','household_children']

char_vars = ['age_group','education','race','sex','income_poverty','marital_status','rent_or_own',
             'employment_status','hhs_geo_region','census_msa',
            'employment_industry','employment_occupation']

target_vars = ['h1n1_vaccine','seasonal_vaccine']

id_var = ['respondent_id']

relevant_vars = np.concatenate((numerical_vars, char_vars, target_vars, id_var), axis=None)

In [4]:
train = train_orig[relevant_vars].copy()
test = test_orig[relevant_vars].copy()
#train_sample_non_missing = train_sample.copy()

In [5]:
# Treatment for missing values

for i in numerical_vars:
    train[i] = train[i].fillna(-1)
    test[i] = test[i].fillna(-1)
    val[i] = val[i].fillna(-1)
    
for i in char_vars:
    train[i] = train[i].fillna('missing_info')
    test[i] = test[i].fillna('missing_info')
    val[i] = val[i].fillna('missing_info')

New variables

In [6]:
train['total_persons'] = train['household_adults']+train['household_children']
test['total_persons'] = test['household_adults']+test['household_children']
val['total_persons'] = val['household_adults']+val['household_children']

train['face_mask_wash_hand'] = train['behavioral_face_mask']+train['behavioral_wash_hands']
train['face_mask_touch_face'] = train['behavioral_face_mask'] + train['behavioral_touch_face']
train['wash_hands_touch_face'] = train['behavioral_wash_hands']+train['behavioral_touch_face']
train['opinion_seas_eff_risk'] = train['opinion_seas_vacc_effective']+train['opinion_seas_risk']
train['opinion_h1n1_eff_risk'] = train['opinion_h1n1_vacc_effective']+train['opinion_h1n1_risk']
train['opinio_seas_vacc_eff_sick'] = train['opinion_seas_vacc_effective'] + train['opinion_seas_sick_from_vacc']
train['opinon_h1n1_vacc_eff_sick'] = train['opinion_h1n1_vacc_effective']+train['opinion_h1n1_sick_from_vacc']
train['h1n1_concern_knowledge']   = train['h1n1_concern'] + train['h1n1_knowledge']
train['behaviour_large_gath_outside_home']  = train['behavioral_large_gatherings'] + train['behavioral_outside_home']
train['opinion_seas_vaccine']  = train['opinion_seas_risk']+train['opinion_seas_sick_from_vacc']
train['opinion_h1n1_vaccine']  = train['opinion_h1n1_risk']+train['opinion_h1n1_sick_from_vacc']

test['face_mask_wash_hand'] = test['behavioral_face_mask']+test['behavioral_wash_hands']
test['face_mask_touch_face'] = test['behavioral_face_mask'] + test['behavioral_touch_face']
test['wash_hands_touch_face'] = test['behavioral_wash_hands']+test['behavioral_touch_face']
test['opinion_seas_eff_risk'] = test['opinion_seas_vacc_effective']+test['opinion_seas_risk']
test['opinion_h1n1_eff_risk'] = test['opinion_h1n1_vacc_effective']+test['opinion_h1n1_risk']
test['opinio_seas_vacc_eff_sick'] = test['opinion_seas_vacc_effective'] + test['opinion_seas_sick_from_vacc']
test['opinon_h1n1_vacc_eff_sick'] = test['opinion_h1n1_vacc_effective']+test['opinion_h1n1_sick_from_vacc']
test['h1n1_concern_knowledge']   = test['h1n1_concern'] + test['h1n1_knowledge']
test['behaviour_large_gath_outside_home']  = test['behavioral_large_gatherings'] + test['behavioral_outside_home']
test['opinion_seas_vaccine']  = test['opinion_seas_risk']+test['opinion_seas_sick_from_vacc']
test['opinion_h1n1_vaccine']  = test['opinion_h1n1_risk']+test['opinion_h1n1_sick_from_vacc']

val['face_mask_wash_hand'] = val['behavioral_face_mask']+val['behavioral_wash_hands']
val['face_mask_touch_face'] = val['behavioral_face_mask'] + val['behavioral_touch_face']
val['wash_hands_touch_face'] = val['behavioral_wash_hands']+val['behavioral_touch_face']
val['opinion_seas_eff_risk'] = val['opinion_seas_vacc_effective']+val['opinion_seas_risk']
val['opinion_h1n1_eff_risk'] = val['opinion_h1n1_vacc_effective']+val['opinion_h1n1_risk']
val['opinio_seas_vacc_eff_sick'] = val['opinion_seas_vacc_effective'] + val['opinion_seas_sick_from_vacc']
val['opinon_h1n1_vacc_eff_sick'] = val['opinion_h1n1_vacc_effective']+val['opinion_h1n1_sick_from_vacc']
val['h1n1_concern_knowledge']   = val['h1n1_concern'] + val['h1n1_knowledge']
val['behaviour_large_gath_outside_home']  = val['behavioral_large_gatherings'] + val['behavioral_outside_home']
val['opinion_seas_vaccine']  = val['opinion_seas_risk']+val['opinion_seas_sick_from_vacc']
val['opinion_h1n1_vaccine']  = val['opinion_h1n1_risk']+val['opinion_h1n1_sick_from_vacc']

In [7]:
#Introducing "clusters" variable using KMeans
kmeans = KMeans(4,random_state=42)
kmeans.fit(train[numerical_vars])

KMeans(n_clusters=4, random_state=42)

In [8]:
# assigning clusters
train['clusters'] = kmeans.predict(train[numerical_vars])
test['clusters']  = kmeans.predict(test[numerical_vars])
val['clusters']  = kmeans.predict(val[numerical_vars])

In [9]:
train_A = train.copy()
test_A = test.copy()
val_A = val.copy()

train_B = train.copy()
test_B = test.copy()
val_B = val.copy()

In [10]:
target_mean_A = train_A.groupby(['clusters']).mean()[['h1n1_vaccine']]
for i in list(target_mean_A.columns):
  target_mean_A.rename({i:i+"_mean"},axis=1,inplace=True)

train_A = train_A.merge(target_mean_A,how='left',on='clusters')
test_A = test_A.merge(target_mean_A,how="left",on='clusters')
val_A = val_A.merge(target_mean_A,how="left",on='clusters')

In [11]:
target_mean_B = train_B.groupby(['clusters']).mean()[['seasonal_vaccine']]
for i in list(target_mean_B.columns):
  target_mean_B.rename({i:i+"_mean"},axis=1,inplace=True)

train_B = train_B.merge(target_mean_B,how='left',on='clusters')
test_B = test_B.merge(target_mean_B,how="left",on='clusters')
val_B = val_B.merge(target_mean_B,how='left',on='clusters')

In [12]:
train_combined = pd.merge(train_A.drop(columns = ['clusters']), train_B[['respondent_id', 'seasonal_vaccine_mean']], on='respondent_id')
test_combined = pd.merge(test_A.drop(columns = ['clusters']), test_B[['respondent_id', 'seasonal_vaccine_mean']], on='respondent_id')
val_combined = pd.merge(val_A.drop(columns = ['clusters']), val_B[['respondent_id', 'seasonal_vaccine_mean']], on='respondent_id')

## 1) Logistic regression

In [13]:
# Define the feature list from dataset (including categorical and numerical)
list_features = train_combined.drop(columns=['h1n1_vaccine','seasonal_vaccine']).columns.values

# Define categorical features list
#list_categorical = char_vars
list_categorical = list_features

# Define selection criteria for BinningProcess
selection_criteria = {"iv": {"min": 0.005, 'max':0.5, "strategy": "highest"}}

# Instatiate BinningProcess
binning_process_t1 = BinningProcess(
    categorical_variables=list_categorical,
    variable_names=list_features,
    selection_criteria=selection_criteria,
)

binning_process_t2 = BinningProcess(
    categorical_variables=list_categorical,
    variable_names=list_features,
    selection_criteria=selection_criteria,
)

In [14]:
# Choosing a linear estimator for h1n1_vaccine
logreg_t1 = LogisticRegression(C=3, max_iter=1000, random_state=161)
# Choosing a linear estimator for seasonal_vaccine
logreg_t2 = LogisticRegression(C=3, max_iter=1000, random_state=161)

In [15]:
# Define scaling method and values
scaling_method = "min_max"
scaling_method_data = {"min": 0, "max": 1000}

# Instatiate and fit Scorecard
scorecard_t1 = Scorecard(
    binning_process=binning_process_t1,
    estimator=logreg_t1,
    scaling_method=scaling_method,
    scaling_method_params=scaling_method_data,
    intercept_based=False,
    reverse_scorecard=True,
)

# Instatiate and fit Scorecard
scorecard_t2 = Scorecard(
    binning_process=binning_process_t2,
    estimator=logreg_t2,
    scaling_method=scaling_method,
    scaling_method_params=scaling_method_data,
    intercept_based=False,
    reverse_scorecard=True,
)

scorecard_t1.fit(train_combined.drop(columns=['h1n1_vaccine','seasonal_vaccine']), y =train['h1n1_vaccine'])
scorecard_t2.fit(train_combined.drop(columns=['h1n1_vaccine','seasonal_vaccine']), y =train['seasonal_vaccine'])

Scorecard(binning_process=BinningProcess(categorical_variables=array(['h1n1_concern', 'h1n1_knowledge', 'behavioral_antiviral_meds',
       'behavioral_avoidance', 'behavioral_face_mask',
       'behavioral_wash_hands', 'behavioral_large_gatherings',
       'behavioral_outside_home', 'behavioral_touch_face',
       'doctor_recc_h1n1', 'doctor_recc_seasonal',
       'chronic_med_condition', 'chi...
       'opinon_h1n1_vacc_eff_sick', 'h1n1_concern_knowledge',
       'behaviour_large_gath_outside_home', 'opinion_seas_vaccine',
       'opinion_h1n1_vaccine', 'h1n1_vaccine_mean',
       'seasonal_vaccine_mean'], dtype=object)),
          estimator=LogisticRegression(C=3, max_iter=1000, random_state=161),
          reverse_scorecard=True, scaling_method='min_max',
          scaling_method_params={'max': 1000, 'min': 0})

In [16]:
# Evaluate 
# Assign score and predicted probability to datasets
train_combined.loc[:,"score_t1"] = scorecard_t1.score(train_combined)
test_combined.loc[:,"score_t1"] = scorecard_t1.score(test_combined)
val_combined.loc[:,"score_t1"] = scorecard_t1.score(val_combined)

train_combined.loc[:,"score_t2"] = scorecard_t2.score(train_combined)
test_combined.loc[:,"score_t2"] = scorecard_t2.score(test_combined)
val_combined.loc[:,"score_t2"] = scorecard_t2.score(val_combined)

In [17]:
# Train - h1n1
# Kolmogorov_Smirnov Plot
plot_ks(train_combined['h1n1_vaccine'], train_combined.score_t1)
# ROC-AUC plot
plot_auc_roc(train_combined['h1n1_vaccine'], train_combined.score_t1)
plt.savefig('proc_log_train_h1n1.jpg')
plt.close() 

# Test - h1n1
# Kolmogorov_Smirnov Plot
plot_ks(test_combined['h1n1_vaccine'], test_combined.score_t1)
# ROC-AUC plot
plot_auc_roc(test_combined['h1n1_vaccine'], test_combined.score_t1)
plt.savefig('proc_log_test_h1n1.jpg')
plt.close() 

**Train:** ![](proc_log_train_h1n1.jpg) **Test:** ![](proc_log_test_h1n1.jpg)

In [18]:
# Train - h1n1
# Kolmogorov_Smirnov Plot
plot_ks(train_combined['seasonal_vaccine'], train_combined.score_t2)
# ROC-AUC plot
plot_auc_roc(train_combined['seasonal_vaccine'], train_combined.score_t2)
plt.savefig('proc_log_train_seasonal_vaccine.jpg')
plt.close() 

# Test - h1n1
# Kolmogorov_Smirnov Plot
plot_ks(test_combined['seasonal_vaccine'], test_combined.score_t2)
# ROC-AUC plot
plot_auc_roc(test_combined['seasonal_vaccine'], test_combined.score_t2)
plt.savefig('proc_log_test_seasonal_vaccine.jpg')
plt.close() 

**Train:** ![](proc_log_train_seasonal_vaccine.jpg) **Test:** ![](proc_log_test_seasonal_vaccine.jpg)

# and now for something completely different

## 2) CatBoost - https://catboost.ai/
Train, test & val samples have logistic regression output (score_t1, score_t2) which will be used in CatBoost

In [30]:
skf = StratifiedKFold(n_splits=20,random_state=42,shuffle=True)
cat = CatBoostClassifier(iterations=50000,random_state=42,reg_lambda=30,eval_metric='AUC',custom_loss='Logloss')

In [31]:
h1n1_scores = []
h1n1_pred= np.zeros(len(val_combined))

#Combining test & train since they are again split in kflods
kfold_input = train_combined.append(test_combined)
#Reordering of columns
val_input = val_combined[kfold_input.drop(columns=['h1n1_vaccine','seasonal_vaccine']).columns.values].copy()

X_a = kfold_input.drop(['h1n1_vaccine','seasonal_vaccine'],axis=1)
y_a = kfold_input['h1n1_vaccine']
test_A = val_input

X_b = kfold_input.drop(['h1n1_vaccine','seasonal_vaccine'],axis=1)
y_b = kfold_input['seasonal_vaccine']
test_B = val_input

# Defining categorical variables
categorical_A = np.where(X_a.dtypes == object)[0]
categorical_B = np.where(X_b.dtypes == object)[0]

In [None]:
for fold, (train_index,test_index) in enumerate(skf.split(X_a,y_a)):
    print(f'====================================== FOLD {fold} ======================================================')
    X_train,X_test = X_a.iloc[train_index],X_a.iloc[test_index]
    y_train,y_test = y_a.iloc[train_index],y_a.iloc[test_index]
    cat.fit(X_train,y_train,early_stopping_rounds=500,eval_set=[(X_train,y_train),(X_test,y_test)],use_best_model=True,cat_features=categorical_A, metric_period=500)
    h1n1_scores.append(roc_auc_score(y_test,cat.predict_proba(X_test)[:,1]))
    h1n1_pred +=cat.predict_proba(test_A)[:,1]

In [39]:
# AUC for first target
np.mean(h1n1_scores)

0.8734776095925477

In [40]:
# reseting cat and skf for new target
skf = StratifiedKFold(n_splits=20,random_state=42,shuffle=True)
cat = CatBoostClassifier(iterations=50000,random_state=42,reg_lambda=30,eval_metric='AUC',custom_loss='Logloss')

In [41]:
season_scores = []
season_pred= np.zeros(len(val_combined))

In [42]:
for fold, (train_index,test_index) in enumerate(skf.split(X_b,y_b)):
    print(f'====================================== FOLD {fold} ======================================================')
    X_train,X_test = X_b.iloc[train_index],X_b.iloc[test_index]
    y_train,y_test = y_b.iloc[train_index],y_b.iloc[test_index]
    cat.fit(X_train,y_train,early_stopping_rounds=500,eval_set=[(X_train,y_train),(X_test,y_test)],use_best_model=True,cat_features=categorical_B, metric_period=500)
    season_scores.append(roc_auc_score(y_test,cat.predict_proba(X_test)[:,1]))
    season_pred +=cat.predict_proba(test_B)[:,1]





0:	test: 0.8210236	test1: 0.8149651	best: 0.8149651 (0)	total: 140ms	remaining: 1h 56m 54s
500:	test: 0.8753508	test1: 0.8607163	best: 0.8607704 (495)	total: 38s	remaining: 1h 2m 38s
1000:	test: 0.8835315	test1: 0.8618692	best: 0.8619502 (963)	total: 1m 14s	remaining: 1h 23s
1500:	test: 0.8900258	test1: 0.8623758	best: 0.8624884 (1453)	total: 1m 50s	remaining: 59m 25s
2000:	test: 0.8959003	test1: 0.8628487	best: 0.8628915 (1993)	total: 2m 27s	remaining: 58m 59s
2500:	test: 0.9013143	test1: 0.8629748	best: 0.8631707 (2478)	total: 3m 5s	remaining: 58m 38s
3000:	test: 0.9064170	test1: 0.8626821	best: 0.8632720 (2596)	total: 3m 55s	remaining: 1h 1m 21s
Stopped by overfitting detector  (500 iterations wait)

bestTest = 0.8632719969
bestIteration = 2596

Shrink model to first 2597 iterations.




0:	test: 0.8230835	test1: 0.7975830	best: 0.7975830 (0)	total: 96.1ms	remaining: 1h 20m 3s
500:	test: 0.8788366	test1: 0.8421195	best: 0.8421690 (484)	total: 51.8s	remaining: 1h 25m 19s
1000:	test: 0.8872158	test1: 0.8435155	best: 0.8435291 (995)	total: 1m 48s	remaining: 1h 28m 6s
1500:	test: 0.8940207	test1: 0.8439073	best: 0.8442383 (1378)	total: 2m 42s	remaining: 1h 27m 15s
2000:	test: 0.8997816	test1: 0.8439299	best: 0.8443081 (1836)	total: 3m 25s	remaining: 1h 22m 1s
2500:	test: 0.9058238	test1: 0.8441550	best: 0.8443779 (2155)	total: 4m 9s	remaining: 1h 19m 5s
Stopped by overfitting detector  (500 iterations wait)

bestTest = 0.8443779441
bestIteration = 2155

Shrink model to first 2156 iterations.




0:	test: 0.8204234	test1: 0.8284167	best: 0.8284167 (0)	total: 87.7ms	remaining: 1h 13m 6s
500:	test: 0.8767858	test1: 0.8693088	best: 0.8694529 (420)	total: 39.5s	remaining: 1h 5m
1000:	test: 0.8855202	test1: 0.8702478	best: 0.8704234 (844)	total: 1m 19s	remaining: 1h 4m 56s
Stopped by overfitting detector  (500 iterations wait)

bestTest = 0.8704234105
bestIteration = 844

Shrink model to first 845 iterations.




0:	test: 0.8221704	test1: 0.8145462	best: 0.8145462 (0)	total: 71.3ms	remaining: 59m 25s
500:	test: 0.8765635	test1: 0.8682955	best: 0.8683271 (481)	total: 37.7s	remaining: 1h 2m 4s
1000:	test: 0.8834827	test1: 0.8687301	best: 0.8687887 (586)	total: 1m 12s	remaining: 59m 18s
Stopped by overfitting detector  (500 iterations wait)

bestTest = 0.868788673
bestIteration = 586

Shrink model to first 587 iterations.




0:	test: 0.8210563	test1: 0.8348881	best: 0.8348881 (0)	total: 76.8ms	remaining: 1h 4m 1s
500:	test: 0.8780540	test1: 0.8652400	best: 0.8653863 (336)	total: 35.9s	remaining: 59m 2s
Stopped by overfitting detector  (500 iterations wait)

bestTest = 0.8653863475
bestIteration = 336

Shrink model to first 337 iterations.




0:	test: 0.8209372	test1: 0.8189145	best: 0.8189145 (0)	total: 79ms	remaining: 1h 5m 48s
500:	test: 0.8767193	test1: 0.8567195	best: 0.8567195 (500)	total: 36.2s	remaining: 59m 40s
1000:	test: 0.8859173	test1: 0.8578319	best: 0.8578521 (955)	total: 1m 12s	remaining: 59m
1500:	test: 0.8933735	test1: 0.8581021	best: 0.8583363 (1381)	total: 1m 48s	remaining: 58m 39s
Stopped by overfitting detector  (500 iterations wait)

bestTest = 0.8583362605
bestIteration = 1381

Shrink model to first 1382 iterations.




0:	test: 0.8256062	test1: 0.8295370	best: 0.8295370 (0)	total: 68.3ms	remaining: 56m 57s
500:	test: 0.8762988	test1: 0.8705900	best: 0.8706441 (496)	total: 37.6s	remaining: 1h 1m 56s
1000:	test: 0.8839222	test1: 0.8709751	best: 0.8716686 (805)	total: 1m 15s	remaining: 1h 1m 22s
Stopped by overfitting detector  (500 iterations wait)

bestTest = 0.8716686031
bestIteration = 805

Shrink model to first 806 iterations.




0:	test: 0.8225253	test1: 0.8416072	best: 0.8416072 (0)	total: 74.8ms	remaining: 1h 2m 18s
500:	test: 0.8748067	test1: 0.8854913	best: 0.8854936 (499)	total: 35.4s	remaining: 58m 15s
1000:	test: 0.8827471	test1: 0.8855229	best: 0.8856649 (912)	total: 1m 11s	remaining: 58m 7s
Stopped by overfitting detector  (500 iterations wait)

bestTest = 0.8856649364
bestIteration = 912

Shrink model to first 913 iterations.




0:	test: 0.8235997	test1: 0.8256044	best: 0.8256044 (0)	total: 68.2ms	remaining: 56m 47s
500:	test: 0.8761501	test1: 0.8590598	best: 0.8590598 (499)	total: 34.6s	remaining: 57m 2s
1000:	test: 0.8841995	test1: 0.8601264	best: 0.8601264 (1000)	total: 1m 11s	remaining: 58m 23s
1500:	test: 0.8910813	test1: 0.8605954	best: 0.8607735 (1423)	total: 1m 48s	remaining: 58m 33s
2000:	test: 0.8977549	test1: 0.8608457	best: 0.8609020 (1987)	total: 2m 24s	remaining: 57m 57s
2500:	test: 0.9036507	test1: 0.8605367	best: 0.8610351 (2078)	total: 3m	remaining: 57m 15s
Stopped by overfitting detector  (500 iterations wait)

bestTest = 0.8610350721
bestIteration = 2078

Shrink model to first 2079 iterations.




0:	test: 0.8233279	test1: 0.8273711	best: 0.8273711 (0)	total: 71.8ms	remaining: 59m 49s
500:	test: 0.8748159	test1: 0.8705619	best: 0.8705619 (500)	total: 34s	remaining: 56m 2s
1000:	test: 0.8847113	test1: 0.8711278	best: 0.8711774 (988)	total: 1m 10s	remaining: 57m 14s
1500:	test: 0.8929927	test1: 0.8718922	best: 0.8721132 (1406)	total: 1m 46s	remaining: 57m 26s
2000:	test: 0.8999244	test1: 0.8724154	best: 0.8724605 (1962)	total: 2m 22s	remaining: 56m 51s
2500:	test: 0.9056997	test1: 0.8722868	best: 0.8725822 (2174)	total: 2m 58s	remaining: 56m 36s
Stopped by overfitting detector  (500 iterations wait)

bestTest = 0.8725822236
bestIteration = 2174

Shrink model to first 2175 iterations.




0:	test: 0.8238729	test1: 0.8112759	best: 0.8112759 (0)	total: 78.2ms	remaining: 1h 5m 8s
500:	test: 0.8771645	test1: 0.8574904	best: 0.8574994 (493)	total: 35.2s	remaining: 57m 56s
1000:	test: 0.8860356	test1: 0.8578918	best: 0.8580812 (773)	total: 1m 11s	remaining: 58m 6s
1500:	test: 0.8931455	test1: 0.8580789	best: 0.8583676 (1422)	total: 1m 48s	remaining: 58m 28s
Stopped by overfitting detector  (500 iterations wait)

bestTest = 0.8583675697
bestIteration = 1422

Shrink model to first 1423 iterations.




0:	test: 0.8236991	test1: 0.8220181	best: 0.8220181 (0)	total: 71.1ms	remaining: 59m 12s
500:	test: 0.8753962	test1: 0.8682100	best: 0.8682100 (499)	total: 35.2s	remaining: 57m 56s
1000:	test: 0.8837616	test1: 0.8695134	best: 0.8697749 (901)	total: 1m 10s	remaining: 57m 52s
1500:	test: 0.8913168	test1: 0.8697817	best: 0.8701830 (1371)	total: 1m 48s	remaining: 58m 12s
2000:	test: 0.8971656	test1: 0.8702327	best: 0.8703454 (1972)	total: 2m 25s	remaining: 57m 58s
2500:	test: 0.9023498	test1: 0.8700455	best: 0.8703882 (2197)	total: 3m 1s	remaining: 57m 36s
Stopped by overfitting detector  (500 iterations wait)

bestTest = 0.8703882422
bestIteration = 2197

Shrink model to first 2198 iterations.




0:	test: 0.8244084	test1: 0.8091992	best: 0.8091992 (0)	total: 73.3ms	remaining: 1h 1m 5s
500:	test: 0.8771939	test1: 0.8458102	best: 0.8458102 (499)	total: 35.4s	remaining: 58m 21s
1000:	test: 0.8858819	test1: 0.8473593	best: 0.8474022 (996)	total: 1m 11s	remaining: 58m 22s
1500:	test: 0.8929306	test1: 0.8485927	best: 0.8486333 (1478)	total: 1m 47s	remaining: 58m 2s
2000:	test: 0.8988762	test1: 0.8487100	best: 0.8489580 (1923)	total: 2m 24s	remaining: 57m 43s
Stopped by overfitting detector  (500 iterations wait)

bestTest = 0.848958028
bestIteration = 1923

Shrink model to first 1924 iterations.




0:	test: 0.8237863	test1: 0.8138983	best: 0.8138983 (0)	total: 86.5ms	remaining: 1h 12m 2s
500:	test: 0.8760664	test1: 0.8555512	best: 0.8555512 (499)	total: 34.8s	remaining: 57m 13s
1000:	test: 0.8849714	test1: 0.8574273	best: 0.8574273 (1000)	total: 1m 10s	remaining: 57m 23s
1500:	test: 0.8923634	test1: 0.8584984	best: 0.8585367 (1457)	total: 1m 46s	remaining: 57m 15s
2000:	test: 0.8985275	test1: 0.8584758	best: 0.8588930 (1837)	total: 2m 22s	remaining: 56m 52s
Stopped by overfitting detector  (500 iterations wait)

bestTest = 0.8588929527
bestIteration = 1837

Shrink model to first 1838 iterations.




0:	test: 0.8242325	test1: 0.8096017	best: 0.8096017 (0)	total: 78.1ms	remaining: 1h 5m 3s
500:	test: 0.8780607	test1: 0.8468678	best: 0.8469625 (477)	total: 34.6s	remaining: 57m 3s
1000:	test: 0.8854467	test1: 0.8479005	best: 0.8479456 (992)	total: 1m 8s	remaining: 55m 43s
1500:	test: 0.8913781	test1: 0.8482838	best: 0.8483808 (1436)	total: 1m 42s	remaining: 55m 9s
2000:	test: 0.8972895	test1: 0.8484169	best: 0.8486559 (1941)	total: 2m 17s	remaining: 54m 47s
2500:	test: 0.9023605	test1: 0.8485138	best: 0.8487889 (2273)	total: 2m 52s	remaining: 54m 28s
Stopped by overfitting detector  (500 iterations wait)

bestTest = 0.8487889133
bestIteration = 2273

Shrink model to first 2274 iterations.




0:	test: 0.8231600	test1: 0.8268414	best: 0.8268414 (0)	total: 69.5ms	remaining: 57m 53s
500:	test: 0.8755739	test1: 0.8642607	best: 0.8643058 (483)	total: 33.2s	remaining: 54m 42s
1000:	test: 0.8861269	test1: 0.8662521	best: 0.8662634 (999)	total: 1m 7s	remaining: 54m 57s
1500:	test: 0.8941107	test1: 0.8661980	best: 0.8667235 (1183)	total: 1m 42s	remaining: 55m 15s
Stopped by overfitting detector  (500 iterations wait)

bestTest = 0.866723501
bestIteration = 1183

Shrink model to first 1184 iterations.




0:	test: 0.8230191	test1: 0.8336096	best: 0.8336096 (0)	total: 70.8ms	remaining: 59m 1s
500:	test: 0.8746623	test1: 0.8773709	best: 0.8773709 (500)	total: 33.7s	remaining: 55m 28s
1000:	test: 0.8832619	test1: 0.8788346	best: 0.8790444 (975)	total: 1m 7s	remaining: 55m 2s
1500:	test: 0.8906769	test1: 0.8789136	best: 0.8791053 (1266)	total: 1m 42s	remaining: 55m 9s
2000:	test: 0.8973601	test1: 0.8788820	best: 0.8792271 (1863)	total: 2m 17s	remaining: 54m 51s
Stopped by overfitting detector  (500 iterations wait)

bestTest = 0.8792270531
bestIteration = 1863

Shrink model to first 1864 iterations.




0:	test: 0.8245574	test1: 0.8054597	best: 0.8054597 (0)	total: 67.6ms	remaining: 56m 19s
500:	test: 0.8764897	test1: 0.8570572	best: 0.8571023 (496)	total: 33.6s	remaining: 55m 16s
1000:	test: 0.8842480	test1: 0.8578307	best: 0.8579052 (902)	total: 1m 7s	remaining: 55m 1s
1500:	test: 0.8907306	test1: 0.8585367	best: 0.8585457 (1485)	total: 1m 41s	remaining: 54m 51s
2000:	test: 0.8972344	test1: 0.8588005	best: 0.8588434 (1988)	total: 2m 17s	remaining: 54m 52s
2500:	test: 0.9031763	test1: 0.8593012	best: 0.8593869 (2485)	total: 2m 52s	remaining: 54m 37s
3000:	test: 0.9089774	test1: 0.8594117	best: 0.8596733 (2617)	total: 3m 27s	remaining: 54m 10s
Stopped by overfitting detector  (500 iterations wait)

bestTest = 0.8596733379
bestIteration = 2617

Shrink model to first 2618 iterations.




0:	test: 0.8231457	test1: 0.8308367	best: 0.8308367 (0)	total: 64.3ms	remaining: 53m 34s
500:	test: 0.8753678	test1: 0.8687398	best: 0.8687420 (498)	total: 33.2s	remaining: 54m 38s
1000:	test: 0.8842110	test1: 0.8702238	best: 0.8703185 (948)	total: 1m 6s	remaining: 53m 56s
1500:	test: 0.8914171	test1: 0.8707538	best: 0.8708665 (1460)	total: 1m 40s	remaining: 54m
2000:	test: 0.8974600	test1: 0.8709139	best: 0.8710650 (1804)	total: 2m 16s	remaining: 54m 41s
2500:	test: 0.9033517	test1: 0.8706252	best: 0.8711462 (2031)	total: 2m 54s	remaining: 55m 5s
Stopped by overfitting detector  (500 iterations wait)

bestTest = 0.8711462041
bestIteration = 2031

Shrink model to first 2032 iterations.




0:	test: 0.8234185	test1: 0.8246695	best: 0.8246695 (0)	total: 75.4ms	remaining: 1h 2m 49s
500:	test: 0.8754124	test1: 0.8774927	best: 0.8774927 (499)	total: 35.5s	remaining: 58m 23s
1000:	test: 0.8834808	test1: 0.8787534	best: 0.8788888 (931)	total: 1m 11s	remaining: 58m 17s
1500:	test: 0.8899432	test1: 0.8795067	best: 0.8796601 (1478)	total: 1m 47s	remaining: 57m 59s
2000:	test: 0.8960333	test1: 0.8801562	best: 0.8803006 (1948)	total: 2m 28s	remaining: 59m 10s
2500:	test: 0.9017671	test1: 0.8802420	best: 0.8806208 (2217)	total: 3m 6s	remaining: 58m 57s
Stopped by overfitting detector  (500 iterations wait)

bestTest = 0.8806208474
bestIteration = 2217

Shrink model to first 2218 iterations.


In [43]:
# AUC for second target
np.mean(season_scores)

0.8652161058583807

In [44]:
# Assign score and predicted probability to test dataset
h1n1_vaccine = h1n1_pred/20
seasonal_vaccine = season_pred/20

In [45]:
output_scored = pd.DataFrame({'respondent_id': val.respondent_id, 'h1n1_vaccine': h1n1_vaccine, 'seasonal_vaccine': seasonal_vaccine})

In [46]:
output_scored.describe()

Unnamed: 0,respondent_id,h1n1_vaccine,seasonal_vaccine
count,26708.0,26708.0,26708.0
mean,40060.5,0.211979,0.44416
std,7710.079831,0.246988,0.313093
min,26707.0,0.004524,0.007646
25%,33383.75,0.041797,0.14551
50%,40060.5,0.097399,0.394915
75%,46737.25,0.287115,0.74761
max,53414.0,0.98182,0.987686


In [47]:
output_scored.corr()

Unnamed: 0,respondent_id,h1n1_vaccine,seasonal_vaccine
respondent_id,1.0,0.002688,0.000156
h1n1_vaccine,0.002688,1.0,0.574622
seasonal_vaccine,0.000156,0.574622,1.0


In [48]:
output_scored.to_csv("output_catboost_v2.csv",index=False)

## Testing CatBoost tuning with optuna

In [32]:
train_dataset_A = Pool(data=X_a,
                     label=y_a,
                     cat_features = categorical_A)               

In [34]:
def objective(trial):
    param = {
        'iterations':trial.suggest_categorical('iterations', [100,200,300,500,1000,1200,1500]),
        'learning_rate':trial.suggest_float("learning_rate", 0.001, 0.3),
        'random_strength':trial.suggest_int("random_strength", 1,10),
        'bagging_temperature':trial.suggest_int("bagging_temperature", 0,10),
        'max_bin':trial.suggest_categorical('max_bin', [4,5,6,8,10,20,30]),
        'grow_policy':trial.suggest_categorical('grow_policy', ['SymmetricTree', 'Depthwise', 'Lossguide']),
        'min_data_in_leaf':trial.suggest_int("min_data_in_leaf", 1,10),
        'od_type' : "Iter",
        'od_wait' : 100,
        "depth": trial.suggest_int("max_depth", 2,10),
        "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1e-8, 100),
         'one_hot_max_size':trial.suggest_categorical('one_hot_max_size', [5,10,12,100,500,1024]),
        'custom_metric' : ['AUC'],
        "loss_function": "Logloss",
        'auto_class_weights':trial.suggest_categorical('auto_class_weights', ['Balanced', 'SqrtBalanced']),
        }

    scores = cv(train_dataset_A,
            param,
            fold_count=5, 
            early_stopping_rounds=10,         
            plot=False, verbose=False)

    return scores['test-AUC-mean'].max()

In [36]:
sampler = optuna.samplers.TPESampler(seed=68)  # Make the sampler behave in a deterministic way.
study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=10)

[32m[I 2022-04-18 16:11:48,559][0m A new study created in memory with name: no-name-dbdb4bc6-222f-46f5-a5cc-1b6432addea6[0m


Training on fold [0/5]

bestTest = 0.4068766478
bestIteration = 675

Training on fold [1/5]

bestTest = 0.426212928
bestIteration = 522

Training on fold [2/5]

bestTest = 0.431274293
bestIteration = 532

Training on fold [3/5]

bestTest = 0.4310591191
bestIteration = 467

Training on fold [4/5]


[32m[I 2022-04-18 16:12:05,650][0m Trial 0 finished with value: 0.8672466586293155 and parameters: {'iterations': 1500, 'learning_rate': 0.029356482739949695, 'random_strength': 8, 'bagging_temperature': 10, 'max_bin': 6, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 1, 'max_depth': 4, 'l2_leaf_reg': 0.001991194871120998, 'one_hot_max_size': 100, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 0 with value: 0.8672466586293155.[0m



bestTest = 0.4217968707
bestIteration = 345

Training on fold [0/5]

bestTest = 0.4098890198
bestIteration = 155

Training on fold [1/5]

bestTest = 0.4281604806
bestIteration = 158

Training on fold [2/5]

bestTest = 0.4304260887
bestIteration = 114

Training on fold [3/5]

bestTest = 0.4296790334
bestIteration = 118

Training on fold [4/5]


[32m[I 2022-04-18 16:12:10,423][0m Trial 1 finished with value: 0.8672804445097292 and parameters: {'iterations': 200, 'learning_rate': 0.1464067066361795, 'random_strength': 10, 'bagging_temperature': 3, 'max_bin': 10, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 1, 'max_depth': 3, 'l2_leaf_reg': 0.028402775147703313, 'one_hot_max_size': 500, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 1 with value: 0.8672804445097292.[0m



bestTest = 0.4137895192
bestIteration = 111

Training on fold [0/5]

bestTest = 0.4096545095
bestIteration = 85

Training on fold [1/5]

bestTest = 0.4254366023
bestIteration = 68

Training on fold [2/5]

bestTest = 0.4310905944
bestIteration = 60

Training on fold [3/5]

bestTest = 0.4294441756
bestIteration = 50

Training on fold [4/5]


[32m[I 2022-04-18 16:12:21,987][0m Trial 2 finished with value: 0.8661650028937672 and parameters: {'iterations': 200, 'learning_rate': 0.27287829596201946, 'random_strength': 8, 'bagging_temperature': 8, 'max_bin': 10, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 1, 'max_depth': 5, 'l2_leaf_reg': 0.027330135035255495, 'one_hot_max_size': 10, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 1 with value: 0.8672804445097292.[0m



bestTest = 0.4150526663
bestIteration = 63

Training on fold [0/5]

bestTest = 0.4099909609
bestIteration = 577

Training on fold [1/5]

bestTest = 0.4272606113
bestIteration = 386

Training on fold [2/5]

bestTest = 0.4346072047
bestIteration = 333

Training on fold [3/5]

bestTest = 0.4305748292
bestIteration = 575

Training on fold [4/5]


[32m[I 2022-04-18 16:12:52,071][0m Trial 3 finished with value: 0.8673197841507765 and parameters: {'iterations': 1200, 'learning_rate': 0.0603209284932487, 'random_strength': 3, 'bagging_temperature': 7, 'max_bin': 4, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 2, 'max_depth': 2, 'l2_leaf_reg': 1.300471404766049e-07, 'one_hot_max_size': 10, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 3 with value: 0.8673197841507765.[0m



bestTest = 0.4142502871
bestIteration = 510

Training on fold [0/5]

bestTest = 0.4087947421
bestIteration = 69

Training on fold [1/5]

bestTest = 0.4280484973
bestIteration = 84

Training on fold [2/5]

bestTest = 0.434562572
bestIteration = 69

Training on fold [3/5]

bestTest = 0.4332405498
bestIteration = 58

Training on fold [4/5]


[32m[I 2022-04-18 16:12:54,857][0m Trial 4 finished with value: 0.8660714672325674 and parameters: {'iterations': 300, 'learning_rate': 0.22423670437233847, 'random_strength': 6, 'bagging_temperature': 2, 'max_bin': 30, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 4, 'max_depth': 4, 'l2_leaf_reg': 0.00010293033487726667, 'one_hot_max_size': 500, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 3 with value: 0.8673197841507765.[0m



bestTest = 0.4145071682
bestIteration = 69

Training on fold [0/5]

bestTest = 0.4710178999
bestIteration = 99

Training on fold [1/5]

bestTest = 0.4795527112
bestIteration = 99

Training on fold [2/5]

bestTest = 0.4822684172
bestIteration = 99

Training on fold [3/5]

bestTest = 0.4825458663
bestIteration = 99

Training on fold [4/5]


[32m[I 2022-04-18 16:12:58,205][0m Trial 5 finished with value: 0.8567829510684252 and parameters: {'iterations': 100, 'learning_rate': 0.06628011038512191, 'random_strength': 4, 'bagging_temperature': 4, 'max_bin': 20, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 3, 'max_depth': 2, 'l2_leaf_reg': 13.751833235431702, 'one_hot_max_size': 100, 'auto_class_weights': 'Balanced'}. Best is trial 3 with value: 0.8673197841507765.[0m



bestTest = 0.4691944216
bestIteration = 99

Training on fold [0/5]

bestTest = 0.415679053
bestIteration = 103

Training on fold [1/5]

bestTest = 0.4318979034
bestIteration = 78

Training on fold [2/5]

bestTest = 0.4335034907
bestIteration = 108

Training on fold [3/5]

bestTest = 0.4350534163
bestIteration = 109

Training on fold [4/5]


[32m[I 2022-04-18 16:13:35,759][0m Trial 6 finished with value: 0.8643847226089525 and parameters: {'iterations': 1200, 'learning_rate': 0.09658215406978513, 'random_strength': 8, 'bagging_temperature': 2, 'max_bin': 30, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 6, 'max_depth': 10, 'l2_leaf_reg': 2.6558249848041764, 'one_hot_max_size': 5, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 3 with value: 0.8673197841507765.[0m



bestTest = 0.4178677513
bestIteration = 96

Training on fold [0/5]

bestTest = 0.4440354
bestIteration = 121

Training on fold [1/5]

bestTest = 0.4614264274
bestIteration = 83

Training on fold [2/5]

bestTest = 0.4650216548
bestIteration = 95

Training on fold [3/5]

bestTest = 0.4602119442
bestIteration = 112

Training on fold [4/5]


[32m[I 2022-04-18 16:13:39,007][0m Trial 7 finished with value: 0.8672250888022962 and parameters: {'iterations': 500, 'learning_rate': 0.2714096381817127, 'random_strength': 4, 'bagging_temperature': 6, 'max_bin': 8, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 1, 'max_depth': 2, 'l2_leaf_reg': 4.9369231964322795, 'one_hot_max_size': 1024, 'auto_class_weights': 'Balanced'}. Best is trial 3 with value: 0.8673197841507765.[0m



bestTest = 0.4456514123
bestIteration = 97

Training on fold [0/5]

bestTest = 0.4123532346
bestIteration = 104

Training on fold [1/5]

bestTest = 0.4271868471
bestIteration = 124

Training on fold [2/5]

bestTest = 0.4362159169
bestIteration = 75

Training on fold [3/5]

bestTest = 0.4300031643
bestIteration = 86

Training on fold [4/5]


[32m[I 2022-04-18 16:13:42,454][0m Trial 8 finished with value: 0.8659015639885024 and parameters: {'iterations': 1500, 'learning_rate': 0.2053434310118264, 'random_strength': 8, 'bagging_temperature': 0, 'max_bin': 20, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 10, 'max_depth': 3, 'l2_leaf_reg': 9.501510078266123e-06, 'one_hot_max_size': 500, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 3 with value: 0.8673197841507765.[0m



bestTest = 0.4162955827
bestIteration = 75

Training on fold [0/5]

bestTest = 0.4233132299
bestIteration = 31

Training on fold [1/5]

bestTest = 0.4344171624
bestIteration = 40

Training on fold [2/5]

bestTest = 0.4460072716
bestIteration = 33

Training on fold [3/5]

bestTest = 0.4471374849
bestIteration = 19

Training on fold [4/5]


[32m[I 2022-04-18 16:14:01,764][0m Trial 9 finished with value: 0.860100549476917 and parameters: {'iterations': 100, 'learning_rate': 0.25900665720714294, 'random_strength': 3, 'bagging_temperature': 0, 'max_bin': 6, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 8, 'max_depth': 7, 'l2_leaf_reg': 1.1694576328936887e-07, 'one_hot_max_size': 12, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 3 with value: 0.8673197841507765.[0m



bestTest = 0.4181117471
bestIteration = 55



In [37]:
print("Number of finished trials: {}".format(len(study.trials)))
print("Best trial:")
trial = study.best_trial
print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}={},".format(key, value))

Number of finished trials: 10
Best trial:
  Value: 0.8673197841507765
  Params: 
    iterations=1200,
    learning_rate=0.0603209284932487,
    random_strength=3,
    bagging_temperature=7,
    max_bin=4,
    grow_policy=SymmetricTree,
    min_data_in_leaf=2,
    max_depth=2,
    l2_leaf_reg=1.300471404766049e-07,
    one_hot_max_size=10,
    auto_class_weights=SqrtBalanced,


In [39]:
final_model = CatBoostClassifier(verbose=False,  cat_features=categorical_A, 
                          **trial.params)

In [40]:
final_model.fit(X_a, y_a)
predictions_h1 = final_model.predict_proba(X_a)
predictions_h1 = predictions_h1[:,1].reshape(-1,1)
roc_auc_score(y_a, predictions_h1)

0.8814719809937959

In [41]:
train_dataset_B = Pool(data=X_b,
                     label=y_b,
                     cat_features = categorical_B)   

In [42]:
def objective_b(trial):
    param = {
        'iterations':trial.suggest_categorical('iterations', [100,200,300,500,1000,1200,1500]),
        'learning_rate':trial.suggest_float("learning_rate", 0.001, 0.3),
        'random_strength':trial.suggest_int("random_strength", 1,10),
        'bagging_temperature':trial.suggest_int("bagging_temperature", 0,10),
        'max_bin':trial.suggest_categorical('max_bin', [4,5,6,8,10,20,30]),
        'grow_policy':trial.suggest_categorical('grow_policy', ['SymmetricTree', 'Depthwise', 'Lossguide']),
        'min_data_in_leaf':trial.suggest_int("min_data_in_leaf", 1,10),
        'od_type' : "Iter",
        'od_wait' : 100,
        "depth": trial.suggest_int("max_depth", 2,10),
        "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1e-8, 100),
         'one_hot_max_size':trial.suggest_categorical('one_hot_max_size', [5,10,12,100,500,1024]),
        'custom_metric' : ['AUC'],
        "loss_function": "Logloss",
        'auto_class_weights':trial.suggest_categorical('auto_class_weights', ['Balanced', 'SqrtBalanced']),
        }

    scores = cv(train_dataset_B,
            param,
            fold_count=5, 
            early_stopping_rounds=10,         
            plot=False, verbose=False)

    return scores['test-AUC-mean'].max()

In [43]:
sampler = optuna.samplers.TPESampler(seed=68)  # Make the sampler behave in a deterministic way.
study_b = optuna.create_study(direction="maximize", sampler=sampler)
study_b.optimize(objective_b, n_trials=10)

[32m[I 2022-04-18 16:20:17,956][0m A new study created in memory with name: no-name-6f148739-8805-4a96-8491-ef4eda776c33[0m


Training on fold [0/5]

bestTest = 0.4677161393
bestIteration = 605

Training on fold [1/5]

bestTest = 0.4642576936
bestIteration = 554

Training on fold [2/5]

bestTest = 0.4751103054
bestIteration = 667

Training on fold [3/5]

bestTest = 0.458888983
bestIteration = 601

Training on fold [4/5]


[32m[I 2022-04-18 16:20:39,529][0m Trial 0 finished with value: 0.8604534084210627 and parameters: {'iterations': 1500, 'learning_rate': 0.029356482739949695, 'random_strength': 8, 'bagging_temperature': 10, 'max_bin': 6, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 1, 'max_depth': 4, 'l2_leaf_reg': 0.001991194871120998, 'one_hot_max_size': 100, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 0 with value: 0.8604534084210627.[0m



bestTest = 0.4666320162
bestIteration = 553

Training on fold [0/5]

bestTest = 0.4659458357
bestIteration = 161

Training on fold [1/5]

bestTest = 0.4655820674
bestIteration = 130

Training on fold [2/5]

bestTest = 0.4754752572
bestIteration = 180

Training on fold [3/5]

bestTest = 0.4575895913
bestIteration = 166

Training on fold [4/5]


[32m[I 2022-04-18 16:20:46,370][0m Trial 1 finished with value: 0.860835217064848 and parameters: {'iterations': 200, 'learning_rate': 0.1464067066361795, 'random_strength': 10, 'bagging_temperature': 3, 'max_bin': 10, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 1, 'max_depth': 3, 'l2_leaf_reg': 0.028402775147703313, 'one_hot_max_size': 500, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 1 with value: 0.860835217064848.[0m



bestTest = 0.4634000181
bestIteration = 162

Training on fold [0/5]

bestTest = 0.4709400283
bestIteration = 61

Training on fold [1/5]

bestTest = 0.4689458595
bestIteration = 52

Training on fold [2/5]

bestTest = 0.4749041811
bestIteration = 68

Training on fold [3/5]

bestTest = 0.4618705103
bestIteration = 75

Training on fold [4/5]


[32m[I 2022-04-18 16:20:59,826][0m Trial 2 finished with value: 0.8583071673691268 and parameters: {'iterations': 200, 'learning_rate': 0.27287829596201946, 'random_strength': 8, 'bagging_temperature': 8, 'max_bin': 10, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 1, 'max_depth': 5, 'l2_leaf_reg': 0.027330135035255495, 'one_hot_max_size': 10, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 1 with value: 0.860835217064848.[0m



bestTest = 0.4698336531
bestIteration = 60

Training on fold [0/5]

bestTest = 0.4675610197
bestIteration = 637

Training on fold [1/5]

bestTest = 0.4653707665
bestIteration = 505

Training on fold [2/5]

bestTest = 0.4818324616
bestIteration = 387

Training on fold [3/5]

bestTest = 0.4627337556
bestIteration = 461

Training on fold [4/5]


[32m[I 2022-04-18 16:21:40,455][0m Trial 3 finished with value: 0.8590007530998008 and parameters: {'iterations': 1200, 'learning_rate': 0.0603209284932487, 'random_strength': 3, 'bagging_temperature': 7, 'max_bin': 4, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 2, 'max_depth': 2, 'l2_leaf_reg': 1.300471404766049e-07, 'one_hot_max_size': 10, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 1 with value: 0.860835217064848.[0m



bestTest = 0.4666959527
bestIteration = 645

Training on fold [0/5]

bestTest = 0.4724766085
bestIteration = 62

Training on fold [1/5]

bestTest = 0.4671351426
bestIteration = 72

Training on fold [2/5]

bestTest = 0.477255272
bestIteration = 89

Training on fold [3/5]

bestTest = 0.4606889044
bestIteration = 84

Training on fold [4/5]


[32m[I 2022-04-18 16:21:44,555][0m Trial 4 finished with value: 0.8587186446963686 and parameters: {'iterations': 300, 'learning_rate': 0.22423670437233847, 'random_strength': 6, 'bagging_temperature': 2, 'max_bin': 30, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 4, 'max_depth': 4, 'l2_leaf_reg': 0.00010293033487726667, 'one_hot_max_size': 500, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 1 with value: 0.860835217064848.[0m



bestTest = 0.4666420163
bestIteration = 79

Training on fold [0/5]

bestTest = 0.486724039
bestIteration = 99

Training on fold [1/5]

bestTest = 0.4830672116
bestIteration = 99

Training on fold [2/5]

bestTest = 0.4953518733
bestIteration = 99

Training on fold [3/5]

bestTest = 0.4803112458
bestIteration = 99

Training on fold [4/5]


[32m[I 2022-04-18 16:21:48,471][0m Trial 5 finished with value: 0.8497013466129747 and parameters: {'iterations': 100, 'learning_rate': 0.06628011038512191, 'random_strength': 4, 'bagging_temperature': 4, 'max_bin': 20, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 3, 'max_depth': 2, 'l2_leaf_reg': 13.751833235431702, 'one_hot_max_size': 100, 'auto_class_weights': 'Balanced'}. Best is trial 1 with value: 0.860835217064848.[0m



bestTest = 0.4851971478
bestIteration = 99

Training on fold [0/5]

bestTest = 0.4702186194
bestIteration = 121

Training on fold [1/5]

bestTest = 0.4624518021
bestIteration = 131

Training on fold [2/5]

bestTest = 0.4756433536
bestIteration = 137

Training on fold [3/5]

bestTest = 0.4582297853
bestIteration = 140

Training on fold [4/5]


[32m[I 2022-04-18 16:22:46,962][0m Trial 6 finished with value: 0.8596549654016881 and parameters: {'iterations': 1200, 'learning_rate': 0.09658215406978513, 'random_strength': 8, 'bagging_temperature': 2, 'max_bin': 30, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 6, 'max_depth': 10, 'l2_leaf_reg': 2.6558249848041764, 'one_hot_max_size': 5, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 1 with value: 0.860835217064848.[0m



bestTest = 0.4698047729
bestIteration = 116

Training on fold [0/5]

bestTest = 0.4661159677
bestIteration = 170

Training on fold [1/5]

bestTest = 0.4639159999
bestIteration = 135

Training on fold [2/5]

bestTest = 0.4803234805
bestIteration = 98

Training on fold [3/5]

bestTest = 0.4587321033
bestIteration = 151

Training on fold [4/5]


[32m[I 2022-04-18 16:22:51,779][0m Trial 7 finished with value: 0.8599081685355614 and parameters: {'iterations': 500, 'learning_rate': 0.2714096381817127, 'random_strength': 4, 'bagging_temperature': 6, 'max_bin': 8, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 1, 'max_depth': 2, 'l2_leaf_reg': 4.9369231964322795, 'one_hot_max_size': 1024, 'auto_class_weights': 'Balanced'}. Best is trial 1 with value: 0.860835217064848.[0m



bestTest = 0.468796333
bestIteration = 100

Training on fold [0/5]

bestTest = 0.4707739679
bestIteration = 77

Training on fold [1/5]

bestTest = 0.4663297585
bestIteration = 84

Training on fold [2/5]

bestTest = 0.4755019722
bestIteration = 105

Training on fold [3/5]

bestTest = 0.4586111376
bestIteration = 99

Training on fold [4/5]


[32m[I 2022-04-18 16:22:55,590][0m Trial 8 finished with value: 0.8590965794027419 and parameters: {'iterations': 1500, 'learning_rate': 0.2053434310118264, 'random_strength': 8, 'bagging_temperature': 0, 'max_bin': 20, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 10, 'max_depth': 3, 'l2_leaf_reg': 9.501510078266123e-06, 'one_hot_max_size': 500, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 1 with value: 0.860835217064848.[0m



bestTest = 0.4692711366
bestIteration = 100

Training on fold [0/5]

bestTest = 0.484405731
bestIteration = 21

Training on fold [1/5]

bestTest = 0.4741623335
bestIteration = 34

Training on fold [2/5]

bestTest = 0.490215162
bestIteration = 28

Training on fold [3/5]

bestTest = 0.469051336
bestIteration = 41

Training on fold [4/5]


[32m[I 2022-04-18 16:23:14,833][0m Trial 9 finished with value: 0.8534060466836741 and parameters: {'iterations': 100, 'learning_rate': 0.25900665720714294, 'random_strength': 3, 'bagging_temperature': 0, 'max_bin': 6, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 8, 'max_depth': 7, 'l2_leaf_reg': 1.1694576328936887e-07, 'one_hot_max_size': 12, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 1 with value: 0.860835217064848.[0m



bestTest = 0.4743536141
bestIteration = 39



In [45]:
print("Number of finished trials: {}".format(len(study_b.trials)))
print("Best trial:")
trial_b = study_b.best_trial
print("  Value: {}".format(trial_b.value))
print("  Params: ")
for key, value in trial_b.params.items():
    print("    {}={},".format(key, value))

Number of finished trials: 10
Best trial:
  Value: 0.860835217064848
  Params: 
    iterations=200,
    learning_rate=0.1464067066361795,
    random_strength=10,
    bagging_temperature=3,
    max_bin=10,
    grow_policy=Depthwise,
    min_data_in_leaf=1,
    max_depth=3,
    l2_leaf_reg=0.028402775147703313,
    one_hot_max_size=500,
    auto_class_weights=SqrtBalanced,


In [46]:
final_model_b = CatBoostClassifier(verbose=False,  cat_features=categorical_B, 
                                    **trial_b.params)

In [47]:
final_model_b.fit(X_b, y_b)
predictions_se = final_model_b.predict_proba(X_b)
predictions_se = predictions_se[:,1].reshape(-1,1)
roc_auc_score(y_b, predictions_se)

0.8770142521380235

In [59]:
final_se = final_model_b.predict_proba(test_A)
final_se = final_se[:,1].reshape(-1,1)

final_h1 = final_model.predict_proba(test_B)
final_h1 = final_h1[:,1].reshape(-1,1)

In [66]:
submission_df = pd.read_csv("./submission_format.csv", 
                            index_col="respondent_id")

In [68]:
# Save predictions to submission data frame
submission_df["h1n1_vaccine"] = final_h1
submission_df["seasonal_vaccine"] = final_se

submission_df.head()

Unnamed: 0_level_0,h1n1_vaccine,seasonal_vaccine
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1
26707,0.194324,0.292885
26708,0.050537,0.036695
26709,0.349791,0.732783
26710,0.654924,0.830654
26711,0.545811,0.507132


In [72]:
submission_df.to_csv(f'output_catboost_optuna.csv', index=True)