In [716]:
import numpy as np
import pandas as pd
pd.pandas.set_option('display.max_columns', None)
from scipy import stats
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score, roc_curve

In [717]:
features = pd.read_csv('data/training_set_features.csv', index_col="respondent_id")

In [718]:
label = pd.read_csv('data/training_set_labels.csv', index_col="respondent_id")

In [719]:
df = features.merge(label, on='respondent_id', how='inner')

df.head()

Unnamed: 0_level_0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,doctor_recc_seasonal,chronic_med_condition,child_under_6_months,health_worker,health_insurance,opinion_h1n1_vacc_effective,opinion_h1n1_risk,opinion_h1n1_sick_from_vacc,opinion_seas_vacc_effective,opinion_seas_risk,opinion_seas_sick_from_vacc,age_group,education,race,sex,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation,h1n1_vaccine,seasonal_vaccine
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,1.0,2.0,2.0,1.0,2.0,55 - 64 Years,< 12 Years,White,Female,Below Poverty,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,,0,0
1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,5.0,4.0,4.0,4.0,2.0,4.0,35 - 44 Years,12 Years,White,Male,Below Poverty,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe,0,1
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,,1.0,0.0,0.0,,3.0,1.0,1.0,4.0,1.0,2.0,18 - 34 Years,College Graduate,White,Male,"<= $75,000, Above Poverty",Not Married,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo,0,0
3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,,3.0,3.0,5.0,5.0,4.0,1.0,65+ Years,12 Years,White,Female,Below Poverty,Not Married,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,,0,1
4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,3.0,3.0,2.0,3.0,1.0,4.0,45 - 54 Years,Some College,White,Female,"<= $75,000, Above Poverty",Married,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb,0,0


In [720]:
def df_summary(df):
    
    print(f"Dataset Shape: {df.shape}")
    
    summary = pd.DataFrame(df.dtypes, columns=['dtypes'])
    
    summary = summary.reset_index()
    
    summary['Name'] = summary['index']
    
    summary = summary[['Name','dtypes']]
    
    summary['Uniques'] = df.nunique().values
    
    summary['Missing Count'] = df.isnull().sum().values
    
    summary['Missing Percentage'] = df.isnull().sum().values/len(df)
    
    summary['Missing Percentage'] = summary['Missing Percentage'].map("{:.2%}".format)
    
    for name in summary['Name'].value_counts().index:
        
        summary.loc[summary['Name'] == name, 'Entropy'] = round(stats.entropy(df[name].value_counts(normalize=True), base=2),2) 

    return summary

In [721]:
df_summary(df)

Dataset Shape: (26707, 37)


Unnamed: 0,Name,dtypes,Uniques,Missing Count,Missing Percentage,Entropy
0,h1n1_concern,float64,4,92,0.34%,1.86
1,h1n1_knowledge,float64,3,116,0.43%,1.33
2,behavioral_antiviral_meds,float64,2,71,0.27%,0.28
3,behavioral_avoidance,float64,2,208,0.78%,0.85
4,behavioral_face_mask,float64,2,19,0.07%,0.36
5,behavioral_wash_hands,float64,2,42,0.16%,0.67
6,behavioral_large_gatherings,float64,2,87,0.33%,0.94
7,behavioral_outside_home,float64,2,82,0.31%,0.92
8,behavioral_touch_face,float64,2,128,0.48%,0.91
9,doctor_recc_h1n1,float64,2,2160,8.09%,0.76


In [722]:
X_num_cols = [var for var in df.columns if df[var].dtype != 'O' and var not in ['h1n1_vaccine', 'seasonal_vaccine']]

In [723]:
X_cat_cols = [var for var in df.columns if df[var].dtype == 'O' and var not in ['h1n1_vaccine', 'seasonal_vaccine']]

In [724]:
X_cols = [var for var in df.columns if var not in ['h1n1_vaccine', 'seasonal_vaccine']]
X = df[X_cols]

In [725]:
y_cols = ['h1n1_vaccine', 'seasonal_vaccine']
y = df[y_cols]

In [726]:
# Categorical features

In [727]:
imp_cat = SimpleImputer(strategy='constant', fill_value='missing')
ohe = OneHotEncoder(handle_unknown='ignore')

In [728]:
pipe_cat = make_pipeline(imp_cat, ohe)

In [729]:
# Numeric features

In [730]:
imp_num = SimpleImputer(strategy='median')

In [731]:
ct = make_column_transformer(
    (pipe_cat, X_cat_cols),
    (imp_num, X_num_cols),
    remainder='passthrough')

In [732]:
estimator = LogisticRegression(solver='liblinear', penalty="l1", C=1, max_iter=1000, random_state=123)

In [733]:
estimators = MultiOutputClassifier(estimator)

In [734]:
pipe = make_pipeline(ct, estimators)

In [735]:
# Grid Search

In [736]:
params = {}
params['columntransformer__pipeline__simpleimputer__add_indicator'] = [False, True]
params['columntransformer__simpleimputer__add_indicator'] = [False, True]
params['multioutputclassifier__estimator__penalty'] = ['l1', 'l2']
params['multioutputclassifier__estimator__C'] = [0.1, 1, 10]

In [737]:
grid = GridSearchCV(pipe, params, cv=5, scoring='roc_auc')
grid.fit(X, y);

In [738]:
results = pd.DataFrame(grid.cv_results_)
results.sort_values('rank_test_score').head(10)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_columntransformer__pipeline__simpleimputer__add_indicator,param_columntransformer__simpleimputer__add_indicator,param_multioutputclassifier__estimator__C,param_multioutputclassifier__estimator__penalty,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
19,0.597695,0.014894,0.028517,0.000839,True,True,0.1,l2,{'columntransformer__pipeline__simpleimputer__...,0.860026,0.856462,0.858788,0.864205,0.854192,0.858735,0.003388,1
20,27.283355,1.632368,0.029616,0.001858,True,True,1.0,l1,{'columntransformer__pipeline__simpleimputer__...,0.859843,0.856264,0.858411,0.86446,0.85448,0.858692,0.003415,2
8,26.785982,1.646975,0.021908,0.000585,False,True,1.0,l1,{'columntransformer__pipeline__simpleimputer__...,0.859871,0.856224,0.858496,0.864414,0.85445,0.858691,0.003413,3
7,0.428069,0.023896,0.021107,0.000735,False,True,0.1,l2,{'columntransformer__pipeline__simpleimputer__...,0.859999,0.856441,0.858808,0.864096,0.854084,0.858686,0.003383,4
18,5.28706,0.359087,0.029414,0.000667,True,True,0.1,l1,{'columntransformer__pipeline__simpleimputer__...,0.86069,0.856575,0.858543,0.864033,0.853575,0.858683,0.003555,5
6,4.735975,0.349939,0.022109,0.000736,False,True,0.1,l1,{'columntransformer__pipeline__simpleimputer__...,0.860709,0.856445,0.85867,0.863976,0.853472,0.858654,0.003586,6
9,0.646071,0.044584,0.02171,0.001126,False,True,1.0,l2,{'columntransformer__pipeline__simpleimputer__...,0.859482,0.856099,0.858405,0.864329,0.854311,0.858525,0.003414,7
21,0.805486,0.091786,0.02881,0.000677,True,True,1.0,l2,{'columntransformer__pipeline__simpleimputer__...,0.85948,0.856085,0.858392,0.864334,0.854313,0.858521,0.003417,8
22,8.869996,0.719344,0.029509,0.001582,True,True,10.0,l1,{'columntransformer__pipeline__simpleimputer__...,0.859263,0.855935,0.858303,0.864239,0.854285,0.858405,0.003402,9
10,8.326231,1.964012,0.02141,0.000583,False,True,10.0,l1,{'columntransformer__pipeline__simpleimputer__...,0.859248,0.855938,0.858302,0.864234,0.854287,0.858402,0.003398,10


In [739]:
print(F"Grid best score: {grid.best_score_}")
print('')
print(F"Grid best score: {grid.best_params_}")

Grid best score: 0.8587347412440621

Grid best score: {'columntransformer__pipeline__simpleimputer__add_indicator': True, 'columntransformer__simpleimputer__add_indicator': True, 'multioutputclassifier__estimator__C': 0.1, 'multioutputclassifier__estimator__penalty': 'l2'}


In [740]:
test_features_df = pd.read_csv('data/test_set_features.csv', index_col="respondent_id")

In [741]:
test_probas = grid.predict_proba(test_features_df)

In [742]:
submission_df = pd.read_csv('data/submission_format.csv', index_col="respondent_id")

In [743]:
submission_df.head()

Unnamed: 0_level_0,h1n1_vaccine,seasonal_vaccine
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1
26707,0.5,0.7
26708,0.5,0.7
26709,0.5,0.7
26710,0.5,0.7
26711,0.5,0.7


In [744]:
# Make sure we have the rows in the same order
np.testing.assert_array_equal(test_features_df.index.values, submission_df.index.values)

# Save predictions to submission data frame
submission_df["h1n1_vaccine"] = test_probas[0][:, 1]
submission_df["seasonal_vaccine"] = test_probas[1][:, 1]

submission_df.head()

Unnamed: 0_level_0,h1n1_vaccine,seasonal_vaccine
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1
26707,0.10298,0.34386
26708,0.039007,0.045409
26709,0.277269,0.558301
26710,0.55529,0.888253
26711,0.265894,0.532675


In [745]:
submission_df.to_csv('my_submission.csv', index=True)