In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier



In [2]:
training_features = pd.read_csv('./Data/training_set_features.csv', index_col = 'respondent_id')

In [None]:
training_features.head()

In [None]:
training_features.info()

In [3]:
def one_hot_df_column(df_column, df):
    feature_name = [df_column.name]
    df = df.drop(columns = feature_name)
    df_column = df_column.replace(np.nan, 'Missing').astype('str')
    df_column = df_column.values.reshape(-1,1)
    enc = OneHotEncoder(handle_unknown = 'ignore')
    feature_arr = enc.fit_transform(df_column).toarray()
    feature_labels = enc.get_feature_names(feature_name)
    one_hot = pd.DataFrame(feature_arr, columns = feature_labels)
    finished_df = pd.concat([df, one_hot], axis = 1)
    return finished_df
    



In [4]:
one_hot_columns = ['h1n1_concern', 'h1n1_knowledge', 'opinion_h1n1_vacc_effective', 'opinion_h1n1_risk', 'opinion_h1n1_sick_from_vacc',
                  'opinion_seas_vacc_effective', 'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'age_group', 'education', 'race',
                  'sex', 'income_poverty', 'marital_status', 'rent_or_own', 'employment_status', 'census_msa', 'employment_industry', 'employment_occupation', 'hhs_geo_region']

In [5]:
final_training_df = training_features.copy()
for column in one_hot_columns:
    final_training_df = one_hot_df_column(final_training_df[column], final_training_df)
    

In [6]:
final_training_df.head()

Unnamed: 0,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,doctor_recc_seasonal,chronic_med_condition,...,hhs_geo_region_atmpeygn,hhs_geo_region_bhuqouqj,hhs_geo_region_dqpwygqj,hhs_geo_region_fpwskwrf,hhs_geo_region_kbazzjca,hhs_geo_region_lrircsnp,hhs_geo_region_lzgpxyit,hhs_geo_region_mlyzmhmf,hhs_geo_region_oxchjgsf,hhs_geo_region_qufhixun
0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [None]:
list(final_training_df.columns)

In [7]:
final_training_df = final_training_df.rename(columns = {'education_< 12 Years': 'education_less_than_12_years', 'income_poverty_<= $75,000, Above Poverty': 'income_poverty_Above_Poverty', 'income_poverty_> $75,000': 'income_poverty_greater_than_$75000', 'census_msa_MSA, Not Principle  City': 'census_msa_MSA_Not Principle  City', 'census_msa_MSA, Principle City': 'census_msa_MSA_ Principle City'})

In [8]:
training_labels = pd.read_csv('./Data/training_set_labels.csv', index_col = 'respondent_id')

In [None]:
training_labels.head()

In [129]:
clf = OneVsRestClassifier(XGBClassifier(objective='binary:logistic', max_depth = 4, scale_pos_weight = 0.9))

In [10]:
X = final_training_df.copy()

In [21]:
list(X.columns)

['behavioral_antiviral_meds',
 'behavioral_avoidance',
 'behavioral_face_mask',
 'behavioral_wash_hands',
 'behavioral_large_gatherings',
 'behavioral_outside_home',
 'behavioral_touch_face',
 'doctor_recc_h1n1',
 'doctor_recc_seasonal',
 'chronic_med_condition',
 'child_under_6_months',
 'health_worker',
 'health_insurance',
 'household_adults',
 'household_children',
 'h1n1_concern_0.0',
 'h1n1_concern_1.0',
 'h1n1_concern_2.0',
 'h1n1_concern_3.0',
 'h1n1_concern_Missing',
 'h1n1_knowledge_0.0',
 'h1n1_knowledge_1.0',
 'h1n1_knowledge_2.0',
 'h1n1_knowledge_Missing',
 'opinion_h1n1_vacc_effective_1.0',
 'opinion_h1n1_vacc_effective_2.0',
 'opinion_h1n1_vacc_effective_3.0',
 'opinion_h1n1_vacc_effective_4.0',
 'opinion_h1n1_vacc_effective_5.0',
 'opinion_h1n1_vacc_effective_Missing',
 'opinion_h1n1_risk_1.0',
 'opinion_h1n1_risk_2.0',
 'opinion_h1n1_risk_3.0',
 'opinion_h1n1_risk_4.0',
 'opinion_h1n1_risk_5.0',
 'opinion_h1n1_risk_Missing',
 'opinion_h1n1_sick_from_vacc_1.0',
 'opini

In [11]:
y = training_labels.copy()

In [None]:
y['h1n1_vaccine'].value_counts()


In [None]:
y['seasonal_vaccine'].value_counts()

In [28]:
y

Unnamed: 0_level_0,h1n1_vaccine,seasonal_vaccine
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0,0
1,0,1
2,0,0
3,0,1
4,0,0
...,...,...
26702,0,0
26703,0,0
26704,0,1
26705,0,0


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [None]:
#list(X.columns)

In [114]:
clf.fit(X,y)

OneVsRestClassifier(estimator=XGBClassifier(base_score=None, booster=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=None, gamma=None,
                                            gpu_id=None, importance_type='gain',
                                            interaction_constraints=None,
                                            learning_rate=None,
                                            max_delta_step=None, max_depth=4,
                                            min_child_weight=None, missing=nan,
                                            monotone_constraints=None,
                                            n_estimators=100, n_jobs=None,
                                            num_parallel_tree=None,
                                            objective='binary:logistic',
                                            ran

In [None]:
y_pred = clf.predict_proba(X_test)

In [None]:
clf.score(X_test, y_test)

In [None]:
roc_auc_score(y_test, y_pred)

In [None]:
y_pred

In [115]:
actual_test_features = pd.read_csv('./Data/test_set_features.csv', index_col = 'respondent_id')

In [116]:
actual_test_features

Unnamed: 0_level_0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
26707,2.0,2.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,"> $75,000",Not Married,Rent,Employed,mlyzmhmf,"MSA, Not Principle City",1.0,0.0,atmlpfrs,hfxkjkmi
26708,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Below Poverty,Not Married,Rent,Employed,bhuqouqj,Non-MSA,3.0,0.0,atmlpfrs,xqwwgdyp
26709,2.0,2.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,...,"> $75,000",Married,Own,Employed,lrircsnp,Non-MSA,1.0,0.0,nduyfdeo,pvmttkik
26710,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,"<= $75,000, Above Poverty",Married,Own,Not in Labor Force,lrircsnp,"MSA, Not Principle City",1.0,0.0,,
26711,3.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,...,"<= $75,000, Above Poverty",Not Married,Own,Employed,lzgpxyit,Non-MSA,0.0,1.0,fcxhlnwr,mxkfnird
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53410,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,...,,,,,dqpwygqj,"MSA, Principle City",1.0,1.0,,
53411,3.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,...,Below Poverty,Married,Rent,Employed,qufhixun,Non-MSA,1.0,3.0,fcxhlnwr,vlluhbov
53412,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Below Poverty,Not Married,Rent,Not in Labor Force,qufhixun,"MSA, Not Principle City",1.0,0.0,,
53413,3.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,"<= $75,000, Above Poverty",Married,Own,Not in Labor Force,bhuqouqj,"MSA, Not Principle City",1.0,0.0,,


In [117]:
final_test_df = actual_test_features.copy()
for column in one_hot_columns:
    final_test_df = one_hot_df_column(final_test_df[column], final_test_df)
    

In [118]:
final_test_df = final_test_df.rename(columns = {'education_< 12 Years': 'education_less_than_12_years', 'income_poverty_<= $75,000, Above Poverty': 'income_poverty_Above_Poverty', 'income_poverty_> $75,000': 'income_poverty_greater_than_$75000', 'census_msa_MSA, Not Principle  City': 'census_msa_MSA_Not Principle  City', 'census_msa_MSA, Principle City': 'census_msa_MSA_ Principle City'})

In [119]:
final_test_df = final_test_df.drop(columns = ['age_group_Missing', 'census_msa_Missing', 'hhs_geo_region_Missing', 'race_Missing', 'sex_Missing'])

In [120]:
final_test_df = final_test_df[26707:]

In [121]:
final_test_df

Unnamed: 0,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,doctor_recc_seasonal,chronic_med_condition,...,hhs_geo_region_atmpeygn,hhs_geo_region_bhuqouqj,hhs_geo_region_dqpwygqj,hhs_geo_region_fpwskwrf,hhs_geo_region_kbazzjca,hhs_geo_region_lrircsnp,hhs_geo_region_lzgpxyit,hhs_geo_region_mlyzmhmf,hhs_geo_region_oxchjgsf,hhs_geo_region_qufhixun
26707,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
26708,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26709,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
26710,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
26711,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53410,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53411,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
53412,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
53413,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [122]:
id_range = final_test_df.index

In [123]:
id_range

Int64Index([26707, 26708, 26709, 26710, 26711, 26712, 26713, 26714, 26715,
            26716,
            ...
            53405, 53406, 53407, 53408, 53409, 53410, 53411, 53412, 53413,
            53414],
           dtype='int64', length=26708)

In [124]:
final_y_pred = clf.predict_proba(final_test_df)


In [125]:
submission_df = pd.DataFrame(final_y_pred, columns = ['h1n1_vaccine', 'seasonal_vaccine'], index = id_range  )

In [126]:
submission_df.head()

Unnamed: 0,h1n1_vaccine,seasonal_vaccine
26707,0.066522,0.187841
26708,0.04279,0.024233
26709,0.111087,0.652013
26710,0.654124,0.82425
26711,0.239596,0.314912


In [127]:
submission_df.index.name = 'respondent_id'

In [128]:
submission_df.to_csv('./Data/submission.csv')