In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier

In [2]:
df_train = pd.read_csv('data/training_set_features.csv')
df_test = pd.read_csv('data/test_set_features.csv')
y_train = pd.read_csv('data/training_set_labels.csv')

In [3]:
df_train.dtypes

respondent_id                    int64
h1n1_concern                   float64
h1n1_knowledge                 float64
behavioral_antiviral_meds      float64
behavioral_avoidance           float64
behavioral_face_mask           float64
behavioral_wash_hands          float64
behavioral_large_gatherings    float64
behavioral_outside_home        float64
behavioral_touch_face          float64
doctor_recc_h1n1               float64
doctor_recc_seasonal           float64
chronic_med_condition          float64
child_under_6_months           float64
health_worker                  float64
health_insurance               float64
opinion_h1n1_vacc_effective    float64
opinion_h1n1_risk              float64
opinion_h1n1_sick_from_vacc    float64
opinion_seas_vacc_effective    float64
opinion_seas_risk              float64
opinion_seas_sick_from_vacc    float64
age_group                       object
education                       object
race                            object
sex                      

In [4]:
print(y_train.h1n1_vaccine.value_counts(normalize=True))
print(y_train.seasonal_vaccine.value_counts(normalize=True))

0    0.787546
1    0.212454
Name: h1n1_vaccine, dtype: float64
0    0.534392
1    0.465608
Name: seasonal_vaccine, dtype: float64


In [14]:
from sklearn.metrics import (
    roc_auc_score,
    recall_score,
    f1_score,
    matthews_corrcoef
)

from sklearn.model_selection import cross_val_score

### Preprocessing

In [15]:
df_train.isnull().sum()

respondent_id                      0
h1n1_concern                      92
h1n1_knowledge                   116
behavioral_antiviral_meds         71
behavioral_avoidance             208
behavioral_face_mask              19
behavioral_wash_hands             42
behavioral_large_gatherings       87
behavioral_outside_home           82
behavioral_touch_face            128
doctor_recc_h1n1                2160
doctor_recc_seasonal            2160
chronic_med_condition            971
child_under_6_months             820
health_worker                    804
health_insurance               12274
opinion_h1n1_vacc_effective      391
opinion_h1n1_risk                388
opinion_h1n1_sick_from_vacc      395
opinion_seas_vacc_effective      462
opinion_seas_risk                514
opinion_seas_sick_from_vacc      537
age_group                          0
education                       1407
race                               0
sex                                0
income_poverty                  4423
m

In [16]:
df_test.isnull().sum()

respondent_id                      0
h1n1_concern                      85
h1n1_knowledge                   122
behavioral_antiviral_meds         79
behavioral_avoidance             213
behavioral_face_mask              19
behavioral_wash_hands             40
behavioral_large_gatherings       72
behavioral_outside_home           82
behavioral_touch_face            128
doctor_recc_h1n1                2160
doctor_recc_seasonal            2160
chronic_med_condition            932
child_under_6_months             813
health_worker                    789
health_insurance               12228
opinion_h1n1_vacc_effective      398
opinion_h1n1_risk                380
opinion_h1n1_sick_from_vacc      375
opinion_seas_vacc_effective      452
opinion_seas_risk                499
opinion_seas_sick_from_vacc      521
age_group                          0
education                       1407
race                               0
sex                                0
income_poverty                  4497
m

In [18]:
fill_with_unknown = ["education","income_poverty","employment_industry","employment_occupation","employment_status","marital_status","rent_or_own"]
                

for col in fill_with_unknown:
    df_train[col] = df_train[col].fillna(value="Unknown")
    df_test[col] = df_test[col].fillna(value="Unknown")

In [19]:
ages = {"18 - 34 Years" :0 ,
                     "35 - 44 Years" : 1,
                     "45 - 54 Years" : 2,
                     "55 - 64 Years": 3,
                     "65+ Years" : 4}
    
education = {"< 12 Years" :1 ,
                     "12 Years" :2,
                     "Some College" : 3,
                     "College Graduate": 4,
                     "Unknown" : 0}

income_poverty =  {"Unknown" :0,
                    "Below Poverty" :1 ,
                     "<= $75,000" :2,
                     "Above Poverty" : 3,
                     "$75,000": 4 }


df_train['age_group'] = df_train.age_group.map(ages)
df_test['age_group'] = df_test.age_group.map(ages)

df_train['education'] = df_train.education.map(education)
df_test['education'] = df_test.education.map(education)

df_train['income_poverty'] = df_train.income_poverty.map(income_poverty)
df_test['income_poverty'] = df_test.income_poverty.map(income_poverty)

In [20]:
def myfillna(series):
    if series.dtype is pd.np.dtype(float):
        return series.fillna(series.mean())
    elif series.dtype is pd.np.dtype(object):
        return series.fillna(series.mode())
    else:
        return series

In [21]:
df_train = df_train.apply(myfillna)
df_test = df_test.apply(myfillna)

  if series.dtype is pd.np.dtype(float):
  elif series.dtype is pd.np.dtype(object):
  if series.dtype is pd.np.dtype(float):
  elif series.dtype is pd.np.dtype(object):
  if series.dtype is pd.np.dtype(float):
  elif series.dtype is pd.np.dtype(object):
  if series.dtype is pd.np.dtype(float):
  elif series.dtype is pd.np.dtype(object):
  if series.dtype is pd.np.dtype(float):
  elif series.dtype is pd.np.dtype(object):
  if series.dtype is pd.np.dtype(float):
  elif series.dtype is pd.np.dtype(object):
  if series.dtype is pd.np.dtype(float):
  elif series.dtype is pd.np.dtype(object):
  if series.dtype is pd.np.dtype(float):
  elif series.dtype is pd.np.dtype(object):
  if series.dtype is pd.np.dtype(float):
  elif series.dtype is pd.np.dtype(object):
  if series.dtype is pd.np.dtype(float):
  elif series.dtype is pd.np.dtype(object):
  if series.dtype is pd.np.dtype(float):
  elif series.dtype is pd.np.dtype(object):
  if series.dtype is pd.np.dtype(float):
  elif series.dtype is p

In [23]:
from category_encoders import CountEncoder

In [24]:
df_train.select_dtypes('object')

Unnamed: 0,race,sex,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,employment_industry,employment_occupation
0,White,Female,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,Unknown,Unknown
1,White,Male,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",pxcmvdjn,xgwztkwe
2,White,Male,Not Married,Own,Employed,qufhixun,"MSA, Not Principle City",rucpziij,xtkaffoo
3,White,Female,Not Married,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",Unknown,Unknown
4,White,Female,Married,Own,Employed,qufhixun,"MSA, Not Principle City",wxleyezf,emcorrxb
...,...,...,...,...,...,...,...,...,...
26702,White,Female,Not Married,Own,Not in Labor Force,qufhixun,Non-MSA,Unknown,Unknown
26703,White,Male,Not Married,Rent,Employed,lzgpxyit,"MSA, Principle City",fcxhlnwr,cmhcxjea
26704,White,Female,Not Married,Own,Unknown,lzgpxyit,"MSA, Not Principle City",Unknown,Unknown
26705,Hispanic,Female,Married,Rent,Employed,lrircsnp,Non-MSA,fcxhlnwr,haliazsg


In [25]:
df_train.isnull().sum()

respondent_id                  0
h1n1_concern                   0
h1n1_knowledge                 0
behavioral_antiviral_meds      0
behavioral_avoidance           0
behavioral_face_mask           0
behavioral_wash_hands          0
behavioral_large_gatherings    0
behavioral_outside_home        0
behavioral_touch_face          0
doctor_recc_h1n1               0
doctor_recc_seasonal           0
chronic_med_condition          0
child_under_6_months           0
health_worker                  0
health_insurance               0
opinion_h1n1_vacc_effective    0
opinion_h1n1_risk              0
opinion_h1n1_sick_from_vacc    0
opinion_seas_vacc_effective    0
opinion_seas_risk              0
opinion_seas_sick_from_vacc    0
age_group                      0
education                      0
race                           0
sex                            0
income_poverty                 0
marital_status                 0
rent_or_own                    0
employment_status              0
hhs_geo_re

In [26]:
categorical_cols = df_train.select_dtypes('object').columns

### Modeling

In [29]:
from sklearn.ensemble import RandomForestClassifier

In [30]:
X, y = df_train.drop(columns='respondent_id'), y_train.drop(columns='respondent_id')
X_test = df_test.drop(columns='respondent_id')
y1, y2 = y['h1n1_vaccine'], y['seasonal_vaccine']

#### Pipeline

In [31]:
from sklearn.pipeline import Pipeline

In [32]:
rfc_1_pipeline = Pipeline(steps=[
    ('encoder', CountEncoder()),
    ('rfc', RandomForestClassifier())
])


rfc_1_pipeline.fit(X, y1)

In [33]:
cv_rfc_1_scores_from_pipe = cross_val_score(rfc_1_pipeline, X, y1, scoring='roc_auc', cv=10, n_jobs=-1, verbose=1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    6.4s finished


In [34]:
np.mean(cv_rfc_1_scores_from_pipe)

0.8603255953816955

### Generic pipeline

# Hyperparameter tuning

In [36]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from scipy.stats import uniform, randint

In [38]:
distr_params = {
    'ml__max_features': randint(5, 20),
    'ml__min_samples_leaf':randint(2, 6),
    'ml__max_depth': randint(5, 20)
}

ml_pipeline = Pipeline(steps=[
    ('encoder', CountEncoder()),
    ('ml', RandomForestClassifier())
])


randomized_rfc_1 = RandomizedSearchCV(ml_pipeline, distr_params,cv=10,n_iter=60, scoring='roc_auc', n_jobs=-1, verbose=1, random_state=21,).fit(X,y1)
randomized_rfc_2 = RandomizedSearchCV(ml_pipeline, distr_params,cv=10,n_iter=60, scoring='roc_auc', n_jobs=-1, verbose=1, random_state=21).fit(X,y2)

Fitting 10 folds for each of 60 candidates, totalling 600 fits
Fitting 10 folds for each of 60 candidates, totalling 600 fits


In [40]:
randomized_rfc_1.best_score_

0.865953036901552

In [41]:
best_rfc_1 = randomized_rfc_1.best_estimator_
best_rfc_1

In [42]:
best_rfc_2 = randomized_rfc_2.best_estimator_
best_rfc_2.get_params

<bound method Pipeline.get_params of Pipeline(steps=[('encoder',
                 CountEncoder(cols=['race', 'sex', 'marital_status',
                                    'rent_or_own', 'employment_status',
                                    'hhs_geo_region', 'census_msa',
                                    'employment_industry',
                                    'employment_occupation'],
                              combine_min_nan_groups=True)),
                ('ml',
                 RandomForestClassifier(max_depth=14, max_features=10,
                                        min_samples_leaf=4))])>

In [43]:
y_test1 = best_rfc_1.predict_proba(X_test)[:, 1]
y_test2 = best_rfc_2.predict_proba(X_test)[:, 1]


In [44]:
y_test_df = pd.DataFrame(y_test1, columns =['h1n1_vaccine'], dtype = float) 
arr = np.arange(26707, 53415)
y_test_df['respondent_id'] = arr.tolist()
y_test_df['seasonal_vaccine'] = y_test2.tolist()
y_test_df['seasonal_vaccine'] = y_test_df['seasonal_vaccine'].astype(float)
y_test_df = y_test_df[['respondent_id','h1n1_vaccine','seasonal_vaccine']]
y_test_df.to_csv("y_test.csv", index=False)