In [51]:
# Data import using OHE and pipelines

# Import the data

import pandas as pd

data_features = pd.read_csv('data/training_set_features.csv') # predictors
data_target = pd.read_csv('data/training_set_labels.csv') # target

# Remove unnecessary or redundant variables
data_features.drop(['respondent_id'], axis=1, inplace=True)
data_target.drop(['respondent_id', 'h1n1_vaccine'], axis=1, inplace=True)

# merge both datasets
df = pd.concat([data_features, data_target], axis=1)

In [52]:
# Convert all float datatype to int
data = df.convert_dtypes()

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26707 entries, 0 to 26706
Data columns (total 36 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   h1n1_concern                 26615 non-null  Int64 
 1   h1n1_knowledge               26591 non-null  Int64 
 2   behavioral_antiviral_meds    26636 non-null  Int64 
 3   behavioral_avoidance         26499 non-null  Int64 
 4   behavioral_face_mask         26688 non-null  Int64 
 5   behavioral_wash_hands        26665 non-null  Int64 
 6   behavioral_large_gatherings  26620 non-null  Int64 
 7   behavioral_outside_home      26625 non-null  Int64 
 8   behavioral_touch_face        26579 non-null  Int64 
 9   doctor_recc_h1n1             24547 non-null  Int64 
 10  doctor_recc_seasonal         24547 non-null  Int64 
 11  chronic_med_condition        25736 non-null  Int64 
 12  child_under_6_months         25887 non-null  Int64 
 13  health_worker                25

In [53]:
# Convert all to category
'''
The categorical data type is useful in the following cases:

- A string variable consisting of only a few different values. 
Converting such a string variable to a categorical variable will save some memory, see here.
- The lexical order of a variable is not the same as the logical order (“one”, “two”, “three”). 
By converting to a categorical and specifying an order on the categories, sorting and min/max 
will use the logical order instead of the lexical order, see here. 
- As a signal to other Python libraries that this column should be treated as a categorical 
variable (e.g. to use suitable statistical methods or plot types).
'''
#data[data.columns] = data[data.columns].astype('category')
#data.info()

'\nThe categorical data type is useful in the following cases:\n\n- A string variable consisting of only a few different values. \nConverting such a string variable to a categorical variable will save some memory, see here.\n- The lexical order of a variable is not the same as the logical order (“one”, “two”, “three”). \nBy converting to a categorical and specifying an order on the categories, sorting and min/max \nwill use the logical order instead of the lexical order, see here. \n- As a signal to other Python libraries that this column should be treated as a categorical \nvariable (e.g. to use suitable statistical methods or plot types).\n'

In [56]:
data['household_children'].value_counts()

0    18672
1     3175
2     2864
3     1747
Name: household_children, dtype: Int64

In [57]:
# Missing values
# Convert all na to 'is_na'
#data.fillna(value='is_na', inplace=True)
#data.isna().sum()

In [72]:
from sklearn.model_selection import train_test_split

# Set X and y variables
y = df['seasonal_vaccine'] # target
X = df.drop('seasonal_vaccine', axis=1) # features

SEED = 42
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.25, 
                                                    random_state=SEED)




In [75]:
# import OneHotEncoder
from sklearn.preprocessing import OneHotEncoder

# Initialize
ohe = OneHotEncoder(drop='first')
# fit encoder on X data
ohe.fit(X_train)
X_train_ohe = ohe.transform(X_train).toarray()

# Dataframe to visualize
X_train_ohe_df = pd.DataFrame(X_train_ohe, columns=ohe.get_feature_names(X_train.columns))

In [76]:
# Decision Tree
# Modeling
# Base model: Decision Tree
from sklearn import tree
from sklearn import metrics
from sklearn.model_selection import cross_val_score

clf_dt = tree.DecisionTreeClassifier(criterion='entropy')
dt_cv_score = cross_val_score(clf_dt, X_train_ohe_df, y_train, cv=5)
mean_dt_cv_score = dt_cv_score.mean()

print(f'Mean Cross Validation Score: {mean_dt_cv_score:.2%}')

Mean Cross Validation Score: 68.63%


In [77]:
X_train_ohe_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20030 entries, 0 to 20029
Columns: 148 entries, h1n1_concern_1.0 to employment_occupation_nan
dtypes: float64(148)
memory usage: 22.6 MB


In [78]:
for col in X_train_ohe_df.columns:
    print(col)

h1n1_concern_1.0
h1n1_concern_2.0
h1n1_concern_3.0
h1n1_concern_nan
h1n1_knowledge_1.0
h1n1_knowledge_2.0
h1n1_knowledge_nan
behavioral_antiviral_meds_1.0
behavioral_antiviral_meds_nan
behavioral_avoidance_1.0
behavioral_avoidance_nan
behavioral_face_mask_1.0
behavioral_face_mask_nan
behavioral_wash_hands_1.0
behavioral_wash_hands_nan
behavioral_large_gatherings_1.0
behavioral_large_gatherings_nan
behavioral_outside_home_1.0
behavioral_outside_home_nan
behavioral_touch_face_1.0
behavioral_touch_face_nan
doctor_recc_h1n1_1.0
doctor_recc_h1n1_nan
doctor_recc_seasonal_1.0
doctor_recc_seasonal_nan
chronic_med_condition_1.0
chronic_med_condition_nan
child_under_6_months_1.0
child_under_6_months_nan
health_worker_1.0
health_worker_nan
health_insurance_1.0
health_insurance_nan
opinion_h1n1_vacc_effective_2.0
opinion_h1n1_vacc_effective_3.0
opinion_h1n1_vacc_effective_4.0
opinion_h1n1_vacc_effective_5.0
opinion_h1n1_vacc_effective_nan
opinion_h1n1_risk_2.0
opinion_h1n1_risk_3.0
opinion_h1n1_ri