1.  One_hot_encoding for ordinal and categorial columns (rank information on education and poverty is lo
2.  Check for NAs 
    -> delete employment columns, health insurance
    -> delete rows with no information on recommandation of doctor (poor answering behavior) 
    -> take rows of opinion columns and impute NAs with k nearest neighbour 
3.  Delete the same rows in the labels dataset
    

In [4]:
import re
from pathlib import Path
import numpy as np
import pandas as pd

In [5]:
data_path = Path.cwd().parent.parent / "data" / "raw"
raw_df = pd.read_csv(data_path / "training_set_features.csv", index_col="respondent_id")
labels_df = pd.read_csv(data_path / "training_set_labels.csv", index_col="respondent_id")
test_df = pd.read_csv(data_path / "test_set_features.csv", index_col="respondent_id")

In [16]:
raw_df.dtypes != "object"
numeric_cols = raw_df.columns[raw_df.dtypes != "object"].values
raw_numeric_df = raw_df[numeric_cols]

In [22]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
raw_numeric_scaled_df = scaler.fit_transform(raw_numeric_df)

from sklearn.impute import SimpleImputer
imp_mode = SimpleImputer(missing_values=np.nan, strategy='median')
raw_numeric_scaled_imputed_df = imp_mode.fit_transform(raw_numeric_scaled_df)


In [26]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    raw_numeric_scaled_imputed_df,
    labels_df,
    test_size=0.33,
    shuffle=True,
    stratify=labels_df,
    random_state=6
)

In [33]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression

logisticRegr = MultiOutputClassifier(LogisticRegression(penalty="l2", C=1))
logisticRegr.fit(X_train, y_train)

predictions = logisticRegr.predict_proba(X_test)


predictions



[array([[0.70898955, 0.29101045],
        [0.90223115, 0.09776885],
        [0.84459698, 0.15540302],
        ...,
        [0.92338355, 0.07661645],
        [0.89632571, 0.10367429],
        [0.92788007, 0.07211993]]),
 array([[0.5272201 , 0.4727799 ],
        [0.61838567, 0.38161433],
        [0.37024072, 0.62975928],
        ...,
        [0.85937   , 0.14063   ],
        [0.77843316, 0.22156684],
        [0.15114951, 0.84885049]])]

In [34]:
print("predictions[0].shape", predictions[0].shape)
print("predictions[1].shape", predictions[1].shape)

predictions[0].shape (8814, 2)
predictions[1].shape (8814, 2)


In [38]:
y_preds = pd.DataFrame(
    {
        "h1n1_vaccine": predictions[0][:, 1],
        "seasonal_vaccine": predictions[1][:, 1],
    },
    index = y_eval.index
)
print("y_preds.shape:", y_preds.shape)
y_preds.head()

y_preds.shape: (8814, 2)


Unnamed: 0_level_0,h1n1_vaccine,seasonal_vaccine
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1
6728,0.29101,0.47278
16516,0.097769,0.381614
3106,0.155403,0.629759
16981,0.66179,0.857877
19111,0.284478,0.776929


In [39]:
from sklearn.metrics import roc_auc_score
print(roc_auc_score(y_test, y_preds))



0.8294712650703964


In [8]:
# change strings in ordinal data to numbers 

from sklearn.preprocessing import OrdinalEncoder
ordinal_enc = OrdinalEncoder ( 
    categories = [
        ['18 - 34 Years', '35 - 44 Years', '45 - 54 Years', '55 - 64 Years', '65+ Years'],
        ['< 12 Years', '12 Years', 'Some College', 'College Graduate' ],
        ['Below Poverty', '<= $75,000, Above Poverty', '> $75,000'],
     ]
)

raw_df_encoded_ordinal = ordinal_enc.fit_transform(raw_df_encoded[ordinal_list])
raw_df_encoded_ordinal = pd.DataFrame(raw_df_encoded_ordinal)
#ordinal_enc.categories_

raw_df_encoded['age_group'] = raw_df_encoded_ordinal[0]
raw_df_encoded['education'] = raw_df_encoded_ordinal[1]
raw_df_encoded['income_poverty'] = raw_df_encoded_ordinal[2]

In [9]:
#delete values in columns wich where imputed with most frequent values

education_encoded = raw_df_encoded['education'].copy()
education_na = raw_df['education'].isna()
education_encoded[education_na] = np.nan

raw_df_encoded['education'] = education_encoded

income_poverty_encoded = raw_df_encoded['income_poverty'].copy()
income_poverty_na = raw_df['income_poverty'].isna()
income_poverty_encoded[education_na] = np.nan

raw_df_encoded['income_poverty'] = income_poverty_encoded

In [10]:
#caculate KNN for the whole dataset

#import numpy as np
#from sklearn.impute import KNNImputer
#imputer = KNNImputer(
#    missing_values=np.nan,
#    n_neighbors=5,
#    weights="distance",
#    metric="nan_euclidean",
#    copy=True,
#    add_indicator=False,
#)

#imputer.fit(raw_df_encoded)

#knn_array = imputer.transform(raw_df_encoded)

#raw_df_encoded_imputed = pd.DataFrame(knn_array)
#raw_df_encoded_imputed.columns = raw_df_encoded.columns

#change KNN imputations in ordinal data columns to "categories"

#raw_df_encoded_imputed['education'].round()
#raw_df_encoded_imputed['income_poverty'].round()

In [11]:
# alternativly to KNN impute missing values with NaN

from sklearn.impute import SimpleImputer
imp_mode = SimpleImputer(missing_values=np.nan, strategy='median')
raw_df_encoded_imputed = imp_mode.fit_transform(raw_df_encoded)

raw_df_encoded_imputed = pd.DataFrame(raw_df_encoded_imputed, columns = raw_df_encoded.columns)


In [12]:
# renaming
raw_df_encoded_imputed_list = raw_df_encoded_imputed.columns.tolist()

renamed_list = []
for column_name in raw_df_encoded_imputed_list:
    column_name = re.sub(pattern=r"\$(\d\d)....", repl="\\1k", string=column_name)
    column_name = re.sub(pattern=r"\<\=|\<\s", repl="less_", string=column_name)
    column_name = re.sub(pattern=r"\>\=|\>\s", repl="gr_", string=column_name)
    column_name = re.sub(pattern=r"(\w)\,\s(\w)", repl="\\1_\\2", string=column_name)
    column_name = re.sub(pattern=r"\s\-\s", repl="_-_", string=column_name)
    column_name = re.sub(pattern=r"(\w)\s+(\w)", repl="\\1_\\2", string=column_name)
    column_name = re.sub(pattern=r"(\d\+)\s", repl="\\1_", string=column_name)
    column_name = re.sub(pattern=r"\_\_", repl="_", string=column_name)
    renamed_list.append(column_name)
    
raw_df_encoded_imputed.columns = renamed_list

In [13]:
#data_path_processed = Path.cwd().parent.parent / "data" / "processed"
#raw_df_encoded.to_csv(data_path_processed / "training_set_features_1hot_na.csv",)
#test_df_encoded.to_csv(data_path_processed / "test_set_features_1hot_na.csv",)

In [14]:
# check for multi collinarity 

pd.set_option("display.max_rows", None, "display.max_columns", None)
corr_data = raw_df_encoded_imputed.corr().abs()
sorted_corr_data = corr_data.unstack().sort_values(ascending=False)
ones = corr_data.unstack().sort_values(ascending=False) != 1.0
without_ones = sorted_corr_data[ones]
NaNs = corr_data.unstack().sort_values(ascending=False).notna()

without_ones_and_Na = without_ones[NaNs]
big_corrs = corr_data.unstack().sort_values(ascending=False) > 0.5
without_ones_and_Na[big_corrs]

employment_status_nan                 marital_status_nan                      0.876134
marital_status_nan                    employment_status_nan                   0.876134
employment_status_Not_in_Labor_Force  employment_industry_nan                 0.789401
employment_industry_nan               employment_status_Not_in_Labor_Force    0.789401
rent_or_own_nan                       employment_status_nan                   0.770415
employment_status_nan                 rent_or_own_nan                         0.770415
marital_status_nan                    rent_or_own_nan                         0.742348
rent_or_own_nan                       marital_status_nan                      0.742348
health_worker                         employment_industry_fcxhlnwr            0.696227
employment_industry_fcxhlnwr          health_worker                           0.696227
doctor_recc_h1n1                      doctor_recc_seasonal                    0.603152
doctor_recc_seasonal                  docto

In [15]:
#standardize dataset



In [16]:
#split in test und train data

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    raw_df_encoded_imputed_stand,
    labels_df,
    shuffle = True,
    test_size = 0.25,
    random_state = 10)

In [17]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression

logisticRegr = MultiOutputClassifier(LogisticRegression(penalty="l2", C=1))
logisticRegr.fit(X_train, y_train)

test_predictions = logisticRegr.predict(X_test)
train_predictions = logisticRegr.predict(X_train)

from sklearn.metrics import roc_auc_score
print(roc_auc_score(y_train, train_predictions))
print(roc_auc_score(y_test, test_predictions))


0.7354426508244212
0.7339868972864074
