1.  One_hot_encoding for ordinal and categorial columns (rank information on education and poverty is lo
2.  Check for NAs 
    -> delete employment columns, health insurance
    -> delete rows with no information on recommandation of doctor (poor answering behavior) 
    -> take rows of opinion columns and impute NAs with k nearest neighbour 
3.  Delete the same rows in the labels dataset
    

In [1]:
import re
from pathlib import Path
import numpy as np
import pandas as pd

In [2]:
# preprocess raw data
# split raw data into train and eval
# log_reg fit 
# preprocess test data 
# log_reg fit
# submit

In [3]:
data_path = Path.cwd().parent.parent / "data" / "raw"
raw_df = pd.read_csv(data_path / "training_set_features.csv", index_col="respondent_id")
labels_df = pd.read_csv(data_path / "training_set_labels.csv", index_col="respondent_id")
test_df = pd.read_csv(data_path / "test_set_features.csv", index_col="respondent_id")

all_df = raw_df.join(labels_df)
all_df.columns

Index(['h1n1_concern', 'h1n1_knowledge', 'behavioral_antiviral_meds',
       'behavioral_avoidance', 'behavioral_face_mask', 'behavioral_wash_hands',
       'behavioral_large_gatherings', 'behavioral_outside_home',
       'behavioral_touch_face', 'doctor_recc_h1n1', 'doctor_recc_seasonal',
       'chronic_med_condition', 'child_under_6_months', 'health_worker',
       'health_insurance', 'opinion_h1n1_vacc_effective', 'opinion_h1n1_risk',
       'opinion_h1n1_sick_from_vacc', 'opinion_seas_vacc_effective',
       'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'age_group',
       'education', 'race', 'sex', 'income_poverty', 'marital_status',
       'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa',
       'household_adults', 'household_children', 'employment_industry',
       'employment_occupation', 'h1n1_vaccine', 'seasonal_vaccine'],
      dtype='object')

In [4]:
pd.set_option("display.max_rows", 20, "display.max_columns", 20)

In [5]:
# one-hot-enconding without employment_occuptation (is employment_occuptation either dropped or Weight of Evidence Encoded)
    # drop column employment_occupation 
    # identify columns with categorial data 
    # just use columns with nominal data
    # getdummies() -> drop first column to dismiss multicollinarity, keep NaN column
raw_df = raw_df.drop(columns = ['employment_occupation'])

non_numerical_obj = raw_df.columns[raw_df.dtypes == "object"]
non_numerical_obj

nominal_list = ['race', 'sex', 
       'marital_status', 'rent_or_own', 'employment_status', 'hhs_geo_region',
       'census_msa', 'employment_industry']

raw_df_encoded = pd.get_dummies(raw_df, columns=nominal_list, drop_first = True, dummy_na=True)        

In [6]:
# Weight of Evidence Encoding of Occuptation
# 'h1n1_vaccine', 'seasonal_vaccine'
def WOE(var, tar):
    all_df[var] = all_df[var].fillna('NoData')
    k = all_df[[var,tar]].groupby(var)[tar].agg(['count','sum']).reset_index()
    k.columns = [var,'Count','Good']
    k['Bad'] = k['Count'] - k['Good']
    k['Good %'] = (k['Good'] / k['Good'].sum()*100).round(2)
    k['Bad %'] = (k['Bad'] / k['Bad'].sum()*100).round(2)
    k[var+tar+'_WOE'] = np.log(k['Good %'] / k['Bad %']).round(2)
    k = k.sort_values(by=var+tar+'_WOE')
    return(k)
h1n1_WOE = WOE('employment_occupation' , 'h1n1_vaccine')

seasonal_WOE = WOE('employment_occupation' , 'seasonal_vaccine')


WOE_df_season = pd.merge(all_df[['seasonal_vaccine','employment_occupation']],seasonal_WOE[['employment_occupation','employment_occupationseasonal_vaccine_WOE']],
     left_on='employment_occupation',
     right_on='employment_occupation',how='left')

WOE_df_h1n1 = pd.merge(all_df[['h1n1_vaccine','employment_occupation']],h1n1_WOE[['employment_occupation','employment_occupationh1n1_vaccine_WOE']],
     left_on='employment_occupation',
     right_on='employment_occupation',how='left')


WOE_df_both = WOE_df_h1n1.join(WOE_df_season, lsuffix='_h1n1', rsuffix='_seasonal')

WOE_mean = WOE_df_both[['employment_occupationh1n1_vaccine_WOE', 
                           'employment_occupationseasonal_vaccine_WOE']].mean(axis = 1)

raw_df_encoded['employment_occupation_WOE'] = WOE_mean

In [7]:
# encoding of ordinal data 
    # identifiying ordinal data columns 
    # change NaN in ordinal data columns to most frequent value 
    # change strings in ordinal data to numbers 
    # delete values in columns wich where imputed with most frequent values
    # impute NaN values with KNN
    
ordinal_list = ['age_group','education', 'income_poverty' ]



In [8]:
# impute misssing values in ordinal data columns with most frequent value 


from sklearn.impute import SimpleImputer
imp_mode = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
raw_df_encoded_mode = imp_mode.fit_transform(raw_df_encoded[ordinal_list])
raw_df_encoded_mode = pd.DataFrame(raw_df_encoded_mode, index = raw_df_encoded.index)

raw_df_encoded['age_group'] = raw_df_encoded_mode[0]
raw_df_encoded['education'] = raw_df_encoded_mode[1]
raw_df_encoded['income_poverty'] = raw_df_encoded_mode[2]

In [9]:
# change strings in ordinal data to numbers 

from sklearn.preprocessing import OrdinalEncoder
ordinal_enc = OrdinalEncoder ( 
    categories = [
        ['18 - 34 Years', '35 - 44 Years', '45 - 54 Years', '55 - 64 Years', '65+ Years'],
        ['< 12 Years', '12 Years', 'Some College', 'College Graduate' ],
        ['Below Poverty', '<= $75,000, Above Poverty', '> $75,000'],
     ]
)

raw_df_encoded_ordinal = ordinal_enc.fit_transform(raw_df_encoded[ordinal_list])
raw_df_encoded_ordinal = pd.DataFrame(raw_df_encoded_ordinal, raw_df_encoded.index)
#ordinal_enc.categories_

raw_df_encoded['age_group'] = raw_df_encoded_ordinal[0]
raw_df_encoded['education'] = raw_df_encoded_ordinal[1]
raw_df_encoded['income_poverty'] = raw_df_encoded_ordinal[2]

In [10]:
#delete values in columns wich where imputed with most frequent values

education_encoded = raw_df_encoded['education'].copy()
education_na = raw_df['education'].isna()
education_encoded[education_na] = np.nan

raw_df_encoded['education'] = education_encoded

income_poverty_encoded = raw_df_encoded['income_poverty'].copy()
income_poverty_na = raw_df['income_poverty'].isna()
income_poverty_encoded[income_poverty_na] = np.nan

raw_df_encoded['income_poverty'] = income_poverty_encoded

In [11]:
##caculate KNN for the whole dataset  // either this or next cell


#import numpy as np
#from sklearn.impute import KNNImputer
#imputer = KNNImputer(
#    missing_values=np.nan,
#    n_neighbors=5,
#    weights="distance",
#    metric="nan_euclidean",
#    copy=True,
#    add_indicator=False,
#)

#imputer.fit(raw_df_encoded)

#knn_array = imputer.transform(raw_df_encoded)

#raw_df_encoded_imputed = pd.DataFrame(knn_array, raw_df_encoded.index)
#raw_df_encoded_imputed.columns = raw_df_encoded.columns

#change KNN imputations in ordinal data columns to "categories"

#raw_df_encoded_imputed['education'].round()
#raw_df_encoded_imputed['income_poverty'].round()

In [12]:
## alternativly to KNN impute missing values with NaN

from sklearn.impute import SimpleImputer
imp_mode = SimpleImputer(missing_values=np.nan, strategy='mean')
raw_df_encoded_imputed = imp_mode.fit_transform(raw_df_encoded)

raw_df_encoded_imputed = pd.DataFrame(raw_df_encoded_imputed, columns = raw_df_encoded.columns, index = raw_df_encoded.index)


In [13]:
# renaming
raw_df_encoded_imputed_list = raw_df_encoded_imputed.columns.tolist()

renamed_list = []
for column_name in raw_df_encoded_imputed_list:
    column_name = re.sub(pattern=r"\$(\d\d)....", repl="\\1k", string=column_name)
    column_name = re.sub(pattern=r"\<\=|\<\s", repl="less_", string=column_name)
    column_name = re.sub(pattern=r"\>\=|\>\s", repl="gr_", string=column_name)
    column_name = re.sub(pattern=r"(\w)\,\s(\w)", repl="\\1_\\2", string=column_name)
    column_name = re.sub(pattern=r"\s\-\s", repl="_-_", string=column_name)
    column_name = re.sub(pattern=r"(\w)\s+(\w)", repl="\\1_\\2", string=column_name)
    column_name = re.sub(pattern=r"(\d\+)\s", repl="\\1_", string=column_name)
    column_name = re.sub(pattern=r"\_\_", repl="_", string=column_name)
    renamed_list.append(column_name)
    
raw_df_encoded_imputed.columns = renamed_list

In [14]:
#data_path_processed = Path.cwd().parent.parent / "data" / "processed"
#raw_df_encoded.to_csv(data_path_processed / "training_set_features_1hot_na.csv",)
#test_df_encoded.to_csv(data_path_processed / "test_set_features_1hot_na.csv",)

In [15]:
# check for multi collinarity 

pd.set_option("display.max_rows", None, "display.max_columns", None)
corr_data = raw_df_encoded_imputed.corr().abs()
sorted_corr_data = corr_data.unstack().sort_values(ascending=False)
ones = corr_data.unstack().sort_values(ascending=False) != 1.0
without_ones = sorted_corr_data[ones]
NaNs = corr_data.unstack().sort_values(ascending=False).notna()

without_ones_and_Na = without_ones[NaNs]
big_corrs = corr_data.unstack().sort_values(ascending=False) > 0.5
without_ones_and_Na[big_corrs]

marital_status_nan                    employment_status_nan                   0.876134
employment_status_nan                 marital_status_nan                      0.876134
employment_status_Not_in_Labor_Force  employment_industry_nan                 0.789401
employment_industry_nan               employment_status_Not_in_Labor_Force    0.789401
rent_or_own_nan                       employment_status_nan                   0.770415
employment_status_nan                 rent_or_own_nan                         0.770415
rent_or_own_nan                       marital_status_nan                      0.742348
marital_status_nan                    rent_or_own_nan                         0.742348
health_worker                         employment_industry_fcxhlnwr            0.694224
employment_industry_fcxhlnwr          health_worker                           0.694224
doctor_recc_seasonal                  doctor_recc_h1n1                        0.591868
doctor_recc_h1n1                      docto

In [16]:
#standardize dataset

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
raw_df_encoded_imputed_stand = scaler.fit_transform(raw_df_encoded_imputed)

raw_df_encoded_imputed_stand = pd.DataFrame(raw_df_encoded_imputed_stand, columns = raw_df_encoded_imputed.columns, index = raw_df_encoded_imputed.index)

In [17]:
# option to select only most relevant features 

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

# feature extraction
SelectBest = SelectKBest(score_func=f_classif, k=4)
bestfeatures_h1n1 = SelectBest.fit(raw_df_encoded_imputed,labels_df['h1n1_vaccine'])

# summarize scores


pd.set_option("display.max_rows", 80) 

bestfeatures_h1n1_df = pd.DataFrame(bestfeatures_h1n1.scores_, index = raw_df_encoded_imputed.columns.tolist())

bestfeatures_h1n1_df.columns = ['h1n1_scores']

bestfeatures_h1n1_df.sort_values(ascending = False, by = 'h1n1_scores')

#features = fit.transform(X)
# summarize selected features
#features

bestfeatures_seasonal = SelectBest.fit(raw_df_encoded_imputed,labels_df['seasonal_vaccine'])

# summarize scores


pd.set_option("display.max_rows", 80) 

bestfeatures_seasonal_df = pd.DataFrame(bestfeatures_seasonal.scores_, index = raw_df_encoded_imputed.columns.tolist())


bestfeatures_seasonal_df.columns = ['seasonal_scores']


bestfeatutures_df = bestfeatures_h1n1_df.join(bestfeatures_seasonal_df)
bestfeatutures_df['combined_scores'] = bestfeatutures_df['h1n1_scores'] + bestfeatutures_df['seasonal_scores']

bestfeatutures_df.sort_values(ascending = False, by = 'combined_scores')
#features = fit.transform(X)
# summarize selected features
#features

  f = msb / msw
  f = msb / msw


Unnamed: 0,h1n1_scores,seasonal_scores,combined_scores
opinion_seas_risk,1875.881784,4687.26229,6563.144074
doctor_recc_h1n1,4637.592333,1007.057887,5644.650221
doctor_recc_seasonal,1170.871533,3834.992552,5005.864085
opinion_seas_vacc_effective,871.637106,3945.960053,4817.597159
opinion_h1n1_risk,3064.265541,1295.316883,4359.582424
opinion_h1n1_vacc_effective,2058.043383,1154.890863,3212.934247
age_group,45.12889,2227.227052,2272.355942
employment_occupation_WOE,797.637296,960.551315,1758.188611
health_worker,770.772913,426.806927,1197.579839
h1n1_concern,401.784811,653.574278,1055.359089


In [18]:
best_featues_list = bestfeatutures_df[bestfeatutures_df['combined_scores'] > 100].index.tolist()

raw_df_encoded_imputed_stand_selected = raw_df_encoded_imputed_stand[best_featues_list]

In [19]:
#split in test und train data

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    raw_df_encoded_imputed_stand_selected,
    labels_df,
    shuffle = True,
    test_size = 0.25,
    random_state = 10)

In [20]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression

logisticRegr = MultiOutputClassifier(LogisticRegression(penalty="l2", C=1))
logisticRegr.fit(X_train, y_train)

predictions = logisticRegr.predict_proba(X_test)

y_preds = pd.DataFrame(
    {
        "h1n1_vaccine": predictions[0][:, 1],
        "seasonal_vaccine": predictions[1][:, 1],
    },
    index = y_test.index
)
print("y_preds.shape:", y_preds.shape)
y_preds.head()

y_preds.shape: (6677, 2)


Unnamed: 0_level_0,h1n1_vaccine,seasonal_vaccine
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1
2653,0.161914,0.960908
9506,0.258613,0.731571
23107,0.061176,0.545706
22648,0.079263,0.84958
25589,0.247137,0.7286


In [21]:
predictions

[array([[0.8380861 , 0.1619139 ],
        [0.74138718, 0.25861282],
        [0.93882399, 0.06117601],
        ...,
        [0.80755292, 0.19244708],
        [0.25507498, 0.74492502],
        [0.88507301, 0.11492699]]),
 array([[0.03909211, 0.96090789],
        [0.26842936, 0.73157064],
        [0.45429423, 0.54570577],
        ...,
        [0.7851672 , 0.2148328 ],
        [0.08389102, 0.91610898],
        [0.91760446, 0.08239554]])]

In [22]:
from sklearn.metrics import roc_auc_score
print(roc_auc_score(y_test, y_preds))

0.8416729926405242


In [23]:
test_predictions = logisticRegr.predict(X_test)
train_predictions = logisticRegr.predict(X_train)

In [24]:
from sklearn.metrics import classification_report
#print(classification_report(y_train, train_predictions))
print(classification_report(y_test, test_predictions))

              precision    recall  f1-score   support

           0       0.67      0.45      0.54      1410
           1       0.77      0.74      0.76      3137

   micro avg       0.74      0.65      0.69      4547
   macro avg       0.72      0.60      0.65      4547
weighted avg       0.73      0.65      0.69      4547
 samples avg       0.35      0.33      0.33      4547



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [25]:
KNN_imputation = 0.8379218513307902

median_imputation = 0.8398216367540676

mean_imputation = 0.8405460643039464

features_selection = 0.8416729926405242


In [26]:
# preprocess test data 

In [27]:

# one-hot-enconding without employment_occuptation (is employment_occuptation either dropped or Weight of Evidence Encoded)
    # drop column employment_occupation 
    # identify columns with categorial data 
    # just use columns with nominal data
    # getdummies() -> drop first column to dismiss multicollinarity, keep NaN column

non_numerical_obj = test_df.columns[test_df.dtypes == "object"]
non_numerical_obj

nominal_list = ['race', 'sex', 
       'marital_status', 'rent_or_own', 'employment_status', 'hhs_geo_region',
       'census_msa', 'employment_industry']

test_df_encoded = pd.get_dummies(test_df, columns=nominal_list, drop_first = True, dummy_na=True)        

In [28]:
# Weight of Evidence Encoding of Occuptation
# 'h1n1_vaccine', 'seasonal_vaccine'

test_df_encoded['employment_occupation_new'] =test_df_encoded['employment_occupation'].replace(np.nan, 'NoData')

def WOE(var, tar):
    all_df[var] = all_df[var].fillna('NoData')
    k = all_df[[var,tar]].groupby(var)[tar].agg(['count','sum']).reset_index()
    k.columns = [var,'Count','Good']
    k['Bad'] = k['Count'] - k['Good']
    k['Good %'] = (k['Good'] / k['Good'].sum()*100).round(2)
    k['Bad %'] = (k['Bad'] / k['Bad'].sum()*100).round(2)
    k[var+tar+'_WOE'] = np.log(k['Good %'] / k['Bad %']).round(2)
    k = k.sort_values(by=var+tar+'_WOE')
    return(k)
h1n1_WOE = WOE('employment_occupation' , 'h1n1_vaccine')

seasonal_WOE = WOE('employment_occupation' , 'seasonal_vaccine')


test_WOE_df_season = pd.merge(test_df_encoded['employment_occupation_new'],seasonal_WOE[['employment_occupation','employment_occupationseasonal_vaccine_WOE']],
     left_on='employment_occupation_new',
     right_on='employment_occupation',how='left')
test_WOE_df_season.index =  test_df_encoded.index

test_WOE_df_h1n1 = pd.merge(test_df_encoded[['employment_occupation_new']],h1n1_WOE[['employment_occupation','employment_occupationh1n1_vaccine_WOE']],
     left_on='employment_occupation_new',
     right_on='employment_occupation',how='left')
test_WOE_df_h1n1.index =  test_df_encoded.index


test_WOE_df_both = test_WOE_df_h1n1 .join(test_WOE_df_season, lsuffix='_h1n1', rsuffix='_seasonal')

test_WOE_df_both.index =  test_df_encoded.index


test_df_encoded['employment_occupation_WOE'] = test_WOE_df_both[['employment_occupationh1n1_vaccine_WOE', 
                           'employment_occupationseasonal_vaccine_WOE']].mean(axis = 1)



test_df_encoded = test_df_encoded.drop(columns = ['employment_occupation','employment_occupation_new' ])


In [29]:
# encoding of ordinal data 
    # identifiying ordinal data columns 
    # change NaN in ordinal data columns to most frequent value 
    # change strings in ordinal data to numbers 
    # delete values in columns wich where imputed with most frequent values
    # impute NaN values with KNN
    
ordinal_list = ['age_group','education', 'income_poverty' ]

# impute misssing values in ordinal data columns with most frequent value 


from sklearn.impute import SimpleImputer
test_imp_mode = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
test_df_encoded_mode = test_imp_mode.fit_transform(test_df_encoded[ordinal_list])
test_df_encoded_mode_pd = pd.DataFrame(test_df_encoded_mode, index = test_df_encoded.index )




In [30]:
test_df_encoded['age_group'] = test_df_encoded_mode_pd[0]
test_df_encoded['education'] = test_df_encoded_mode_pd[1]
test_df_encoded['income_poverty'] = test_df_encoded_mode_pd[2]

In [31]:
# change strings in ordinal data to numbers 


from sklearn.preprocessing import OrdinalEncoder
ordinal_enc = OrdinalEncoder ( 
    categories = [
        ['18 - 34 Years', '35 - 44 Years', '45 - 54 Years', '55 - 64 Years', '65+ Years'],
        ['< 12 Years', '12 Years', 'Some College', 'College Graduate' ],
        ['Below Poverty', '<= $75,000, Above Poverty', '> $75,000'],
     ]
)

test_df_encoded_ordinal = ordinal_enc.fit_transform(test_df_encoded[ordinal_list])
test_df_encoded_ordinal = pd.DataFrame(test_df_encoded_ordinal, test_df_encoded.index)



In [32]:

test_df_encoded['age_group'] = test_df_encoded_ordinal[0]
test_df_encoded['education'] = test_df_encoded_ordinal[1]
test_df_encoded['income_poverty'] = test_df_encoded_ordinal[2]

In [33]:
#delete values in columns wich where imputed with most frequent values

test_education_encoded = test_df_encoded['education'].copy()
test_education_na = test_df['education'].isna()
test_education_encoded[test_education_na] = np.nan

test_df_encoded['education'] = test_education_encoded

test_income_poverty_encoded = test_df_encoded['income_poverty'].copy()
test_income_poverty_na = test_df['income_poverty'].isna()
test_income_poverty_encoded[test_income_poverty_na] = np.nan

test_df_encoded['income_poverty'] = test_income_poverty_encoded

In [34]:
## alternativly to KNN impute missing values with NaN

imp_mode = SimpleImputer(missing_values=np.nan, strategy='mean')

raw_df_median = imp_mode.fit(raw_df_encoded)
test_df_encoded_imputed = imp_mode.transform(test_df_encoded)

test_df_encoded_imputed = pd.DataFrame(test_df_encoded_imputed, columns = test_df_encoded.columns, index = test_df_encoded.index)



In [38]:
# renaming
test_df_encoded_imputed_list = test_df_encoded_imputed.columns.tolist()

renamed_list = []
for column_name in raw_df_encoded_imputed_list:
    column_name = re.sub(pattern=r"\$(\d\d)....", repl="\\1k", string=column_name)
    column_name = re.sub(pattern=r"\<\=|\<\s", repl="less_", string=column_name)
    column_name = re.sub(pattern=r"\>\=|\>\s", repl="gr_", string=column_name)
    column_name = re.sub(pattern=r"(\w)\,\s(\w)", repl="\\1_\\2", string=column_name)
    column_name = re.sub(pattern=r"\s\-\s", repl="_-_", string=column_name)
    column_name = re.sub(pattern=r"(\w)\s+(\w)", repl="\\1_\\2", string=column_name)
    column_name = re.sub(pattern=r"(\d\+)\s", repl="\\1_", string=column_name)
    column_name = re.sub(pattern=r"\_\_", repl="_", string=column_name)
    renamed_list.append(column_name)
    
test_df_encoded_imputed.columns = renamed_list

In [39]:
#standardize dataset

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
test_df_encoded_imputed_stand = scaler.fit_transform(test_df_encoded_imputed)

test_df_encoded_imputed_stand = pd.DataFrame(test_df_encoded_imputed_stand, columns = test_df_encoded_imputed.columns, index = test_df_encoded_imputed.index)

In [40]:
test_df_encoded_imputed_stand_selected = test_df_encoded_imputed_stand[best_featues_list]

In [None]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression

logisticRegr = MultiOutputClassifier(LogisticRegression(penalty="l2", C=1))
logisticRegr.fit(raw_df_encoded_imputed_stand_selected, labels_df)



In [41]:
test_predictions = logisticRegr.predict_proba(test_df_encoded_imputed_stand_selected)


In [42]:

y_preds = pd.DataFrame(
    {
        "h1n1_vaccine": test_predictions[0][:, 1],
        "seasonal_vaccine": test_predictions[1][:, 1],
    },
    index = test_df_encoded_imputed_stand.index
)
print("y_preds.shape:", y_preds.shape)
y_preds.head()

y_preds.shape: (26708, 2)


Unnamed: 0_level_0,h1n1_vaccine,seasonal_vaccine
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1
26707,0.110042,0.245974
26708,0.015058,0.025575
26709,0.386582,0.688453
26710,0.465132,0.870906
26711,0.181032,0.503416


In [43]:
submission_df = pd.read_csv(data_path / "submission_format.csv", 
                            index_col="respondent_id")

In [49]:
np.testing.assert_array_equal(test_df_encoded_imputed_stand_selected.index.values, 
                              submission_df.index.values)




In [50]:
y_preds["h1n1_vaccine"].head(10)

respondent_id
26707    0.110042
26708    0.015058
26709    0.386582
26710    0.465132
26711    0.181032
26712    0.506193
26713    0.335879
26714    0.164873
26715    0.035064
26716    0.211157
Name: h1n1_vaccine, dtype: float64

In [46]:
# Save predictions to submission data frame
submission_df["h1n1_vaccine"] = y_preds["h1n1_vaccine"]
submission_df["seasonal_vaccine"] = y_preds["seasonal_vaccine"]

In [48]:
#CHANGE PATH 

output_path = Path.cwd().parent.parent / "models" / "submissions"


submission_df.to_csv(output_path /'submission_logreg_JFR_200723_ordinal_nominal_meanImpute_BestFeatures.csv', index=True)


In [None]:
data_path_preprocessed = Path.cwd().parent.parent / "data" / "processed"

raw_pd_df_encoded_imputed_stand = pd.DataFrame(raw_df_encoded_imputed_stand, columns = raw_df_encoded.columns, index = raw_df_encoded.index)
raw_pd_df_encoded_imputed_stand.to_csv (data_path_preprocessed / 'training_set_features_encoded_imputed_standardized.csv')

In [None]:
data_path_preprocessed = Path.cwd().parent.parent / "data" / "processed"

test_pd_df_encoded_imputed_stand = pd.DataFrame(test_df_encoded_imputed_stand, columns = test_df_encoded.columns, index = test_df_encoded.index)
test_pd_df_encoded_imputed_stand.to_csv (data_path_preprocessed / 'test_set_features_encoded_imputed_standardized.csv')