1. split preprocessed data in train und validation sets
2. create pipeline for Grid Search Algorithm
3. execute grid search on serveral logReg parameters 
4. get parameters with highest potential for predictions
5. fit logistic Regr on complete train dataset with parameters from GridSearch and apply on test data set
6. load submission dataset, sort probabilities accordingly and save submission dataset



In [1]:
import re
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline



In [6]:
data_path_raw = Path.cwd().parent / "data" / "raw"
data_path_preprocessed = Path.cwd().parent / "data" / "processed"

In [8]:
processed_df = pd.read_csv(data_path_preprocessed / "training_set_features_nominal_ordinal_WOE_Impute_Dropped_Stand.csv", index_col="respondent_id")
labels_df = pd.read_csv(data_path_raw / "training_set_labels.csv", index_col="respondent_id")
test_df_processed = pd.read_csv(data_path_preprocessed / "test_set_features_nominal_ordinal_WOE_Impute_Dropped_Stand.csv", index_col="respondent_id")

In [9]:
import time 

from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [12]:
## 1 split preprocessed data in train und validation sets
X_train, X_test, y_train, y_test = train_test_split(
        processed_df,
        labels_df,
        shuffle = True,
        test_size = 0.25,
        random_state = 10)


In [13]:
## 2 create pipeline for Grid Search Algorithm

pipeline = Pipeline([
            ('LogReg', MultiOutputClassifier(LogisticRegression())) 
                 ])

In [14]:
## 3 execute grid search on serveral logReg parameters 

from sklearn.model_selection import GridSearchCV

param = {
    'LogReg__estimator__penalty' : ['l1', 'l2' ,'elasticnet', 'none'],
    'LogReg__estimator__dual' : [False, True],
    'LogReg__estimator__C' : [0.5,1.,2.,5.],
    'LogReg__estimator__fit_intercept' : [False, True], 
    'LogReg__estimator__class_weight' :['balanced', 'None'],
    'LogReg__estimator__solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'LogReg__estimator__max_iter' : [100],
    'LogReg__estimator__l1_ratio' : [0, 0.5, 1]
    }    

clf = GridSearchCV(estimator=pipeline, param_grid = param, cv = 5, n_jobs=-1)
clf.fit(X_train, y_train)




  "(penalty={})".format(self.penalty))
  "(penalty={})".format(self.penalty))


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('LogReg',
                                        MultiOutputClassifier(estimator=LogisticRegression()))]),
             n_jobs=-1,
             param_grid={'LogReg__estimator__C': [0.5, 1.0, 2.0, 5.0],
                         'LogReg__estimator__class_weight': ['balanced',
                                                             'None'],
                         'LogReg__estimator__dual': [False, True],
                         'LogReg__estimator__fit_intercept': [False, True],
                         'LogReg__estimator__l1_ratio': [0, 0.5, 1],
                         'LogReg__estimator__max_iter': [100],
                         'LogReg__estimator__penalty': ['l1', 'l2',
                                                        'elasticnet', 'none'],
                         'LogReg__estimator__solver': ['newton-cg', 'lbfgs',
                                                       'liblinear', 'sag',
                     

In [15]:
# 4 get parameters with highest potential for predictions

clf.best_params_


{'LogReg__estimator__C': 1.0,
 'LogReg__estimator__class_weight': 'None',
 'LogReg__estimator__dual': False,
 'LogReg__estimator__fit_intercept': True,
 'LogReg__estimator__l1_ratio': 0,
 'LogReg__estimator__max_iter': 100,
 'LogReg__estimator__penalty': 'l2',
 'LogReg__estimator__solver': 'newton-cg'}

##  4
{'LogReg__estimator__C': 1.0,
 'LogReg__estimator__class_weight': 'None',
 'LogReg__estimator__dual': False,
 'LogReg__estimator__fit_intercept': True,
 'LogReg__estimator__l1_ratio': 0,
 'LogReg__estimator__max_iter': 100,
 'LogReg__estimator__penalty': 'l2',
 'LogReg__estimator__solver': 'newton-cg'

In [16]:
##  4

results = pd.DataFrame(clf.cv_results_)
results[results['rank_test_score'] < 10]
top_results = results[results['rank_test_score'] < 10].sort_values(ascending = True, by = 'rank_test_score')
top_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_LogReg__estimator__C,param_LogReg__estimator__class_weight,param_LogReg__estimator__dual,param_LogReg__estimator__fit_intercept,param_LogReg__estimator__l1_ratio,param_LogReg__estimator__max_iter,...,param_LogReg__estimator__solver,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
785,1.129433,0.186796,0.007666,0.000924,1,,False,True,0.0,100,...,newton-cg,"{'LogReg__estimator__C': 1.0, 'LogReg__estimat...",0.670245,0.67349,0.675487,0.663005,0.668248,0.670095,0.004343,1
786,0.470965,0.077908,0.013243,0.004776,1,,False,True,0.0,100,...,lbfgs,"{'LogReg__estimator__C': 1.0, 'LogReg__estimat...",0.670245,0.67349,0.675487,0.663005,0.668248,0.670095,0.004343,1
788,3.63975,0.044643,0.007923,0.001649,1,,False,True,0.0,100,...,sag,"{'LogReg__estimator__C': 1.0, 'LogReg__estimat...",0.670245,0.67349,0.675487,0.663005,0.668248,0.670095,0.004343,1
805,1.546109,0.207408,0.011382,0.00195,1,,False,True,0.5,100,...,newton-cg,"{'LogReg__estimator__C': 1.0, 'LogReg__estimat...",0.670245,0.67349,0.675487,0.663005,0.668248,0.670095,0.004343,1
806,0.516892,0.057213,0.011842,0.00245,1,,False,True,0.5,100,...,lbfgs,"{'LogReg__estimator__C': 1.0, 'LogReg__estimat...",0.670245,0.67349,0.675487,0.663005,0.668248,0.670095,0.004343,1
808,3.324943,0.470234,0.007615,0.000739,1,,False,True,0.5,100,...,sag,"{'LogReg__estimator__C': 1.0, 'LogReg__estimat...",0.670245,0.67349,0.675487,0.663005,0.668248,0.670095,0.004343,1
825,1.183195,0.095977,0.009121,0.00287,1,,False,True,1.0,100,...,newton-cg,"{'LogReg__estimator__C': 1.0, 'LogReg__estimat...",0.670245,0.67349,0.675487,0.663005,0.668248,0.670095,0.004343,1
826,0.500316,0.028767,0.011938,0.00323,1,,False,True,1.0,100,...,lbfgs,"{'LogReg__estimator__C': 1.0, 'LogReg__estimat...",0.670245,0.67349,0.675487,0.663005,0.668248,0.670095,0.004343,1
828,3.682659,0.29006,0.007614,0.000414,1,,False,True,1.0,100,...,sag,"{'LogReg__estimator__C': 1.0, 'LogReg__estimat...",0.670245,0.67349,0.675487,0.663005,0.668248,0.670095,0.004343,1


In [18]:
## 5 apply logistic Regr on complete train dataset with parameters from GridSearch

logisticRegr = MultiOutputClassifier(LogisticRegression(
                        C = 1.0,
                        class_weight = None,
                        dual = False,
                        fit_intercept = True,
                        l1_ratio = 0,
                        max_iter = 100,
                        penalty = 'l2', 
                        solver = 'newton-cg'))

logisticRegr.fit(processed_df, labels_df)
test_probability = logisticRegr.predict_proba(test_df_processed)

In [19]:
## 6 load submission dataset, sort probabilities accordingly and save submission dataset 

predictions = pd.DataFrame(
    {
        "h1n1_vaccine": test_probability[0][:, 1],
        "seasonal_vaccine": test_probability[1][:, 1],
    },
    index = test_df_processed.index
)


In [20]:
## 6
submission_df = pd.read_csv(data_path_raw / "submission_format.csv", 
                            index_col="respondent_id")

In [21]:
## 6
np.testing.assert_array_equal(test_df_processed.index.values, 
                              submission_df.index.values)


In [22]:
## 6
# Save predictions to submission data frame
submission_df["h1n1_vaccine"] = predictions["h1n1_vaccine"]
submission_df["seasonal_vaccine"] = predictions["seasonal_vaccine"]

In [23]:
## 6

output_path = Path.cwd().parent / "models" / "submissions"


submission_df.to_csv(output_path /'logreg_GridSearch.csv', index=True)

