1. Feature Selection based on F-Value -> Sorting Columns
2. Which Features optimize ROC with train_vali
3. Apply Feature List on Test Data 
4. Submit


In [1]:
import re
from pathlib import Path
import numpy as np
import pandas as pd

In [2]:
data_path_raw = Path.cwd().parent.parent / "data" / "raw"
data_path_preprocessed = Path.cwd().parent.parent / "data" / "processed"

In [3]:
raw_df_processed_balanced = pd.read_csv(
    data_path_preprocessed / 
    "training_set_features__Preprocessing_nominal_ordinal_WOE_Impute_balanced_dropped_stand.csv", 
    index_col="respondent_id"
)

labels_df_balanced = pd.read_csv(
    data_path_preprocessed / 
    "training_set_labels__balanced.csv", 
    index_col="respondent_id"
)

test_df_processed = pd.read_csv(
    data_path_preprocessed / 
    "test_set_features__Preprocessing_nominal_ordinal_WOE_Impute_balanced_dropped_stand.csv", 
    index_col="respondent_id"
)


In [4]:
import time 

from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [25]:
X_train, X_test, y_train, y_test = train_test_split(
        raw_df_processed_balanced,
        labels_df_balanced,
        shuffle = True,
        test_size = 0.25,
        random_state = 10)
    
logisticRegr = MultiOutputClassifier(LogisticRegression(
    penalty="l1",
    C=0.5,
    class_weight = 'None',
    dual = False,
    fit_intercept = True,
    l1_ratio = 0,
    max_iter = 1000,
    solver = 'saga'
    ))
    

start = time.time()
logisticRegr.fit(X_train, y_train)
test_probability = logisticRegr.predict_proba(X_test)
end = time.time()
    
    
y_preds = pd.DataFrame(
    {
        "h1n1_vaccine": test_probability[0][:, 1],
        "seasonal_vaccine": test_probability[1][:, 1],
    },
    index = y_test.index
    )
    
    
roc_auc_score(y_test, y_preds)

0.8321844987450644

In [20]:
logisticRegr = MultiOutputClassifier(LogisticRegression(
    penalty="l1",
    C=0.5,
    class_weight = 'None',
    dual = False,
    fit_intercept = True,
    l1_ratio = 0,
    max_iter = 1000,
    solver = 'saga'
    ))
    
logisticRegr.fit(raw_df_processed_balanced, labels_df_balanced)
test_probability = logisticRegr.predict_proba(test_df_processed)

y_preds = pd.DataFrame(
    {
        "h1n1_vaccine": test_probability[0][:, 1],
        "seasonal_vaccine": test_probability[1][:, 1],
    },
    index = test_df_processed.index
    )
    

  "(penalty={})".format(self.penalty))
  "(penalty={})".format(self.penalty))


In [21]:
submission_df = pd.read_csv(data_path_raw / "submission_format.csv", 
                            index_col="respondent_id")

In [22]:
np.testing.assert_array_equal(test_df_processed.index.values, 
                              submission_df.index.values)




In [23]:
# Save predictions to submission data frame
submission_df["h1n1_vaccine"] = y_preds["h1n1_vaccine"]
submission_df["seasonal_vaccine"] = y_preds["seasonal_vaccine"]

In [24]:
#CHANGE PATH 

output_path = Path.cwd().parent.parent / "models" / "submissions"


submission_df.to_csv(output_path /'logreg_JFR_200805_ordinal_nominal_meanImpute_BalancedData.csv', index=True)

