In [None]:
from pathlib import Path

import numpy as np
import pandas as pd

pd.set_option("display.max_columns", 100)

In [None]:
DATA_PATH = Path.cwd().parent / "data" / "raw"
OUTPUT_PATH = Path.cwd().parent / "models" / "submissions"

print(DATA_PATH)
print(OUTPUT_PATH)

In [None]:
features_df = pd.read_csv(
    DATA_PATH / "training_set_features.csv", 
    index_col="respondent_id"
)

labels_df = pd.read_csv(
    DATA_PATH / "training_set_labels.csv", 
    index_col="respondent_id"
)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier

from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split

from sklearn.metrics import roc_curve, roc_auc_score

RANDOM_SEED = 6    # Set a random seed for reproducibility!

In [None]:
newfeatures_df = pd.get_dummies(features_df, columns=['sex', 'race'])

#newfeatures_df.dtypes == "object"

In [None]:
numeric_cols = newfeatures_df.columns[newfeatures_df.dtypes != "object"].values
print(numeric_cols)

In [None]:
# chain preprocessing into a Pipeline object
# each step is a tuple of (name you chose, sklearn transformer)
numeric_preprocessing_steps = Pipeline([
    ('standard_scaler', StandardScaler()),
    ('simple_imputer', SimpleImputer(strategy='median'))
])

# create the preprocessor stage of final pipeline
# each entry in the transformer list is a tuple of
# (name you choose, sklearn transformer, list of columns)
preprocessor = ColumnTransformer(
    transformers = [
        ("numeric", numeric_preprocessing_steps, numeric_cols)
    ],
    remainder = "drop"
)

In [None]:
estimators = MultiOutputClassifier(
    estimator=LogisticRegression(penalty="l2", C=1)
)

In [None]:
full_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", estimators),
])

In [None]:
full_pipeline

In [None]:
X_train, X_eval, y_train, y_eval = train_test_split(
    newfeatures_df,
    labels_df,
    test_size=0.33,
    shuffle=True,
    stratify=labels_df,
    random_state=RANDOM_SEED
)

In [None]:
%%time

# Train model
full_pipeline.fit(X_train, y_train)

# Predict on evaluation set
# This competition wants probabilities, not labels
preds = full_pipeline.predict_proba(X_eval)
preds

In [None]:
print("test_probas[0].shape", preds[0].shape)
print("test_probas[1].shape", preds[1].shape)

In [None]:
y_preds = pd.DataFrame(
    {
        "h1n1_vaccine": preds[0][:, 1],
        "seasonal_vaccine": preds[1][:, 1],
    },
    index = y_eval.index
)
print("y_preds.shape:", y_preds.shape)
y_preds.head()

In [None]:
def plot_roc(y_true, y_score, label_name, ax):
    fpr, tpr, thresholds = roc_curve(y_true, y_score)
    ax.plot(fpr, tpr)
    ax.plot([0, 1], [0, 1], color='grey', linestyle='--')
    ax.set_ylabel('TPR')
    ax.set_xlabel('FPR')
    ax.set_title(
        f"{label_name}: AUC = {roc_auc_score(y_true, y_score):.4f}"
    )

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(7, 3.5))

plot_roc(
    y_eval['h1n1_vaccine'], 
    y_preds['h1n1_vaccine'], 
    'h1n1_vaccine',
    ax=ax[0]
)

plot_roc(
    y_eval['seasonal_vaccine'], 
    y_preds['seasonal_vaccine'], 
    'seasonal_vaccine',
    ax=ax[1]
)

fig.tight_layout()

In [None]:
roc_auc_score(y_eval, y_preds)

In [None]:
%%time 

full_pipeline.fit(newfeatures_df, labels_df)

None   # So we don't print out the whole pipeline representation

In [None]:
test_features_df = pd.read_csv(DATA_PATH / "test_set_features.csv", index_col="respondent_id")

new_test_features_df = pd.get_dummies(test_features_df, columns=['sex', 'race'])

test_probas = full_pipeline.predict_proba(new_test_features_df)
test_probas

In [None]:
submission_df = pd.read_csv(DATA_PATH / "submission_format.csv", index_col="respondent_id")
submission_df.head()

In [None]:
# Make sure we have the rows in the same order
np.testing.assert_array_equal(new_test_features_df.index.values, 
                              submission_df.index.values)

# Save predictions to submission data frame
submission_df["h1n1_vaccine"] = test_probas[0][:, 1]
submission_df["seasonal_vaccine"] = test_probas[1][:, 1]

submission_df.head()

In [None]:
submission_df.to_csv(OUTPUT_PATH / 'submission_1.csv', index=True)