1.  split preprocessed data in train und validation sets
2.  fit train data with decision tree, predict probabilities and calculate model performance with ROC with different tree depths
3. fit decision tree model with optimal depth on complete processed dataset and apply on test set
4. load submission dataset, sort probabilities accordingly and save submission dataset






In [1]:
import re
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
import time
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split


In [2]:
data_path_raw = Path.cwd().parent / "data" / "raw"

data_path_preprocessed = Path.cwd().parent / "data" / "processed"


In [3]:
processed_df = pd.read_csv(data_path_preprocessed / "training_set_features__nominal_ordinal_WOE_Impute_Dropped_balanced.csv", index_col="respondent_id")
labels_df_balanced = pd.read_csv(data_path_preprocessed / "training_set_labels__balanced.csv", index_col="respondent_id")
test_df_processed = pd.read_csv(data_path_preprocessed / "test_set_features_nominal_ordinal_WOE_Impute_Dropped.csv", index_col="respondent_id")


In [4]:
## 1 split preprocessed data in train und validation sets

X_train, X_test, y_train, y_test = train_test_split(
    processed_df,
    labels_df_balanced,
    shuffle = True,
    test_size = 0.25,
    random_state = 10)

In [5]:
## 2 fit train data with decision tree, predict probabilities and calculate model performance with ROC with different tree depths

ROC_Depths = pd.DataFrame(columns = ['depth', 'time', 'ROC'])

for i in range(1,40): 
    start = time.time()
    dec_tree = DecisionTreeClassifier(max_depth = i )

    dec_tree.fit(X_train, y_train)
    test_probability = dec_tree.predict_proba(X_test)
    
    y_preds = pd.DataFrame(
    {
        "h1n1_vaccine": test_probability[0][:, 1],
        "seasonal_vaccine": test_probability[1][:, 1],
    },
    index = y_test.index
    )
    
    end = time.time()
    
    
    ROC_Depths = ROC_Depths.append(
        {'depth': i, 
         'time': end - start, 
         'ROC': roc_auc_score(y_test, y_preds)}, 
        ignore_index=True)

In [6]:
## 2
ROC_Depths

Unnamed: 0,depth,time,ROC
0,1.0,0.071182,0.618232
1,2.0,0.10282,0.721392
2,3.0,0.145293,0.766274
3,4.0,0.192578,0.799659
4,5.0,0.235199,0.821589
5,6.0,0.283308,0.839198
6,7.0,0.332391,0.85156
7,8.0,0.363702,0.861872
8,9.0,0.415894,0.867903
9,10.0,0.446055,0.870821


In [7]:
## 3 fit decision tree model with optimal depth on complete processed dataset and apply on test set

dec_tree = DecisionTreeClassifier(max_depth = 10 )

dec_tree.fit(processed_df, labels_df_balanced)
test_probability = dec_tree.predict_proba(test_df_processed)
    
y_preds = pd.DataFrame(
    {
        "h1n1_vaccine": test_probability[0][:, 1],
        "seasonal_vaccine": test_probability[1][:, 1],
    },
    index = test_df_processed.index
    )

In [8]:
## 4 load submission dataset, sort probabilities accordingly and save submission dataset

submission_df = pd.read_csv(data_path_raw / "submission_format.csv", 
                            index_col="respondent_id")

In [9]:
## 4

np.testing.assert_array_equal(test_df_processed.index.values, 
                              submission_df.index.values)

In [10]:
## 4
submission_df["h1n1_vaccine"] = y_preds["h1n1_vaccine"]
submission_df["seasonal_vaccine"] = y_preds["seasonal_vaccine"]

In [11]:
## 4
output_path = Path.cwd().parent / "models" / "submissions"

submission_df.to_csv(output_path /'DecTree_balanced_data.csv', index=True)

In [None]:
# submission score on datadriven: 0.8094 -> decision tree overfitted