In [9]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
import numpy as np
import scipy.sparse
import xgboost as xgb

In [11]:
import pandas as pd
import numpy as np
import sys
from sklearn.metrics import roc_auc_score, precision_recall_fscore_support
from sklearn.pipeline import Pipeline, FeatureUnion
from time import time
import pickle
import os
from sys import argv
import itertools

In [44]:
import EncoderFactory
import ClassifierFactory
from DatasetManager import DatasetManager

In [45]:
dataset_name = "production"

In [46]:
methods = ["static", "last"]

In [47]:
train_ratio = 0.8
val_ratio = 0.2
random_state = 22
fillna = True
n_min_cases_in_bucket = 30

In [48]:
cls_encoder_args = {'case_id_col':dataset_manager.case_id_col, 
                    'static_cat_cols':dataset_manager.static_cat_cols,
                    'static_num_cols':dataset_manager.static_num_cols, 
                    'dynamic_cat_cols':dataset_manager.dynamic_cat_cols,
                    'dynamic_num_cols':dataset_manager.dynamic_num_cols, 
                    'fillna':fillna}

In [49]:
dataset_manager = DatasetManager(dataset_name)
data = dataset_manager.read_dataset()
train, test = dataset_manager.split_data_strict(data, train_ratio)
train, val = dataset_manager.split_val(train, val_ratio)
max_prefix_length = min(40, dataset_manager.get_pos_case_length_quantile(data, 0.90))
min_prefix_length = 1

In [50]:
dt_train_prefixes = dataset_manager.generate_prefix_data(train, min_prefix_length, max_prefix_length)
dt_val_prefixes = dataset_manager.generate_prefix_data(val, min_prefix_length, max_prefix_length)
dt_test_prefixes = dataset_manager.generate_prefix_data(test, min_prefix_length, max_prefix_length)

In [51]:
max_depth_values = [3, 4, 5, 6, 7, 8, 9]
min_child_weight_values = [1, 2, 3]

In [56]:
for i in range(16):
    n_estimators = np.random.randint(150, 1000)
    learning_rate = np.random.uniform(0.01, 0.07)
    subsample = np.random.uniform(0.3, 0.7)
    max_depth = max_depth_values[np.random.randint(0, len(max_depth_values))]
    colsample_bytree = np.random.uniform(0.5, 0.45)
    min_child_weight = min_child_weight_values[np.random.randint(0, len(min_child_weight_values))]

    params = {'n_estimators': n_estimators,
             'learning_rate': learning_rate,
             'subsample': subsample,
             'max_depth': max_depth,
             'colsample_bytree': colsample_bytree,
             'min_child_weight': min_child_weight}

    cls = xgb.XGBClassifier(objective = 'binary:logistic', **params)

    train_y = dataset_manager.get_label_numeric(dt_train_prefixes)

    feature_combiner = FeatureUnion([(method, EncoderFactory.get_encoder(method, **cls_encoder_args)) for method in methods])
    pipeline = Pipeline([('encoder', feature_combiner), ('cls', cls)])
    pipeline.fit(dt_train_prefixes, train_y)

    preds = pipeline.predict_proba(dt_val_prefixes)
    val_y = dataset_manager.get_label_numeric(dt_val_prefixes)
    auc = roc_auc_score(val_y, preds[:,1])
    print("AUC = %s, n_estimators = %s, learning_rate = %s, subsample = %s, max_depth = %s, colsample_bytree = %s, min_child_weight = %s" % (auc, n_estimators, learning_rate, subsample, max_depth, colsample_bytree, min_child_weight))

AUC = 0.690806878307, n_estimators = 309, learning_rate = 0.031053916150034826, subsample = 0.5624783425918776, max_depth = 7, colsample_bytree = 0.4639186783251802, min_child_weight = 2
AUC = 0.668113425926, n_estimators = 647, learning_rate = 0.03328721463087977, subsample = 0.5654068561074812, max_depth = 6, colsample_bytree = 0.46120041241958887, min_child_weight = 1
AUC = 0.675925925926, n_estimators = 822, learning_rate = 0.03785990558578823, subsample = 0.3900011653438886, max_depth = 6, colsample_bytree = 0.4775153061223786, min_child_weight = 3
AUC = 0.687045304233, n_estimators = 603, learning_rate = 0.02617238508763179, subsample = 0.553917856720791, max_depth = 6, colsample_bytree = 0.45095667550283736, min_child_weight = 3
AUC = 0.688616071429, n_estimators = 531, learning_rate = 0.0454157059761846, subsample = 0.6156043071490778, max_depth = 3, colsample_bytree = 0.4537116779207621, min_child_weight = 1
AUC = 0.752149470899, n_estimators = 214, learning_rate = 0.063506847