In [12]:
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm

from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score

In [13]:
df = pd.read_csv("data_itc.csv", delimiter=";")
df.head()

Unnamed: 0,bulbasaur,ivysaur,venusaur,charmander,charmeleon,charizard,squirtle,wartortle,blastoise,caterpie,...,sandslash,nidoran-f,nidorina,nidoqueen,nidoran-m,nidorino,nidoking,clefairy,label,date
0,0.004301,0.030108,0.027957,0.032258,0.0,0.11828,0.0,0,0.004301,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.019355,0,9/13/2018
1,0.000762,0.108994,0.012195,0.012957,0.000762,0.03125,0.000762,0,0.0,0.000762,...,0.0,0.0,0.000762,0.0,0.0,0.0,0.619048,0.025915,1,9/13/2018
2,0.0,0.108046,0.011494,0.011494,0.000766,0.036782,0.000766,0,0.0,0.000766,...,0.0,0.0,0.000766,0.0,0.0,0.0,0.5,0.022989,1,9/13/2018
3,0.001815,0.041742,0.00726,0.016334,0.0,0.072595,0.0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00363,0.0,0.55,0.059891,0,9/13/2018
4,0.014362,0.040476,0.01886,0.033367,0.0,0.060061,0.003192,0,0.003772,0.0,...,0.0,0.0,0.011026,0.032258,0.013782,0.001451,0.129032,0.026984,0,9/13/2018


In [14]:
df = df.set_index(pd.DatetimeIndex(df["date"])).drop(columns=["date"])

In [15]:
df["label"].unique()

array([0, 1])

In [16]:
max_month = df.index.month.max()

In [17]:
df.index.month

Int64Index([ 9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
            ...
            12, 12, 12, 12, 12, 12, 12, 12, 12, 12],
           dtype='int64', name='date', length=39240)

# Modelling

In [18]:
train = df.loc[df.index.month < max_month]
test = df.loc[df.index.month >= max_month]

In [19]:
assert len(train) + len(test) == len(df)

In [20]:
X_train, y_train = train.drop(columns=["label"]), train["label"]
X_test, y_test = test.drop(columns=["label"]), test["label"]

In [21]:
X_train.head()

Unnamed: 0_level_0,bulbasaur,ivysaur,venusaur,charmander,charmeleon,charizard,squirtle,wartortle,blastoise,caterpie,...,raichu,sandshrew,sandslash,nidoran-f,nidorina,nidoqueen,nidoran-m,nidorino,nidoking,clefairy
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-09-13,0.004301,0.030108,0.027957,0.032258,0.0,0.11828,0.0,0,0.004301,0.0,...,0.008602,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.019355
2018-09-13,0.000762,0.108994,0.012195,0.012957,0.000762,0.03125,0.000762,0,0.0,0.000762,...,0.004573,0.004573,0.0,0.0,0.000762,0.0,0.0,0.0,0.619048,0.025915
2018-09-13,0.0,0.108046,0.011494,0.011494,0.000766,0.036782,0.000766,0,0.0,0.000766,...,0.004598,0.000766,0.0,0.0,0.000766,0.0,0.0,0.0,0.5,0.022989
2018-09-13,0.001815,0.041742,0.00726,0.016334,0.0,0.072595,0.0,0,0.0,0.0,...,0.001815,0.00363,0.0,0.0,0.0,0.0,0.00363,0.0,0.55,0.059891
2018-09-13,0.014362,0.040476,0.01886,0.033367,0.0,0.060061,0.003192,0,0.003772,0.0,...,0.007109,0.010155,0.0,0.0,0.011026,0.032258,0.013782,0.001451,0.129032,0.026984


In [22]:
xgb_train = xgb.DMatrix(X_train.values, y_train.values)
xgb_test = xgb.DMatrix(X_test.values, y_test.values)

In [23]:
params = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic'}

In [24]:
xgb_clf = xgb.XGBClassifier(max_depth=2, learning_rate=1, objetive="binary:logistic")
xgb_clf.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=1, max_delta_step=0,
       max_depth=2, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic',
       objetive='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1)

In [25]:
rf_clf = RandomForestClassifier()
rf_clf.fit(X_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [26]:
def calculate_metrics(clf, X, y, scoring_func, threshold=0.5):
    probas = clf.predict_proba(X)
    
    y_pred = [0 if proba < threshold else 1 for proba in probas[:, 1]]
    
    assert len(y) == len(y_pred)
    
    conf_mat = confusion_matrix(y, y_pred, labels=clf.classes_)
    f1 = f1_score(y, y_pred)
    precision = precision_score(y, y_pred)
    recall = recall_score(y, y_pred)
    
    tp = conf_mat[0, 0]
    fp = conf_mat[0, 1]
    scoring_func_res = scoring_func(tp, fp)
    
    return {"total_samples": len(X), "f1_score": f1, "precision": precision, "recall": recall, "score": scoring_func_res}

In [27]:
def scoring_func(tp, fp):
    return tp / (6 * fp + tp)

In [28]:
calculate_metrics(xgb_clf, X_test, y_test, scoring_func)

{'total_samples': 4796,
 'f1_score': 0.9948431456811345,
 'precision': 0.9931359931359931,
 'recall': 0.9965561773568661,
 'score': 0.9623971797884842}

In [29]:
calculate_metrics(rf_clf, X_test, y_test, scoring_func)

{'total_samples': 4796,
 'f1_score': 0.9982773471145563,
 'precision': 0.998707453683757,
 'recall': 0.9978476108480413,
 'score': 0.9927652733118971}

In [30]:
metrics_gb = {thresh: calculate_metrics(xgb_clf, X_test, y_test, scoring_func, thresh) for thresh in np.arange(0, 1, 0.1)}

In [31]:
metrics_rf = {thresh: calculate_metrics(rf_clf, X_test, y_test, scoring_func, thresh) for thresh in np.arange(0, 1, 0.1)}

In [32]:
max_thresh_gb = max(metrics_gb, key = lambda x: metrics_gb[x]["score"])
maximized_gb = metrics_gb[max_thresh_gb]
maximized_gb

{'total_samples': 4796,
 'f1_score': 0.994606256742179,
 'precision': 0.9969723183391004,
 'recall': 0.9922513990529488,
 'score': 0.9832535885167464}

In [33]:
max_thresh_rf = max(metrics_rf, key = lambda x: metrics_rf[x]["score"])
maximized_rf = metrics_rf[max_thresh_rf]
maximized_rf

{'total_samples': 4796,
 'f1_score': 0.9963275005400735,
 'precision': 1.0,
 'recall': 0.9926818768833405,
 'score': 1.0}

In [34]:
class ThresholdClassifier:
    
    def __init__(self, clf, threshold):
        self.clf = clf
        self.threshold = threshold
        
    def predict(self, X):
        probas = self.clf.predict_proba(X)
        preds = [0 if proba < self.threshold else 1 for proba in probas[:, 1]]
        return (preds, probas)
    
    def precision(self, X, y):
        predictions, _ = self.predict(X)
        return precision_score(y, predictions)

In [23]:
test_thr_clf = ThresholdClassifier(xgb_clf, max_thresh_gb)

In [40]:
X_test.index.value_counts()

2018-12-04    436
2018-12-08    436
2018-12-03    436
2018-12-07    436
2018-12-11    436
2018-12-02    436
2018-12-06    436
2018-12-10    436
2018-12-01    436
2018-12-05    436
2018-12-09    436
Name: date, dtype: int64

# Bokeh

In [25]:
from bokeh.io import output_notebook, show
from bokeh.plotting import figure

from bokeh.io import curdoc
from bokeh.layouts import row, widgetbox
from bokeh.models import ColumnDataSource
from bokeh.models.widgets import DateRangeSlider, TextInput

In [26]:
output_notebook()

In [27]:
plot = figure(plot_height=400, plot_width=400, title="Precision and recall over time", x_axis_type="datetime")

In [44]:
vis_x_axis = X_test
min_date, max_date = vis_x_axis.index.min(), vis_x_axis.index.max()

In [45]:
time_period_slider = DateRangeSlider(title="Time Period", value=(min_date, max_date), start=min_date, end=max_date, step=1)
inputs = widgetbox(time_period_slider)

In [46]:
curdoc().add_root(time_period_slider)
curdoc().add_root(plot)

In [48]:
source = ColumnDataSource(data=dict(x=vis_x_axis, y=precisions))

In [71]:
def update(attr, old, new):
    start, end = time_period_slider.value
    print(start, end)

In [72]:
time_period_slider.on_change('value', update)

In [73]:
plot.line('x', 'y', source=source)
plot.xaxis.axis_label="Date"
plot.yaxis.axis_label="Precision"
show(inputs)
show(plot)

You are generating standalone HTML/JS output, but trying to use real Python
callbacks (i.e. with on_change or on_event). This combination cannot work.

Only JavaScript callbacks may be used with standalone output. For more
information on JavaScript callbacks with Bokeh, see:

    http://bokeh.pydata.org/en/latest/docs/user_guide/interaction/callbacks.html

Alternatively, to use real Python callbacks, a Bokeh server application may
be used. For more information on building and running Bokeh applications, see:

    http://bokeh.pydata.org/en/latest/docs/user_guide/server.html

