In [1]:
import os
import unittest

import numpy as np
import pandas as pd

from sklearn.metrics import precision_score, recall_score, roc_auc_score, accuracy_score, mean_absolute_error, \
    mean_squared_error
from sklearn.ensemble import BaggingClassifier, BaggingRegressor, RandomForestClassifier, RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.utils import indices_to_mask

from mlfinlab.util.utils import get_daily_vol
from mlfinlab.filters.filters import cusum_filter
from mlfinlab.ensemble.sb_bagging import SequentiallyBootstrappedBaggingClassifier, \
    SequentiallyBootstrappedBaggingRegressor
from mlfinlab.labeling.labeling import get_events, add_vertical_barrier, get_bins
from mlfinlab.sampling.bootstrapping import seq_bootstrap, get_ind_matrix, get_ind_mat_average_uniqueness, get_ind_mat_label_uniqueness

In [2]:
data = pd.read_csv('mlfinlab/tests/test_data/dollar_bar_sample.csv', index_col = 'date_time', parse_dates = [0])

In [3]:
fast_window = 20
slow_window = 50

In [4]:
data['fast_mavg'] = data['close'].rolling(window=fast_window, min_periods=fast_window,
                                                            center=False).mean()
data['slow_mavg'] = data['close'].rolling(window=slow_window, min_periods=slow_window,
                                                    center=False).mean()

In [5]:
# Compute sides
data['side'] = np.nan

long_signals = data['fast_mavg'] >= data['slow_mavg']
short_signals = data['fast_mavg'] < data['slow_mavg']
data.loc[long_signals, 'side'] = 1
data.loc[short_signals, 'side'] = -1

# Remove Look ahead bias by lagging the signal
data['side'] = data['side'].shift(1)

daily_vol = get_daily_vol(close=data['close'], lookback=50) * 0.5
cusum_events = cusum_filter(data['close'], threshold=0.001)
vertical_barriers = add_vertical_barrier(t_events=cusum_events, close=data['close'],
                                         num_hours=2)
meta_labeled_events = get_events(close=data['close'],
                                      t_events=cusum_events,
                                      pt_sl=[1, 4],
                                      target=daily_vol,
                                      min_ret=0.0000005,
                                      num_threads=3,
                                      vertical_barrier_times=vertical_barriers,
                                      side_prediction=data['side'])

meta_labeled_events.dropna(inplace=True)
labels = get_bins(meta_labeled_events, data['close'])

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  target = target.loc[t_events]
2019-08-28 20:53:42.901578 100.0% apply_pt_sl_on_t1 done after 0.01 minutes. Remaining 0.0 minutes..


In [6]:
ind_mat = get_ind_matrix(meta_labeled_events.t1, data.close)

In [7]:
i = 0
unique_samples = []
uniqueness_thresh = 0.4
bad_unq = 0.1
for label in get_ind_mat_label_uniqueness(ind_mat):
    if np.mean(label[label >0]) > uniqueness_thresh or np.mean(label[label >0]) <  bad_unq:
        unique_samples.append(i)
    i += 1

In [8]:
len(unique_samples)

119

In [9]:
get_ind_mat_average_uniqueness(ind_mat[:, unique_samples])

  uniqueness = ind_mat.T / concurrency
  uniqueness = ind_mat.T / concurrency
  avg_uniqueness = uniqueness[uniqueness > 0].mean()


0.3161925601750547

In [10]:
X = data.loc[labels.index, ].iloc[unique_samples].dropna()
labels = labels.loc[X.index, :]

In [11]:
X.shape, labels.shape

((119, 10), (119, 4))

In [12]:
X.loc[labels.index, 'y'] = labels.bin

In [13]:
labels.bin.value_counts()

0    72
1    47
Name: bin, dtype: int64

In [14]:
X.shape

(119, 11)

In [15]:
# generate superfeatures
def _generate_label_with_prob(x, prob):
    rs = np.random.RandomState(1)
    random_labels = []   
    choice = np.random.choice([0, 1],  p = [1-prob, prob])
    if choice == 1:
        return x
    else:
        return int(not x)

In [16]:
for index, value in X.y.iteritems():
    X.loc[index, 'label_prob_0.5'] = _generate_label_with_prob(value, 0.5)
    X.loc[index, 'label_prob_0.3'] = _generate_label_with_prob(value, 0.3)
    X.loc[index, 'label_prob_0.2'] = _generate_label_with_prob(value, 0.2)
    X.loc[index, 'label_prob_0.1'] = _generate_label_with_prob(value, 0.1)

In [17]:
X['label_prob_0.5'].value_counts()

0.0    60
1.0    59
Name: label_prob_0.5, dtype: int64

In [18]:
X.y.value_counts()

0    72
1    47
Name: y, dtype: int64

In [19]:
features = ['label_prob_0.3', 'label_prob_0.2']
features = []
for prob in [0.5, 0.3, 0.2]:
    for window in [2,5,10]: 
        X['label_prob_{}_sma_{}'.format(prob, window)] = X['label_prob_{}'.format(prob)].rolling(window=window).mean()
        features.append('label_prob_{}_sma_{}'.format(prob, window))

In [20]:
X.dropna(inplace=True)

In [21]:
y = X.pop('y')

In [22]:
X.shape[0]

110

In [23]:
X_train, y_train = X.iloc[:70][features], y.iloc[:70]
X_test, y_test = X.iloc[70:][features], y.iloc[70:]

In [24]:
end_times = meta_labeled_events.loc[X_train.index, 't1']
price_bars_trim = data[(data.index >= X_train.index.min()) & (data.index <= X_train.index.max())].close

clf = RandomForestClassifier(n_estimators=100, oob_score=True, class_weight='balanced_subsample')

clf_base = RandomForestClassifier(n_estimators=1, criterion='entropy', bootstrap=False,
                                     class_weight='balanced_subsample')

clf_2 = SequentiallyBootstrappedBaggingClassifier(n_estimators=100, oob_score=True, events_end_times=end_times, price_bars=price_bars_trim, base_estimator=clf_base)



clf.fit(X_train, y_train)
clf_2.fit(X_train, y_train)

SequentiallyBootstrappedBaggingClassifier(base_estimator=RandomForestClassifier(bootstrap=False,
                                                                                class_weight='balanced_subsample',
                                                                                criterion='entropy',
                                                                                max_depth=None,
                                                                                max_features='auto',
                                                                                max_leaf_nodes=None,
                                                                                min_impurity_decrease=0.0,
                                                                                min_impurity_split=None,
                                                                                min_samples_leaf=1,
                                                                             

In [25]:
clf.oob_score_

0.6

In [26]:
clf_2.oob_score_

0.9857142857142858

In [27]:
from sklearn.metrics import accuracy_score, f1_score

In [28]:
accuracy_score(y_test, clf.predict(X_test)), f1_score(y_test, clf.predict(X_test))

(0.575, 0.41379310344827586)

In [29]:
accuracy_score(y_test, clf_2.predict(X_test)), f1_score(y_test, clf_2.predict(X_test))

(0.675, 0.6060606060606061)