# Ensemble methods

In data science world, _Bootstrap aggregation(bagging)_ is a one of wonderful skills to get a better performance within the limited data. The advantage of using Bagging lays in its ability to reduce forecast variance and thus prevents overfitting. However, we know that financial observations cannot be simply assumed to be IID. If we don't address that issue carefully, we can not fully take advantage of the benefits of the bagging. Through mlfinlab package, We can navigate the bagging process in finance by leveraging [sklearn](https://scikit-learn.org)'s `BaggingClassifier/Regressor` with [seq_bootstrap](https://mlfinlab.readthedocs.io/en/latest/implementations/sampling.html#mlfinlab.sampling.bootstrapping.seq_bootstrap).


Let's check how we can use _random forest_ algorithm and _sequential bootstrap_ in the _mlfinlab_.

In [1]:
import mlfinlab as ml

import numpy as np
import pandas as pd



## Data preparation

In [2]:
data = pd.read_csv('dollar_bars.csv', nrows=40000)
data.index = pd.to_datetime(data['date_time'])
data = data.drop('date_time', axis=1)
data = data.loc['2016-01-01':'2016-06-30']
data

Unnamed: 0_level_0,open,high,low,close,cum_vol,cum_dollar,cum_ticks
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2016-01-04 01:17:36.863,2036.50,2043.50,2034.75,2040.50,34337,70002505.75,5587
2016-01-04 03:26:50.833,2040.50,2041.75,2025.75,2031.25,34443,70003995.50,8520
2016-01-04 06:17:16.453,2031.25,2032.75,2016.00,2016.00,34654,70077289.00,9943
2016-01-04 08:07:12.320,2016.25,2019.25,2007.50,2007.75,34800,70063062.25,7853
2016-01-04 08:47:15.283,2007.75,2007.75,2001.00,2005.25,34939,70026656.00,7407
...,...,...,...,...,...,...,...
2016-06-30 20:00:03.168,2090.50,2090.75,2087.25,2089.00,33527,70036244.50,1495
2016-06-30 20:00:54.844,2089.00,2089.25,2087.25,2087.75,33593,70148391.25,1935
2016-06-30 20:05:54.542,2087.50,2087.75,2086.25,2087.00,33725,70377246.25,2538
2016-06-30 20:13:58.349,2087.00,2089.25,2086.25,2088.50,33544,70033735.00,2612


### Make meta-labels

Let's calculate RSI and Bollinger bands for side-preidcition.

In [3]:
# Compute RSI
def relative_strength_index(df, n):
        """Calculate Relative Strength Index(RSI) for given data.
        https://github.com/Crypto-toolbox/pandas-technical-indicators/blob/master/technical_indicators.py
        
        :param df: pandas.DataFrame
        :param n: 
        :return: pandas.DataFrame
        """
        i = 0
        UpI = [0]
        DoI = [0]
        while i + 1 <= df.index[-1]:
            UpMove = df.loc[i + 1, 'high'] - df.loc[i, 'high']
            DoMove = df.loc[i, 'low'] - df.loc[i + 1, 'low']
            if UpMove > DoMove and UpMove > 0:
                UpD = UpMove
            else:
                UpD = 0
            UpI.append(UpD)
            if DoMove > UpMove and DoMove > 0:
                DoD = DoMove
            else:
                DoD = 0
            DoI.append(DoD)
            i = i + 1
        UpI = pd.Series(UpI)
        DoI = pd.Series(DoI)
        PosDI = pd.Series(UpI.ewm(span=n, min_periods=n).mean())
        NegDI = pd.Series(DoI.ewm(span=n, min_periods=n).mean())
        RSI = pd.Series(round(PosDI * 100. / (PosDI + NegDI)), name='RSI_' + str(n))
        # df = df.join(RSI)
        return RSI

def get_rsi(data, window=14):
    df = data.copy(deep=True).reset_index()
    rsi = relative_strength_index(df, window)
    rsi_df = pd.Series(data=rsi.values, index=data.index)
    return rsi_df


def bbands(close_prices, window, no_of_stdev):
    rolling_mean = close_prices.ewm(span=window).mean()
    rolling_std = close_prices.ewm(span=window).std()

    upper_band = rolling_mean + (rolling_std * no_of_stdev)
    lower_band = rolling_mean - (rolling_std * no_of_stdev)

    return rolling_mean, upper_band, lower_band

In [4]:
# compute bands
window = 50
data['avg'], data['upper'], data['lower'] = bbands(data['close'], window, no_of_stdev=1.5)
data.sample(10)

Unnamed: 0_level_0,open,high,low,close,cum_vol,cum_dollar,cum_ticks,avg,upper,lower
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2016-06-06 19:55:54.770,2109.75,2109.75,2107.75,2108.75,33385,70394214.0,2749,2103.598324,2112.991362,2094.205286
2016-06-02 13:38:48.661,2090.75,2093.0,2090.25,2092.5,33468,70002268.75,3664,2093.50503,2098.691578,2088.318482
2016-04-29 13:47:47.706,2063.75,2065.75,2061.0,2062.0,33989,70115080.75,4038,2073.318252,2086.512625,2060.123878
2016-01-25 18:10:00.893,1889.5,1895.0,1888.0,1894.0,37005,70000155.0,5526,1890.166744,1900.89209,1879.441397
2016-02-24 20:23:40.438,1926.75,1928.0,1925.25,1925.75,36331,70000420.0,4198,1909.042127,1927.300922,1890.783331
2016-02-08 14:41:13.169,1842.0,1844.75,1841.0,1844.75,38020,70054715.25,5135,1868.609889,1895.789249,1841.430529
2016-01-28 12:07:52.417,1878.25,1878.5,1872.0,1874.0,37346,70035667.5,7917,1883.486716,1899.585584,1867.387847
2016-02-11 16:24:08.067,1814.75,1818.75,1812.25,1812.5,38549,70005751.25,5426,1827.744991,1852.732962,1802.757021
2016-05-20 00:37:06.745,2038.25,2042.0,2037.25,2042.0,34343,70026738.75,4510,2033.63747,2042.817151,2024.45779
2016-05-04 15:22:31.659,2045.5,2046.0,2042.0,2042.75,34267,70039705.25,4328,2050.794783,2060.959481,2040.630086


In [5]:
# Compute RSI
rsi_df = get_rsi(data, window=14)
data['rsi'] = pd.Series(data=rsi_df.values, index=data.index)

# Drop the NaN values from our data set
data.dropna(axis=0, how='any', inplace=True)

In [6]:
# Compute sides
data['side'] = np.nan 

long_signals = (data['close'] <= data['lower']) 
short_signals = (data['close'] >= data['upper']) 

data.loc[long_signals, 'side'] = 1
data.loc[short_signals, 'side'] = -1

print(data.side.value_counts())

# Remove Look ahead bias by lagging the signal
data['side'] = data['side'].shift(1)

 1.0    349
-1.0    287
Name: side, dtype: int64


In [7]:
# Save the raw data
raw_data = data.copy(deep=True)

# Drop the NaN values from our data set
data.dropna(axis=0, how='any', inplace=True)
print(data.side.value_counts())

 1.0    349
-1.0    287
Name: side, dtype: int64


In [8]:
# Compute daily volatility
daily_vol = ml.util.get_daily_vol(close=data['close'], lookback=50)

# Apply Symmetric CUSUM Filter and get timestamps for events
# Note: Only the CUSUM filter needs a point estimate for volatility
cusum_events = ml.filters.cusum_filter(data['close'], threshold=daily_vol.mean() * 0.1)

# Compute vertical barrier
vertical_barriers = ml.labeling.add_vertical_barrier(t_events=cusum_events, close=data['close'], num_days=1)

pt_sl = [0, 2]
min_ret = 0.0005

triple_barrier_events = ml.labeling.get_events(close=data['close'],
                                               t_events=cusum_events,
                                               pt_sl=pt_sl,
                                               target=daily_vol,
                                               min_ret=min_ret,
                                               num_threads=2,
                                               vertical_barrier_times=vertical_barriers,
                                               side_prediction=data['side'])
labels = ml.labeling.get_bins(triple_barrier_events, data['close'])

2020-04-13 12:52:15.124852 100.0% apply_pt_sl_on_t1 done after 0.0 minutes. Remaining 0.0 minutes.


### Make Features

In [9]:
# Log Returns
raw_data['log_ret'] = np.log(raw_data['close']).diff()

# Momentum
raw_data['mom1'] = raw_data['close'].pct_change(periods=1)
raw_data['mom2'] = raw_data['close'].pct_change(periods=2)
raw_data['mom3'] = raw_data['close'].pct_change(periods=3)
raw_data['mom4'] = raw_data['close'].pct_change(periods=4)
raw_data['mom5'] = raw_data['close'].pct_change(periods=5)

# Volatility
window_stdev = 50
raw_data['volatility'] = raw_data['log_ret'].rolling(window=window_stdev, min_periods=window_stdev, center=False).std()

# Serial Correlation (Takes about 4 minutes)
window_autocorr = 50

raw_data['autocorr_1'] = raw_data['log_ret'].rolling(window=window_autocorr, min_periods=window_autocorr, center=False).apply(lambda x: x.autocorr(lag=1), raw=False)
raw_data['autocorr_2'] = raw_data['log_ret'].rolling(window=window_autocorr, min_periods=window_autocorr, center=False).apply(lambda x: x.autocorr(lag=2), raw=False)
raw_data['autocorr_3'] = raw_data['log_ret'].rolling(window=window_autocorr, min_periods=window_autocorr, center=False).apply(lambda x: x.autocorr(lag=3), raw=False)
raw_data['autocorr_4'] = raw_data['log_ret'].rolling(window=window_autocorr, min_periods=window_autocorr, center=False).apply(lambda x: x.autocorr(lag=4), raw=False)
raw_data['autocorr_5'] = raw_data['log_ret'].rolling(window=window_autocorr, min_periods=window_autocorr, center=False).apply(lambda x: x.autocorr(lag=5), raw=False)

# Get the various log -t returns
raw_data['log_t1'] = raw_data['log_ret'].shift(1)
raw_data['log_t2'] = raw_data['log_ret'].shift(2)
raw_data['log_t3'] = raw_data['log_ret'].shift(3)
raw_data['log_t4'] = raw_data['log_ret'].shift(4)
raw_data['log_t5'] = raw_data['log_ret'].shift(5)

# Add fast and slow moving averages
fast_window = 7
slow_window = 15

raw_data['fast_mavg'] = raw_data['close'].rolling(window=fast_window, min_periods=fast_window, center=False).mean()
raw_data['slow_mavg'] = raw_data['close'].rolling(window=slow_window, min_periods=slow_window, center=False).mean()


In [10]:
# Add Trending signals
raw_data['sma'] = np.nan

long_signals = raw_data['fast_mavg'] >= raw_data['slow_mavg']
short_signals = raw_data['fast_mavg'] < raw_data['slow_mavg']
raw_data.loc[long_signals, 'sma'] = 1
raw_data.loc[short_signals, 'sma'] = -1

In [11]:
# Re compute sides
raw_data['side'] = np.nan

long_signals = raw_data['close'] <= raw_data['lower'] 
short_signals = raw_data['close'] >= raw_data['upper'] 

raw_data.loc[long_signals, 'side'] = 1
raw_data.loc[short_signals, 'side'] = -1

In [12]:
# Remove look ahead bias
raw_data = raw_data.shift(1)
raw_data.head()

Unnamed: 0_level_0,open,high,low,close,cum_vol,cum_dollar,cum_ticks,avg,upper,lower,...,autocorr_4,autocorr_5,log_t1,log_t2,log_t3,log_t4,log_t5,fast_mavg,slow_mavg,sma
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-01-04 14:40:05.559,,,,,,,,,,,...,,,,,,,,,,
2016-01-04 14:43:34.220,1997.75,1999.25,1995.75,1996.5,35065.0,70034780.25,4243.0,2005.024831,2023.794262,1986.2554,...,,,,,,,,,,
2016-01-04 14:48:09.087,1996.5,1997.0,1991.5,1993.0,35200.0,70188370.25,4463.0,2003.979777,2022.629063,1985.330492,...,,,,,,,,,,
2016-01-04 14:52:18.754,1993.0,1993.25,1990.5,1992.75,35144.0,70002843.25,4464.0,2003.048247,2021.50482,1984.591674,...,,,-0.001755,,,,,,,
2016-01-04 14:57:30.085,1992.75,1999.0,1992.75,1998.0,35221.0,70295761.75,4348.0,2002.647033,2020.447477,1984.846589,...,,,-0.000125,-0.001755,,,,,,


### Split train and test

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
# Get features at event dates
X = raw_data.loc[labels.index, :]

# Drop unwanted columns
X.drop(['avg', 'upper', 'lower', 'open', 'high', 'low', 'close', 'cum_vol', 'cum_dollar', 'cum_ticks','fast_mavg', 'slow_mavg',], axis=1, inplace=True)

y = labels['bin']

In [15]:
# Split data into training, validation and test sets
X_training_test = X
y_training_test = y
X_train, X_test, y_train, y_test = train_test_split(X_training_test, y_training_test, test_size=0.2, shuffle=False)

In [16]:
train_df = pd.concat([y_train, X_train], axis=1, join='inner')
train_df = train_df.dropna()
train_df['bin'].value_counts()

1    185
0    106
Name: bin, dtype: int64

In [17]:
# Create training data
y_train = train_df['bin']
X_train= train_df.loc[:, train_df.columns != 'bin']

## SequentiallyBootstrappedBaggingClassifier

Here, we are making classifiers. If your task is regression type, please check _SequentiallyBootstrappedBaggingRegressor_.

In [18]:
from mlfinlab.ensemble import SequentiallyBootstrappedBaggingClassifier
from mlfinlab.sampling.concurrent import get_av_uniqueness_from_triple_barrier

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier

In [19]:
av_unique = get_av_uniqueness_from_triple_barrier(triple_barrier_events, raw_data.close, num_threads=3)
avgU = av_unique['tW'].mean()
avgU

2020-04-13 12:52:27.364022 100.0% num_concurrent_events done after 0.0 minutes. Remaining 0.0 minutes.
2020-04-13 12:52:27.463654 100.0% _get_average_uniqueness done after 0.0 minutes. Remaining 0.0 minutes.


0.21987054594884525

### _BaggingClassifier_ with _DecisionTreeClassifier_

* _max_samples_: the average uniqueness(_avgU_) between samples.

In [20]:
base_trees = DecisionTreeClassifier(criterion='entropy', max_features='auto', class_weight='balanced')
bagging_ensemble = BaggingClassifier(base_estimator=base_trees, n_estimators=1000, max_samples=avgU)

In [21]:

clf = SequentiallyBootstrappedBaggingClassifier(base_estimator=bagging_ensemble,
                                                samples_info_sets=triple_barrier_events.loc[X.index, :],
                                                price_bars=data.close,
                                                max_samples=avgU
                                               )

In [22]:
clf.fit(X_train, y_train.values.ravel())


SequentiallyBootstrappedBaggingClassifier(base_estimator=BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight='balanced',
                                                                                                                 criterion='entropy',
                                                                                                                 max_depth=None,
                                                                                                                 max_features='auto',
                                                                                                                 max_leaf_nodes=None,
                                                                                                                 min_impurity_decrease=0.0,
                                                                                                                 min_impurity_split=None,
                                                     

### BaggingClassifier on RandomForestClassifier

* _max_samples_: the average uniqueness(_avgU_) between samples.

In [23]:
rf_1_tree = RandomForestClassifier(n_estimators=1, criterion='entropy', bootstrap=False, class_weight='balanced_subsample')
clf2 = SequentiallyBootstrappedBaggingClassifier(base_estimator=rf_1_tree,
                                                 samples_info_sets=triple_barrier_events.loc[X.index, :],
                                                 price_bars=data.close,
                                                 max_samples=avgU
                                                )

In [24]:
clf2.fit(X_train, y_train.values.ravel())

SequentiallyBootstrappedBaggingClassifier(base_estimator=RandomForestClassifier(bootstrap=False,
                                                                                class_weight='balanced_subsample',
                                                                                criterion='entropy',
                                                                                max_depth=None,
                                                                                max_features='auto',
                                                                                max_leaf_nodes=None,
                                                                                min_impurity_decrease=0.0,
                                                                                min_impurity_split=None,
                                                                                min_samples_leaf=1,
                                                                             