Download data from [kaggle](https://www.kaggle.com/shivinder/googlestockpricing/data).

In [2]:
import pandas as pd

df = pd.read_csv("../datasets/Google.csv")
df.index = pd.DatetimeIndex(df['Date'].values)
close = df["Close"]

In [3]:
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Ex-Dividend,Split Ratio,Adj. Open,Adj. High,Adj. Low,Adj. Close,Adj. Volume
2004-08-19,2004-08-19,100.01,104.06,95.96,100.335,44659000.0,0.0,1.0,50.159839,52.191109,48.128568,50.322842,44659000.0
2004-08-20,2004-08-20,101.01,109.08,100.5,108.31,22834300.0,0.0,1.0,50.661387,54.708881,50.405597,54.322689,22834300.0
2004-08-23,2004-08-23,110.76,113.48,109.05,109.4,18256100.0,0.0,1.0,55.551482,56.915693,54.693835,54.869377,18256100.0
2004-08-24,2004-08-24,111.24,111.6,103.57,104.87,15247300.0,0.0,1.0,55.792225,55.972783,51.94535,52.597363,15247300.0
2004-08-25,2004-08-25,104.76,108.0,103.88,106.0,9188600.0,0.0,1.0,52.542193,54.167209,52.10083,53.164113,9188600.0


In [4]:
import numpy as np
import pandas as pd

def get_daily_vol(close, span=100):
    use_idx = close.index.searchsorted(close.index - pd.Timedelta(days=1))
    use_idx = use_idx[use_idx > 0]
    # Get rid of duplications in index
    use_idx = np.unique(use_idx)
    prev_idx = pd.Series(close.index[use_idx - 1], index=close.index[use_idx])
    ret = close.loc[prev_idx.index] / close.loc[prev_idx.values].values - 1
    vol = ret.ewm(span=span).std()
    return vol

In [5]:
vol = get_daily_vol(df["Close"])
vol.head()

2004-08-23         NaN
2004-08-24    0.036396
2004-08-25    0.029930
2004-08-26    0.027366
2004-08-30    0.029365
Name: Close, dtype: float64

# 3.1

In [6]:
import numbers


def cusum_filter(close, h):
    # asssum that E y_t = y_{t-1}
    t_events = []
    s_pos, s_neg = 0, 0
    ret = close.pct_change().dropna()
    diff = ret.diff().dropna()
    # time variant threshold
    if isinstance(h, numbers.Number):
        h = pd.Series(h, index=diff.index)
    h = h.reindex(diff.index, method='bfill')
    h = h.dropna()
    for t in h.index:
        s_pos = max(0, s_pos + diff.loc[t])
        s_neg = min(0, s_neg + diff.loc[t])
        if s_pos > h.loc[t]:
            s_pos = 0
            t_events.append(t)
        elif s_neg < -h.loc[t]:
            s_neg = 0
            t_events.append(t)
    return pd.DatetimeIndex(t_events)

In [7]:
cusum_filter(df["Close"], 0.1)

DatetimeIndex(['2004-08-24', '2004-09-28', '2004-10-20', '2004-10-21',
               '2004-10-26', '2004-11-05', '2004-11-08', '2004-11-11',
               '2004-11-16', '2004-11-24', '2005-01-26', '2005-02-04',
               '2005-06-08', '2005-10-21', '2005-10-25', '2006-01-20',
               '2006-01-23', '2006-02-01', '2006-02-16', '2006-02-28',
               '2006-03-02', '2006-03-24', '2006-04-25', '2006-10-20',
               '2006-11-27', '2008-01-23', '2008-01-24', '2008-02-01',
               '2008-02-05', '2008-03-11', '2008-03-17', '2008-03-24',
               '2008-04-18', '2008-04-21', '2008-07-18', '2008-07-22',
               '2008-09-11', '2008-09-17', '2008-09-18', '2008-09-22',
               '2008-09-29', '2008-09-30', '2008-10-02', '2008-10-13',
               '2008-10-14', '2008-10-16', '2008-10-28', '2008-10-29',
               '2008-11-05', '2008-11-13', '2008-11-17', '2008-11-25',
               '2008-12-01', '2008-12-02', '2008-12-22', '2009-01-20',
      

In [8]:
vol = get_daily_vol(close)
sampled_idx = cusum_filter(close, vol)
sampled_idx

DatetimeIndex(['2004-08-24', '2004-08-25', '2004-08-27', '2004-08-31',
               '2004-09-02', '2004-09-03', '2004-09-07', '2004-09-10',
               '2004-09-15', '2004-09-17',
               ...
               '2016-12-07', '2016-12-08', '2016-12-09', '2016-12-12',
               '2016-12-15', '2016-12-30', '2017-01-03', '2017-01-04',
               '2017-01-06', '2017-01-09'],
              dtype='datetime64[ns]', length=1513, freq=None)

In [9]:
sampled_idx.shape

(1513,)

In [10]:
df.shape

(3125, 13)

In [11]:
def get_t1(close, t_events, num_days):
    t1 = close.index.searchsorted(t_events + pd.Timedelta(days=num_days))
    t1 = t1[t1 < close.shape[0]]
    t1 = pd.Series(close.index[t1], index=t_events[:t1.shape[0]])
    return t1

In [12]:
t1 = get_t1(close, sampled_idx, num_days=1)
print(t1.shape)
t1.head()

(1513,)


2004-08-24   2004-08-25
2004-08-25   2004-08-26
2004-08-27   2004-08-30
2004-08-31   2004-09-01
2004-09-02   2004-09-03
dtype: datetime64[ns]

In [13]:
def apply_ptslt1(close, events, ptsl, molecule):
    """Return datafram about if price touches the boundary"""
    # Sample a subset with specific indices
    _events = events.loc[molecule]
    # Time limit
    
    out = pd.DataFrame(index=_events.index)
    # Set Profit Taking and Stop Loss
    if ptsl[0] > 0:
        pt = ptsl[0] *  _events["trgt"]
    else:
        # Switch off profit taking
        pt = pd.Series(index=_events.index)
    if ptsl[1] > 0:
        sl = -ptsl[1] * _events["trgt"]
    else:
        # Switch off stop loss
        sl = pd.Series(index=_events.index)
    # Replace undifined value with the last time index
    time_limits = _events["t1"].fillna(close.index[-1])
    for loc, t1 in time_limits.iteritems():
        df = close[loc:t1]
        # Change the direction depending on the side
        df = (df / close[loc] - 1) * _events.at[loc, 'side']
        # print(df)
        # print(loc, t1, df[df < sl[loc]].index.min(), df[df > pt[loc]].index.min())
        out.at[loc, 'sl'] = df[df < sl[loc]].index.min()
        out.at[loc, 'pt'] = df[df > pt[loc]].index.min()
    out['t1'] = _events['t1'].copy(deep=True)
    return out


def get_3barriers(close, t_events, ptsl, trgt, min_ret=0, num_threads=1,
                  t1=False, side=None):
    # Get sampled target values
    trgt = trgt.loc[t_events]
    trgt = trgt[trgt > min_ret]
    # Get time boundary t1
    if t1 is False:
        t1 = pd.Series(pd.NaT, index=t_events)
    # Define the side
    if side is None:
        _side = pd.Series(1., index=trgt.index)
        _ptsl = [ptsl, ptsl]
    else:
        _side = side.loc[trgt.index]
        _ptsl = ptsl[:2]
    events = pd.concat({'t1': t1, 'trgt': trgt, 'side': _side}, axis=1)
    events = events.dropna(subset=['trgt'])
    time_idx = apply_ptslt1(close, events, _ptsl, events.index)
    # Skip when all of barrier are not touched
    events['t1'] = time_idx.dropna(how='all').min(axis=1)
    events = events.drop('side', axis=1)
    return events

In [14]:
trgt = vol
events = get_3barriers(close, t_events=sampled_idx, trgt=trgt,
                       ptsl=1, t1=t1)
events.head()

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike


Unnamed: 0,t1,trgt
2004-08-24,2004-08-25,0.036396
2004-08-25,2004-08-26,0.02993
2004-08-31,2004-09-01,0.026605
2004-09-02,2004-09-03,0.024097
2004-09-07,2004-09-08,0.02361


In [15]:
def get_bins(events, close):
    # Prices algined with events
    events = events.dropna(subset=['t1'])
    px = events.index.union(events['t1'].values).drop_duplicates()
    px = close.reindex(px, method='bfill')
    # Create out object
    out = pd.DataFrame(index=events.index)
    out['ret'] = px.loc[events['t1'].values].values / px.loc[events.index] - 1.
    if 'side' in events:
        out['ret'] *= events['side']
    out['bin'] = np.sign(out['ret'])
    if 'side' in events:
        out.loc[out['ret'] <= 0, 'bin'] = 0
    return out

In [16]:
bins = get_bins(events, close)
bins.head()

Unnamed: 0,ret,bin
2004-08-24,0.010775,1.0
2004-08-25,0.018019,1.0
2004-08-31,-0.020709,-1.0
2004-09-02,-0.014777,-1.0
2004-09-07,0.007088,1.0


# 3.2

In [17]:
def drop_labels(events, min_pct=0.05):
    while True:
        df = events['bin'].value_counts(normalize=True)
        if df.min() > min_pct or df.shape[0] < 3:
            break
        print('dropped label', df.argmin(), df.min())
        events = events[events['bin'] != df.argmin()]
    return events

In [18]:
dropped_bins = drop_labels(bins)
print(bins.shape)
print(dropped_bins.shape)

dropped label 0.0 0.0008216926869350862
(1217, 2)
(1216, 2)


  
  import sys


In [19]:
bins = dropped_bins

# 3.3

In [20]:
def get_3barriers(close, t_events, ptsl, trgt, min_ret=0, num_threads=1,
                  t1=False, side=None):
    # Get sampled target values
    trgt = trgt.loc[t_events]
    trgt = trgt[trgt > min_ret]
    # Get time boundary t1
    if t1 is False:
        t1 = pd.Series(pd.NaT, index=t_events)
    # Define the side
    if side is None:
        _side = pd.Series(1., index=trgt.index)
        _ptsl = [ptsl, ptsl]
    else:
        _side = side.loc[trgt.index]
        _ptsl = ptsl[:2]
    events = pd.concat({'t1': t1, 'trgt': trgt, 'side': _side}, axis=1)
    events = events.dropna(subset=['trgt'])
    time_idx = apply_ptslt1(close, events, _ptsl, events.index)
    # Skip when all of barrier are not touched
    time_idx = time_idx.dropna(how='all')
    events['t1_type'] = time_idx.idxmin(axis=1)
    events['t1'] = time_idx.min(axis=1)
    if side is None:
        events = events.drop('side', axis=1)
    return events

def get_bins(events, close):
    # Prices algined with events
    events = events.dropna(subset=['t1'])
    px = events.index.union(events['t1'].values).drop_duplicates()
    px = close.reindex(px, method='bfill')
    # Create out object
    out = pd.DataFrame(index=events.index)
    out['ret'] = px.loc[events['t1'].values].values / px.loc[events.index] - 1.
    if 'side' in events:
        out['ret'] *= events['side']
    out['bin'] = np.sign(out['ret'])
    # 0 when touching vertical line
    out['bin'].loc[events['t1_type'] == 't1'] = 0
    if 'side' in events:
        out.loc[out['ret'] <= 0, 'bin'] = 0
    return out

In [21]:
t1 = get_t1(close, sampled_idx, num_days=1)
events = get_3barriers(close, t_events=sampled_idx, trgt=trgt,
                       ptsl=1, t1=t1)
events.head()

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  after removing the cwd from sys.path.


Unnamed: 0,t1,trgt,t1_type
2004-08-24,2004-08-25,0.036396,t1
2004-08-25,2004-08-26,0.02993,t1
2004-08-31,2004-09-01,0.026605,t1
2004-09-02,2004-09-03,0.024097,t1
2004-09-07,2004-09-08,0.02361,t1


In [22]:
print(events['t1_type'].unique())
print(events['t1_type'].describe())

['t1' 'pt' 'sl']
count     1217
unique       3
top         t1
freq       906
Name: t1_type, dtype: object


In [23]:
bins = get_bins(events, close)
bins.head()

Unnamed: 0,ret,bin
2004-08-24,0.010775,0.0
2004-08-25,0.018019,0.0
2004-08-31,-0.020709,0.0
2004-09-02,-0.014777,0.0
2004-09-07,0.007088,0.0


In [24]:
bins['bin'].value_counts()

 0.0    906
 1.0    180
-1.0    131
Name: bin, dtype: int64

# 3.4

In [70]:
import talib
import numpy as np


def macd_side(close):
    macd, signal, hist = talib.MACD(close.values)
    hist = pd.Series(hist).fillna(1).values
    return pd.Series(2 * ((hist > 0).astype(float) - 0.5), index=close.index[-len(hist):])

In [71]:
import numpy as np

vol = get_daily_vol(close)
sampled_idx = cusum_filter(close, vol)
t1 = get_t1(close, sampled_idx, num_days=1)
side =  macd_side(close)
events = get_3barriers(close, t_events=sampled_idx, trgt=vol,
                       ptsl=[1, 2], t1=t1, side=side)
events.head()

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  after removing the cwd from sys.path.


Unnamed: 0,side,t1,trgt,t1_type
2004-08-24,1.0,2004-08-25,0.036396,t1
2004-08-25,1.0,2004-08-26,0.02993,t1
2004-08-31,1.0,2004-09-01,0.026605,t1
2004-09-02,1.0,2004-09-03,0.024097,t1
2004-09-07,1.0,2004-09-08,0.02361,t1


In [72]:
bins = get_bins(events, close)
bins.head()

Unnamed: 0,ret,bin
2004-08-24,0.010775,0.0
2004-08-25,0.018019,0.0
2004-08-31,-0.020709,0.0
2004-09-02,-0.014777,0.0
2004-09-07,0.007088,0.0


In [27]:
bins['bin'].unique()

array([0., 1.])

In [93]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()
x = np.hstack([events['side'].values[:, np.newaxis], close.loc[events.index].values[:, np.newaxis]])
y = bins['bin'].values
clf.fit(x, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [94]:
clf.predict(x)

array([0., 0., 0., ..., 0., 0., 0.])

In [91]:
x.shape

(1217, 2)

In [76]:
events['side'].values

array([ 1.,  1.,  1., ..., -1., -1.,  1.])

In [37]:
help(talib.MACD)

Help on built-in function MACD in module talib.func:

MACD(...)
    MACD(real[, fastperiod=?, slowperiod=?, signalperiod=?])
    
    Moving Average Convergence/Divergence (Momentum Indicators)
    
    Inputs:
        real: (any ndarray)
    Parameters:
        fastperiod: 12
        slowperiod: 26
        signalperiod: 9
    Outputs:
        macd
        macdsignal
        macdhist



In [42]:
macd, signal, hist = talib.MACD(close.values)

In [45]:
np.max(macd[100:] - signal[100:]  - hist[100:] )

0.0

In [49]:
macd[np.isfinite(macd)].shape

(3092,)

In [51]:
signal = signal[np.isfinite(signal)]

In [55]:
2 * ((signal > 0).astype(float) - 0.5)

array([1., 1., 1., ..., 1., 1., 1.])

In [68]:
macd.fill(1)

In [69]:
macd

array([1., 1., 1., ..., 1., 1., 1.])