In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt

In [2]:
sp500_futures = pd.read_csv("../data/sp500_futures_tick_data.csv")
sp500_futures.head()

Unnamed: 0,date,time,price,volume
0,01/03/2000,08:30:34.000,1496.4,0
1,01/03/2000,08:30:36.000,1496.0,0
2,01/03/2000,08:30:37.000,1495.5,0
3,01/03/2000,08:30:46.000,1495.0,0
4,01/03/2000,08:30:53.000,1495.5,0


## 0. Add SP500 spot price

In [3]:
sp500_original = pd.read_csv('../data/sp500.csv')
# The time is always "00:00:00"
sp500 = pd.DataFrame()
# Remove the time, which is always 00:00:00
sp500['date'] = sp500_original.Date.str.replace("00:00:00", "").str.strip()
# Change date format to match with dates from sp500_futures
sp500['date'] = pd.to_datetime(sp500['date']).dt.strftime('%m/%d/%Y')
# Set the price using the adjusted close
sp500['price'] = sp500_original['Adj Close']

del sp500_original

sp500.head()

Unnamed: 0,date,price
0,01/03/1950,16.66
1,01/04/1950,16.85
2,01/05/1950,16.93
3,01/06/1950,16.98
4,01/09/1950,17.08


In [4]:
sp500_futures = pd.merge(sp500_futures, sp500, on='date') # inner join by default
sp500_futures.rename(columns={'price_x': 'price', 'price_y': 'spot_price'}, inplace=True)
sp500_futures.head()

Unnamed: 0,date,time,price,volume,spot_price
0,01/03/2000,08:30:34.000,1496.4,0,1455.22
1,01/03/2000,08:30:36.000,1496.0,0,1455.22
2,01/03/2000,08:30:37.000,1495.5,0,1455.22
3,01/03/2000,08:30:46.000,1495.0,0,1455.22
4,01/03/2000,08:30:53.000,1495.5,0,1455.22


## 1. Add the vertical barrier (Chapter 3.5)

#### Convert time bars to dollar bars

In [5]:
dollar_size = 10000
sp500_futures['dollar'] = sp500_futures.price * sp500_futures.volume
sp500_futures['dollar_group'] = sp500_futures['dollar'].cumsum().astype(int) // dollar_size
sp500_futures.head()

Unnamed: 0,date,time,price,volume,spot_price,dollar,dollar_group
0,01/03/2000,08:30:34.000,1496.4,0,1455.22,0.0,0
1,01/03/2000,08:30:36.000,1496.0,0,1455.22,0.0,0
2,01/03/2000,08:30:37.000,1495.5,0,1455.22,0.0,0
3,01/03/2000,08:30:46.000,1495.0,0,1455.22,0.0,0
4,01/03/2000,08:30:53.000,1495.5,0,1455.22,0.0,0


In [6]:
def aggregate_dollar_bars(x):
    d = {}
    d['date'] = x['date'].iloc[-1]
    d['time'] = x['time'].iloc[-1]
    d['open'] = x['price'].iloc[0]
    d['close'] = x['price'].iloc[-1]
    d['low'] = x['price'].min()
    d['high'] = x['price'].max()
    d['spot_price'] = x['spot_price'].iloc[-1]
    d['total_volume'] = x['volume'].sum()
    # In edge case, d['total_volume'] can be 0 if all volume in the df is 0
    d['vol_weighted_avg_price'] = x['price'].dot(x['volume']) / d['total_volume'] if d['total_volume'] else 0
    d['dollar'] = x['dollar'].sum()
    d['count'] = x['date'].count()
    return pd.Series(d)

In [7]:
sp500_futures_dollar = sp500_futures.groupby('dollar_group', as_index=False).apply(aggregate_dollar_bars)
sp500_futures_dollar.head()

Unnamed: 0,dollar_group,date,time,open,close,low,high,spot_price,total_volume,vol_weighted_avg_price,dollar,count
0,0,06/30/2003,23:04:16.000,1496.4,972.0,767.5,1574.0,974.5,3,971.966667,2915.9,2812947
1,1,06/30/2003,23:04:34.000,972.0,972.0,972.0,972.0,974.5,9,972.0,8748.0,1
2,10,06/30/2003,23:16:38.000,972.0,971.5,971.5,972.0,974.5,100,971.976,97197.6,8
3,11,06/30/2003,23:21:20.000,971.8,971.8,971.8,971.8,974.5,9,971.8,8746.2,2
4,12,06/30/2003,23:45:14.000,971.8,972.3,971.8,972.3,974.5,10,971.91,9719.1,6


#### Apply CUSUM filter on intraday returns

In [8]:
sp500_futures_dollar['returns'] = np.log(sp500_futures_dollar.close / sp500_futures_dollar.open)
sp500_futures_dollar['return_diff'] = sp500_futures_dollar['returns'].diff().fillna(0)

cusum_threshold = sp500_futures_dollar['returns'].std()
print(cusum_threshold)

0.0007371667092802313


In [9]:
# Use E_{t-1}(y_t) = y_t
def cusum_filter_pos(row, threshold=0.05):
    if row.name == 0:
        cusum_filter_pos.cumsum = 0
        cusum_filter_pos.count = 0
    
    cusum_filter_pos.cumsum = max(0, cusum_filter_pos.cumsum + row.return_diff)
    if cusum_filter_pos.cumsum > threshold:
        cusum_filter_pos.cumsum = 0
        cusum_filter_pos.count =+ 1
    
    return cusum_filter_pos.cumsum, cusum_filter_pos.count


def cusum_filter_neg(row, threshold=0.05):
    if row.name == 0:
        cusum_filter_neg.cumsum = 0
        cusum_filter_neg.count = 0
    
    cusum_filter_neg.cumsum = min(0, cusum_filter_neg.cumsum + row.return_diff)
    if cusum_filter_neg.cumsum < -threshold:
        cusum_filter_neg.cumsum = 0
        cusum_filter_neg.count += 1
    
    return cusum_filter_neg.cumsum, cusum_filter_neg.count

In [10]:
sp500_futures_dollar['cusum_filter_pos'] = sp500_futures_dollar.apply(cusum_filter_pos, args=(cusum_threshold, ), axis=1)
sp500_futures_dollar['cusum_filter_neg'] = sp500_futures_dollar.apply(cusum_filter_neg, args=(cusum_threshold, ), axis=1)

print(sp500_futures_dollar['cusum_filter_pos'].tail())
print(sp500_futures_dollar['cusum_filter_neg'].tail())

1254776    (0, 1)
1254777    (0, 1)
1254778    (0, 1)
1254779    (0, 1)
1254780    (0, 1)
Name: cusum_filter_pos, dtype: object
1254776    (0, 69173)
1254777    (0, 69174)
1254778    (0, 69175)
1254779    (0, 69175)
1254780    (0, 69176)
Name: cusum_filter_neg, dtype: object


#### Add the vertical barrier

In [11]:
# Compute tEvents, which is the timestamp Series selected by CUSUM
# Let's use Negative cause it samples more than Positive
sp500_futures_dollar['prev_cusum_neg'] = sp500_futures_dollar['cusum_filter_neg'].shift(1)
sp500_futures_dollar.dropna(inplace=True)
sp500_futures_dollar.reset_index(inplace=True, drop=True)
sp500_futures_dollar.head()

Unnamed: 0,dollar_group,date,time,open,close,low,high,spot_price,total_volume,vol_weighted_avg_price,dollar,count,returns,return_diff,cusum_filter_pos,cusum_filter_neg,prev_cusum_neg
0,1,06/30/2003,23:04:34.000,972.0,972.0,972.0,972.0,974.5,9,972.0,8748.0,1,0.0,0.431462,"(0, 1)","(0, 0)","(0, 0)"
1,10,06/30/2003,23:16:38.000,972.0,971.5,971.5,972.0,974.5,100,971.976,97197.6,8,-0.000515,-0.000515,"(0, 1)","(-0.0005145356429443377, 0)","(0, 0)"
2,11,06/30/2003,23:21:20.000,971.8,971.8,971.8,971.8,974.5,9,971.8,8746.2,2,0.0,0.000515,"(0.0005145356429443377, 1)","(0, 0)","(-0.0005145356429443377, 0)"
3,12,06/30/2003,23:45:14.000,971.8,972.3,971.8,972.3,974.5,10,971.91,9719.1,6,0.000514,0.000514,"(0, 1)","(0, 0)","(0, 0)"
4,13,06/30/2003,23:45:27.000,972.3,972.3,972.3,972.3,974.5,9,972.3,8750.7,2,0.0,-0.000514,"(0, 1)","(-0.0005143768438087622, 0)","(0, 0)"


In [12]:
sp500_futures_dollar['datetime'] = pd.to_datetime(sp500_futures_dollar.date + " " + sp500_futures_dollar.time)
tEvents = sp500_futures_dollar[sp500_futures_dollar.cusum_filter_neg.str[1] == sp500_futures_dollar.prev_cusum_neg.str[1] + 1].datetime
tEvents.head()

6    2003-07-01 00:03:43
28   2003-07-01 01:41:26
39   2003-07-01 02:10:41
60   2003-07-01 02:40:21
73   2003-07-01 03:10:08
Name: datetime, dtype: datetime64[ns]

In [35]:
days = 1
t1 = sp500_futures_dollar.datetime.searchsorted(tEvents + pd.Timedelta(days=days))
t1 = t1[t1 < sp500_futures_dollar.shape[0]]  # Remove those that are inserted at the end
sp500_futures_dollar['barrier'] = sp500_futures_dollar.index.to_series().apply(lambda x: x in t1)