# Intraday Strategy, Part 1: Feature Engineering

### Loading Libraries

In [1]:
# Numerical Computing
import numpy as np

# Data Manipulation
import pandas as pd
import pandas_datareader.data as web

# Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter

# Collections, time 
from collections import defaultdict
from time import time

# Warnings
import warnings

# Notebook Optimizer
from tqdm import tqdm

# SciPy
from scipy.stats import spearmanr

# Technical Analysis
import talib

In [2]:
%matplotlib inline

In [3]:
np.random.seed(42)

idx = pd.IndexSlice

sns.set_style('whitegrid')

deciles = np.arange(.1, 1, .1)

warnings.filterwarnings('ignore')

### Algoseek Trade & Quote Minute Bar Data

#### Data Preparation

In [4]:
tcols = ['openbartime', 
         'firsttradetime',
         'highbidtime', 
         'highasktime', 
         'hightradetime',
         'lowbidtime', 
         'lowasktime', 
         'lowtradetime',
         'closebartime', 
         'lasttradetime']

In [5]:
drop_cols = ['unknowntickvolume',
             'cancelsize',
             'tradeatcrossorlocked']

In [6]:
keep = ['firsttradeprice', 
        'hightradeprice', 
        'lowtradeprice', 
        'lasttradeprice', 
        'minspread', 
        'maxspread',
        'volumeweightprice', 
        'nbboquotecount', 
        'tradeatbid', 
        'tradeatbidmid', 
        'tradeatmid', 
        'tradeatmidask', 
        'tradeatask', 
        'volume', 
        'totaltrades', 
        'finravolume', 
        'finravolumeweightprice', 
        'uptickvolume', 
        'downtickvolume', 
        'repeatuptickvolume', 
        'repeatdowntickvolume', 
        'tradetomidvolweight', 
        'tradetomidvolweightrelative']

In [7]:
columns = {'volumeweightprice'          : 'price',
           'finravolume'                : 'fvolume',
           'finravolumeweightprice'     : 'fprice',
           'uptickvolume'               : 'up',
           'downtickvolume'             : 'down',
           'repeatuptickvolume'         : 'rup',
           'repeatdowntickvolume'       : 'rdown',
           'firsttradeprice'            : 'first',
           'hightradeprice'             : 'high',
           'lowtradeprice'              : 'low',
           'lasttradeprice'             : 'last',
           'nbboquotecount'             : 'nbbo',
           'totaltrades'                : 'ntrades',
           'openbidprice'               : 'obprice',
           'openbidsize'                : 'obsize',
           'openaskprice'               : 'oaprice',
           'openasksize'                : 'oasize',
           'highbidprice'               : 'hbprice',
           'highbidsize'                : 'hbsize',
           'highaskprice'               : 'haprice',
           'highasksize'                : 'hasize',
           'lowbidprice'                : 'lbprice',
           'lowbidsize'                 : 'lbsize',
           'lowaskprice'                : 'laprice',
           'lowasksize'                 : 'lasize',
           'closebidprice'              : 'cbprice',
           'closebidsize'               : 'cbsize',
           'closeaskprice'              : 'caprice',
           'closeasksize'               : 'casize',
           'firsttradesize'             : 'firstsize',
           'hightradesize'              : 'highsize',
           'lowtradesize'               : 'lowsize',
           'lasttradesize'              : 'lastsize',
           'tradetomidvolweight'        : 'volweight',
           'tradetomidvolweightrelative': 'volweightrel'}

In [8]:
nasdaq_path = Path('../data/nasdaq100')

In [9]:
def extract_and_combine_data():
    path = nasdaq_path / '1min_taq'

    data = []
    # ~80K files to process
    for f in tqdm(list(path.glob('*/**/*.csv.gz'))):
        data.append(pd.read_csv(f, parse_dates=[['Date', 'TimeBarStart']])
                    .rename(columns=str.lower)
                    .drop(tcols + drop_cols, axis=1)
                    .rename(columns=columns)
                    .set_index('date_timebarstart')
                    .sort_index()
                    .between_time('9:30', '16:00')
                    .set_index('ticker', append=True)
                    .swaplevel()
                    .rename(columns=lambda x: x.replace('tradeat', 'at')))
    data = pd.concat(data).apply(pd.to_numeric, downcast='integer')
    data.index.rename(['ticker', 'date_time'])
    print(data.info(show_counts=True))
    data.to_hdf(nasdaq_path / 'algoseek.h5', 'min_taq')

In [10]:
extract_and_combine_data()

### Loading Algoseek Data

In [11]:
ohlcv_cols = ['first', 'high', 'low', 'last', 'price', 'volume']

In [12]:
data_cols = ohlcv_cols + ['up', 'down', 'rup', 'rdown', 'atask', 'atbid']

In [14]:
with pd.HDFStore(as_path / 'algoseek.h5') as store:
    df = store['min_taq'].loc[:, data_cols].sort_index()

In [15]:
df['date'] = pd.to_datetime(df.index.get_level_values('date_time').date)

In [16]:
df.to_hdf('data/algoseek.h5', 'data')

In [17]:
df = pd.read_hdf('data/algoseek.h5', 'data')

In [18]:
df.info(null_counts=True)

### Feature Engineering

In [19]:
by_ticker = df.sort_index().groupby('ticker', group_keys=False)

by_ticker_date = df.sort_index().groupby(['ticker', 'date'])

In [20]:
data = pd.DataFrame(index=df.index)

In [21]:
data['date'] = pd.factorize(df['date'], sort=True)[0]

In [22]:
data['minute'] = pd.to_timedelta(data.index.get_level_values('date_time').time.astype(str))

data.minute = (data.minute.dt.seconds.sub(data.minute.dt.seconds.min()).div(60).astype(int))

### Lagged Returns

In [23]:
data[f'ret1min'] = df['last'].div(df['first']).sub(1)

In [24]:
sns.kdeplot(data.ret1min.sample(n=100000));
plt.show()

In [25]:
data.ret1min.describe(percentiles=np.arange(.1, 1, .1)).iloc[1:].apply(lambda x: f'{x:.3%}')

In [26]:
print(f'Skew: {data.ret1min.skew():.2f} | Kurtosis: {data.ret1min.kurtosis():.2f}')

In [27]:
data.join(df[['first', 'last']]).nlargest(10, columns=['ret1min'])

In [28]:
for t in tqdm(range(2, 11)):
    data[f'ret{t}min'] = df['last'].div(by_ticker_date['first'].shift(t-1)).sub(1)

### Forwarding Returns

In [29]:
data['fwd1min'] = (data
                   .sort_index()
                   .groupby(['ticker', 'date'])
                   .ret1min
                   .shift(-1))

In [30]:
data = data.dropna(subset=['fwd1min'])

In [31]:
data.info(null_counts=True)

### Normalized Up/Downtick Volume

In [32]:
for f in ['up', 'down', 'rup', 'rdown']:
    data[f] = df.loc[:, f].div(df.volume).replace(np.inf, np.nan)

In [33]:
data.loc[:, ['rup', 'up', 'rdown', 'down']].describe(deciles)

### Balance of Power

In [34]:
data['BOP'] = (by_ticker.apply(lambda x: talib.BOP(x['first'],
                                                   x.high,
                                                   x.low,
                                                   x['last'])))

### Commodity Channel Index

In [35]:
data['CCI'] = (by_ticker.apply(lambda x: talib.CCI(x.high,
                                          x.low,
                                          x['last'],
                                          timeperiod=14)))

### Money Flow Index

In [36]:
data['MFI'] = (by_ticker.apply(lambda x: talib.MFI(x.high,
                                          x.low,
                                          x['last'],
                                          x.volume,
                                          timeperiod=14)))

In [37]:
data[['BOP', 'CCI', 'MFI']].describe(deciles)

### Stochastic RSI

In [38]:
data['STOCHRSI'] = (by_ticker.apply(lambda x: talib.STOCHRSI(x['last'].ffill(),
                                                             timeperiod=14,
                                                             fastk_period=14,
                                                             fastd_period=3,
                                                             fastd_matype=0)[0]))

### Stochastic Oscillator

In [39]:
def compute_stoch(x, fastk_period=14, slowk_period=3, slowk_matype=0, slowd_period=3, slowd_matype=0):
    slowk, slowd = talib.STOCH(x.high.ffill(), x.low.ffill(), x['last'].ffill(), 
                               fastk_period=fastk_period,
                               slowk_period=slowk_period,
                               slowk_matype=slowk_matype,
                               slowd_period=slowd_period,
                               slowd_matype=slowd_matype)
    return pd.DataFrame({'slowd': slowd, 
                         'slowk': slowk}, 
                        index=x.index)

In [40]:
data = data.join(by_ticker.apply(compute_stoch))

### Average True Range

In [41]:
data['NATR'] = by_ticker.apply(lambda x: talib.NATR(x.high.ffill(), 
                                                    x.low.ffill(), 
                                                    x['last'].ffill()))

### Transaction Volume by Price Point

In [42]:
data['trades_bid_ask'] = df.atask.sub(df.atbid).div(df.volume).replace((np.inf, -np.inf), np.nan)

In [43]:
del df

In [44]:
data.info(show_counts=True)

### Evaluating Features

In [45]:
features = ['ret1min', 'ret2min', 'ret3min', 'ret4min', 'ret5min', 
            'ret6min', 'ret7min', 'ret8min', 'ret9min', 'ret10min',
            'rup', 'up', 'down', 'rdown', 'BOP', 'CCI', 'MFI', 'STOCHRSI', 
            'slowk', 'slowd', 'trades_bid_ask']

In [46]:
sample = data.sample(n=100000)

In [47]:
fig, axes = plt.subplots(nrows=3, ncols=7, figsize=(30, 12))

axes = axes.flatten()

for i, feature in enumerate(features):
    sns.distplot(sample[feature], ax=axes[i])
    axes[i].set_title(feature.upper())
    
sns.despine()
fig.tight_layout()
plt.show()

In [48]:
sns.pairplot(sample, y_vars=['fwd1min'], x_vars=features);
plt.show()

In [49]:
corr = sample.loc[:, features].corr()

sns.clustermap(corr, cmap = sns.diverging_palette(20, 230, as_cmap=True),
              center=0, vmin=-.25);

plt.show()

In [50]:
ic = {}

for feature in tqdm(features):
    df = data[['fwd1min', feature]].dropna()
    by_day = df.groupby(df.index.get_level_values('date_time').date) # calc per min is very time-consuming
    ic[feature] = by_day.apply(lambda x: spearmanr(x.fwd1min, x[feature])[0]).mean()

ic = pd.Series(ic)

In [51]:
ic.sort_values()

In [52]:
title = 'Information Coeficient for Intraday Features (1-min forward returns)'

ic.index = ic.index.map(str.upper)

ax = ic.sort_values(ascending=False).plot.bar(figsize=(14, 4), title=title, rot=35)
ax.set_ylabel('Information Coefficient')
ax.yaxis.set_major_formatter(FuncFormatter(lambda y, _: '{:.1%}'.format(y))) 
sns.despine()
plt.tight_layout();
plt.show()

### Storing Results

In [53]:
data.info(null_counts=True)

In [54]:
data.drop(['date', 'up', 'down'], axis=1).to_hdf('data/algoseek.h5', 'model_data')