In [1]:
import sys
import os
from os import sep
from os.path import dirname, realpath
from pathlib import Path
import logging

def get_cwd(fname, subdir, crunch_dir=realpath(Path.home()) +sep +'crunch' +sep):
    """
    Convenience function to make a directory string for the current file based on inputs.
    Jupyter Notebook in Anaconda invokes the Python interpreter in Anaconda's subdirectory
    which is why changing sys.argv[0] is necessary. In the future a better way to do this
    should be preferred..
    """
    return crunch_dir +subdir +fname
    
def fix_path(cwd):
    """
    Convenience function to fix argv and python path so that jupyter notebook can run the same as
    any script in crunch.
    """
    sys.argv[0] = cwd
    module_path = os.path.abspath(os.path.join('..'))
    if module_path not in sys.path:
        sys.path.append(module_path)

fix_path(get_cwd('test.ipynb', 'recon' +sep))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from numba import jit, vectorize, float64

from ipywidgets import interact, interactive, fixed
from IPython.display import display

pd.set_option("display.max_rows", 100)
pd.set_option('display.max_columns', 50)

from common_util import DT_HOURLY_FREQ, DT_CAL_DAILY_FREQ, DT_BIZ_DAILY_FREQ, get_custom_biz_freq, query_df, search_df, chained_filter, benchmark
from data.data_api import DataAPI
from data.access_util import col_subsetters as cs
from mutate.common import dum, count_nonnan, count_nonzero, count_both

In [2]:
from mutate.thresh import get_thresh_fth


In [11]:
	# LOAD DATA
	search_terms = {
		'stage': 'raw',
		'raw_cat': 'us_equity_index'
	}
	dfs, recs = {}, {}
	for rec, df in DataAPI.generate(search_terms):
		recs[rec.name] = rec
		dfs[rec.name] = df


	# THRESH SPECIFICATION
	# price vol based threshold
	fs_pairs = {
		'oc': ['close', 'open'],
		'lh': ['high', 'low'],
		'oa': ['avgPrice', 'open'],
		'ac': ['close', 'avgPrice']#,
		# 'oo': ['open', 'open'],
		# 'hh': ['high', 'high'],
		# 'll': ['low', 'low'],
		# 'cc': ['close', 'close'],
		# 'aa': ['avgPrice', 'avgPrice'],
	}

	# trmi based threshold
	trmi = [cs['#trmi']['all']]

	# constant percentage threshold
	start = 1e-8
	end = .5
	step = float(start/2)
	pct_threshes = np.arange(start, end+step, step=step)


	# FTH THRESHIZE LOOP
	date_range = {
		'id': ('lt', 2018)
	}
	for name, whole_df in dfs.items():
		print(name)
		original = recs[name]
		df = whole_df.loc[search_df(df, date_range)]

		pba_cols = chained_filter(df.columns, [cs['#pba']['ohlc']])
		vol_cols = chained_filter(df.columns, [cs['#vol']['ohlc']])
		trmi_cols = chained_filter(df.columns, [cs['#trmi']['all']])
		pba_df = df[pba_cols]
		vol_df = df[vol_cols]
		trmi_df = df[trmi_cols]

		# price based
		src = 'pba'
		src_fs_pairs = {key: ['_'.join([src, it]) for it in pair] for key, pair in fs_pairs.items()}
		for key, fs_cols in src_fs_pairs.items():
			fs = pba_df.loc[:, fs_cols].dropna()
			print(fs_cols)
			custom = get_custom_biz_freq(fs)
			thresh_df = get_thresh_fth(fs, thresh_type='spread', shift=False, pfx='_'.join([src, key]),
				org_freq=DT_HOURLY_FREQ, agg_freq=custom, shift_freq=DT_HOURLY_FREQ).drop(columns=['fast', 'slow', 'thresh'])


dow_jones_raw_0
['pba_close', 'pba_open']
['pba_high', 'pba_low']
['pba_avgPrice', 'pba_open']
['pba_close', 'pba_avgPrice']
sp_500_raw_1
['pba_close', 'pba_open']


KeyboardInterrupt: 

In [None]:
thresh_df

In [28]:
start = 1e-7
end = .1
step = start
pct_threshes = np.arange(start, end, step=step)

In [29]:
len(pct_threshes)

1000000

In [30]:
pct_threshes

array([1.00000e-07, 2.00000e-07, 3.00000e-07, ..., 9.99998e-02,
       9.99999e-02, 1.00000e-01])

### Load Raw Data

Get all raw data, filter out data before 2018 (leave 2018 for final validation)

In [34]:
search_terms = {
    'stage': 'raw'
}
date_range = {
    'id': ('lt', 2018)
}
dfs = {}
for rec, df in DataAPI.generate(search_terms):
    dfs[rec.name] = df.loc[search_df(df, date_range)]

In [35]:
list(dfs.keys())

['dow_jones_raw_0',
 'sp_500_raw_1',
 'nasdaq_100_raw_2',
 'russell_2000_raw_3',
 'oil_raw_4',
 'gold_raw_5']

In [36]:
pba = chained_filter(dfs['sp_500_raw_1'].columns, [cs['#pba']['ohlc']])
vol = chained_filter(dfs['sp_500_raw_1'].columns, [cs['#vol']['ohlc']])
pba_vol = pba + vol

In [37]:
pba_vol

['pba_open',
 'pba_high',
 'pba_low',
 'pba_close',
 'pba_avgPrice',
 'vol_open',
 'vol_high',
 'vol_low',
 'vol_close',
 'vol_avgPrice']

In [38]:
eq_df = dfs['sp_500_raw_1'][pba_vol]
em_df = dfs['oil_raw_4'][pba_vol]

In [7]:
from pandas.tseries.offsets import CustomBusinessDay, CustomBusinessHour
def missing_biz_days(ser):
    biz_days = pd.date_range(ser.index.min(), ser.index.max(), freq=DT_BIZ_DAILY_FREQ).date
    df_biz_days = ser.resample(DT_BIZ_DAILY_FREQ).mean().dropna().index.date

    biz_days = pd.DatetimeIndex(biz_days)
    df_biz_days = pd.DatetimeIndex(df_biz_days)

    return biz_days.difference(df_biz_days)

In [39]:
_spread_thresh = lambda f, s: abs(f - s)			# spread: abs arithmetic spread -> |fast - slow|
_ansr_thresh = lambda f, s: abs((f / s) - 1)		# ansr: abs net simple return 	-> |(fast / slow) - 1|
_alog_thresh = lambda f, s: abs(np.log(f / s))		# alog: abs log gross return 	-> |ln(fast / slow)|

In [9]:
def get_thresh_fth(intraday_df, thresh_type='ansr', org_freq=DT_HOURLY_FREQ, agg_freq=DT_BIZ_DAILY_FREQ, shift_freq=DT_BIZ_DAILY_FREQ, pfx=''):
	"""
	Return thresh estimates.

	Args:
		intraday_df (pd.DataFrame): intraday price dataframe with two columns (slow, fast)
		thresh_type (String): threshold type
		org_freq: freq of the original data
		agg_freq: freq to use for groupby aggregation
		shift_freq: freq to use for shift ∈ {org_freq, agg_freq}
		pfx (String, optional): prefix to all column names
	
	Returns:
		Return pd.DataFrame with derived columns
	"""
	th =  'fth'
	_cname = lambda s: '_'.join([pfx, th, thresh_type, s])

	if (thresh_type == 'spread'):
		thresh_fun = _spread_thresh
	elif (thresh_type == 'ansr'):
		thresh_fun = _ansr_thresh
	elif (thresh_type == 'alog'):
		thresh_fun = _alog_thresh

	derived = pd.DataFrame(index=intraday_df.index)
	derived['slow'] = intraday_df.iloc[:, 0]
	derived['fast'] = intraday_df.iloc[:, 1]
	derived['thresh'] = thresh_fun(derived['fast'], derived['slow'])
	gb = derived.groupby(pd.Grouper(freq=agg_freq))

	if (shift_freq == agg_freq):
		# static average
		derived[_cname('avg')] = gb['thresh'].transform(pd.Series.mean)#.shift(freq=shift_freq)

		# static standard deviation
		derived[_cname('std')] = gb['thresh'].transform(pd.Series.std)#.shift(freq=shift_freq)

		# static median
		derived[_cname('med')] = gb['thresh'].transform(pd.Series.median)#.shift(freq=shift_freq)
		
		# static largest
		derived[_cname('max')] = gb['thresh'].transform(pd.Series.max)#.shift(freq=shift_freq)

		# static smallest
		derived[_cname('min')] = gb['thresh'].transform(pd.Series.min)#.shift(freq=shift_freq)

		# second-to-last of previous day
		derived[_cname('sec')] = gb['thresh'].transform(lambda x: x.iat[len(x)-2])#.shift(freq=shift_freq)
		
		# final of previous day
		derived[_cname('fin')] = gb['thresh'].transform(pd.Series.last, org_freq)#.shift(freq=shift_freq)

		# whole of previous day
		last_fast = gb['fast'].transform(pd.Series.last, org_freq)
		first_slow = gb['slow'].transform(pd.Series.first, org_freq)
		derived[_cname('whl')] = thresh_fun(last_fast, first_slow)#.shift(freq=shift_freq)

		# # For days where previous day has less hours than current
		# derived = derived.groupby(pd.Grouper(freq=agg_freq)).fillna(method='ffill')

	elif (shift_freq == org_freq):

		# final of previous period
		derived[_cname('prev')] = gb['thresh'].transform(lambda ser: ser)

		# expanding average
		derived[_cname('x_avg')] = gb['thresh'].transform(lambda ser: ser.expanding().mean())

		# expanding standard deviation
		derived[_cname('x_std')] = gb['thresh'].transform(lambda ser: ser.expanding().std())

		# expanding max
		derived[_cname('x_max')] = gb['thresh'].transform(lambda ser: ser.expanding().max())
		
		# expanding min
		derived[_cname('x_min')] = gb['thresh'].transform(lambda ser: ser.expanding().min())

	return derived

In [23]:
def get_thresh_vth(intraday_df, thresh_type='ansr', org_freq=DT_HOURLY_FREQ, agg_freq=DT_BIZ_DAILY_FREQ, shift_freq=DT_BIZ_DAILY_FREQ, per=None, pfx=''):
	"""
	Return thresh estimates.
	Variable time horizon allows thresholds to go beyond the specified aggregation frequency.

	Args:
		intraday_df (pd.DataFrame): intraday price dataframe with two columns (slow, fast)
		thresh_type (String): threshold type
		org_freq: freq of the original data
		agg_freq: freq to use for groupby aggregation
		shift_freq: freq to use for shift ∈ {org_freq, agg_freq}
		pfx (String, optional): prefix to all column names
	
	Returns:
		Return pd.DataFrame with derived columns
	"""
	time_hor = str(per)+str(shift_freq)
	th =  'vth(' +time_hor +')'
	_cname = lambda s: '_'.join([pfx, th, s, thresh_type])

	if (thresh_type == 'spread'):
		thresh_fun = _spread_thresh
	elif (thresh_type == 'ansr'):
		thresh_fun = _ansr_thresh
	elif (thresh_type == 'alog'):
		thresh_fun = _alog_thresh

	derived = pd.DataFrame(index=intraday_df.index)
	derived['slow'] = intraday_df.iloc[:, 0]
	derived['fast'] = intraday_df.iloc[:, 1]
	derived['thresh'] = thresh_fun(derived['fast'], derived['slow'])
	gb = derived.groupby(pd.Grouper(freq=agg_freq))
	roll = derived.rolling(time_hor) # Rolling object of size 'per' 'agg_freq's
	expand = derived.expanding()

	# ROLLING
	# simple moving average
	derived[_cname('s_ma')] = roll.mean()

	# simple moving standard deviation
	derived[_cname('s_std')] = roll.std()

	# simple moving median
	derived[_cname('s_med')] = roll.median()
	
	# simple moving largest
	derived[_cname('s_max')] = roll.max()

	# simple moving smallest
	derived[_cname('s_min')] = roll.min()

	# EXPANDING
	# expanding average
	derived[_cname('x_ma')] = expand.mean()

	# expanding standard deviation
	derived[_cname('x_std')] = expand.std()

	# expanding median
	derived[_cname('x_med')] = expand.median()
	
	# expanding largest
	derived[_cname('x_max')] = expand.max()

	# expanding smallest
	derived[_cname('x_min')] = expand.min()
	
	if (shift_freq == agg_freq):
		pass# # whole of previous day
		# last_fast = gb['fast'].transform(pd.Series.last, org_freq)
		# first_slow = gb['slow'].transform(pd.Series.first, org_freq)
		# whole = thresh_fun(last_fast, first_slow)
		# derived[_cname('whl')] = whole.shift(freq=shift_freq)

		# # For days where previous day has less hours than current
		# derived = derived.groupby(pd.Grouper(freq=agg_freq)).fillna(method='ffill')

	elif (shift_freq == org_freq):
		ew = derived.ewm(span=per)

		# EXPONENTIAL
		# simple moving average
		derived[_cname('e_ma')] = ew.mean()

		# simple moving standard deviation
		derived[_cname('e_std')] = ew.std()

		# median
		derived[_cname('e_med')] = ew.median()
		
		# largest
		derived[_cname('e_max')] = ew.max()

		# smallest
		derived[_cname('e_min')] = ew.min()


	return derived

In [44]:
dtype='pba'
oc = eq_df.loc[:, [dtype +'_open', dtype +'_close']].dropna()

In [57]:
_alog_thresh = lambda f, s: np.log((f / s)+1)

In [58]:
val = _alog_thresh(oc[dtype +'_close'], oc[dtype +'_open'])

In [59]:
val

id
1998-01-02 14:00:00+00:00    0.693348
1998-01-02 15:00:00+00:00    0.692544
1998-01-02 16:00:00+00:00    0.692807
1998-01-02 17:00:00+00:00    0.694240
1998-01-02 18:00:00+00:00    0.692158
1998-01-02 19:00:00+00:00    0.692027
1998-01-02 20:00:00+00:00    0.696420
1998-01-02 21:00:00+00:00    0.693902
1998-01-05 14:00:00+00:00    0.693532
1998-01-05 15:00:00+00:00    0.694709
1998-01-05 16:00:00+00:00    0.694765
1998-01-05 17:00:00+00:00    0.691807
1998-01-05 18:00:00+00:00    0.690596
1998-01-05 19:00:00+00:00    0.693055
1998-01-05 20:00:00+00:00    0.694081
1998-01-05 21:00:00+00:00    0.693726
1998-01-06 14:00:00+00:00    0.689738
1998-01-06 15:00:00+00:00    0.692766
1998-01-06 16:00:00+00:00    0.690958
1998-01-06 17:00:00+00:00    0.693877
1998-01-06 18:00:00+00:00    0.694506
1998-01-06 19:00:00+00:00    0.691821
1998-01-06 20:00:00+00:00    0.692842
1998-01-06 21:00:00+00:00    0.693235
1998-01-07 14:00:00+00:00    0.691066
1998-01-07 15:00:00+00:00    0.690333
1998-01-0

In [13]:
ansr_a_fth = get_thresh_fth(oc, thresh_type='ansr', org_freq=DT_HOURLY_FREQ, agg_freq=cust, shift_freq=cust, pfx=dtype)
ansr_o_fth = get_thresh_fth(oc, thresh_type='ansr', org_freq=DT_HOURLY_FREQ, agg_freq=cust, shift_freq=DT_HOURLY_FREQ, pfx=dtype)

In [24]:
ansr_a_vth = get_thresh_vth(oc, thresh_type='ansr', org_freq=DT_HOURLY_FREQ, agg_freq=DT_CAL_DAILY_FREQ, shift_freq=DT_CAL_DAILY_FREQ, per=3, pfx=dtype)
ansr_o_vth = get_thresh_vth(oc, thresh_type='ansr', org_freq=DT_HOURLY_FREQ, agg_freq=DT_CAL_DAILY_FREQ, shift_freq=DT_HOURLY_FREQ, per=3, pfx=dtype)

ValueError: Wrong number of items passed 3, placement implies 1

In [22]:
ansr_a_vth

Unnamed: 0_level_0,slow,fast,thresh,pba_vth(3D)_s_ma_ansr,pba_vth(3D)_s_std_ansr,pba_vth(3D)_s_med_ansr,pba_vth(3D)_s_max_ansr,pba_vth(3D)_s_min_ansr,pba_vth(3D)_x_ma_ansr,pba_vth(3D)_x_std_ansr,pba_vth(3D)_x_med_ansr,pba_vth(3D)_x_max_ansr,pba_vth(3D)_x_min_ansr
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1998-01-02 14:00:00+00:00,970.43,970.82,0.000402,"DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c..."
1998-01-02 15:00:00+00:00,970.72,969.55,0.001205,"DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c..."
1998-01-02 16:00:00+00:00,969.63,968.97,0.000681,"DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c..."
1998-01-02 17:00:00+00:00,969.00,971.12,0.002188,"DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c..."
1998-01-02 18:00:00+00:00,971.10,969.18,0.001977,"DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c..."
1998-01-02 19:00:00+00:00,969.16,966.99,0.002239,"DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c..."
1998-01-02 20:00:00+00:00,967.09,973.43,0.006556,"DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c..."
1998-01-02 21:00:00+00:00,973.53,975.00,0.001510,"DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c..."
1998-01-05 14:00:00+00:00,975.04,975.79,0.000769,"DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c..."
1998-01-05 15:00:00+00:00,975.89,978.94,0.003125,"DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c...","DatetimeIndexResampler [freq=<Hour>, axis=0, c..."


## Volatility Estimators

In [7]:
def day_spread(group_df):
    slow = group_df.iloc[:, 0]
    fast = group_df.iloc[:, 1]
    
    day_diff = fast.last(DT_HOURLY_FREQ) - slow.first(DT_HOURLY_FREQ)
    
    return abs(day_dff)

In [8]:
def get_spreads(intraday_df, pfx='', cal=DT_BIZ_DAILY_FREQ):
    """
    Return spreads.
    
    Args:
        intraday_df (pd.DataFrame): intraday price dataframe with two columns (slow, fast)
        pfx (String, optional): prefix to all column names
        cal (pd.Calender): calender to use for groupby and shift
    
    Returns:
        Return pd.DataFrame with derived columns
    """
    spreads = intraday_df.copy()
    spreads['diff'] = abs(intraday_df.iloc[:, 1] - intraday_df.iloc[:, 0])
    spreads['nsr'] = abs((intraday_df.iloc[:, 1] / intraday_df.iloc[:, 0]) - 1)

    gb = spreads.iloc[:, :2].groupby(pd.Grouper(freq=cal))
    diffs_gb = spreads.loc[:, 'nsr'].groupby(pd.Grouper(freq=cal))
    
    # PREV DAY BASED
    # average spread
    spreads[pfx +'avg_spread'] = diffs_gb.transform(pd.Series.mean).shift(freq=cal)
    
    # largest spread
    spreads[pfx +'big_spread'] = diffs_gb.transform(pd.Series.max).shift(freq=cal)

    # second-to-last spread of previous day
    spreads[pfx +'sec_spread'] = diffs_gb.transform(lambda x: x.iat[len(x)-2]).shift(freq=cal)
    
    # final spread of previous day
    spreads[pfx +'fin_spread'] = diffs_gb.transform(pd.Series.last, DT_HOURLY_FREQ).shift(freq=cal)
    
    # Deal with days where previous day has less hours than current
    spreads = spreads.groupby(pd.Grouper(freq=cal)).fillna(method='ffill')

    # CUR DAY BASED
    # TODO
#     # whole spread of previous day.
#     spreads[pfx +'day_spread'] = gb.transform(day_spread)
    
#     # latest spread of current day
#     spreads[pfx +'lat_spread'] = intraday_df.shift(1)

#     # moving average spread of current day
#     spreads[pfx +'mav_spread'] = 
    return spreads
    

In [9]:
def run_spreads(df, str_pfx, cal=DT_BIZ_DAILY_FREQ):
    pfx = str_pfx +'_'
    # OR get every combination of two...
    return get_spreads(eq_df.loc[:, [pfx +'open', pfx +'close']].dropna(), pfx=pfx +'oc_', cal=cal)
#     get_spreads(eq_df.loc[:, [pfx +'open', pfx +'high']].dropna(), pfx=pfx +'oh_')
#     get_spreads(eq_df.loc[:, [pfx +'open', pfx +'low']].dropna(), pfx=pfx +'ol_')
#     get_spreads(eq_df.loc[:, [pfx +'open', pfx +'avgPrice']].dropna(), pfx=pfx +'oa_')

#     get_spreads(eq_df.loc[:, [pfx +'low', pfx +'high']].dropna(), pfx=pfx +'lc_')
#     get_spreads(eq_df.loc[:, [pfx +'low', pfx +'close']].dropna(), pfx=pfx +'lc_')
#     get_spreads(eq_df.loc[:, [pfx +'high', pfx +'close']].dropna(), pfx=pfx +'hc_')
#     get_spreads(eq_df.loc[:, [pfx +'avgPrice', pfx +'close']].dropna(), pfx=pfx +'ac_')

#     return joined

# spd = run_spreads(eq_df, str_pfx='pba')
# run_spreads(eq_df, pfx='vol

In [10]:
from pandas.tseries.offsets import CustomBusinessDay, CustomBusinessHour
def missing_biz_days(ser):
    biz_days = pd.date_range(ser.index.min(), ser.index.max(), freq=DT_BIZ_DAILY_FREQ).date
    df_biz_days = ser.resample(DT_BIZ_DAILY_FREQ).mean().dropna().index.date

    biz_days = pd.DatetimeIndex(biz_days)
    df_biz_days = pd.DatetimeIndex(df_biz_days)

    return biz_days.difference(df_biz_days)

miss = [None] * 5
miss[0] = missing_biz_days(eq_df['pba_open'])
miss[1] = missing_biz_days(eq_df['pba_high'])
miss[2] = missing_biz_days(eq_df['pba_low'])
miss[3] = missing_biz_days(eq_df['pba_close'])
miss[4] = missing_biz_days(eq_df['pba_avgPrice'])

assert(all(miss[0]==miss[1]) and all(miss[0]==miss[2]) and all(miss[0]==miss[3]) and all(miss[0]==miss[4]))

cust = CustomBusinessDay(holidays=miss[2])

## Intraday Triple Barrier Price Velocity

In [247]:
@vectorize([float64(float64, float64, float64)], nopython=True)
def thresh_break(start, end, thresh):
    change = (end / start) - 1

    return change if (change > thresh or change < -thresh) else 0

In [248]:
def find_touch(group_df, per_shift=1):
    """
    Return touch found
    """
    group_df = group_df.dropna()
    if (group_df.empty):
        return np.NaN

    start_arr = np.array(group_df.loc[:, 'start'].first(DT_HOURLY_FREQ))
    end_arr = np.array(group_df['end'].values)
    thresh_arr = np.array(group_df['thresh'].values)

    stats = {
        "dir": 0,
        "mag": 0,
        "brk": 0,
        "day": end_arr.size
    }

    breaks = thresh_break(start_arr, end_arr, thresh_arr)
    break_ids = np.flatnonzero(breaks)
    
    if (break_ids.size != 0):
        # Change to first threshold break
        change = breaks[break_ids[0]]
        stats['brk'] = break_ids[0] + per_shift
    else:
        # End of day change, no threshold
        change = (end_arr[-1] / start_arr[0]) - 1

    stats['dir'] = np.sign(change)
    stats['mag'] = abs(change)

    return stats['dir'], stats['mag'], stats['brk'], stats['day']

In [260]:
def id_triple_barrier(intraday_df, thresh, scalar=1.0, agg_freq=DT_BIZ_DAILY_FREQ):
    """
    Return intraday triple barrier label series.
    
    Args:
        intraday_df (pd.DataFrame): intraday price dataframe
        thresh (String): name of threshold column
        scalar (float, ℝ≥0): threshold multiplier
    
    Returns:
        Return pd.DataFrame with four columns:
            - 'dir': price direction
            - 'spd': change speed
            - 'brk': period of break (zero if none)
            - 'day': number of trading periods
    """
    # DF Preprocessing
    col_renames = {
        intraday_df.columns[0]: "start",
        thresh: "thresh"
    }
    num_cols = len(intraday_df.columns)
    if (num_cols == 2):
        intraday_df['end'] = intraday_df[intraday_df.columns[0]]
    elif (num_cols > 2):
        col_renames[intraday_df.columns[1]] = 'end'
    intraday_df.rename(columns=col_renames, inplace=True)
    
    # Scale
    intraday_df['thresh'] = scalar * intraday_df['thresh']

    # Apply
    labels = intraday_df.groupby(pd.Grouper(freq=agg_freq)).apply(find_touch)
    labels = labels.apply(pd.Series)
    labels.columns=['dir','mag', 'brk', 'day']
    return labels

In [276]:
new_df = eq_df[['pba_open', 'pba_close']].copy()
new_df = new_df.join(spd['pba_oc_sec_spread'])
new_df['const_thresh'] = .005

labs = id_triple_barrier(new_df.dropna(), thresh='pba_oc_sec_spread', scalar=1.0, agg_freq=cust)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [277]:
labs

Unnamed: 0_level_0,dir,mag,brk,day
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1998-01-05 00:00:00+00:00,1.0,0.007313,3.0,8.0
1998-01-06 00:00:00+00:00,-1.0,0.006806,1.0,8.0
1998-01-07 00:00:00+00:00,-1.0,0.004159,1.0,5.0
1998-01-08 00:00:00+00:00,-1.0,0.004098,1.0,8.0
1998-01-09 00:00:00+00:00,-1.0,0.007751,1.0,8.0
1998-01-12 00:00:00+00:00,1.0,0.012159,7.0,8.0
1998-01-13 00:00:00+00:00,1.0,0.013330,7.0,8.0
1998-01-14 00:00:00+00:00,-1.0,0.003802,3.0,8.0
1998-01-15 00:00:00+00:00,-1.0,0.004207,1.0,8.0
1998-01-16 00:00:00+00:00,1.0,0.007741,1.0,8.0


In [286]:
labs[1==labs['brk']]['dir'].value_counts()

 1.0    1641
-1.0    1490
Name: dir, dtype: int64

### Label EDA

In [16]:
daily_f = dfs['sp_500_raw_1'][pba].groupby(pd.Grouper(freq='B'))['pba_open'].first()
daily_l = dfs['sp_500_raw_1'][pba].groupby(pd.Grouper(freq='B'))['pba_close'].last()

In [17]:
daily_f.head()

id
1998-01-01 00:00:00+00:00       NaN
1998-01-02 00:00:00+00:00    970.43
1998-01-05 00:00:00+00:00    975.04
1998-01-06 00:00:00+00:00    977.07
1998-01-07 00:00:00+00:00    966.58
Freq: B, Name: pba_open, dtype: float64

In [18]:
daily_l.head()

id
1998-01-01 00:00:00+00:00       NaN
1998-01-02 00:00:00+00:00    975.00
1998-01-05 00:00:00+00:00    977.07
1998-01-06 00:00:00+00:00    966.58
1998-01-07 00:00:00+00:00    954.78
Freq: B, Name: pba_close, dtype: float64

In [27]:
pct_changes = (daily_l / daily_f) - 1

In [38]:
#Show sentiment characteristics at sizable (at least greater than 1%) moves
move_thresh = .01
large_moves_pos = pct_changes[pct_changes >= move_thresh]
large_moves_neg = pct_changes[pct_changes <= -move_thresh]
print('all rows:', len(pct_changes))
print('all rows, no weekends:', len(pct_changes.dropna()))

print('\ntotal large moves:', len(large_moves_pos) + len(large_moves_neg))
print('large up moves [greater than', str(move_thresh) +']:', len(large_moves_pos))
print('large down moves [less than', str(-move_thresh) +']:', len(large_moves_neg))

all rows: 5217
all rows, no weekends: 5013

total large moves: 1345
large up moves [greater than 0.01]: 678
large down moves [less than -0.01]: 667


In [29]:
fins = dfs['sp_500_raw_1'][pba].groupby(pd.Grouper(freq='Y'))['pba_close'].describe()

In [118]:
jan2['diff'] = jan2['pba_close'] - jan2['pba_open']
jan2['pct_diff'] = jan2['diff'] / jan2['pba_close']

# Prev price diff
jan2['pv_pct_diff'] = jan2['pct_diff'].shift(1)

# Prev price spread
jan2['pv_spread'] = (jan2['pba_high'] - jan2['pba_low']).shift(1)

# Prev IV
jan2['pv_vol_diff'] = (jan2['vol_close'] - jan2['vol_open']).shift(1)

# Prev IV pct_change
jan2['pv_vol_pct_diff'] = jan2['pv_vol_diff'] / jan2['vol_close'].shift(1)

# Prev IV spread
jan2['pv_vol_spread'] = (jan2['vol_high'] - jan2['vol_low']).shift(1)

In [119]:
fcs = ['pv_pct_diff', 'pv_spread', 'pv_vol_diff', 'pv_vol_pct_diff', 'pv_vol_spread']
lcs = ['pct_diff']
jan2 = jan2.dropna(axis=0, how='any', subset=fcs)
cm = corr_mat(jan2, feat_col_name=fcs, lab_col_name=lcs)
cm

Unnamed: 0,pct_diff
pv_pct_diff,-0.000515
pv_spread,0.008056
pv_vol_diff,0.003881
pv_vol_pct_diff,0.000885
pv_vol_spread,0.010523


In [None]:
frame['pba_numBids'].hist()

In [None]:
interact(dist_probs, col_name=full_list, value=(0.01, .1, .001), showboth=False, suppress_print=False);

In [None]:
tenth_inner_prob = {} #probability value will land within .1 of the mean
for sent in sent_list:
    tenth_inner_prob[sent] = dist_probs(sent, .1, suppress_print=True)

import operator
sorted_tenth_prob = sorted(tenth_inner_prob.items(), key=operator.itemgetter(1))
less_than_85_percent = list(filter(lambda x: x[1] < .85, sorted_tenth_prob))
greater_than_85_percent = list(filter(lambda x: x[1] >= .85, sorted_tenth_prob))
print(less_than_85_percent)

less_85_stats = pd.DataFrame(columns=['sent', 'mean', 'variance'])
lt85 = [tup[0] for tup in less_than_85_percent]
for sent in lt85:
    less_85_stats = less_85_stats.append({'sent':sent, 'mean':data[sent].mean(), 'variance':data[sent].var()}, ignore_index=True)

greater_85_stats = pd.DataFrame(columns=['sent', 'mean', 'variance'])
gt85 = [tup[0] for tup in greater_than_85_percent]
for sent in gt85:
    greater_85_stats = greater_85_stats.append({'sent':sent, 'mean':data[sent].mean(), 'variance':data[sent].var()}, ignore_index=True)

less_85_stats

TODO: Compare sentiment vectors at large movement thresholds ($\pm 6$% , $\pm 4$% , $\pm 2$% , $\pm 1.5$%) to sentiment vectors of the other price movements.
Compare things such as min/max, mean, median, variance, average probability (based on normal dist)