In [1]:
import sys
import os
from os import sep
from os.path import dirname, realpath
from pathlib import Path
import logging

def get_cwd(fname, subdir, crunch_dir=realpath(Path.home()) +sep +'crunch' +sep):
    """
    Convenience function to make a directory string for the current file based on inputs.
    Jupyter Notebook in Anaconda invokes the Python interpreter in Anaconda's subdirectory
    which is why changing sys.argv[0] is necessary. In the future a better way to do this
    should be preferred..
    """
    return crunch_dir +subdir +fname
    
def fix_path(cwd):
    """
    Convenience function to fix argv and python path so that jupyter notebook can run the same as
    any script in crunch.
    """
    sys.argv[0] = cwd
    module_path = os.path.abspath(os.path.join('..'))
    if module_path not in sys.path:
        sys.path.append(module_path)

fix_path(get_cwd('test.ipynb', 'recon' +sep))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from numba import jit, vectorize, float64

from ipywidgets import interact, interactive, fixed
from IPython.display import display

pd.set_option("display.max_rows", 100)
pd.set_option('display.max_columns', 50)

from common_util import DT_HOURLY_FREQ, DT_BIZ_DAILY_FREQ, query_df, search_df, chained_filter, benchmark
from data.data_api import DataAPI
from data.access_util import col_subsetters as cs
from mutate.common import dum#, count_nonnan, count_nonzero, count_both
# from recon.transform import *
# from recon.filter import *
# from recon.viz import *
# from recon.corr import corr_mat

### Load Raw Data

Get all raw data, filter out data before 2018 (leave 2018 for final validation)

In [167]:
search_terms = {
    'stage': 'raw'
}
date_range = {
    'id': ('lt', 2018)
}
dfs = {}
for rec, df in DataAPI.generate(search_terms):
    dfs[rec.name] = df.loc[search_df(df, date_range)]

In [168]:
list(dfs.keys())

['dow_jones_raw_0',
 'sp_500_raw_1',
 'nasdaq_100_raw_2',
 'russell_2000_raw_3',
 'oil_raw_4',
 'gold_raw_5']

In [169]:
pba = chained_filter(dfs['sp_500_raw_1'].columns, [cs['#pba']['ohlc']])
vol = chained_filter(dfs['sp_500_raw_1'].columns, [cs['#vol']['ohlc']])
pba_vol = pba + vol

In [170]:
pba_vol

['pba_open',
 'pba_high',
 'pba_low',
 'pba_close',
 'pba_avgPrice',
 'vol_open',
 'vol_high',
 'vol_low',
 'vol_close',
 'vol_avgPrice']

In [171]:
eq_df = dfs['sp_500_raw_1'][pba_vol]
em_df = dfs['oil_raw_4'][pba_vol]

## Volatility Estimators

In [172]:
diffs = (eq_df['pba_close'] / eq_df['pba_open']) -1

In [8]:
# @vectorize([float64(float64, float64)], nopython=True)
# def avg_spread(first, second):
#     return first - second
#     change = (end / start) - 1

#     return change if (change > thresh or change < -thresh) else 0

In [96]:
def day_spread(group_df):
    slow = group_df.iloc[:, 0]
    fast = group_df.iloc[:, 1]
    
    day_diff = fast.last(DT_HOURLY_FREQ) - slow.first(DT_HOURLY_FREQ)
    
    return abs(day_dff)

In [118]:
def get_spreads(intraday_df, pfx=''):
    """
    Return spreads.
    
    Args:
        intraday_df (pd.DataFrame): intraday price dataframe with two columns (slow, fast)
        pfx (String, optional): prefix to all column names
    
    Returns:
        Return pd.DataFrame with derived columns
    """
    spreads = intraday_df.copy()
    spreads['diff'] = abs(intraday_df.iloc[:, 1] - intraday_df.iloc[:, 0])

    gb = spreads.iloc[:, :2].groupby(pd.Grouper(freq=DT_BIZ_DAILY_FREQ))
    diffs_gb = spreads.loc[:, 'diff'].groupby(pd.Grouper(freq=DT_BIZ_DAILY_FREQ))
    
    # PREV DAY BASED
    # average spread
    spreads[pfx +'avg_spread'] = diffs_gb.transform(pd.Series.mean).fillna(axis=0, method='ffill').shift(freq=DT_BIZ_DAILY_FREQ)
    
    # largest spread
    spreads[pfx +'big_spread'] = diffs_gb.transform(pd.Series.max).fillna(axis=0, method='ffill')
    
    # final spread of previous day
    spreads[pfx +'fin_spread'] = diffs_gb.transform(pd.Series.last, DT_HOURLY_FREQ).fillna(axis=0, method='ffill')
    
#     spreads[pfx +'avg_spread'] = spreads[pfx +'avg_spread'].fillna(axis=0, method='pad')
    
    

#     # whole spread of previous day.
#     spreads[pfx +'day_spread'] = gb.transform(day_spread)
    
#     # latest spread of current day
#     spreads[pfx +'lat_spread'] = intraday_df.shift(1)

#     # moving average spread of current day
#     spreads[pfx +'mav_spread'] = 
    return spreads
    

In [119]:
def run_spreads(df, str_pfx):
    pfx = str_pfx +'_'
    # OR get every combination of two...
    return get_spreads(eq_df.loc[:, [pfx +'open', pfx +'close']].dropna(), pfx=pfx +'oc_')
#     get_spreads(eq_df.loc[:, [pfx +'open', pfx +'high']].dropna(), pfx=pfx +'oh_')
#     get_spreads(eq_df.loc[:, [pfx +'open', pfx +'low']].dropna(), pfx=pfx +'ol_')
#     get_spreads(eq_df.loc[:, [pfx +'open', pfx +'avgPrice']].dropna(), pfx=pfx +'oa_')

#     get_spreads(eq_df.loc[:, [pfx +'low', pfx +'high']].dropna(), pfx=pfx +'lc_')
#     get_spreads(eq_df.loc[:, [pfx +'low', pfx +'close']].dropna(), pfx=pfx +'lc_')
#     get_spreads(eq_df.loc[:, [pfx +'high', pfx +'close']].dropna(), pfx=pfx +'hc_')
#     get_spreads(eq_df.loc[:, [pfx +'avgPrice', pfx +'close']].dropna(), pfx=pfx +'ac_')

#     return joined

spd = run_spreads(eq_df, str_pfx='pba')
# run_spreads(eq_df, pfx='vol

In [184]:
def missing_biz_days(ser):
    biz_days = pd.date_range(ser.index.min(), ser.index.max(), freq=DT_BIZ_DAILY_FREQ).date
    df_biz_days = ser.resample(DT_BIZ_DAILY_FREQ).mean().dropna().index.date

    biz_days = pd.DatetimeIndex(biz_days)
    df_biz_days = pd.DatetimeIndex(df_biz_days)

    return biz_days.difference(df_biz_days)

miss = missing_biz_days(eq_df['pba_close'])
print(miss)

DatetimeIndex(['1998-01-01', '1998-01-19', '1998-02-16', '1998-03-26',
               '1998-04-10', '1998-04-14', '1998-04-21', '1998-05-25',
               '1998-07-01', '1998-07-03',
               ...
               '2016-12-26', '2017-01-02', '2017-01-16', '2017-02-20',
               '2017-04-14', '2017-05-29', '2017-07-04', '2017-09-04',
               '2017-11-23', '2017-12-25'],
              dtype='datetime64[ns]', length=204, freq=None)


In [None]:
eq_df

In [120]:
spd

Unnamed: 0_level_0,pba_open,pba_close,diff,pba_oc_avg_spread,pba_oc_big_spread,pba_oc_fin_spread
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1998-01-02 14:00:00+00:00,970.43,970.82,0.39,,6.34,1.47
1998-01-02 15:00:00+00:00,970.72,969.55,1.17,,6.34,1.47
1998-01-02 16:00:00+00:00,969.63,968.97,0.66,,6.34,1.47
1998-01-02 17:00:00+00:00,969.00,971.12,2.12,,6.34,1.47
1998-01-02 18:00:00+00:00,971.10,969.18,1.92,,6.34,1.47
1998-01-02 19:00:00+00:00,969.16,966.99,2.17,,6.34,1.47
1998-01-02 20:00:00+00:00,967.09,973.43,6.34,,6.34,1.47
1998-01-02 21:00:00+00:00,973.53,975.00,1.47,,6.34,1.47
1998-01-05 14:00:00+00:00,975.04,975.79,0.75,2.03000,4.99,1.13
1998-01-05 15:00:00+00:00,975.89,978.94,3.05,2.03000,4.99,1.13


## Intraday Triple Barrier Price Velocity

In [29]:
@vectorize([float64(float64, float64, float64)], nopython=True)
def thresh_break(start, end, thresh):
    change = (end / start) - 1

    return change if (change > thresh or change < -thresh) else 0

In [88]:
def find_touch(group_df, thresh, per_shift=1):
    """
    Return touch found
    """
    group_df = group_df.dropna()
    if (group_df.empty):
        return np.NaN

    start_arr = np.array(group_df.loc[:, 'start_ser'].first(DT_HOURLY_FREQ))
    end_arr = np.array(group_df['end_ser'].values)
    stats = {
        "dir": 0,
        "mag": 0,
        "brk": 0,
        "day": end_arr.size
    }

    breaks = thresh_break(start_arr, end_arr, thresh)
    break_ids = np.flatnonzero(breaks)
    
    if (break_ids.size != 0):
        # Change to first threshold break
        change = breaks[break_ids[0]]
        stats['brk'] = break_ids[0] + per_shift
    else:
        # End of day change, no threshold
        change = (end_arr[-1] / start_arr[0]) - 1

    stats['dir'] = np.sign(change)
    stats['mag'] = abs(change)

    return stats['dir'], stats['mag'], stats['brk'], stats['day']

In [89]:
def id_triple_barrier(intraday_df, thresh, scalar=1.0, agg_freq=DT_BIZ_DAILY_FREQ):
    """
    Return intraday triple barrier label series.
    
    Args:
        intraday_df (pd.DataFrame): intraday price dataframe
        thresh (pd.Series or float, ℝ≥0):
        scalar (float, ℝ≥0): threshold multiplier
    
    Returns:
        Return pd.DataFrame with four columns:
            - 'dir': price direction
            - 'spd': change speed
            - 'brk': period of break (zero if none)
            - 'day': number of trading periods
    """
    
    # Rename
    col_renames = {
        "pba_open": "start_ser",
        "pba_close": "end_ser"
    }
    intraday_df.rename(columns=col_renames, inplace=True)
    
    # Convert threshold to numpy array
    if (isinstance(thresh, pd.Series)):
        thresh = thresh.values
    thresh = np.array(thresh)

    labels = intraday_df.groupby(pd.Grouper(freq=agg_freq)).apply(find_touch, thresh)
    labels = labels.apply(pd.Series)
    labels.columns=['dir','mag', 'brk', 'day']
    return labels

In [90]:
labs = id_triple_barrier(eq_df.loc[:, ['pba_open', 'pba_close']].dropna(), thresh=.005)

In [91]:
labs

Unnamed: 0_level_0,dir,mag,brk,day
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1998-01-02 00:00:00+00:00,1.0,0.004709,0.0,8.0
1998-01-05 00:00:00+00:00,1.0,0.007313,3.0,8.0
1998-01-06 00:00:00+00:00,-1.0,0.006806,1.0,8.0
1998-01-07 00:00:00+00:00,-1.0,0.009766,2.0,5.0
1998-01-08 00:00:00+00:00,-1.0,0.005477,2.0,8.0
1998-01-09 00:00:00+00:00,-1.0,0.007751,1.0,8.0
1998-01-12 00:00:00+00:00,1.0,0.005659,4.0,8.0
1998-01-13 00:00:00+00:00,1.0,0.006207,1.0,8.0
1998-01-14 00:00:00+00:00,1.0,0.005693,7.0,8.0
1998-01-15 00:00:00+00:00,-1.0,0.005783,4.0,8.0


### Label EDA

In [16]:
daily_f = dfs['sp_500_raw_1'][pba].groupby(pd.Grouper(freq='B'))['pba_open'].first()
daily_l = dfs['sp_500_raw_1'][pba].groupby(pd.Grouper(freq='B'))['pba_close'].last()

In [17]:
daily_f.head()

id
1998-01-01 00:00:00+00:00       NaN
1998-01-02 00:00:00+00:00    970.43
1998-01-05 00:00:00+00:00    975.04
1998-01-06 00:00:00+00:00    977.07
1998-01-07 00:00:00+00:00    966.58
Freq: B, Name: pba_open, dtype: float64

In [18]:
daily_l.head()

id
1998-01-01 00:00:00+00:00       NaN
1998-01-02 00:00:00+00:00    975.00
1998-01-05 00:00:00+00:00    977.07
1998-01-06 00:00:00+00:00    966.58
1998-01-07 00:00:00+00:00    954.78
Freq: B, Name: pba_close, dtype: float64

In [27]:
pct_changes = (daily_l / daily_f) - 1

In [38]:
#Show sentiment characteristics at sizable (at least greater than 1%) moves
move_thresh = .01
large_moves_pos = pct_changes[pct_changes >= move_thresh]
large_moves_neg = pct_changes[pct_changes <= -move_thresh]
print('all rows:', len(pct_changes))
print('all rows, no weekends:', len(pct_changes.dropna()))

print('\ntotal large moves:', len(large_moves_pos) + len(large_moves_neg))
print('large up moves [greater than', str(move_thresh) +']:', len(large_moves_pos))
print('large down moves [less than', str(-move_thresh) +']:', len(large_moves_neg))

all rows: 5217
all rows, no weekends: 5013

total large moves: 1345
large up moves [greater than 0.01]: 678
large down moves [less than -0.01]: 667


In [29]:
fins = dfs['sp_500_raw_1'][pba].groupby(pd.Grouper(freq='Y'))['pba_close'].describe()

In [118]:
jan2['diff'] = jan2['pba_close'] - jan2['pba_open']
jan2['pct_diff'] = jan2['diff'] / jan2['pba_close']

# Prev price diff
jan2['pv_pct_diff'] = jan2['pct_diff'].shift(1)

# Prev price spread
jan2['pv_spread'] = (jan2['pba_high'] - jan2['pba_low']).shift(1)

# Prev IV
jan2['pv_vol_diff'] = (jan2['vol_close'] - jan2['vol_open']).shift(1)

# Prev IV pct_change
jan2['pv_vol_pct_diff'] = jan2['pv_vol_diff'] / jan2['vol_close'].shift(1)

# Prev IV spread
jan2['pv_vol_spread'] = (jan2['vol_high'] - jan2['vol_low']).shift(1)

In [119]:
fcs = ['pv_pct_diff', 'pv_spread', 'pv_vol_diff', 'pv_vol_pct_diff', 'pv_vol_spread']
lcs = ['pct_diff']
jan2 = jan2.dropna(axis=0, how='any', subset=fcs)
cm = corr_mat(jan2, feat_col_name=fcs, lab_col_name=lcs)
cm

Unnamed: 0,pct_diff
pv_pct_diff,-0.000515
pv_spread,0.008056
pv_vol_diff,0.003881
pv_vol_pct_diff,0.000885
pv_vol_spread,0.010523


In [None]:
frame['pba_numBids'].hist()

In [None]:
interact(dist_probs, col_name=full_list, value=(0.01, .1, .001), showboth=False, suppress_print=False);

In [None]:
tenth_inner_prob = {} #probability value will land within .1 of the mean
for sent in sent_list:
    tenth_inner_prob[sent] = dist_probs(sent, .1, suppress_print=True)

import operator
sorted_tenth_prob = sorted(tenth_inner_prob.items(), key=operator.itemgetter(1))
less_than_85_percent = list(filter(lambda x: x[1] < .85, sorted_tenth_prob))
greater_than_85_percent = list(filter(lambda x: x[1] >= .85, sorted_tenth_prob))
print(less_than_85_percent)

less_85_stats = pd.DataFrame(columns=['sent', 'mean', 'variance'])
lt85 = [tup[0] for tup in less_than_85_percent]
for sent in lt85:
    less_85_stats = less_85_stats.append({'sent':sent, 'mean':data[sent].mean(), 'variance':data[sent].var()}, ignore_index=True)

greater_85_stats = pd.DataFrame(columns=['sent', 'mean', 'variance'])
gt85 = [tup[0] for tup in greater_than_85_percent]
for sent in gt85:
    greater_85_stats = greater_85_stats.append({'sent':sent, 'mean':data[sent].mean(), 'variance':data[sent].var()}, ignore_index=True)

less_85_stats

TODO: Compare sentiment vectors at large movement thresholds ($\pm 6$% , $\pm 4$% , $\pm 2$% , $\pm 1.5$%) to sentiment vectors of the other price movements.
Compare things such as min/max, mean, median, variance, average probability (based on normal dist)