In [1]:
import sys
import os
from os import sep
from os.path import dirname, realpath
from pathlib import Path
from itertools import product
import logging

def get_cwd(fname, subdir, crunch_dir=realpath(Path.home()) +sep +'crunch' +sep):
    """
    Convenience function to make a directory string for the current file based on inputs.
    Jupyter Notebook in Anaconda invokes the Python interpreter in Anaconda's subdirectory
    which sis why changing sys.argv[0] is necessary. In the future a better way to do this
    should be preferred..
    """
    return crunch_dir +subdir +fname

def fix_path(cwd):
    """
    Convenience function to fix argv and python path so that jupyter notebook can run the same as
    any script in crunch.
    """
    sys.argv[0] = cwd
    module_path = os.path.abspath(os.path.join('..'))
    if module_path not in sys.path:
        sys.path.append(module_path)

fix_path(get_cwd('pattern_eda.ipynb', 'mutate' +sep))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from numba import jit, vectorize, float64, uint
# from tslearn.preprocessing import TimeSeriesScalerMeanVariance
# from tslearn.piecewise import PiecewiseAggregateApproximation
# from tslearn.piecewise import SymbolicAggregateApproximation, OneD_SymbolicAggregateApproximation
from scipy.stats import zscore
from sortedcontainers import SortedList, SortedSet 

from ipywidgets import interact, interactive, fixed
from IPython.display import display

pd.set_option("display.max_rows", 100)
pd.set_option('display.max_columns', 50)

from common_util import DT_HOURLY_FREQ, DT_CAL_DAILY_FREQ, DT_BIZ_DAILY_FREQ, get_custom_biz_freq, get_custom_biz_freq_df, query_df, search_df, chained_filter, benchmark
from common_util import MUTATE_DIR, load_json, outer_join, left_join, count_nn_df, count_nz_df, count_nn_nz_df, pairwise, cust_count, list_get_dict, get_time_mask
from data.data_api import DataAPI
from data.access_util import col_subsetters as cs, df_getters as dg, col_subsetters2 as cs2
from mutate.common import dum
from recon.viz import *

## Load Data

In [2]:
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)

date_range = {
    'id': ('lt', 2018)
}

search_terms = {
    'stage': 'raw',
    'raw_cat': 'us_equity_index'
}
price_dfs = {}
for rec, price_df in DataAPI.generate(search_terms):
    price_dfs[rec.root] = price_df.loc[search_df(price_df, date_range)]
logging.info('price data loaded')

INFO:root:price data loaded


In [3]:
col = 'pba_gmtOffset'
sp = price_dfs['sp_500']
per = ["08:00", "16:00"]

In [5]:
mask2 = get_time_mask(sp, offset_col_name=col, offset_tz='US/Eastern', time_range=per)

In [56]:
def indexes_within(dti_ser, lt_start, lt_end, lt_period=None):
    lt_offsets = pd.TimedeltaIndex(dti_ser.iloc[:, 0], 'H')
    lt = dti_ser.index + lt_offsets
    iloc_within = lt.indexer_between_time(lt_start, lt_end)

    return dti_ser.iloc[iloc_within].index

sampled = price_dfs['sp_500'].loc[:, ['pba_gmtOffset', 'pba_close', 'pba_avgPrice']].dropna(axis=0, how='all')
shifted = indexes_within(sampled, "08:00", "16:00")
shifted.hour.value_counts(sort=True)

16    5000
17    4997
14    4996
15    4993
18    4988
19    4950
20    4943
13    3100
21    1864
22      13
Name: id, dtype: int64

In [75]:
time_offset_df = price_dfs['sp_500'].loc[:, ['pba_gmtOffset', 'pba_close', 'pba_avgPrice']].dropna(axis=0, how='all')
lt_offsets = pd.TimedeltaIndex(dti_ser['pba_gmtOffset'], 'H')
print(times)
time_offset_df['local_time'] = times.shift(n=dti_ser['pba_gmtOffset'], freq='H') if ('pba_gmtOffset' in time_offset_df.columns) else times

[datetime.time(14, 0) datetime.time(15, 0) datetime.time(16, 0) ...
 datetime.time(19, 0) datetime.time(20, 0) datetime.time(21, 0)]


AttributeError: 'numpy.ndarray' object has no attribute 'shift'

In [82]:
lt_offsets.value_counts()

-1 days +19:00:00    24779
-1 days +18:00:00    15071
dtype: int64

In [80]:
time_offset_df.index

DatetimeIndex(['1998-01-02 14:00:00+00:00', '1998-01-02 15:00:00+00:00',
               '1998-01-02 16:00:00+00:00', '1998-01-02 17:00:00+00:00',
               '1998-01-02 18:00:00+00:00', '1998-01-02 19:00:00+00:00',
               '1998-01-02 20:00:00+00:00', '1998-01-02 21:00:00+00:00',
               '1998-01-05 14:00:00+00:00', '1998-01-05 15:00:00+00:00',
               ...
               '2017-12-28 20:00:00+00:00', '2017-12-28 21:00:00+00:00',
               '2017-12-29 14:00:00+00:00', '2017-12-29 15:00:00+00:00',
               '2017-12-29 16:00:00+00:00', '2017-12-29 17:00:00+00:00',
               '2017-12-29 18:00:00+00:00', '2017-12-29 19:00:00+00:00',
               '2017-12-29 20:00:00+00:00', '2017-12-29 21:00:00+00:00'],
              dtype='datetime64[ns, UTC]', name='id', length=39850, freq=None)

In [77]:
time_offset_df.index+lt_offsets

DatetimeIndex(['1998-01-02 08:00:00+00:00', '1998-01-02 09:00:00+00:00',
               '1998-01-02 10:00:00+00:00', '1998-01-02 11:00:00+00:00',
               '1998-01-02 12:00:00+00:00', '1998-01-02 13:00:00+00:00',
               '1998-01-02 14:00:00+00:00', '1998-01-02 15:00:00+00:00',
               '1998-01-05 08:00:00+00:00', '1998-01-05 09:00:00+00:00',
               ...
               '2017-12-28 14:00:00+00:00', '2017-12-28 15:00:00+00:00',
               '2017-12-29 08:00:00+00:00', '2017-12-29 09:00:00+00:00',
               '2017-12-29 10:00:00+00:00', '2017-12-29 11:00:00+00:00',
               '2017-12-29 12:00:00+00:00', '2017-12-29 13:00:00+00:00',
               '2017-12-29 14:00:00+00:00', '2017-12-29 15:00:00+00:00'],
              dtype='datetime64[ns, UTC]', length=39850, freq=None)

In [76]:
lt_offsets

TimedeltaIndex(['-1 days +18:00:00', '-1 days +18:00:00', '-1 days +18:00:00',
                '-1 days +18:00:00', '-1 days +18:00:00', '-1 days +18:00:00',
                '-1 days +18:00:00', '-1 days +18:00:00', '-1 days +18:00:00',
                '-1 days +18:00:00',
                ...
                '-1 days +18:00:00', '-1 days +18:00:00', '-1 days +18:00:00',
                '-1 days +18:00:00', '-1 days +18:00:00', '-1 days +18:00:00',
                '-1 days +18:00:00', '-1 days +18:00:00', '-1 days +18:00:00',
                '-1 days +18:00:00'],
               dtype='timedelta64[ns]', length=39850, freq=None)

In [67]:
time_offset_df[time_offset_df['pba_gmtOffset']==-5.0]['local_time'].value_counts()

11.0    3102
12.0    3102
9.0     3100
8.0     3100
10.0    3096
13.0    3092
14.0    3089
15.0    3085
16.0       9
17.0       3
7.0        1
Name: local_time, dtype: int64

In [66]:
time_offset_df[time_offset_df['pba_gmtOffset']==-6.0]['local_time'].value_counts()

10.0    1898
9.0     1897
12.0    1896
8.0     1896
11.0    1895
13.0    1861
14.0    1858
15.0    1855
16.0      13
17.0       1
3.0        1
Name: local_time, dtype: int64

In [64]:
time_offset_df[['pba_gmtOffset', 'local_time']].apply(pd.Series.value_counts)

Unnamed: 0,pba_gmtOffset,local_time
-6.0,15071.0,
-5.0,24779.0,
3.0,,1.0
7.0,,1.0
8.0,,4996.0
9.0,,4997.0
10.0,,4994.0
11.0,,4997.0
12.0,,4998.0
13.0,,4953.0


In [54]:
time_offset_df[time_offset_df['local_time']==16]

Unnamed: 0_level_0,pba_gmtOffset,pba_close,pba_avgPrice,local_time
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1998-02-03 22:00:00+00:00,-6.0,1005.99,1005.9967,16.0
1998-02-06 22:00:00+00:00,-6.0,1012.46,1012.452,16.0
2001-06-29 21:00:00+00:00,-5.0,1224.38,1224.362,16.0
2002-07-05 21:00:00+00:00,-5.0,972.14,966.871,16.0
2003-03-21 22:00:00+00:00,-6.0,895.9,895.8967,16.0
2007-01-03 22:00:00+00:00,-6.0,1416.63,1416.6142,16.0
2007-05-07 21:00:00+00:00,-5.0,1509.48,1509.4802,16.0
2008-10-08 21:00:00+00:00,-5.0,984.94,988.3926,16.0
2008-10-09 21:00:00+00:00,-5.0,909.92,912.6596,16.0
2008-10-10 21:00:00+00:00,-5.0,899.22,901.0482,16.0


## row masks

In [15]:
all_raw = ['raw', 'all']
raw_dg, raw_cs = list_get_dict(dg, all_raw), list_get_dict(cs2, all_raw)
raw_paths, raw_recs, raw_dfs = DataAPI.load_from_dg(raw_dg, raw_cs, subset=['raw_pba', 'raw_vol'])

In [2]:
raw_id = ['raw', 'id_rm']
rm_dg, rm_cs = list_get_dict(dg, raw_id), list_get_dict(cs2, raw_id)
rm_paths, rm_recs, rm_dfs = DataAPI.load_from_dg(rm_dg, rm_cs, subset=['raw_pba', 'raw_vol'])

In [26]:
for key_chain in rm_paths:
    print(key_chain)
    rm_df = list_get_dict(rm_dfs, key_chain)
    print(rm_df.dropna().info())

['dow_jones', 'id_rm', 'raw_pba']
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 39857 entries, 1998-01-02 14:00:00+00:00 to 2018-01-08 21:00:00+00:00
Data columns (total 1 columns):
times    39857 non-null datetime64[ns, UTC]
dtypes: datetime64[ns, UTC](1)
memory usage: 622.8 KB
None
['sp_500', 'id_rm', 'raw_pba']
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 39884 entries, 1998-01-02 14:00:00+00:00 to 2018-01-08 21:00:00+00:00
Data columns (total 1 columns):
times    39884 non-null datetime64[ns, UTC]
dtypes: datetime64[ns, UTC](1)
memory usage: 623.2 KB
None
['nasdaq_100', 'id_rm', 'raw_pba']
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 39417 entries, 1998-01-02 14:00:00+00:00 to 2018-01-08 21:00:00+00:00
Data columns (total 1 columns):
times    39417 non-null datetime64[ns, UTC]
dtypes: datetime64[ns, UTC](1)
memory usage: 615.9 KB
None
['russell_2000', 'id_rm', 'raw_pba']
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 39840 entries, 1998-01-02 14:00:00+0

In [27]:
for key_chain in raw_paths:
    print(key_chain)
    raw_df = list_get_dict(raw_dfs, key_chain)
    gmt_col = 'pba_gmtOffset' if (key_chain[-1] == 'raw_pba') else 'vol_gmtOffset'
    print(raw_df[[gmt_col]].dropna().info())

['dow_jones', 'all', 'raw_pba']
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 39863 entries, 1998-01-02 14:00:00+00:00 to 2018-01-08 21:00:00+00:00
Data columns (total 1 columns):
pba_gmtOffset    39863 non-null float64
dtypes: float64(1)
memory usage: 622.9 KB
None
['sp_500', 'all', 'raw_pba']
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 39890 entries, 1998-01-02 14:00:00+00:00 to 2018-01-08 21:00:00+00:00
Data columns (total 1 columns):
pba_gmtOffset    39890 non-null float64
dtypes: float64(1)
memory usage: 623.3 KB
None
['nasdaq_100', 'all', 'raw_pba']
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 39464 entries, 1998-01-02 14:00:00+00:00 to 2018-01-08 21:00:00+00:00
Data columns (total 1 columns):
pba_gmtOffset    39464 non-null float64
dtypes: float64(1)
memory usage: 616.6 KB
None
['russell_2000', 'all', 'raw_pba']
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 39851 entries, 1998-01-02 14:00:00+00:00 to 2018-01-08 21:00:00+00:00
Data columns (total 1 

In [82]:
sp_500_raw_vol = list_get_dict(raw_dfs, ['sp_500', 'all', 'raw_vol'])
tr = ["08:00", "16:00"]
res = get_time_mask(sp_500_raw_vol, offset_col_name='vol_gmtOffset', offset_tz='US/Eastern', time_range=tr)

In [83]:
res.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 38479 entries, 1998-01-02 15:00:00+00:00 to 2018-01-08 21:00:00+00:00
Data columns (total 1 columns):
times    38479 non-null datetime64[ns, UTC]
dtypes: datetime64[ns, UTC](1)
memory usage: 601.2 KB


In [71]:
res

Unnamed: 0_level_0,times
id,Unnamed: 1_level_1
1998-01-02 15:00:00+00:00,1998-01-02 09:00:00+00:00
1998-01-02 16:00:00+00:00,1998-01-02 10:00:00+00:00
1998-01-02 17:00:00+00:00,1998-01-02 11:00:00+00:00
1998-01-02 18:00:00+00:00,1998-01-02 12:00:00+00:00
1998-01-02 19:00:00+00:00,1998-01-02 13:00:00+00:00
1998-01-02 20:00:00+00:00,1998-01-02 14:00:00+00:00
1998-01-05 14:00:00+00:00,1998-01-05 08:00:00+00:00
1998-01-05 16:00:00+00:00,1998-01-05 10:00:00+00:00
1998-01-05 17:00:00+00:00,1998-01-05 11:00:00+00:00
1998-01-05 18:00:00+00:00,1998-01-05 12:00:00+00:00


# Sandbox

## Function Defs

In [16]:
thresh_df = thresh_dfs[assets[1]].loc[:, thresh_cols]
pba_df = price_dfs[assets[1]].loc[:, pba_cols]
vol_df = price_dfs[assets[1]].loc[:, vol_cols]
pba_vol_df = price_dfs[assets[1]].loc[:, pba_vol_cols]

In [17]:
pba_vol_cust, pba_vol_count_df = cust_count(pba_vol_df)
pba_vol_count_df.apply(pd.Series.value_counts)

Unnamed: 0,pba_open,pba_high,pba_low,pba_close,pba_avgPrice,vol_open,vol_high,vol_low,vol_close,vol_avgPrice
0,204.0,204.0,204.0,204.0,204.0,205,205,205,205,205
1,1.0,1.0,1.0,1.0,1.0,2,2,2,2,2
2,7.0,7.0,7.0,7.0,7.0,7,7,7,7,7
3,5.0,5.0,5.0,5.0,5.0,7,7,7,7,7
4,5.0,5.0,5.0,5.0,5.0,6,6,6,6,6
5,46.0,46.0,46.0,46.0,46.0,44,44,44,44,44
6,15.0,15.0,15.0,15.0,15.0,33,33,33,33,33
7,18.0,18.0,18.0,18.0,18.0,1353,1353,1353,1353,1353
8,4890.0,4890.0,4890.0,4890.0,4890.0,3106,3106,3106,3106,3106
9,26.0,26.0,26.0,26.0,26.0,22,22,22,22,22


In [18]:
pba_tdf = thresh_df['pba_oc_spread_fth_of_xact']
pba_tdf_cust, pba_tdf_count_df = cust_count(pba_tdf)
pba_tdf_count_df.value_counts()

8    4890
0     203
5      46
9      26
7      18
6      15
2       7
3       5
4       5
1       1
Name: pba_oc_spread_fth_of_xact, dtype: int64

In [19]:
vol_tdf = thresh_df['vol_oc_spread_fth_of_xact']
vol_tdf_cust, vol_tdf_count_df = cust_count(vol_tdf)
vol_tdf_count_df.value_counts()

8     3106
7     1353
14     414
0      204
5       44
6       33
9       22
13      14
3        7
2        7
4        6
11       4
1        2
Name: vol_oc_spread_fth_of_xact, dtype: int64

In [30]:
cust, dzn_price_count_df = cust_count(price_vol_cols_df)
price_vol_cols_df.groupby(pd.Grouper(freq=cust)).transform(zscore)

Unnamed: 0_level_0,pba_open,pba_high,pba_low,pba_close,pba_avgPrice,vol_open,vol_high,vol_low,vol_close,vol_avgPrice
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1998-01-01 01:00:00+00:00,,,,,,,,,,
1998-01-01 02:00:00+00:00,,,,,,,,,,
1998-01-01 03:00:00+00:00,,,,,,,,,,
1998-01-01 04:00:00+00:00,,,,,,,,,,
1998-01-01 05:00:00+00:00,,,,,,,,,,
1998-01-01 06:00:00+00:00,,,,,,,,,,
1998-01-01 07:00:00+00:00,,,,,,,,,,
1998-01-01 08:00:00+00:00,,,,,,,,,,
1998-01-01 09:00:00+00:00,,,,,,,,,,
1998-01-01 10:00:00+00:00,,,,,,,,,,


In [19]:
cust, dzn_price_count_df = cust_count(dzn_price_vol_cols_df)

In [29]:
ex_df = dzn_price_count_df[['pba_avgPrice']]
cust = get_custom_biz_freq(ex_df)
count_df = ex_df.groupby(pd.Grouper(freq=cust)).count()
count_df.index = count_df.index.strftime("%Y-%m-%d")

TypeError: Only valid with DatetimeIndex, TimedeltaIndex or PeriodIndex, but got an instance of 'Index'

In [26]:
dzn_price_count_df['pba_avgPrice'].value_counts()

8    3536
7    1354
5      49
6      31
9      20
3       7
2       7
4       6
Name: pba_avgPrice, dtype: int64

In [27]:
dzn_price_count_df['vol_avgPrice'].value_counts()

8    3536
7    1354
5      49
6      31
9      20
3       7
2       7
4       6
Name: vol_avgPrice, dtype: int64