In [1]:
import sys
import os
from os import sep
from os.path import dirname, realpath
from pathlib import Path
from itertools import product
import logging

def get_cwd(fname, subdir, crunch_dir=realpath(Path.home()) +sep +'crunch' +sep):
    """
    Convenience function to make a directory string for the current file based on inputs.
    Jupyter Notebook in Anaconda invokes the Python interpreter in Anaconda's subdirectory
    which sis why changing sys.argv[0] is necessary. In the future a better way to do this
    should be preferred..
    """
    return crunch_dir +subdir +fname

def fix_path(cwd):
    """
    Convenience function to fix argv and python path so that jupyter notebook can run the same as
    any script in crunch.
    """
    sys.argv[0] = cwd
    module_path = os.path.abspath(os.path.join('..'))
    if module_path not in sys.path:
        sys.path.append(module_path)

fix_path(get_cwd('pattern_eda.ipynb', 'mutate' +sep))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from numba import jit, vectorize, float64, uint
# from tslearn.preprocessing import TimeSeriesScalerMeanVariance
# from tslearn.piecewise import PiecewiseAggregateApproximation
# from tslearn.piecewise import SymbolicAggregateApproximation, OneD_SymbolicAggregateApproximation
from scipy.stats import zscore
from sortedcontainers import SortedList, SortedSet 

from ipywidgets import interact, interactive, fixed
from IPython.display import display

pd.set_option("display.max_rows", 100)
pd.set_option('display.max_columns', 50)

from common_util import RAW_DIR, DT_HOURLY_FREQ, DT_CAL_DAILY_FREQ, DT_BIZ_DAILY_FREQ, get_custom_biz_freq, get_custom_biz_freq_df, query_df, search_df, chained_filter, benchmark
from common_util import MUTATE_DIR, load_json, outer_join, left_join, count_nn_df, count_nz_df, count_nn_nz_df, pairwise, cust_count, list_get_dict, get_time_mask
from data.data_api import DataAPI
from raw.common import default_row_masksfile
from recon.viz import *
DataAPI.__init__()
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)

CRITICAL:root:script location: /home/kev/crunch/mutate/pattern_eda.ipynb
CRITICAL:root:using project dir: /home/kev/crunch/


## Load Data

In [2]:
raw_rcs, raw_dfs = DataAPI.axe_load(['root', 'root_split_gmtoffset'], lazy=False)

  labels = getattr(columns, 'labels', None) or [
  return pd.MultiIndex(levels=new_levels, labels=labels, names=columns.names)
  labels, = index.labels


In [62]:
list(raw_dfs.keys())

[['dow_jones', 'root', 'root_split_gmtoffset', 'join', 'pba'],
 ['dow_jones', 'root', 'root_split_gmtoffset', 'join', 'trmi2'],
 ['dow_jones', 'root', 'root_split_gmtoffset', 'join', 'trmi3'],
 ['dow_jones', 'root', 'root_split_gmtoffset', 'join', 'vol'],
 ['nasdaq_100', 'root', 'root_split_gmtoffset', 'join', 'pba'],
 ['nasdaq_100', 'root', 'root_split_gmtoffset', 'join', 'trmi2'],
 ['nasdaq_100', 'root', 'root_split_gmtoffset', 'join', 'trmi3'],
 ['nasdaq_100', 'root', 'root_split_gmtoffset', 'join', 'vol'],
 ['russell_2000', 'root', 'root_split_gmtoffset', 'join', 'pba'],
 ['russell_2000', 'root', 'root_split_gmtoffset', 'join', 'trmi2'],
 ['russell_2000', 'root', 'root_split_gmtoffset', 'join', 'trmi3'],
 ['russell_2000', 'root', 'root_split_gmtoffset', 'join', 'vol'],
 ['sp_500', 'root', 'root_split_gmtoffset', 'join', 'pba'],
 ['sp_500', 'root', 'root_split_gmtoffset', 'join', 'trmi2'],
 ['sp_500', 'root', 'root_split_gmtoffset', 'join', 'trmi3'],
 ['sp_500', 'root', 'root_split_

In [63]:
hrm_rcs, hrm_dfs = DataAPI.axe_load(['hrm', 'hrm'], lazy=False)

  labels = getattr(columns, 'labels', None) or [
  return pd.MultiIndex(levels=new_levels, labels=labels, names=columns.names)
  labels, = index.labels


In [4]:
row_masksfile = default_row_masksfile
row_masks = load_json(row_masksfile, dir_path=RAW_DIR)

In [28]:
date_range = {
    'id': ('ine', 2009, 2018)
}

In [91]:
asset_name = 'russell_2000'
data_subset = 'trmi3'
rvol_df = raw_dfs[[asset_name, 'root', 'root_split_gmtoffset', 'join', data_subset]]
#print(rvol_df.dropna().iloc[:, 0].value_counts())
mask = row_masks[asset_name][data_subset]['intraday_hourly']
res = get_time_mask(rvol_df.fillna(method='bfill', axis=0), offset_col_name=rvol_df.columns[0], offset_tz=mask['target_tz'], time_range=mask['time_range'])
hrm_df = hrm_dfs[[asset_name, 'hrm', 'hrm', data_subset, data_subset]].dropna(how='all')

In [96]:
rvol_df.fillna(method='bfill', axis=0)

Unnamed: 0_level_0,pba_gmtOffset,mkt3_ver,etf3_ver
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1998-01-01 01:00:00+00:00,-6.0,3.0.9,3.0.9
1998-01-01 02:00:00+00:00,-6.0,3.0.9,3.0.9
1998-01-01 03:00:00+00:00,-6.0,3.0.9,3.0.9
1998-01-01 04:00:00+00:00,-6.0,3.0.9,3.0.9
1998-01-01 05:00:00+00:00,-6.0,3.0.9,3.0.9
1998-01-01 06:00:00+00:00,-6.0,3.0.9,3.0.9
1998-01-01 07:00:00+00:00,-6.0,3.0.9,3.0.9
1998-01-01 08:00:00+00:00,-6.0,3.0.9,3.0.9
1998-01-01 09:00:00+00:00,-6.0,3.0.9,3.0.9
1998-01-01 10:00:00+00:00,-6.0,3.0.9,3.0.9


In [86]:
strange_times = ['2009-11-02','2010-11-08','2011-11-07','2012-11-05','2013-11-04','2014-11-03','2015-11-02','2016-11-07','2017-11-06']

In [95]:
ff = rvol_df.fillna(method='bfill')
for t in strange_times: print(rvol_df[t], ff[t])

                           pba_gmtOffset mkt3_ver etf3_ver
id                                                        
2009-11-02 00:00:00+00:00            NaN    3.0.9    3.0.9
2009-11-02 01:00:00+00:00            NaN    3.0.9    3.0.9
2009-11-02 02:00:00+00:00            NaN    3.0.9    3.0.9
2009-11-02 03:00:00+00:00            NaN    3.0.9    3.0.9
2009-11-02 04:00:00+00:00            NaN    3.0.9    3.0.9
2009-11-02 05:00:00+00:00            NaN    3.0.9    3.0.9
2009-11-02 06:00:00+00:00            NaN    3.0.9    3.0.9
2009-11-02 07:00:00+00:00            NaN    3.0.9    3.0.9
2009-11-02 08:00:00+00:00            NaN    3.0.9    3.0.9
2009-11-02 09:00:00+00:00            NaN    3.0.9    3.0.9
2009-11-02 10:00:00+00:00            NaN    3.0.9    3.0.9
2009-11-02 11:00:00+00:00            NaN    3.0.9    3.0.9
2009-11-02 12:00:00+00:00            NaN    3.0.9    3.0.9
2009-11-02 13:00:00+00:00            NaN    3.0.9    3.0.9
2009-11-02 14:00:00+00:00           -6.0    3.0.9    3.0

In [97]:
for t in strange_times: print(res[t], hrm_df[t])

                                              times
id                                                 
2009-11-02 14:00:00+00:00 2009-11-02 08:00:00+00:00
2009-11-02 15:00:00+00:00 2009-11-02 09:00:00+00:00
2009-11-02 16:00:00+00:00 2009-11-02 10:00:00+00:00
2009-11-02 17:00:00+00:00 2009-11-02 11:00:00+00:00
2009-11-02 18:00:00+00:00 2009-11-02 12:00:00+00:00
2009-11-02 19:00:00+00:00 2009-11-02 13:00:00+00:00
2009-11-02 20:00:00+00:00 2009-11-02 14:00:00+00:00
2009-11-02 21:00:00+00:00 2009-11-02 15:00:00+00:00
2009-11-02 22:00:00+00:00 2009-11-02 16:00:00+00:00                                               times
id                                                 
2009-11-02 13:00:00+00:00 2009-11-02 08:00:00+00:00
2009-11-02 14:00:00+00:00 2009-11-02 08:00:00+00:00
2009-11-02 15:00:00+00:00 2009-11-02 09:00:00+00:00
2009-11-02 16:00:00+00:00 2009-11-02 10:00:00+00:00
2009-11-02 17:00:00+00:00 2009-11-02 11:00:00+00:00
2009-11-02 18:00:00+00:00 2009-11-02 12:00:00+00:00
2009-11-02 1

In [35]:
for keychain, raw_df in raw_dfs.items():
    print(keychain)
    print(raw_df.loc[search_df(raw_df, date_range)].dropna().iloc[:, 0].value_counts())
    continue2
    asset_name, data_subset = keychain[0], keychain[-1]
    gmt_col = raw_df.columns[0]
    #assert(gmt_col.endswith(GMT_OFFSET_COL_SFX))
    if (raw_df.shape[1] > 1):
        # XXX - Forward filling nulls may lead to error if the time_range includes the times when Daylight savings is switched on/off
        raw_df = raw_df.fillna(method='ffill', axis=0)
        logging.info('Found more than one column, ffilled null values')
        logging.debug(raw_df)

    for mask_type, mask in row_masks[asset_name][data_subset].items():
        logging.info('mask name: {}'.format(mask_type))
        mask_freq = DT_HOURLY_FREQ if (mask['type'].startswith('h')) else None
        mask_df = get_time_mask(raw_df, offset_col_name=gmt_col, offset_tz=mask['target_tz'], time_range=mask['time_range'])
        #logging.debug(mask_df)
        #DataAPI.dump(make_entry('raw', mask['type'], data_subset, mask_freq, base_rec=raw_rec), mask_df)
        logging.info('dumped {} {}...'.format(mask['type'], data_subset))

['dow_jones', 'root', 'root_split_gmtoffset', 'join', 'pba']
-4.0    11922
-5.0     6143
Name: pba_gmtOffset, dtype: int64
['dow_jones', 'root', 'root_split_gmtoffset', 'join', 'trmi2']
-4.0    11922
-5.0     6143
Name: pba_gmtOffset, dtype: int64
['dow_jones', 'root', 'root_split_gmtoffset', 'join', 'trmi3']
-4.0    11922
-5.0     6143
Name: pba_gmtOffset, dtype: int64
['dow_jones', 'root', 'root_split_gmtoffset', 'join', 'vol']
-5.0    11762
-6.0     6090
Name: vol_gmtOffset, dtype: int64
['nasdaq_100', 'root', 'root_split_gmtoffset', 'join', 'pba']
-4.0    11825
-5.0     6100
Name: pba_gmtOffset, dtype: int64
['nasdaq_100', 'root', 'root_split_gmtoffset', 'join', 'trmi2']
-4.0    11825
-5.0     6100
Name: pba_gmtOffset, dtype: int64
['nasdaq_100', 'root', 'root_split_gmtoffset', 'join', 'trmi3']
-4.0    11825
-5.0     6100
Name: pba_gmtOffset, dtype: int64
['nasdaq_100', 'root', 'root_split_gmtoffset', 'join', 'vol']
-5.0    11919
-6.0     6095
Name: vol_gmtOffset, dtype: int64
['ru