In [1]:
import sys
import os
from os import sep
from os.path import dirname, realpath
from pathlib import Path
from itertools import product
import logging

def get_cwd(fname, subdir, crunch_dir=realpath(Path.home()) +sep +'crunch' +sep):
    """
    Convenience function to make a directory string for the current file based on inputs.
    Jupyter Notebook in Anaconda invokes the Python interpreter in Anaconda's subdirectory
    which sis why changing sys.argv[0] is necessary. In the future a better way to do this
    should be preferred..
    """
    return crunch_dir +subdir +fname

def fix_path(cwd):
    """
    Convenience function to fix argv and python path so that jupyter notebook can run the same as
    any script in crunch.
    """
    sys.argv[0] = cwd
    module_path = os.path.abspath(os.path.join('..'))
    if module_path not in sys.path:
        sys.path.append(module_path)

fix_path(get_cwd('pattern_eda.ipynb', 'mutate' +sep))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from numba import jit, vectorize, float64, uint
# from tslearn.preprocessing import TimeSeriesScalerMeanVariance
# from tslearn.piecewise import PiecewiseAggregateApproximation
# from tslearn.piecewise import SymbolicAggregateApproximation, OneD_SymbolicAggregateApproximation
from scipy.stats import zscore
from sortedcontainers import SortedList, SortedSet 

from ipywidgets import interact, interactive, fixed
from IPython.display import display

pd.set_option("display.max_rows", 200)
pd.set_option('display.max_columns', 50)

from common_util import RAW_DIR, DT_HOURLY_FREQ, DT_CAL_DAILY_FREQ, DT_BIZ_DAILY_FREQ, get_custom_biz_freq, get_custom_biz_freq_df, query_df, search_df, chained_filter, benchmark
from common_util import MUTATE_DIR, load_df, load_json, outer_join, left_join, count_nn_df, count_nz_df, count_nn_nz_df, pairwise, cust_count, list_get_dict, get_time_mask
from data.data_api import DataAPI
from raw.common import default_row_masksfile
from recon.viz import *
DataAPI.__init__()
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)

CRITICAL:root:script location: /home/kev/crunch/mutate/pattern_eda.ipynb
CRITICAL:root:using project dir: /home/kev/crunch/


## Load Data

### RAW

In [15]:
VOL_DIR = RAW_DIR +'vol/'
PRICE_DIR = RAW_DIR +'price/'

SPX = load_df(PRICE_DIR +'SPX.parquet')
RUT = load_df(PRICE_DIR +'RUT.parquet')
NDX = load_df(PRICE_DIR +'NDX.parquet')
DJI = load_df(PRICE_DIR +'DJI.parquet')
VIX = load_df(VOL_DIR +'VIX.parquet')
RVX = load_df(VOL_DIR +'RVX.parquet')
VXN = load_df(VOL_DIR +'VXN.parquet')
DVX = load_df(VOL_DIR +'VXD.parquet')

  labels = getattr(columns, 'labels', None) or [
  return pd.MultiIndex(levels=new_levels, labels=labels, names=columns.names)
  labels, = index.labels


In [16]:
pba = {
    'sp_500': SPX,
    'russell_2000': RUT,
    'dow_jones': DJI,
    'nasdaq_100': NDX,    
}
vol = {
    'sp_500': VIX,
    'russell_2000': RVX,
    'dow_jones': DVX,
    'nasdaq_100': VXN,    
}

### ROOT

In [3]:
root_rcs, root_dfs = DataAPI.axe_load(['root', 'root_split_ohlca'])
hrm_rcs, hrm_dfs = DataAPI.axe_load(['hrm', 'hrm'], lazy=False)

## VIEW

In [66]:
day = '2010-04-24'
dayl = ('{} 00:00'.format(day), '{} 23:00'.format(day))
ass = 'dow_jones'
raw_pba = pba[ass]
raw_vol = vol[ass]

In [67]:
raw_pba.loc[dayl[0]: dayl[1]].dropna()

Unnamed: 0_level_0,pba_gmtOffset,pba_open,pba_high,pba_low,pba_close,pba_volume,pba_avgPrice,pba_numTrades,pba_openBid,pba_highBid,pba_lowBid,pba_closeBid,pba_numBids,pba_openAsk,pba_highAsk,pba_lowAsk,pba_closeAsk,pba_numAsks
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2010-04-24 18:00,-4,11145.33,11146.01,11144.87,11144.95,63066612.0,11145.456,10.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0
2010-04-24 19:00,-4,11145.03,11146.84,11141.93,11145.1,63793084.0,11143.9832,53.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0


In [68]:
root_dfs[[ass, 'root', 'root_split_ohlca', 'join', 'pba']].loc[dayl[0]: dayl[1]].dropna()

Unnamed: 0_level_0,pba_open,pba_high,pba_low,pba_close,pba_avgPrice
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010-04-24 18:00:00+00:00,11145.33,11146.01,11144.87,11144.95,11145.456
2010-04-24 19:00:00+00:00,11145.03,11146.84,11141.93,11145.1,11143.9832


In [69]:
hrm_dfs[[ass, 'hrm', 'hrm', 'pba', 'pba']].loc[dayl[0]: dayl[1]].dropna()

Unnamed: 0_level_0,times
id,Unnamed: 1_level_1
2010-04-24 18:00:00+00:00,2010-04-24 14:00:00+00:00
2010-04-24 19:00:00+00:00,2010-04-24 15:00:00+00:00
