# nb-model

In [105]:
import sys
import os
from os import sep
from os.path import dirname, realpath
from pathlib import Path
from functools import partial, reduce
import logging

def get_cwd(fname, subdir, crunch_dir=realpath(Path.home()) +sep +'crunch' +sep):
    """
    Convenience function to make a directory string for the current file based on inputs.
    Jupyter Notebook in Anaconda invokes the Python interpreter in Anaconda's subdirectory
    which is why changing sys.argv[0] is necessary. In the future a better way to do this
    should be preferred..
    """
    return crunch_dir +subdir +fname

def fix_path(cwd):
    """
    Convenience function to fix argv and python path so that jupyter notebook can run the same as
    any script in crunch.
    """
    sys.argv[0] = cwd
    module_path = os.path.abspath(os.path.join('..'))
    if module_path not in sys.path:
        sys.path.append(module_path)

fname = 'nb-model_xg=test-prep-transpose.ipynb'   # FILL
dir_name = 'model'             # FILL
fix_path(get_cwd(fname, dir_name +sep))

import numpy as np
import pandas as pd
#import matplotlib.pyplot as plt
from dask import delayed, compute
from torch.utils.data import TensorDataset, DataLoader
import torch

from ipywidgets import interact, interactive, fixed
from IPython.display import display

pd.set_option("display.max_rows", 100)
pd.set_option('display.max_columns', 50)

from common_util import RECON_DIR, JSON_SFX_LEN, DT_CAL_DAILY_FREQ, is_type, pd_common_idx_rows, remove_dups_list, set_loglevel, search_df, chained_filter, get_variants, dump_df, load_json, gb_transpose, pd_common_index_rows, filter_cols_below, inner_join, outer_join, ser_shift, list_get_dict, window_iter, benchmark
from common_util import midx_get_level, pd_rows, midx_intersect, pd_common_idx_rows, midx_split, pd_midx_to_arr, window_iter, np_is_ndim, identity_fn
from model.common import DATASET_DIR, XG_PROCESS_DIR, XG_DATA_DIR, XG_DIR, EXPECTED_NUM_HOURS, default_dataset
from model.dataprep_util import align_first_last_cols, prune_nulls
from model.datagen_util import xgdg, process_group
from data.data_api import DataAPI
from recon.dataset_util import prep_dataset, gen_group
from model.dataprep_util import COMMON_PREP_MAPPING, DATA_PREP_MAPPING, prep_transpose_data, align_first_last_cols
#from recon.split_util import get_train_test_split, gen_time_series_split, index_three_split, pd_binary_clip
DataAPI.__init__()
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)

## Load

In [10]:
dohlca_rcs, dohlca_dfs = DataAPI.axe_load(['hohlca', 'hohlca_pba_vol'])
hrm_rcs, hrm_dfs = DataAPI.axe_load(['hrm', 'hrm'])

In [158]:
asset_name = 'sp_500'

In [159]:
d = dohlca_dfs[[asset_name, 'hohlca', 'hohlca_pba_vol', 'pba_hohlca', 'pba_hohlca']]
r = hrm_dfs[[asset_name, 'hrm', 'hrm', 'pba', 'pba']]

## Test FN

In [160]:
from common_util import is_type, compose, dcompose, pd_idx_rename, pd_idx_to_midx, pd_dti_idx_date_only, filter_cols_below, reindex_on_time_mask, df_downsample_transpose, pd_single_ser, ser_shift, pd_common_idx_rows, df_midx_restack
from model.common import EXPECTED_NUM_HOURS

In [161]:
def prep_transpose_data(feature_df, row_masks_df, delayed=False):
	"""
	Converts a single indexed intraday DataFrame into a MultiIndexed daily DataFrame.

	Args:
		feature_df (pd.DataFrame): Intraday DataFrame
		row_masks_df (pd.DataFrame): DataFrame of row masks / time mask
		delayed (boolean): Whether or not to create a delayed function composition

	Returns:
		pd.DataFrame or dask Delayed object
	"""
	preproc = (
				reindex_on_time_mask,		# Converts the UTC time index to local time
				df_downsample_transpose,	# Performs the grouby downsample to daily frequency and intraday transpose
				filter_cols_below,		# Filters out columns with 90% or less of their data missing (relative to the most populated column)
				align_first_last_cols,		# Removes an extra column due to misalignment if it exists
				#partial(prune_nulls, limit=6),			# Removes or fills any last null data
				pd_dti_idx_date_only,		# Removes the time component of the DatetimeIndex index
				df_midx_restack			# Restacks to fix https://github.com/pandas-dev/pandas/issues/2770
			)
	prep_fn = dcompose(*preproc) if (delayed) else compose(*preproc)
	return prep_fn(feature_df, row_masks_df)

In [162]:
res = prep_transpose_data(d, r)

In [163]:
poi = []
if (asset_name == 'russell_2000'):
    poi.append(['2011-11-08', '2012-10-09', '2012-11-01', '2014-07-03', '2015-09-02', '2017-04-21'])
elif (asset_name == 'nasdaq_100'):
    poi.append(['2009-06-01', '2011-01-28', '2012-11-01'])
elif (asset_name == 'dow_jones'):
    poi.append(['2010-04-24'])

In [164]:
for point in poi:
    display(res.loc[point])

Unnamed: 0_level_0,Unnamed: 1_level_0,9,10,11,12,13,14,15,16
id0,id1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2010-04-24,pba_open,,,,,,11145.33,11145.03,
2010-04-24,pba_high,,,,,,11146.01,11146.84,
2010-04-24,pba_low,,,,,,11144.87,11141.93,
2010-04-24,pba_close,,,,,,11144.95,11145.1,
2010-04-24,pba_avgPrice,,,,,,11145.456,11143.9832,


In [168]:
display(res.loc[res.iloc[:, 0].isnull()])

Unnamed: 0_level_0,Unnamed: 1_level_0,9,10,11,12,13,14,15,16
id0,id1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2010-04-24,pba_open,,,,,,11145.33,11145.03,
2010-04-24,pba_high,,,,,,11146.01,11146.84,
2010-04-24,pba_low,,,,,,11144.87,11141.93,
2010-04-24,pba_close,,,,,,11144.95,11145.1,
2010-04-24,pba_avgPrice,,,,,,11145.456,11143.9832,
2016-03-28,pba_open,,,,,17562.71,17561.86,17529.67,17535.63
2016-03-28,pba_high,,,,,17568.74,17583.81,17562.51,17535.94
2016-03-28,pba_low,,,,,17545.52,17519.98,17518.03,17535.25
2016-03-28,pba_close,,,,,17561.89,17529.47,17534.91,17535.39
2016-03-28,pba_avgPrice,,,,,17556.4437,17555.091,17542.9374,17535.4041


In [166]:
res

Unnamed: 0_level_0,Unnamed: 1_level_0,9,10,11,12,13,14,15,16
id0,id1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2009-01-02,pba_open,8771.7700,8783.2400,8905.9800,8918.4100,8931.4700,8968.9000,9001.8000,9026.8900
2009-01-02,pba_high,8817.6500,8907.9700,8934.6500,8951.5400,8981.5700,9011.9900,9065.2800,9034.6900
2009-01-02,pba_low,8761.1000,8768.9900,8889.8900,8913.8700,8925.9700,8948.9100,9001.8000,9024.8200
2009-01-02,pba_close,8783.0800,8905.8200,8918.1700,8931.0700,8968.6600,9001.4800,9023.4600,9034.6900
2009-01-02,pba_avgPrice,8795.6230,8839.0401,8912.6615,8930.6138,8956.4848,8982.1202,9030.1114,9029.4574
...,...,...,...,...,...,...,...,...,...
2017-12-29,pba_open,24849.6300,24838.6700,24806.3700,24826.0100,24815.3500,24798.3800,24805.3100,24744.4500
2017-12-29,pba_high,24871.6600,24838.6700,24833.0700,24828.5800,24817.7000,24808.6700,24822.2200,24749.9900
2017-12-29,pba_low,24829.0200,24793.5500,24791.7400,24810.1700,24789.7600,24795.1900,24735.6000,24719.2200
2017-12-29,pba_close,24837.7600,24806.6300,24826.0900,24815.4700,24798.3900,24805.5100,24743.7300,24719.2200
