# Model Debug

In [1]:
import sys
import os
from os import sep
from os.path import dirname, realpath
from pathlib import Path
import logging

def get_cwd(fname, subdir, crunch_dir=realpath(Path.home()) +sep +'crunch' +sep):
    """
    Convenience function to make a directory string for the current file based on inputs.
    Jupyter Notebook in Anaconda invokes the Python interpreter in Anaconda's subdirectory
    which is why changing sys.argv[0] is necessary. In the future a better way to do this
    should be preferred..
    """
    return crunch_dir +subdir +fname
    
def fix_path(cwd):
    """
    Convenience function to fix argv and python path so that jupyter notebook can run the same as
    any script in crunch.
    """
    sys.argv[0] = cwd
    module_path = os.path.abspath(os.path.join('..'))
    if module_path not in sys.path:
        sys.path.append(module_path)

fname = 'model_debug.ipynb'   # FILL
dir_name = 'model'   # FILL
fix_path(get_cwd(fname, dir_name +sep))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from dask import delayed, compute

from ipywidgets import interact, interactive, fixed
from IPython.display import display

pd.set_option("display.max_rows", 100)
pd.set_option('display.max_columns', 50)

from common_util import RECON_DIR, JSON_SFX_LEN, DT_CAL_DAILY_FREQ, set_loglevel, get_variants, dump_df, load_json, gb_transpose, reindex_on_time_mask, filter_cols_below, inner_join, outer_join, list_get_dict, benchmark
from model.common import DATASET_DIR, FILTERSET_DIR, default_dataset, default_filterset, default_nt_filter
from data.data_api import DataAPI
from data.access_util import df_getters as dg, col_subsetters2 as cs2
from recon.dataset_util import prep_dataset, prep_labels, gen_fl
from recon.model_util import get_train_test_split, gen_time_series_split



In [2]:
set_loglevel('info')

In [3]:
dataset_name = default_dataset
filterset_name = default_filterset
filter_idxs = ["1"]
assets_str = 'dow_jones,sp_500,nasdaq_100, russell_2000'
assets = list(map(str.strip, assets_str.split(',')))

dataset_dict = load_json(dataset_name, dir_path=DATASET_DIR)
filter_dict = load_json(filterset_name, dir_path=FILTERSET_DIR)

filterset = []
for filter_idx in filter_idxs:
    selected = [flt for flt in filter_dict[filter_idx] if (flt not in filterset)]
    filterset.extend(selected)
dataset = prep_dataset(dataset_dict, assets=assets, filters_map={'features': filterset})

logging.info('assets: ' +str('all' if (assets==None) else ', '.join(assets)))
logging.info('dataset: {} {} df(s)'.format(len(dataset['features']['paths']), dataset_name[:-JSON_SFX_LEN]))
logging.info('filter: {} [{}]'.format(filterset_name[:-JSON_SFX_LEN], str(', '.join(filter_idxs))))
logging.debug('filterset: ' +str(filterset))
logging.debug('fpaths: ' +str(dataset['features']['paths']))
logging.debug('lpaths: ' +str(dataset['labels']['paths']))

ax = ['raw', 'id_rm']
au_dg, au_cs = list_get_dict(dg, ax), list_get_dict(cs2, ax)
lt_paths, lt_recs, lt_dfs = DataAPI.lazy_load(au_dg, au_cs)

INFO:root:assets: dow_jones, sp_500, nasdaq_100, russell_2000
INFO:root:dataset: 4 dnorm_sym df(s)
INFO:root:filter: default_dnorm_sym [1]


In [4]:
ax = ['raw', 'id_rm']
au_dg, au_cs = list_get_dict(dg, ax), list_get_dict(cs2, ax)
lt_paths, lt_recs, lt_dfs = DataAPI.lazy_load(au_dg, au_cs)

results, times_table = {}, {}
for asst in assets:
    times_table[asst] = list_get_dict(lt_dfs, [asst, 'id_rm', 'raw_pba'])

In [6]:
logging.info('executing...')
for paths, dfs in gen_fl(dataset):
    fpaths, lpaths = paths
    features, labels = dfs
    asset = fpaths[0]
    print(fpaths, lpaths)

    norm_df = delayed(reindex_on_time_mask)(features, times_table[asset])
    transposed = delayed(gb_transpose)(norm_df.loc[:, ['pba_avgPrice']])
#     numcols = transposed.columns.size
#     if (numcols > 8):
#         cnt_df = transposed.count()
#         if (abs(cnt_df.iloc[0] - cnt_df.iloc[-1]) > .25*cnt_df.max()):
#             first_hr, last_hr = cnt.index[0], cnt.index[-1]
#             lastnull = tofillcopy[~tofillcopy[first_hr].isnull() & tofillcopy[last_hr].isnull()]
#             firstnull = tofillcopy[tofillcopy[first_hr].isnull() & ~tofillcopy[last_hr].isnull()]
#             if (lastnull.index[-1] < firstnull.index[0]):
#                 tofillcopy.loc[~tofillcopy[first_hr].isnull() & tofillcopy[last_hr].isnull(), :] = lastnull.shift(periods=1, axis=1)
#             elif (lastnull.index[-1] > firstnull.index[0]):
#                 tofillcopy.loc[tofillcopy[first_hr].isnull() & ~tofillcopy[last_hr].isnull(), :] = firstnull.shift(periods=-1, axis=1)
#             display(cnt)
#             tofill = transposed
#             display(tofill)
    results[asset] = transposed

INFO:root:executing...


['dow_jones', 'dzn', 'raw_pba'] ['dow_jones', 'itb_fth_of_xwhole', 'pba_oc_return']
['sp_500', 'dzn', 'raw_pba'] ['sp_500', 'itb_fth_of_xwhole', 'pba_oc_return']
['nasdaq_100', 'dzn', 'raw_pba'] ['nasdaq_100', 'itb_fth_of_xwhole', 'pba_oc_return']
['russell_2000', 'dzn', 'raw_pba'] ['russell_2000', 'itb_fth_of_xwhole', 'pba_oc_return']


In [8]:
for asset in assets:
    transposed = filter_cols_below(results[asset].compute())
    numcols = transposed.columns.size
    if (numcols > 8):
        cnt = transposed.count()
        if (abs(cnt.iloc[0] - cnt.iloc[-1]) > .25*cnt.max()):
            display(cnt)
            tofill = transposed
            display(tofill)

index
8     3165
9     4990
10    4992
11    4996
12    4997
13    4969
14    4946
15    4936
16    1808
dtype: int64

index,8,9,10,11,12,13,14,15,16
1998-01-02,1.767220,-0.621473,-0.801030,-0.762599,-0.378077,-0.561798,0.012850,1.344907,
1998-01-03,,,,,,,,,
1998-01-04,,,,,,,,,
1998-01-05,-0.365924,0.230259,0.942520,1.476313,0.385029,-1.611172,-0.998866,-0.058159,
1998-01-06,1.744715,1.120044,0.375041,-0.793998,-0.489436,-0.149741,-1.038334,-0.768292,
1998-01-07,1.339411,0.650810,-0.202215,-0.626559,-1.161447,,,,
1998-01-08,1.100531,0.090101,0.769356,0.408035,-0.134298,0.439166,-0.549288,-2.123603,
1998-01-09,1.587021,0.971661,0.590207,-0.027603,-0.089460,-0.747691,-1.063173,-1.220963,
1998-01-10,,,,,,,,,
1998-01-11,,,,,,,,,


In [9]:
def align_first_last_columns(df):
	"""
	Return df where non-overlapping subsets have first or last column set to null, align them and remove the redundant column
	"""
	cnt_df = df.count()
	first_hr, last_hr = cnt_df.index[0], cnt_df.index[-1]
	firstnull = df[df[first_hr].isnull() & ~df[last_hr].isnull()]
	lastnull = df[~df[first_hr].isnull() & df[last_hr].isnull()]

	# The older format is changed to match the latest one
	if (firstnull.index[-1] > lastnull.index[-1]): 		# Changed lastnull subset to firstnull
		df.loc[~df[first_hr].isnull() & df[last_hr].isnull(), :] = lastnull.shift(periods=1, axis=1)
	elif (firstnull.index[-1] < lastnull.index[-1]):	# Changed firstnull subset to lastnull
		df.loc[df[first_hr].isnull() & ~df[last_hr].isnull(), :] = firstnull.shift(periods=-1, axis=1)

	return filter_cols_below(df)

In [11]:
tofillcopy = tofill.copy()

In [18]:
first_hr, last_hr = cnt.index[0], cnt.index[-1]

In [207]:
lastnull = tofillcopy[~tofillcopy[first_hr].isnull() & tofillcopy[last_hr].isnull()]
firstnull = tofillcopy[tofillcopy[first_hr].isnull() & ~tofillcopy[last_hr].isnull()]

In [210]:
if (lastnull.index[-1] < firstnull.index[-1]):
    tofillcopy.loc[~tofillcopy[first_hr].isnull() & tofillcopy[last_hr].isnull(), :] = lastnull.shift(periods=1, axis=1)
elif (lastnull.index[-1] > firstnull.index[-1]):
    tofillcopy.loc[tofillcopy[first_hr].isnull() & ~tofillcopy[last_hr].isnull(), :] = firstnull.shift(periods=-1, axis=1)

In [12]:
fx = align_first_last_columns(tofillcopy)

In [14]:
fx.count()

8

In [103]:
tofillcopy[~tofillcopy[8].isnull() & tofillcopy[16].isnull()]
# tofillcopy[tofillcopy[8].isnull() & ~tofillcopy[16].isnull()] = tofillcopy.shift(periods=-1, axis=1)
tofillcopy[~tofillcopy[8].isnull() & tofillcopy[16].isnull()] = tofillcopy.shift(periods=1,axis=1)
display(tofillcopy.count())

index
8        2
9     4993
10    5000
11    4997
12    4999
13    4997
14    4953
15    4946
16    4912
dtype: int64

In [105]:
tofill[tofill[8].isnull()].dropna(how='all')

index,8,9,10,11,12,13,14,15,16
1998-10-29,,-1.643792,-1.027894,0.084634,0.406074,0.419761,0.466979,1.294239,
1999-04-06,,0.622315,0.867931,0.614056,0.502141,-0.006925,-1.952221,-0.647298,
1999-08-26,,,1.029110,1.000261,0.327112,-0.065712,-0.825014,-1.465757,
2000-08-23,,-2.267000,0.352326,0.350146,0.363710,0.378770,0.397934,0.424114,
2000-12-19,,0.273660,0.448425,0.690596,0.823906,0.614975,-1.176399,-1.675164,
2001-01-02,,2.160707,-0.189538,-0.076502,0.021759,-0.433779,-0.771404,-0.711243,
2001-02-27,,,0.854926,1.008949,0.621285,-0.175221,-0.866425,-1.443514,
2001-06-18,,1.075310,1.145978,0.418093,0.147402,-0.260277,-1.070175,-1.456330,
2001-08-23,,1.248464,0.650121,0.454951,0.399100,-0.328185,-0.661894,-1.762558,
2001-09-10,,,,,0.305202,-0.379414,-1.137659,1.211871,
