# Model Debug

In [1]:
import sys
import os
from os import sep
from os.path import dirname, realpath
from pathlib import Path
import logging

def get_cwd(fname, subdir, crunch_dir=realpath(Path.home()) +sep +'crunch' +sep):
    """
    Convenience function to make a directory string for the current file based on inputs.
    Jupyter Notebook in Anaconda invokes the Python interpreter in Anaconda's subdirectory
    which is why changing sys.argv[0] is necessary. In the future a better way to do this
    should be preferred..
    """
    return crunch_dir +subdir +fname
    
def fix_path(cwd):
    """
    Convenience function to fix argv and python path so that jupyter notebook can run the same as
    any script in crunch.
    """
    sys.argv[0] = cwd
    module_path = os.path.abspath(os.path.join('..'))
    if module_path not in sys.path:
        sys.path.append(module_path)

fname = 'model_debug.ipynb'   # FILL
dir_name = 'model'   # FILL
fix_path(get_cwd(fname, dir_name +sep))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from dask import delayed, compute

from ipywidgets import interact, interactive, fixed
from IPython.display import display

pd.set_option("display.max_rows", 100)
pd.set_option('display.max_columns', 50)

from common_util import RECON_DIR, JSON_SFX_LEN, DT_CAL_DAILY_FREQ, set_loglevel, chained_filter, get_variants, dump_df, load_json, gb_transpose, reindex_on_time_mask, filter_cols_below, inner_join, outer_join, list_get_dict, benchmark
from model.common import DATASET_DIR, FILTERSET_DIR, EXPECTED_NUM_HOURS, default_dataset, default_filterset, default_nt_filter
from data.data_api import DataAPI
from data.access_util import df_getters as dg, col_subsetters2 as cs2
from recon.dataset_util import prep_dataset, prep_labels, gen_group
from recon.model_util import get_train_test_split, gen_time_series_split
from recon.label_util import shift_label



In [2]:
set_loglevel('info')

In [3]:
dataset_name = default_dataset
filterset_name = default_filterset
filter_idxs = ["1"]
assets_str = 'sp_500'
assets = list(map(str.strip, assets_str.split(',')))

dataset_dict = load_json(dataset_name, dir_path=DATASET_DIR)
filter_dict = load_json(filterset_name, dir_path=FILTERSET_DIR)

filterset = []
for filter_idx in filter_idxs:
    selected = [flt for flt in filter_dict[filter_idx] if (flt not in filterset)]
    filterset.extend(selected)
dataset = prep_dataset(dataset_dict, assets=assets, filters_map={'features': filterset})

logging.info('assets: ' +str('all' if (assets==None) else ', '.join(assets)))
logging.info('dataset: {} {} df(s)'.format(len(dataset['features']['paths']), dataset_name[:-JSON_SFX_LEN]))
logging.info('filter: {} [{}]'.format(filterset_name[:-JSON_SFX_LEN], str(', '.join(filter_idxs))))
logging.debug('filterset: ' +str(filterset))
logging.debug('fpaths: ' +str(dataset['features']['paths']))
logging.debug('lpaths: ' +str(dataset['labels']['paths']))

INFO:root:assets: sp_500
INFO:root:dataset: 1 dnorm_sym df(s)
INFO:root:filter: default_dnorm_sym [1]


In [4]:
def is_alignment_needed(df, ratio_max=.25):
	count_df = df.count()
	return count_df.size > EXPECTED_NUM_HOURS and abs(count_df.iloc[0] - count_df.iloc[-1]) > ratio_max*count_df.max()

def align_first_last(df):
	"""
	Return df where non-overlapping subsets have first or last column set to null, align them and remove the redundant column.
	"""
	cnt_df = df.count()
	first_hr, last_hr = cnt_df.index[0], cnt_df.index[-1]
	firstnull = df[df[first_hr].isnull() & ~df[last_hr].isnull()]
	lastnull = df[~df[first_hr].isnull() & df[last_hr].isnull()]

	# The older format is changed to match the latest one
	if (firstnull.index[-1] > lastnull.index[-1]): 		# Changed lastnull subset to firstnull
		df.loc[~df[first_hr].isnull() & df[last_hr].isnull(), :] = lastnull.shift(periods=1, axis=1)
	elif (firstnull.index[-1] < lastnull.index[-1]):	# Changed firstnull subset to lastnull
		df.loc[df[first_hr].isnull() & ~df[last_hr].isnull(), :] = firstnull.shift(periods=-1, axis=1)

	return filter_cols_below(df)


In [5]:
labs_filter = [
{
    "exact": [],
    "startswith": ["pba_"],
    "endswith": [],
    "regex": [],
    "exclude": None
},
{
    "exact": [],
    "startswith": [],
    "endswith": ["_eod", "_fb", "_fbeod"],
    "regex": [],
    "exclude": None
}]

final_dfs = {}
logging.info('executing...')
for paths, dfs in gen_group(dataset):
    fpaths, lpaths, rpaths = paths
    features, labels, row_masks = dfs
    asset = fpaths[0]
    logging.info('fpaths: ' +str(fpaths))
    logging.info('lpaths: ' +str(lpaths))
    logging.info('rpaths: ' +str(rpaths))

    reindexed = delayed(reindex_on_time_mask)(features, row_masks)
    transposed = delayed(gb_transpose)(reindexed.loc[:, ['pba_avgPrice']])
    cleaned = delayed(filter_cols_below)(transposed)
    to_align = delayed(is_alignment_needed)(cleaned)
    final_feats = delayed(lambda df, align: df if (not align) else align_first_last(transposed))(cleaned, to_align)

    final_labs = prep_labels(labels, types=['bool'])
    final_labs = delayed(lambda df: df.loc[:, chained_filter(df.columns, labs_filter)])(final_labs) # EOD, FBEOD, FB

#     sc = delayed(feedforward_test)(final_feats, final_labs)
    feats = final_feats.compute()
    labs = final_labs.compute()
    print('done')
    break

INFO:root:executing...
INFO:root:fpaths: ['sp_500', 'dzn', 'raw_pba']
INFO:root:lpaths: ['sp_500', 'itb_fth_of_xwhole', 'pba_oc_return']
INFO:root:rpaths: ['sp_500', 'id_rm', 'raw_pba']


done


In [6]:
feats.dropna(axis=0, how='all')

index,8,9,10,11,12,13,14,15
1998-01-02,0.764363,-0.919535,-0.324742,0.030738,-0.005801,-1.244423,-0.261876,1.961275
1998-01-05,-0.294921,0.534975,1.138490,1.260201,-0.048909,-1.661387,-0.988127,0.059679
1998-01-06,1.823835,0.871321,0.084899,-0.882164,-0.038934,0.048638,-1.344733,-0.562864
1998-01-07,1.402632,0.546009,-0.192620,-0.599092,-1.156929,,,
1998-01-08,1.356156,-0.439922,1.085739,0.028506,-0.422662,0.602441,-0.471305,-1.738954
1998-01-09,1.339278,0.893651,0.586687,0.160021,0.229886,-0.488798,-1.210417,-1.510309
1998-01-12,-1.876759,-0.888824,0.015159,0.110451,0.616472,-0.051101,0.772059,1.302542
1998-01-13,-0.866760,-0.804940,-0.979722,-0.158095,-0.394555,0.437965,0.912527,1.853580
1998-01-14,-0.046663,-0.556156,-0.985557,-1.049424,-0.346577,0.025488,1.236935,1.721954
1998-01-15,0.953404,0.427092,1.533024,-0.215255,-0.283984,-0.196835,-0.425278,-1.792168


In [9]:
labs

Unnamed: 0,pba_oa_return_fth_of_xwhole_eod,pba_oa_return_fth_af_abs_avg_shf_0.5_dir_fbeod,pba_oa_return_fth_af_abs_avg_shf_1_dir_fbeod,pba_oa_return_fth_af_abs_avg_shf_2_dir_fbeod,pba_oa_return_fth_af_abs_std_shf_0.5_dir_fbeod,pba_oa_return_fth_af_abs_std_shf_1_dir_fbeod,pba_oa_return_fth_af_abs_std_shf_2_dir_fbeod,pba_oa_return_fth_af_abs_max_shf_0.5_dir_fbeod,pba_oa_return_fth_af_abs_max_shf_1_dir_fbeod,pba_oa_return_fth_af_abs_max_shf_2_dir_fbeod,pba_oa_return_fth_of_abs_xavg_shf_0.5_dir_fbeod,pba_oa_return_fth_of_abs_xavg_shf_1_dir_fbeod,pba_oa_return_fth_of_abs_xavg_shf_2_dir_fbeod,pba_oa_return_fth_of_abs_xstd_shf_0.5_dir_fbeod,pba_oa_return_fth_of_abs_xstd_shf_1_dir_fbeod,pba_oa_return_fth_of_abs_xstd_shf_2_dir_fbeod,pba_oa_return_fth_of_abs_xmax_shf_0.5_dir_fbeod,pba_oa_return_fth_of_abs_xmax_shf_1_dir_fbeod,pba_oa_return_fth_of_abs_xmax_shf_2_dir_fbeod,pba_oa_return_fth_af_abs_avg_shf_0.5_dir_fb,pba_oa_return_fth_af_abs_avg_shf_1_dir_fb,pba_oa_return_fth_af_abs_avg_shf_2_dir_fb,pba_oa_return_fth_af_abs_std_shf_0.5_dir_fb,pba_oa_return_fth_af_abs_std_shf_1_dir_fb,pba_oa_return_fth_af_abs_std_shf_2_dir_fb,pba_oa_return_fth_af_abs_max_shf_0.5_dir_fb,pba_oa_return_fth_af_abs_max_shf_1_dir_fb,pba_oa_return_fth_af_abs_max_shf_2_dir_fb,pba_oa_return_fth_of_abs_xavg_shf_0.5_dir_fb,pba_oa_return_fth_of_abs_xavg_shf_1_dir_fb,pba_oa_return_fth_of_abs_xavg_shf_2_dir_fb,pba_oa_return_fth_of_abs_xstd_shf_0.5_dir_fb,pba_oa_return_fth_of_abs_xstd_shf_1_dir_fb,pba_oa_return_fth_of_abs_xstd_shf_2_dir_fb,pba_oa_return_fth_of_abs_xmax_shf_0.5_dir_fb,pba_oa_return_fth_of_abs_xmax_shf_1_dir_fb,pba_oa_return_fth_of_abs_xmax_shf_2_dir_fb
1998-01-02,1.0,,,,,,,,,,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,,,,,,,,,,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0
1998-01-03,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1998-01-04,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1998-01-05,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1998-01-06,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0
1998-01-07,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
1998-01-08,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
1998-01-09,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
1998-01-10,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1998-01-11,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [10]:
lab_name = labs.columns[0]
lab_ser = shift_label(labs.loc[:, lab_name])

In [12]:
inner_join(feats.dropna(axis=0, how='all'), lab_ser)

Unnamed: 0,8,9,10,11,12,13,14,15,pba_oa_return_fth_of_xwhole_eod
1998-01-02,0.764363,-0.919535,-0.324742,0.030738,-0.005801,-1.244423,-0.261876,1.961275,1
1998-01-05,-0.294921,0.534975,1.138490,1.260201,-0.048909,-1.661387,-0.988127,0.059679,-1
1998-01-06,1.823835,0.871321,0.084899,-0.882164,-0.038934,0.048638,-1.344733,-0.562864,-1
1998-01-07,1.402632,0.546009,-0.192620,-0.599092,-1.156929,,,,-1
1998-01-08,1.356156,-0.439922,1.085739,0.028506,-0.422662,0.602441,-0.471305,-1.738954,-1
1998-01-09,1.339278,0.893651,0.586687,0.160021,0.229886,-0.488798,-1.210417,-1.510309,1
1998-01-12,-1.876759,-0.888824,0.015159,0.110451,0.616472,-0.051101,0.772059,1.302542,1
1998-01-13,-0.866760,-0.804940,-0.979722,-0.158095,-0.394555,0.437965,0.912527,1.853580,1
1998-01-14,-0.046663,-0.556156,-0.985557,-1.049424,-0.346577,0.025488,1.236935,1.721954,-1
1998-01-15,0.953404,0.427092,1.533024,-0.215255,-0.283984,-0.196835,-0.425278,-1.792168,1


In [8]:
for asset in assets:
    transposed = filter_cols_below(results[asset].compute())
    numcols = transposed.columns.size
    if (numcols > 8):
        cnt = transposed.count()
        if (abs(cnt.iloc[0] - cnt.iloc[-1]) > .25*cnt.max()):
            display(cnt)
            tofill = transposed
            display(tofill)

index
8     3165
9     4990
10    4992
11    4996
12    4997
13    4969
14    4946
15    4936
16    1808
dtype: int64

index,8,9,10,11,12,13,14,15,16
1998-01-02,1.767220,-0.621473,-0.801030,-0.762599,-0.378077,-0.561798,0.012850,1.344907,
1998-01-03,,,,,,,,,
1998-01-04,,,,,,,,,
1998-01-05,-0.365924,0.230259,0.942520,1.476313,0.385029,-1.611172,-0.998866,-0.058159,
1998-01-06,1.744715,1.120044,0.375041,-0.793998,-0.489436,-0.149741,-1.038334,-0.768292,
1998-01-07,1.339411,0.650810,-0.202215,-0.626559,-1.161447,,,,
1998-01-08,1.100531,0.090101,0.769356,0.408035,-0.134298,0.439166,-0.549288,-2.123603,
1998-01-09,1.587021,0.971661,0.590207,-0.027603,-0.089460,-0.747691,-1.063173,-1.220963,
1998-01-10,,,,,,,,,
1998-01-11,,,,,,,,,


In [9]:
def align_first_last_columns(df):
	"""
	Return df where non-overlapping subsets have first or last column set to null, align them and remove the redundant column
	"""
	cnt_df = df.count()
	first_hr, last_hr = cnt_df.index[0], cnt_df.index[-1]
	firstnull = df[df[first_hr].isnull() & ~df[last_hr].isnull()]
	lastnull = df[~df[first_hr].isnull() & df[last_hr].isnull()]

	# The older format is changed to match the latest one
	if (firstnull.index[-1] > lastnull.index[-1]): 		# Changed lastnull subset to firstnull
		df.loc[~df[first_hr].isnull() & df[last_hr].isnull(), :] = lastnull.shift(periods=1, axis=1)
	elif (firstnull.index[-1] < lastnull.index[-1]):	# Changed firstnull subset to lastnull
		df.loc[df[first_hr].isnull() & ~df[last_hr].isnull(), :] = firstnull.shift(periods=-1, axis=1)

	return filter_cols_below(df)

In [11]:
tofillcopy = tofill.copy()

In [18]:
first_hr, last_hr = cnt.index[0], cnt.index[-1]

In [207]:
lastnull = tofillcopy[~tofillcopy[first_hr].isnull() & tofillcopy[last_hr].isnull()]
firstnull = tofillcopy[tofillcopy[first_hr].isnull() & ~tofillcopy[last_hr].isnull()]

In [210]:
if (lastnull.index[-1] < firstnull.index[-1]):
    tofillcopy.loc[~tofillcopy[first_hr].isnull() & tofillcopy[last_hr].isnull(), :] = lastnull.shift(periods=1, axis=1)
elif (lastnull.index[-1] > firstnull.index[-1]):
    tofillcopy.loc[tofillcopy[first_hr].isnull() & ~tofillcopy[last_hr].isnull(), :] = firstnull.shift(periods=-1, axis=1)

In [12]:
fx = align_first_last_columns(tofillcopy)

In [14]:
fx.count()

8

In [103]:
tofillcopy[~tofillcopy[8].isnull() & tofillcopy[16].isnull()]
# tofillcopy[tofillcopy[8].isnull() & ~tofillcopy[16].isnull()] = tofillcopy.shift(periods=-1, axis=1)
tofillcopy[~tofillcopy[8].isnull() & tofillcopy[16].isnull()] = tofillcopy.shift(periods=1,axis=1)
display(tofillcopy.count())

index
8        2
9     4993
10    5000
11    4997
12    4999
13    4997
14    4953
15    4946
16    4912
dtype: int64

In [105]:
tofill[tofill[8].isnull()].dropna(how='all')

index,8,9,10,11,12,13,14,15,16
1998-10-29,,-1.643792,-1.027894,0.084634,0.406074,0.419761,0.466979,1.294239,
1999-04-06,,0.622315,0.867931,0.614056,0.502141,-0.006925,-1.952221,-0.647298,
1999-08-26,,,1.029110,1.000261,0.327112,-0.065712,-0.825014,-1.465757,
2000-08-23,,-2.267000,0.352326,0.350146,0.363710,0.378770,0.397934,0.424114,
2000-12-19,,0.273660,0.448425,0.690596,0.823906,0.614975,-1.176399,-1.675164,
2001-01-02,,2.160707,-0.189538,-0.076502,0.021759,-0.433779,-0.771404,-0.711243,
2001-02-27,,,0.854926,1.008949,0.621285,-0.175221,-0.866425,-1.443514,
2001-06-18,,1.075310,1.145978,0.418093,0.147402,-0.260277,-1.070175,-1.456330,
2001-08-23,,1.248464,0.650121,0.454951,0.399100,-0.328185,-0.661894,-1.762558,
2001-09-10,,,,,0.305202,-0.379414,-1.137659,1.211871,
