# Model Debug

In [1]:
import sys
import os
from os import sep
from os.path import dirname, realpath
from pathlib import Path
from itertools import product
import logging

def get_cwd(fname, subdir, crunch_dir=realpath(Path.home()) +sep +'crunch' +sep):
    """
    Convenience function to make a directory string for the current file based on inputs.
    Jupyter Notebook in Anaconda invokes the Python interpreter in Anaconda's subdirectory
    which is why changing sys.argv[0] is necessary. In the future a better way to do this
    should be preferred..
    """
    return crunch_dir +subdir +fname
    
def fix_path(cwd):
    """
    Convenience function to fix argv and python path so that jupyter notebook can run the same as
    any script in crunch.
    """
    sys.argv[0] = cwd
    module_path = os.path.abspath(os.path.join('..'))
    if module_path not in sys.path:
        sys.path.append(module_path)

fname = 'model_debug_2.ipynb'   # FILL
dir_name = 'model'   # FILL
fix_path(get_cwd(fname, dir_name +sep))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from dask import delayed, compute

from ipywidgets import interact, interactive, fixed
from IPython.display import display

pd.set_option("display.max_rows", 100)
pd.set_option('display.max_columns', 50)

from common_util import RECON_DIR, JSON_SFX_LEN, DT_CAL_DAILY_FREQ, set_loglevel, str_to_list, get_cmd_args, in_debug_mode, pd_common_index_rows, load_json, filter_cols_below, inner_join, outer_join, list_get_dict, benchmark
from model.common import DATASET_DIR, FILTERSET_DIR, default_dataset, default_opt_filter, default_target_idx
from model.model_util import prepare_transpose_data, prepare_masked_labels
from recon.dataset_util import prep_dataset, prep_labels, gen_group
from recon.split_util import get_train_test_split, gen_time_series_split
from recon.label_util import shift_label



In [2]:
set_loglevel('info')

In [3]:
cmd_arg_list = ['dataset=', 'filterset=', 'idxfilters=', 'assets=', 'target_idx=', 'strategy=', 'visualize']
cmd_input = dict.fromkeys(cmd_arg_list)
cmd_input['idxfilters='] = '1,2,3,4'
dataset_name = cmd_input['dataset='] if (cmd_input['dataset='] is not None) else default_dataset
filterset_name = cmd_input['filterset='] if (cmd_input['filterset='] is not None) else '_'.join(['default', dataset_name])
filter_idxs = str_to_list(cmd_input['idxfilters=']) if (cmd_input['idxfilters='] is not None) else default_opt_filter
assets = str_to_list(cmd_input['assets=']) if (cmd_input['assets='] is not None) else None
target_idx = str_to_list(cmd_input['target_idx='], cast_to=int) if (cmd_input['target_idx='] is not None) else default_target_idx
run_compute = True if (cmd_input['visualize'] is None) else False

dataset_dict = load_json(dataset_name, dir_path=DATASET_DIR)
filter_dict = load_json(filterset_name, dir_path=FILTERSET_DIR)

filterset = []
for filter_idx in filter_idxs:
    selected = [flt for flt in filter_dict[filter_idx] if (flt not in filterset)]
    filterset.extend(selected)
dataset = prep_dataset(dataset_dict, assets=assets, filters_map={'features': filterset})

logging.info('assets: ' +str('all' if (assets==None) else ', '.join(assets)))
logging.info('dataset: {} {} df(s)'.format(len(dataset['features']['paths']), dataset_name[:-JSON_SFX_LEN]))
logging.info('filter: {} [{}]'.format(filterset_name[:-JSON_SFX_LEN], str(', '.join(filter_idxs))))
logging.debug('filterset: ' +str(filterset))
logging.debug('fpaths: ' +str(dataset['features']['paths']))
logging.debug('lpaths: ' +str(dataset['labels']['paths']))
logging.debug('rmpaths: ' +str(dataset['row_masks']['paths']))

INFO:root:assets: all
INFO:root:dataset: 16 dnorm_raw df(s)
INFO:root:filter: default_dnorm_raw [1, 2, 3, 4]


In [4]:
feats_filter = [{
    "exact": ["pba_avgPrice"],
    "startswith": [],
    "endswith": [],
    "regex": [],
    "exclude": None
}]

labs_filter = [ # EOD, FBEOD, FB
{
    "exact": [],
    "startswith": ["pba_"],
    "endswith": [],
    "regex": [],
    "exclude": None
},
{
    "exact": [],
    "startswith": [],
    "endswith": ["_eod(0%)", "_eod(1%)", "_eod(2%)", "_fb", "_fbeod"],
    "regex": [],
    "exclude": None
}]
dataset_grid = {
    'feat_idx': [0, 1, 2, 3, 4],
    'label_idx': [0, 1, 2]
}
# dataset_space = {
#     'feat': hp.choice('feat', ['pba_avgPrice']),
#     'label_idx': hp.choice('label_idx', ['0', '1', '2'])
# }

# lstm_space = {
#     'units1': hp.choice('units1', [64, 512]),
#     'units2': hp.choice('units2', [64, 512]),
#     'units3': hp.choice('units3', [64, 512]),
#     'lr': hp.choice('lr',[0.01, 0.001, 0.0001]),
#     'activation': hp.choice('activation', ['relu', 'sigmoid', 'tanh', 'linear']),
#     'loss': hp.choice('loss', [losses.logcosh, losses.mse, losses.mae, losses.mape])
# }

# space = {**dataset_space, **lstm_space}

In [5]:
def pd_binary_clip(pd_obj, thresh=0):
	"""
	Given a pd_obj split it into two by clipping above and below threshold,
	return the above threshold pd_obj and absolute value of the below threshold pd_obj.
	"""
	above = pd_obj.clip(lower=thresh, upper=None)
	below = pd_obj.clip(lower=None, upper=thresh)

	return above, below.abs()

In [23]:
if (run_compute):
    logging.info('executing...')
    for paths, dfs in gen_group(dataset):
        fpaths, lpaths, rpaths = paths
        features, labels, row_masks = dfs
        asset = fpaths[0]
        logging.info('fpaths: ' +str(fpaths))
        logging.info('lpaths: ' +str(lpaths))
        logging.info('rpaths: ' +str(rpaths))

        masked_labels = prepare_masked_labels(labels, ['bool'], labs_filter)
        for feat_idx, label_idx in product(*dataset_grid.values()):
            final_feature = prepare_transpose_data(features.iloc[:, [feat_idx]], row_masks).dropna(axis=0, how='all')
            final_label = delayed(shift_label)(masked_labels.iloc[:, label_idx]).dropna()
            feat_lab = delayed(pd_common_index_rows)(final_feature, final_label)
            break
        break

INFO:root:executing...
INFO:root:fpaths: ['dow_jones', 'dzn', 'raw_pba']
INFO:root:lpaths: ['dow_jones', 'itb_fth_of_xwhole', 'pba_oc_return']
INFO:root:rpaths: ['dow_jones', 'id_rm', 'raw_pba']


In [24]:
f, l = tuple(feat_lab.compute())

In [26]:
l

 1    2651
-1    2329
 0       3
Name: pba_oc_return_fth_of_xwhole_eod(0%), dtype: int64

In [9]:
pd_binary_clip(l)

(1998-01-02    0
 1998-01-05    0
 1998-01-06    0
 1998-01-07    0
 1998-01-08    0
 1998-01-09    0
 1998-01-12    1
 1998-01-13    0
 1998-01-14    0
 1998-01-15    0
 1998-01-16    1
 1998-01-20    0
 1998-01-21    0
 1998-01-22    0
 1998-01-23    0
 1998-01-26    1
 1998-01-27    1
 1998-01-28    0
 1998-01-29    0
 1998-01-30    1
 1998-02-02    0
 1998-02-03    0
 1998-02-04    0
 1998-02-05    0
 1998-02-06    0
 1998-02-09    1
 1998-02-10    0
 1998-02-11    0
 1998-02-12    0
 1998-02-13    0
 1998-02-17    0
 1998-02-18    0
 1998-02-19    0
 1998-02-20    0
 1998-02-23    0
 1998-02-24    1
 1998-02-25    0
 1998-02-26    0
 1998-02-27    0
 1998-03-02    0
 1998-03-03    0
 1998-03-04    0
 1998-03-05    1
 1998-03-06    0
 1998-03-09    0
 1998-03-10    0
 1998-03-11    0
 1998-03-12    0
 1998-03-13    1
 1998-03-16    0
              ..
 2017-10-18    0
 2017-10-19    0
 2017-10-20    0
 2017-10-23    0
 2017-10-24    0
 2017-10-25    0
 2017-10-26    0
 2017-10-27   

In [40]:
above, below = pd_binary_cleave(l, f)

In [41]:
below

(1998-01-02    0
 1998-01-05    0
 1998-01-06    1
 1998-01-07    1
 1998-01-08    1
 1998-01-09    0
 1998-01-13    0
 1998-01-14    1
 1998-01-15    0
 1998-01-20    0
 1998-01-21    0
 1998-01-22    0
 1998-01-23    0
 1998-01-28    0
 1998-01-29    0
 1998-02-02    0
 1998-02-03    0
 1998-02-04    0
 1998-02-05    0
 1998-02-06    0
 1998-02-10    0
 1998-02-11    0
 1998-02-12    0
 1998-02-13    0
 1998-02-17    0
 1998-02-18    0
 1998-02-19    0
 1998-02-20    0
 1998-02-23    0
 1998-02-25    0
 1998-02-26    0
 1998-02-27    0
 1998-03-02    0
 1998-03-03    0
 1998-03-04    1
 1998-03-06    0
 1998-03-09    0
 1998-03-10    0
 1998-03-11    0
 1998-03-12    0
 1998-03-16    0
 1998-03-17    0
 1998-03-18    0
 1998-03-20    0
 1998-03-23    0
 1998-03-24    0
 1998-03-25    0
 1998-03-27    0
 1998-03-30    0
 1998-03-31    0
              ..
 2017-10-17    0
 2017-10-18    0
 2017-10-19    0
 2017-10-20    0
 2017-10-23    0
 2017-10-24    0
 2017-10-25    0
 2017-10-26   