# Model Debug

In [1]:
import sys
import os
from os import sep
from os.path import dirname, realpath
from pathlib import Path
from functools import partial
from itertools import product
import logging

def get_cwd(fname, subdir, crunch_dir=realpath(Path.home()) +sep +'crunch' +sep):
    """
    Convenience function to make a directory string for the current file based on inputs.
    Jupyter Notebook in Anaconda invokes the Python interpreter in Anaconda's subdirectory
    which is why changing sys.argv[0] is necessary. In the future a better way to do this
    should be preferred..
    """
    return crunch_dir +subdir +fname
    
def fix_path(cwd):
    """
    Convenience function to fix argv and python path so that jupyter notebook can run the same as
    any script in crunch.
    """
    sys.argv[0] = cwd
    module_path = os.path.abspath(os.path.join('..'))
    if module_path not in sys.path:
        sys.path.append(module_path)

fname = 'model_debug_2.ipynb'   # FILL
dir_name = 'model'              # FILL
fix_path(get_cwd(fname, dir_name +sep))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from dask import delayed, compute

from ipywidgets import interact, interactive, fixed
from IPython.display import display

pd.set_option("display.max_rows", 100)
pd.set_option('display.max_columns', 50)

from common_util import RECON_DIR, JSON_SFX_LEN, DT_CAL_DAILY_FREQ, set_loglevel, str_to_list, ser_shift, get_cmd_args, in_debug_mode, pd_common_index_rows, load_json, filter_cols_below, inner_join, outer_join, list_get_dict, benchmark
from common_util import remove_dups_list, ser_shift, list_get_dict, list_set_dict, compose, pd_dti_index_to_date
from model.common import DATASET_DIR, FILTERSET_DIR, default_dataset, default_opt_filter, default_target_idx
from model.model_util import datagen, prepare_transpose_data, prepare_label_data
from recon.dataset_util import prep_dataset, gen_group
from recon.split_util import get_train_test_split, gen_time_series_split, pd_binary_clip
from data.access_util import df_getters as dg, col_subsetters2 as cs2
from data.data_api import DataAPI



In [2]:
set_loglevel('info')

In [3]:
cmd_arg_list = ['dataset=', 'filterset=', 'idxfilters=', 'assets=', 'target_idx=', 'strategy=', 'visualize']
cmd_input = dict.fromkeys(cmd_arg_list)
cmd_input['idxfilters='] = '1,2,3,4'
dataset_name = cmd_input['dataset='] if (cmd_input['dataset='] is not None) else default_dataset
assets = str_to_list(cmd_input['assets=']) if (cmd_input['assets='] is not None) else None
target_idx = str_to_list(cmd_input['target_idx='], cast_to=int) if (cmd_input['target_idx='] is not None) else default_target_idx
run_compute = True if (cmd_input['visualize'] is None) else False

dataset_dict = load_json(dataset_name, dir_path=DATASET_DIR)
dataset = prep_dataset(dataset_dict, assets=assets, filters_map=None)

logging.info('assets: ' +str('all' if (assets==None) else ', '.join(assets)))
# logging.info('dataset: {} {} df(s)'.format(len(dataset['features']['paths']), dataset_name[:-JSON_SFX_LEN]))
logging.debug('fpaths: {}'.format(str(dataset['features'])))
logging.debug('lpaths: {}'.format((dataset['labels'])))
logging.debug('rmpaths: {}'.format((dataset['row_masks'])))

INFO:root:assets: all


In [4]:
for k, v in dataset['labels']['dfs'].items():
    print(k)

['dow_jones', 'direod', 'raw_pba_oa_retxeod_direod']
['dow_jones', 'direod', 'raw_pba_oa_retxeod_direod', 'raw_pba_oa_retxeod(0.25%)_direod']
['dow_jones', 'direod', 'raw_pba_oa_retxeod_direod', 'raw_pba_oa_retxeod(0.50%)_direod']
['dow_jones', 'direod', 'raw_pba_oa_retxeod_direod', 'raw_pba_oa_retxeod(1.00%)_direod']
['dow_jones', 'direod', 'raw_pba_oa_retxeod_direod', 'raw_pba_oa_retxeod(1.50%)_direod']
['dow_jones', 'direod', 'raw_pba_oa_retxeod_direod', 'raw_pba_oa_retxeod(2.00%)_direod']
['dow_jones', 'direod', 'raw_pba_oc_retxeod_direod']
['dow_jones', 'direod', 'raw_pba_oc_retxeod_direod', 'raw_pba_oc_retxeod(0.25%)_direod']
['dow_jones', 'direod', 'raw_pba_oc_retxeod_direod', 'raw_pba_oc_retxeod(0.50%)_direod']
['dow_jones', 'direod', 'raw_pba_oc_retxeod_direod', 'raw_pba_oc_retxeod(1.00%)_direod']
['dow_jones', 'direod', 'raw_pba_oc_retxeod_direod', 'raw_pba_oc_retxeod(1.50%)_direod']
['dow_jones', 'direod', 'raw_pba_oc_retxeod_direod', 'raw_pba_oc_retxeod(2.00%)_direod']
['na

In [5]:
kc = ['dow_jones', 'direod', 'raw_vol_oa_retxeod_direod', 'raw_vol_oa_retxeod(2.00%)_direod']
d = dataset['labels']['dfs'][kc]

In [6]:
kc

['dow_jones',
 'direod',
 'raw_vol_oa_retxeod_direod',
 'raw_vol_oa_retxeod(2.00%)_direod']

In [7]:
d.compute().dropna()

Unnamed: 0_level_0,vol_oa
id,Unnamed: 1_level_1
2005-03-18 00:00:00+00:00,-1.0
2005-03-21 00:00:00+00:00,1.0
2005-03-22 00:00:00+00:00,1.0
2005-03-23 00:00:00+00:00,0.0
2005-03-24 00:00:00+00:00,-1.0
2005-03-28 00:00:00+00:00,-1.0
2005-03-29 00:00:00+00:00,1.0
2005-03-30 00:00:00+00:00,-1.0
2005-03-31 00:00:00+00:00,0.0
2005-04-01 00:00:00+00:00,1.0


In [8]:
kc = ['dow_jones', 'direod', 'raw_vol_oa_retxeod_direod']
d = dataset['labels']['dfs'][kc]

In [9]:
d.compute()

Unnamed: 0_level_0,vol_oa
id,Unnamed: 1_level_1
2005-03-18 00:00:00+00:00,-1.0
2005-03-19 00:00:00+00:00,
2005-03-20 00:00:00+00:00,
2005-03-21 00:00:00+00:00,1.0
2005-03-22 00:00:00+00:00,1.0
2005-03-23 00:00:00+00:00,1.0
2005-03-24 00:00:00+00:00,-1.0
2005-03-25 00:00:00+00:00,
2005-03-26 00:00:00+00:00,
2005-03-27 00:00:00+00:00,


In [10]:
dataset_grid = {
    'feat_idx': [0, 1, 2, 3, 4],
    'label_idx': target_idx
}

In [8]:
logging.info('executing...')
for feature, label in datagen(dataset, feat_prep_fn=prepare_transpose_data, label_prep_fn=prepare_label_data):
    pos_label, neg_label = pd_binary_clip(label)
    f, lpos, lneg = pd_common_index_rows(feature, pos_label, neg_label)
    display(label)
#     logging.info('pos dir model experiment')
#     run_trials(ThreeLayerBinaryFFN, f, lpos)

#     logging.info('neg dir model experiment')
#     run_trials(ThreeLayerBinaryFFN, f, lneg)


INFO:root:executing...


id
1998-01-02    1
1998-01-05    1
1998-01-06   -1
1998-01-07   -1
1998-01-08   -1
1998-01-09    1
1998-01-12    1
1998-01-13    1
1998-01-14   -1
1998-01-15    1
1998-01-16    1
1998-01-20   -1
1998-01-21    1
1998-01-22   -1
1998-01-23    1
1998-01-26    1
1998-01-27    1
1998-01-28    1
1998-01-29   -1
1998-01-30    1
1998-02-02    1
1998-02-03    1
1998-02-04   -1
1998-02-05    1
1998-02-06   -1
1998-02-09    1
1998-02-10    1
1998-02-11    1
1998-02-12    1
1998-02-13    1
1998-02-17    1
1998-02-18   -1
1998-02-19    1
1998-02-20   -1
1998-02-23   -1
1998-02-24    1
1998-02-25    1
1998-02-26    1
1998-02-27    1
1998-03-02    1
1998-03-03   -1
1998-03-04   -1
1998-03-05    1
1998-03-06    1
1998-03-09    1
1998-03-10    1
1998-03-11    1
1998-03-12   -1
1998-03-13    1
1998-03-16    1
             ..
2017-10-18    1
2017-10-19    1
2017-10-20   -1
2017-10-23    1
2017-10-24   -1
2017-10-25    1
2017-10-26    1
2017-10-27    1
2017-10-30    1
2017-10-31   -1
2017-11-01    1
2017-

id
1998-01-02    1
1998-01-05    1
1998-01-06   -1
1998-01-07   -1
1998-01-08   -1
1998-01-09    1
1998-01-12    1
1998-01-13    1
1998-01-14   -1
1998-01-15    1
1998-01-16    1
1998-01-20   -1
1998-01-21    1
1998-01-22   -1
1998-01-23    1
1998-01-26    1
1998-01-27    1
1998-01-28    1
1998-01-29   -1
1998-01-30    1
1998-02-02    1
1998-02-03    1
1998-02-04   -1
1998-02-05    1
1998-02-06   -1
1998-02-09    1
1998-02-10    1
1998-02-11    1
1998-02-12    1
1998-02-13    1
1998-02-17    1
1998-02-18   -1
1998-02-19    1
1998-02-20   -1
1998-02-23   -1
1998-02-24    1
1998-02-25    1
1998-02-26    1
1998-02-27    1
1998-03-02    1
1998-03-03   -1
1998-03-04   -1
1998-03-05    1
1998-03-06    1
1998-03-09    1
1998-03-10    1
1998-03-11    1
1998-03-12   -1
1998-03-13    1
1998-03-16    1
             ..
2017-10-18    1
2017-10-19    1
2017-10-20   -1
2017-10-23    1
2017-10-24   -1
2017-10-25    1
2017-10-26    1
2017-10-27    1
2017-10-30    1
2017-10-31   -1
2017-11-01    1
2017-

id
1998-01-02    1
1998-01-05    1
1998-01-06   -1
1998-01-07   -1
1998-01-08   -1
1998-01-09    1
1998-01-12    1
1998-01-13    1
1998-01-14   -1
1998-01-15    1
1998-01-16    1
1998-01-20   -1
1998-01-21    1
1998-01-22   -1
1998-01-23    1
1998-01-26    1
1998-01-27    1
1998-01-28    1
1998-01-29   -1
1998-01-30    1
1998-02-02    1
1998-02-03    1
1998-02-04   -1
1998-02-05    1
1998-02-06   -1
1998-02-09    1
1998-02-10    1
1998-02-11    1
1998-02-12    1
1998-02-13    1
1998-02-17    1
1998-02-18   -1
1998-02-19    1
1998-02-20   -1
1998-02-23   -1
1998-02-24    1
1998-02-25    1
1998-02-26    1
1998-02-27    1
1998-03-02    1
1998-03-03   -1
1998-03-04   -1
1998-03-05    1
1998-03-06    1
1998-03-09    1
1998-03-10    1
1998-03-11    1
1998-03-12   -1
1998-03-13    1
1998-03-16    1
             ..
2017-10-18    1
2017-10-19    1
2017-10-20   -1
2017-10-23    1
2017-10-24   -1
2017-10-25    1
2017-10-26    1
2017-10-27    1
2017-10-30    1
2017-10-31   -1
2017-11-01    1
2017-

KeyboardInterrupt: 

In [8]:
if (run_compute):
    logging.info('executing...')
    for paths, dfs, recs in gen_group(dataset, out=['dfs', 'recs']):
        fpaths, lpaths, rpaths = paths
        features, labels, row_masks = dfs
        features_rec, labels_rec, row_masks_rec = recs

        asset = fpaths[0]
        logging.info('fpaths: ' +str(fpaths))
        logging.info('lpaths: ' +str(lpaths))
        logging.info('rpaths: ' +str(rpaths))
        print(features_rec)

        for feat_idx, label_idx in product(*dataset_grid.values()):
            final_feature = prepare_transpose_data(features.iloc[:, [feat_idx]], row_masks).dropna(axis=0, how='all')
            final_label = prepare_labels(labels.iloc[:, label_idx]).dropna()
            final_label = delayed(ser_shift)(final_label)
            feat_lab = delayed(pd_common_index_rows)(final_feature, final_label)
            break
        break

INFO:root:executing...
INFO:root:fpaths: ['dow_jones', 'dmx', 'raw_pba']
INFO:root:lpaths: ['dow_jones', 'direod', 'raw_pba_oa_retxeod_direod']
INFO:root:rpaths: ['dow_jones', 'id_rm', 'raw_pba']


Pandas(Index=995, id=995, name='dow_jones_mutate_995', dir='dow_jones/dow_jones_raw_0/H/dmx/', freq='H', root='dow_jones', basis='dow_jones_raw_0', stage='mutate', recon_type=nan, mutate_type='dmx', raw_cat='us_equity_index', hist='->mutate_dmx', desc='raw_pba_dmx', size=2, dumptime=0.03, hash='367136082725478097496128', created='2018-08-12 02:13:23', modified='2018-10-29 01:00:24')


NameError: name 'dataset_grid' is not defined

In [42]:
f, l = tuple(feat_lab.compute())

In [43]:
l

1998-01-02    1
1998-01-05    1
1998-01-06   -1
1998-01-07   -1
1998-01-08   -1
1998-01-09    1
1998-01-12    1
1998-01-13    1
1998-01-14   -1
1998-01-15    1
1998-01-16    1
1998-01-20   -1
1998-01-21    1
1998-01-22   -1
1998-01-23    1
1998-01-26    1
1998-01-27    1
1998-01-28    1
1998-01-29   -1
1998-01-30    1
1998-02-02    1
1998-02-03    1
1998-02-04   -1
1998-02-05    1
1998-02-06   -1
1998-02-09    1
1998-02-10    1
1998-02-11    1
1998-02-12    1
1998-02-13    1
1998-02-17    1
1998-02-18   -1
1998-02-19    1
1998-02-20   -1
1998-02-23   -1
1998-02-24    1
1998-02-25    1
1998-02-26    1
1998-02-27    1
1998-03-02    1
1998-03-03   -1
1998-03-04   -1
1998-03-05    1
1998-03-06    1
1998-03-09    1
1998-03-10    1
1998-03-11    1
1998-03-12   -1
1998-03-13    1
1998-03-16    1
             ..
2017-10-18    1
2017-10-19    1
2017-10-20   -1
2017-10-23    1
2017-10-24   -1
2017-10-25    1
2017-10-26    1
2017-10-27    1
2017-10-30    1
2017-10-31   -1
2017-11-01    1
2017-11-