# nb-model_xg-mdl

In [1]:
import sys
import os
from os import sep
from os.path import dirname, realpath
from pathlib import Path
from functools import partial, reduce
import logging

def get_cwd(fname, subdir, crunch_dir=realpath(Path.home()) +sep +'crunch' +sep):
    """
    Convenience function to make a directory string for the current file based on inputs.
    Jupyter Notebook in Anaconda invokes the Python interpreter in Anaconda's subdirectory
    which is why changing sys.argv[0] is necessary. In the future a better way to do this
    should be preferred..
    """
    return crunch_dir +subdir +fname

def fix_path(cwd):
    """
    Convenience function to fix argv and python path so that jupyter notebook can run the same as
    any script in crunch.
    """
    sys.argv[0] = cwd
    module_path = os.path.abspath(os.path.join('..'))
    if module_path not in sys.path:
        sys.path.append(module_path)

fname = 'nb-model_xg-mdl.ipynb'
dir_name = 'model'
fix_path(get_cwd(fname, dir_name +sep))

import numpy as np
import pandas as pd
#import matplotlib.pyplot as plt
from dask import delayed, compute
from torch.utils.data import TensorDataset, DataLoader
import torch

from ipywidgets import interact, interactive, fixed
from IPython.display import display

pd.set_option("display.max_rows", 100)
pd.set_option('display.max_columns', 50)

from common_util import MODEL_DIR, RECON_DIR, JSON_SFX_LEN, DT_CAL_DAILY_FREQ, is_type, pd_common_idx_rows, remove_dups_list, NestedDefaultDict, set_loglevel, search_df, chained_filter, get_variants, load_df, dump_df, load_json, gb_transpose, pd_common_index_rows, filter_cols_below, inner_join, outer_join, ser_shift, list_get_dict, window_iter, benchmark
from common_util import midx_get_level, pd_rows, midx_intersect, pd_common_idx_rows, midx_split, pd_midx_to_arr, window_iter, np_is_ndim, identity_fn
from model.common import DATASET_DIR, XG_PROCESS_DIR, XG_DATA_DIR, XG_DIR, PYTORCH_MODELS_DIR, ERROR_CODE, TEST_RATIO, VAL_RATIO, EXPECTED_NUM_HOURS, default_dataset
from model.xg_util import xgload
from model.model_util import CLF_MAP
from recon.dataset_util import GEN_GROUP_CONSTRAINTS, gen_group
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)

CRITICAL:root:script location: /home/kev/crunch/model/nb-model_xg-mdl.ipynb
CRITICAL:root:using project dir: /home/kev/crunch/


## Load Data

In [2]:
assets = ['sp_500', 'russell_2000', 'nasdaq_100', 'dow_jones']
chosen_asset = assets[0]

In [3]:
f = xgload(XG_DATA_DIR +'features' +sep)
l = xgload(XG_DATA_DIR +'labels' +sep)
t = xgload(XG_DATA_DIR +'targets' +sep)

In [4]:
print('num f: {}'.format(len(list(f))))
print('num l: {}'.format(len(list(l))))
print('num t: {}'.format(len(list(t))))

num f: 2520
num l: 1008
num t: 1504


### ddir / dret

In [5]:
ddir_pba_hoc = {a: list(l.childkeys([a, 'ddir', 'ddir', 'pba_hoc_hdxret_ddir'])) for a in assets}
ddir_vol_hoc = {a: list(l.childkeys([a, 'ddir', 'ddir', 'vol_hoc_hdxret_ddir'])) for a in assets}

In [6]:
dret_pba_hoc = {a: list(t.childkeys([a, 'dret', 'dret', 'pba_hoc_hdxret_dret'])) for a in assets}
dret_vol_hoc = {a: list(t.childkeys([a, 'dret', 'dret', 'vol_hoc_hdxret_dret'])) for a in assets}

### ddir1 / dret1

In [7]:
ddir1_pba_hoc_lin = {a: list(l.childkeys([a, 'ddir1', 'ddir1_lin', 'pba_hoc_hdxret1_ddir1'])) for a in assets}
ddir1_pba_hoc_log = {a: list(l.childkeys([a, 'ddir1', 'ddir1_log', 'pba_hoc_hdxret1_ddir1'])) for a in assets}
ddir1_vol_hoc_lin = {a: list(l.childkeys([a, 'ddir1', 'ddir1_lin', 'vol_hoc_hdxret1_ddir1'])) for a in assets}
ddir1_vol_hoc_log = {a: list(l.childkeys([a, 'ddir1', 'ddir1_log', 'vol_hoc_hdxret1_ddir1'])) for a in assets}

In [8]:
dret1_pba_hoc_lin = {a: list(t.childkeys([a, 'dret1', 'dret1_lin', 'pba_hoc_hdxret1_dret1'])) for a in assets}
dret1_pba_hoc_log = {a: list(t.childkeys([a, 'dret1', 'dret1_log', 'pba_hoc_hdxret1_dret1'])) for a in assets}
dret1_vol_hoc_lin = {a: list(t.childkeys([a, 'dret1', 'dret1_lin', 'vol_hoc_hdxret1_dret1'])) for a in assets}
dret1_vol_hoc_log = {a: list(t.childkeys([a, 'dret1', 'dret1_log', 'vol_hoc_hdxret1_dret1'])) for a in assets}

### Features

In [9]:
list(set([k[1] for k in f.childkeys([assets[0]])]))

['hdzn',
 'dc',
 'dwrod',
 'dwrxmx',
 'hduni',
 'hohlca',
 'dlogret',
 'hdpt',
 'dwrzn',
 'dwrpt',
 'dffd',
 'hdod',
 'ddiff',
 'dwrmx',
 'dohlca',
 'hdmx',
 'hdgau']

In [15]:
kc_end = ['ddiff', 'ddiff_pba_vol']
ft_all = {a: list(f.childkeys([a, *kc_end])) for a in assets}
feat = ft_all[chosen_asset]

In [28]:
feat

[['sp_500',
  'ddiff',
  'ddiff_pba_vol',
  'pba_dohlca_ddiff',
  'pba_dohlca_ddiff(1)'],
 ['sp_500',
  'ddiff',
  'ddiff_pba_vol',
  'vol_dohlca_ddiff',
  'vol_dohlca_ddiff(1)']]

## Select Data

In [34]:
feature_df = inner_join(f[feat[0]], f[feat[1]])
label_df = l[ddir_pba_hoc[chosen_asset][0]]
target_df = t[dret_pba_hoc[chosen_asset][0]]
feature_df, label_df, target_df = pd_common_idx_rows(feature_df, label_df, target_df)
assert(feature_df.shape[0]==label_df.shape[0]==target_df.shape[0])

## Mdl

In [38]:
feature_df

Unnamed: 0,pba_open,pba_high,pba_low,pba_close,pba_avgPrice,vol_open,vol_high,vol_low,vol_close,vol_avgPrice
2009-01-05,26.18,1.90,20.18,-4.35,8.173325,-0.34,0.40,1.42,-0.11,1.061950
2009-01-06,2.00,7.22,7.75,7.25,6.726387,-1.18,-0.89,-0.96,-0.52,-0.712587
2009-01-07,-3.72,-16.40,-24.91,-28.05,-21.802400,0.50,4.49,1.22,4.83,3.290450
2009-01-08,-21.72,-17.45,-5.56,3.08,-9.761875,4.82,0.78,4.00,-0.83,1.858525
2009-01-09,4.18,1.93,-8.50,-19.38,-7.565663,-2.20,-1.47,-1.51,0.26,-1.249475
...,...,...,...,...,...,...,...,...,...,...
2017-12-21,-5.16,1.63,6.29,5.32,6.206425,-0.21,0.01,0.46,-0.10,0.076250
2017-12-22,1.20,-7.29,-4.27,-1.23,-5.591513,0.10,0.32,0.02,0.28,0.192300
2017-12-26,-5.13,-2.61,-0.17,-2.84,-1.804388,0.89,0.20,0.78,0.35,0.434037
2017-12-27,3.01,2.90,0.95,2.12,1.634150,-0.21,0.41,-0.45,0.22,-0.008987


In [23]:
clf = CLF_MAP['pytorch']['TCN']()

In [24]:
clf.space

{'epochs': <hyperopt.pyll.base.Apply at 0x7fce701afd90>,
 'batch_size': <hyperopt.pyll.base.Apply at 0x7fce701afed0>,
 'loss': <hyperopt.pyll.base.Apply at 0x7fce701af590>,
 'opt': <hyperopt.pyll.base.Apply at 0x7fce701afad0>,
 'input_windows': <hyperopt.pyll.base.Apply at 0x7fce701a5310>,
 'topology': <hyperopt.pyll.base.Apply at 0x7fce701a5090>,
 'kernel_size': <hyperopt.pyll.base.Apply at 0x7fce701af2d0>,
 'dropout': <hyperopt.pyll.base.Apply at 0x7fce701af450>,
 'attention': False,
 'max_attn_len': 120}

In [37]:
params = {
    'epochs': 300,
    'batch_size': 256,
    'loss': 'nll',
    'opt': {'name': 'Adam', 'lr': 1e-3},
    'input_windows': 5,
    'topology': [5],
    'kernel_size': 3,
    'dropout': 0,
    'attention': False,
    'max_attn_len': 120
}

In [39]:
obj_fn = clf.make_const_data_objective(feature_df, label_df, exp_logdir=None, exp_meta=None, clf_type='binary',
									meta_obj='val_loss', obj_agg='last', obj_mode='min', meta_var=None,
									val_ratio=VAL_RATIO, test_ratio=TEST_RATIO, shuffle=False)

In [40]:
res = obj_fn(params)

RuntimeError: CUDA error: device-side assert triggered