# Model Debug

In [1]:
import sys
import os
from os import sep
from os.path import dirname, realpath
from pathlib import Path
from functools import partial
import logging

def get_cwd(fname, subdir, crunch_dir=realpath(Path.home()) +sep +'crunch' +sep):
    """
    Convenience function to make a directory string for the current file based on inputs.
    Jupyter Notebook in Anaconda invokes the Python interpreter in Anaconda's subdirectory
    which is why changing sys.argv[0] is necessary. In the future a better way to do this
    should be preferred..
    """
    return crunch_dir +subdir +fname
    
def fix_path(cwd):
    """
    Convenience function to fix argv and python path so that jupyter notebook can run the same as
    any script in crunch.
    """
    sys.argv[0] = cwd
    module_path = os.path.abspath(os.path.join('..'))
    if module_path not in sys.path:
        sys.path.append(module_path)

fname = 'dataload.ipynb'   # FILL
dir_name = 'model'         # FILL
fix_path(get_cwd(fname, dir_name +sep))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from dask import delayed, compute
from torch.utils.data import TensorDataset, DataLoader
import torch

from ipywidgets import interact, interactive, fixed
from IPython.display import display

pd.set_option("display.max_rows", 100)
pd.set_option('display.max_columns', 50)

from common_util import RECON_DIR, JSON_SFX_LEN, DT_CAL_DAILY_FREQ, set_loglevel, chained_filter, get_variants, dump_df, load_json, gb_transpose, reindex_on_time_mask, pd_common_index_rows, filter_cols_below, inner_join, outer_join, ser_shift, pd_to_pytorch, list_get_dict, window_iter, benchmark
from model.common import DATASET_DIR, EXPECTED_NUM_HOURS, default_dataset
from model.data_util import datagen, prepare_transpose_data, prepare_label_data, prepare_target_data
from data.data_api import DataAPI
from data.access_util import df_getters as dg, col_subsetters2 as cs2
from recon.dataset_util import prep_dataset, gen_group
from recon.split_util import get_train_test_split, gen_time_series_split, index_three_split

Using TensorFlow backend.


In [2]:
set_loglevel('info')

In [3]:
dataset_name = default_dataset
assets_str = 'sp_500'
assets = list(map(str.strip, assets_str.split(',')))

dataset_dict = load_json(dataset_name, dir_path=DATASET_DIR)
dataset = prep_dataset(dataset_dict, assets=assets)

logging.info('dataset: {} {} df(s)'.format(len(dataset['features']), dataset_name))
logging.info('assets: {}'.format(str('all' if (assets==None) else ', '.join(assets))))
logging.debug('fpaths: {}'.format(str(list(dataset['features']['dfs'].keys()))))
logging.debug('lpaths: {}'.format(str(list(dataset['labels']['dfs'].keys()))))

INFO:root:dataset: 2 mvp_dnorm_raw_pba_avgprice.json df(s)
INFO:root:assets: sp_500


In [4]:
for i, (fpath, lpath, tpath, frec, lrec, trec, fcol, lcol, tcol, feature, label, target) in enumerate(datagen(dataset, feat_prep_fn=prepare_transpose_data, label_prep_fn=prepare_label_data, target_prep_fn=prepare_target_data, how='ser_to_ser')):
    logging.info('(X, y, z) -> ({fdesc}, {ldesc}, {tdesc})'.format(fdesc=frec.desc, ldesc=lrec.desc, tdesc=trec.desc))
    logging.info('(X, y, z) -> ({fcol}, {lcol}, {tcol})'.format(fcol=fcol, lcol=lcol, tcol=tcol))
    f = feature
    l = label
    t = target
    break
#     print(feature)
#     print(label)
#     print(target)

INFO:root:(X, y, z) -> (raw_pba_dmx, raw_pba_oa_retxeod_direod, raw_pba_oa_retxeod_reteod)
INFO:root:(X, y, z) -> (pba_avgPrice, pba_oa, pba_oa)


In [5]:
train_idx, val_idx, test_idx = index_three_split(f.index, l.index, t.index, val_ratio=.2, test_ratio=.2, shuffle=False)
feat_train, feat_val, feat_test = f.loc[train_idx], f.loc[val_idx], f.loc[test_idx]
lab_train, lab_val, lab_test = l.loc[train_idx], l.loc[val_idx], l.loc[test_idx]
tar_train, tar_val, tar_test = t.loc[train_idx], t.loc[val_idx], t.loc[test_idx]

feat_train_np, feat_val_np, feat_test_np = f.loc[train_idx].values, f.loc[val_idx].values, f.loc[test_idx].values
lab_train_np, lab_val_np, lab_test_np = l.loc[train_idx].values, l.loc[val_idx].values, l.loc[test_idx].values
tar_train_np, tar_val_np, tar_test_np = t.loc[train_idx].values, t.loc[val_idx].values, t.loc[test_idx].values

feat_train_tor, feat_val_tor, feat_test_tor = map(pd_to_pytorch, (f.loc[train_idx], f.loc[val_idx], f.loc[test_idx]))
lab_train_tor, lab_val_tor, lab_test_tor = map(pd_to_pytorch, (l.loc[train_idx], l.loc[val_idx], l.loc[test_idx]))
tar_train_tor, tar_val_tor, tar_test_tor = map(pd_to_pytorch, (t.loc[train_idx], t.loc[val_idx], t.loc[test_idx]))

In [12]:
to_ds = lambda *data: TensorDataset(*[torch.tensor(d) for d in data])
data_tuples = zip([feat_train_np, feat_val_np, feat_test_np], [lab_train_np, lab_val_np, lab_test_np])
train_ds, val_ds, test_ds = [to_ds(X, y) for X, y in data_tuples]
train_dl, val_dl, test_dl = map(partial(DataLoader, batch_size=10, shuffle=False), [train_ds, val_ds, test_ds])