# Experiment Group Model Test

In [67]:
import sys
import os
from os import sep
from os.path import dirname, realpath
from pathlib import Path
from functools import partial, reduce
import logging

def get_cwd(fname, subdir, crunch_dir=realpath(Path.home()) +sep +'crunch' +sep):
    """
    Convenience function to make a directory string for the current file based on inputs.
    Jupyter Notebook in Anaconda invokes the Python interpreter in Anaconda's subdirectory
    which is why changing sys.argv[0] is necessary. In the future a better way to do this
    should be preferred..
    """
    return crunch_dir +subdir +fname

def fix_path(cwd):
    """
    Convenience function to fix argv and python path so that jupyter notebook can run the same as
    any script in crunch.
    """
    sys.argv[0] = cwd
    module_path = os.path.abspath(os.path.join('..'))
    if module_path not in sys.path:
        sys.path.append(module_path)

fname = 'dataload_test.ipynb'   # FILL
dir_name = 'model'              # FILL
fix_path(get_cwd(fname, dir_name +sep))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from dask import delayed, compute
from torch.utils.data import TensorDataset, DataLoader
import torch

from ipywidgets import interact, interactive, fixed
from IPython.display import display

pd.set_option("display.max_rows", 100)
pd.set_option('display.max_columns', 50)

from common_util import RECON_DIR, JSON_SFX_LEN, DT_CAL_DAILY_FREQ, is_type, pd_common_idx_rows, remove_dups_list, set_loglevel, chained_filter, get_variants, dump_df, load_json, gb_transpose, pd_common_index_rows, filter_cols_below, inner_join, outer_join, ser_shift, list_get_dict, window_iter, benchmark
from common_util import str_to_list, isnt, np_inner, get_class_name, get0, midx_get_level, midx_intersect, pd_common_idx_rows, midx_split, pd_midx_to_arr, window_iter, np_is_ndim
from model.common import XG_DIR, EXPECTED_NUM_HOURS, default_xg
from model.data_util import xgdg, align_first_last_cols, prune_nulls
from model.model_util import CLF_MAP
from data.data_api import DataAPI
from data.access_util import df_getters as dg, col_subsetters2 as cs2
from recon.dataset_util import prep_dataset, gen_group
from recon.split_util import get_train_test_split, gen_time_series_split, index_three_split, pd_binary_clip

Use this script to test if a particlar xgdg works with a particular model
* Use xgdg_dataload_test notebook before using this one

## Set Experiment Parameters

In [23]:
set_loglevel('info')

In [24]:
sorted(os.listdir(XG_DIR))

['test_xg0_single_channel_daily.json',
 'test_xg1_multi_channel_daily.json',
 'test_xg2_single_channel_intraday.json',
 'test_xg3_multi_channel_intraday.json',
 'xg0_reteod_direod.json']

In [122]:
logdir = None
cmd_input = {
    'model=': 'TCN',
    'backend=': 'pytorch',
    'xg=': 'test_xg0_single_channel_daily.json',
    'assets=': 'sp_500',
    'trials_count=': 100,
}

In [123]:
model_code = cmd_input['model='] if (cmd_input['model='] is not None) else default_model
backend_name = cmd_input['backend='] if (cmd_input['backend='] is not None) else default_backend
xg_fname = cmd_input['xg='] if (cmd_input['xg='] is not None) else default_xg
assets = str_to_list(cmd_input['assets=']) if (cmd_input['assets='] is not None) else None
trials_count = int(cmd_input['trials_count=']) if (cmd_input['trials_count='] is not None) else default_trials_count

In [124]:
dev = torch.device('cuda') if (torch.cuda.is_available()) else torch.device('cpu')

## Make Model

In [125]:
mod_obj = CLF_MAP[backend_name][model_code]()
mod_name = get_class_name(mod_obj)
xg_name = xg_fname.rstrip('.json')

logging.info('model: {}'.format(mod_name))
logging.info('backend: {}'.format(backend_name))
logging.info('xg: {}'.format(xg_name))
logging.info('assets: {}'.format(str('all' if (assets==None) else ', '.join(assets))))

INFO:root:model: TCN_CLF
INFO:root:backend: pytorch
INFO:root:xg: test_xg0_single_channel_daily
INFO:root:assets: sp_500


## Data Gen

In [126]:
for i, (paths, recs, dfs) in enumerate(xgdg(xg_fname, delayed=True, assets=assets, filters_map=None)):
    fpath, lpath, tpath = paths
    frec, lrec, trec = recs
    logging.info('(X, y, z) -> ({fdesc}[:], {ldesc}[:], {tdesc}[:])'.format(fdesc=frec.desc, ldesc=lrec.desc, tdesc=trec.desc))
    f, l, t = dfs.compute()
    break

INFO:root:(X, y, z) -> (raw_pba_oc_retxeod_reteod[:], raw_pba_oc_retxeod_direod[:], raw_pba_oc_retxeod_reteod[:])


In [127]:
pos_l, neg_l = pd_binary_clip(l)
l = pos_l

In [128]:
val_ratio = .2
test_ratio = .2
train_ratio = 1-(val_ratio+test_ratio)
f_train_idx, f_val_idx, f_test_idx = midx_split(f.index, train_ratio, val_ratio, test_ratio)
l_train_idx, l_val_idx, l_test_idx = midx_split(l.index, train_ratio, val_ratio, test_ratio)
t_train_idx, t_val_idx, t_test_idx = midx_split(t.index, train_ratio, val_ratio, test_ratio)

In [129]:
f_train_pd, f_val_pd, f_test_pd = f.loc[f_train_idx], f.loc[f_val_idx], f.loc[f_test_idx]
l_train_pd, l_val_pd, l_test_pd = l.loc[l_train_idx], l.loc[l_val_idx], l.loc[l_test_idx]
t_train_pd, t_val_pd, t_test_pd = t.loc[t_train_idx], t.loc[t_val_idx], t.loc[t_test_idx]

In [130]:
if (is_type(f.index, pd.core.index.MultiIndex)):
    f_train_np, f_val_np, f_test_np = map(pd_midx_to_arr, [f_train_pd.stack(), f_val_pd.stack(), f_test_pd.stack()])
else:
    f_train_np, f_val_np, f_test_np = f_train_pd.values, f_val_pd.values, f_test_pd.values
l_train_np, l_val_np, l_test_np = l_train_pd.values, l_val_pd.values, l_test_pd.values
t_train_np, t_val_np, t_test_np = t_train_pd.values, t_val_pd.values, t_test_pd.values

In [131]:
t_val_tc = torch.tensor(t_val_np, dtype=torch.float32, device=dev, requires_grad=False).squeeze()

## Set Observation Shape

In [132]:
obs_shape = mod_obj.get_obs_shape(f_train_np.shape)
print(f_train_np.shape)
print(obs_shape)

(3007, 1)
(1, 1)


## Set Hyperparameters

In [133]:
mod_obj.get_space()

{'epochs': <hyperopt.pyll.base.Apply at 0x7f6463759860>,
 'batch_size': <hyperopt.pyll.base.Apply at 0x7f6463759908>,
 'loss': <hyperopt.pyll.base.Apply at 0x7f6463759358>,
 'opt': <hyperopt.pyll.base.Apply at 0x7f64637595f8>,
 'input_windows': <hyperopt.pyll.base.Apply at 0x7f64637353c8>,
 'topology': <hyperopt.pyll.base.Apply at 0x7f6463735b00>,
 'kernel_size': <hyperopt.pyll.base.Apply at 0x7f6463759160>,
 'dropout': <hyperopt.pyll.base.Apply at 0x7f64637592b0>,
 'attention': False,
 'max_attn_len': 120}

In [134]:
pmf = list(reversed(l_train_pd.value_counts(normalize=True)))

In [135]:
params = {
    'epochs': 500,
    'batch_size': 128, #256
    'loss': 'nll',
    'cw': pmf,
    'opt': {
        'name': 'Adam',
        'lr': .0001
    },
    'input_windows': 20,
    'topology': [100],
    'kernel_size': 8,
    'dropout': .1,
    'attention': False,
    'max_attn_len': 80,
}

## Verify Preproc

In [137]:
train_data = (f_train_np, l_train_np)
val_data = (f_val_np, l_val_np)

In [143]:
t_train_np.shape

(3007,)

In [146]:
f_train_pd.values.shape

(3007, 1)

In [120]:
X, y = mod_obj.preproc(params, train_data)
print(X.shape)
print(y.shape)

(2988, 1, 620)
(2988, 1)


## Define Batch Loss Compute Function

In [58]:
def bloss(params, model, loss_function, feat_batch, lab_batch, optimizer=None, ret_train_pred=False, metrics_fns=mod_obj.metrics_fns):
    """
    Compute loss and metrics on batch, run optimizer on losses if passed.
    """
    # logging.debug('batch tensor[0][0]: {}'.format(feat_batch[0][0]))
    outputs_batch = model(feat_batch)
    loss = loss_function(outputs_batch, lab_batch)
    max_batch, pred_batch = torch.max(outputs_batch, dim=1) # Convert network outputs into predictions
    lab_batch_cpu = lab_batch.cpu()
    pred_batch_cpu = pred_batch.cpu()
    metrics = {name: fn(lab_batch_cpu, pred_batch_cpu) for name, fn in metrics_fns.items()}

    if (optimizer is not None):
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if (not ret_train_pred):
            return loss.item(), len(feat_batch), metrics

#     logging.debug('batch loss:   {}'.format(loss.item()))
    return loss.item(), len(feat_batch), metrics, (max_batch.exp(), pred_batch.float())

## Make and Fit Model

In [62]:
# Make Model, Loss Fn, and Optimizer
mdl = mod_obj.get_model(params, obs_shape).to(device=dev)
loss_fn = mod_obj.make_loss_fn(params, None if (isnt(params['cw'])) else torch.tensor(params['cw']).to(dev)).to(dev)
opt = mod_obj.make_optimizer(params, mdl.parameters())
display(mdl)

TCN(
  (tcn): TemporalConvNet(
    (network): Sequential(
      (0): TemporalBlock(
        (conv1): Conv1d(1, 100, kernel_size=(8,), stride=(1,), padding=(7,))
        (chomp1): Chomp1d()
        (relu1): ReLU()
        (dropout1): Dropout(p=0.1)
        (conv2): Conv1d(100, 100, kernel_size=(8,), stride=(1,), padding=(7,))
        (chomp2): Chomp1d()
        (relu2): ReLU()
        (dropout2): Dropout(p=0.1)
        (net): Sequential(
          (0): Conv1d(1, 100, kernel_size=(8,), stride=(1,), padding=(7,))
          (1): Chomp1d()
          (2): ReLU()
          (3): Dropout(p=0.1)
          (4): Conv1d(100, 100, kernel_size=(8,), stride=(1,), padding=(7,))
          (5): Chomp1d()
          (6): ReLU()
          (7): Dropout(p=0.1)
        )
        (downsample): Conv1d(1, 100, kernel_size=(1,), stride=(1,))
        (relu): ReLU()
      )
    )
  )
  (out): Linear(in_features=100, out_features=2, bias=True)
  (logprob): LogSoftmax()
)

In [68]:
writer = self.tbx(params, logdir) if (logdir is not None) else None
mdl.zero_grad()
opt.zero_grad()

# Metrics
history = {
    'loss': [],
    'val_loss': []
}
for name in mod_obj.metrics_fns.keys():
    history[name] = []
    history['val_{}'.format(name)] = []

# Fit Model
try:
    for epoch in range(params['epochs']):
        epoch_str = str(epoch).zfill(3)

        mdl.train()
        losses, nums, metrics = zip(*[bloss(params, mdl, loss_fn, Xb, yb, optimizer=opt) for Xb, yb in mod_obj.batchify(params, mod_obj.preproc(params, train_data), dev, shuffle_batches=True)])
        loss = np_inner(losses, nums)
        soa = {name[0]: tuple(d[name[0]] for d in metrics) for name in zip(*metrics)}
        metric = {name: np_inner(vals, nums) for name, vals in soa.items()}
        history['loss'].append(loss)
        for name, val in metric.items():
            history[name].append(val)

        mdl.eval()
        with torch.no_grad():
            Xe, ye = get0(*mod_obj.batchify(params, mod_obj.preproc(params, val_data), dev, override_batch_size=val_data[-1].size, shuffle_batches=False))
            loss, num, metric, pred = bloss(params, mdl, loss_fn, Xe, ye)
            pred_conf, pred_dir = pred
        history['val_loss'].append(loss)
        for name, val in metric.items():
            history['val_{}'.format(name)].append(val)

    results = {
#         'history': history,
        'mean': {name: np.mean(vals) for name, vals in history.items()},
        'last': {name: vals[-1] for name, vals in history.items()}
    }

except Exception as e:
    logging.error('Error during model fitting: {}'.format(str(e)))
    raise e

## Results