# Model Testing

In [1]:
import sys
import os
from os import sep
from os.path import dirname, realpath
from pathlib import Path
from functools import partial, reduce
import logging

def get_cwd(fname, subdir, crunch_dir=realpath(Path.home()) +sep +'crunch' +sep):
    """
    Convenience function to make a directory string for the current file based on inputs.
    Jupyter Notebook in Anaconda invokes the Python interpreter in Anaconda's subdirectory
    which is why changing sys.argv[0] is necessary. In the future a better way to do this
    should be preferred..
    """
    return crunch_dir +subdir +fname

def fix_path(cwd):
    """
    Convenience function to fix argv and python path so that jupyter notebook can run the same as
    any script in crunch.
    """
    sys.argv[0] = cwd
    module_path = os.path.abspath(os.path.join('..'))
    if module_path not in sys.path:
        sys.path.append(module_path)

fname = 'model_test.ipynb'      # FILL
dir_name = 'model'              # FILL
fix_path(get_cwd(fname, dir_name +sep))

import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch
from torch.utils.data import TensorDataset, DataLoader
from dask import delayed, compute
import matplotlib.pyplot as plt

from ipywidgets import interact, interact_manual, interactive, fixed
from IPython.display import display

pd.set_option("display.max_rows", 100)
pd.set_option('display.max_columns', 50)

from common_util import RECON_DIR, JSON_SFX_LEN, DT_CAL_DAILY_FREQ, is_type, pd_common_idx_rows, remove_dups_list, set_loglevel, chained_filter, get_variants, dump_df, load_json, gb_transpose, np_inner, pd_common_index_rows, filter_cols_below, inner_join, outer_join, ser_shift, list_get_dict, window_iter, benchmark
from common_util import midx_get_level, midx_intersect, str_to_list, pd_common_idx_rows, midx_split, pd_midx_to_arr, window_iter, np_is_ndim, get_class_name
from model.common import DATASET_DIR, HOPT_WORKER_BIN, default_model, default_backend, default_dataset, default_trials_count
from model.data_util import datagen, align_first_last_cols, prune_nulls, prepare_transpose_data, prepare_label_data, prepare_target_data
from model.model_util import BINARY_CLF_MAP
from recon.dataset_util import prep_dataset, gen_group
from recon.split_util import get_train_test_split, gen_time_series_split, index_three_split, pd_binary_clip

Using TensorFlow backend.


In [2]:
set_loglevel('info')

### Set Fixed Experiment Parameters ("Commandline" Arguments)

In [3]:
logdir = None
cmd_input = {
    'model=': 'BinTCN',
    'backend=': 'pytorch',
    'dataset=': 'raw_pba_ohlca.json', # 'dnorm_raw_pba_ohlca.json'
    'assets=': 'sp_500', # 'russell_2000'
    'trials_count=': 50,
}

In [4]:
model_code = cmd_input['model='] if (cmd_input['model='] is not None) else default_model
backend_name = cmd_input['backend='] if (cmd_input['backend='] is not None) else default_backend
dataset_fname = cmd_input['dataset='] if (cmd_input['dataset='] is not None) else default_dataset
assets = str_to_list(cmd_input['assets=']) if (cmd_input['assets='] is not None) else None
trials_count = int(cmd_input['trials_count=']) if (cmd_input['trials_count='] is not None) else default_trials_count

### Load Dataset

In [5]:
mod_obj = BINARY_CLF_MAP[backend_name][model_code]()
mod_name = get_class_name(mod_obj)
dataset_name = dataset_fname[:-JSON_SFX_LEN]
dataset_dict = load_json(dataset_fname, dir_path=DATASET_DIR)
dataset = prep_dataset(dataset_dict, assets=assets, filters_map=None)

In [6]:
logging.info('model: {}'.format(mod_name))
logging.info('backend: {}'.format(backend_name))
logging.info('dataset: {} {} df(s)'.format(len(dataset['features']['dfs']), dataset_name))
logging.info('assets: {}'.format(str('all' if (assets==None) else ', '.join(assets))))

INFO:root:model: BinaryTCN
INFO:root:backend: pytorch
INFO:root:dataset: 1 raw_pba_ohlca df(s)
INFO:root:assets: sp_500


### Show Data Options

In [7]:
flts_data = []
flts_choices = {}
for i, (fpath, lpath, tpath, frec, lrec, trec, fcol, lcol, tcol, flt) in enumerate(datagen(dataset, feat_prep_fn=prepare_transpose_data, label_prep_fn=prepare_label_data, target_prep_fn=prepare_target_data, how='df_to_df', delayed=True)):
    ident = '{fdesc}[{fcol}], {ldesc}[{lcol}], {tdesc}[{tcol}])'.format(fdesc=frec.desc, fcol=fcol, ldesc=lrec.desc, lcol=lcol, tdesc=trec.desc, tcol=tcol)
    logging.info('{data_idx} (X, y, z) -> ({data_id})'.format(data_idx=i, data_id=ident))
    flts_data.append(flt)
    flts_choices[ident] = i

INFO:root:0 (X, y, z) -> (raw[:], raw_pba_oc_retxeod_direod[:], raw_pba_oc_retxeod_reteod[:]))
INFO:root:1 (X, y, z) -> (raw[:], raw_pba_oc_retxeod(0.25%)_direod[:], raw_pba_oc_retxeod(0.25%)_reteod[:]))
INFO:root:2 (X, y, z) -> (raw[:], raw_pba_oc_retxeod(0.50%)_direod[:], raw_pba_oc_retxeod(0.50%)_reteod[:]))
INFO:root:3 (X, y, z) -> (raw[:], raw_pba_oc_retxeod(1.00%)_direod[:], raw_pba_oc_retxeod(1.00%)_reteod[:]))
INFO:root:4 (X, y, z) -> (raw[:], raw_pba_oc_retxeod(1.50%)_direod[:], raw_pba_oc_retxeod(1.50%)_reteod[:]))
INFO:root:5 (X, y, z) -> (raw[:], raw_pba_oc_retxeod(2.00%)_direod[:], raw_pba_oc_retxeod(2.00%)_reteod[:]))


### Select Feature and Label/Target

In [8]:
feature, label, target = flts_data[0].compute()

  labels = getattr(columns, 'labels', None) or [
  labels, = index.labels
  return pd.MultiIndex(levels=new_levels, labels=labels, names=columns.names)


In [9]:
f = feature
pos_l, neg_l = pd_binary_clip(label) # Clip Label by Side
l = pos_l
t = target

### Split Data into Ndarrays

In [10]:
val_ratio = .2
test_ratio = .2
train_ratio = 1-(val_ratio+test_ratio)
f_train_idx, f_val_idx, f_test_idx = midx_split(f.index, train_ratio, val_ratio, test_ratio)
l_train_idx, l_val_idx, l_test_idx = midx_split(l.index, train_ratio, val_ratio, test_ratio)
t_train_idx, t_val_idx, t_test_idx = midx_split(t.index, train_ratio, val_ratio, test_ratio)

In [11]:
f_train_pd, f_val_pd, f_test_pd = f.loc[f_train_idx], f.loc[f_val_idx], f.loc[f_test_idx]
l_train_pd, l_val_pd, l_test_pd = l.loc[l_train_idx], l.loc[l_val_idx], l.loc[l_test_idx]
t_train_pd, t_val_pd, t_test_pd = t.loc[t_train_idx], t.loc[t_val_idx], t.loc[t_test_idx]

In [12]:
if (is_type(f.index, pd.core.index.MultiIndex)):
    f_train_np, f_val_np, f_test_np = map(pd_midx_to_arr, [f_train_pd.stack(), f_val_pd.stack(), f_test_pd.stack()])
else:
    f_train_np, f_val_np, f_test_np = f_train_pd.values, f_val_pd.values, f_test_pd.values
l_train_np, l_val_np, l_test_np = l_train_pd.values, l_val_pd.values, l_test_pd.values
t_train_np, t_val_np, t_test_np = t_train_pd.values, t_val_pd.values, t_test_pd.values

### Set Input Shape

In [13]:
input_shape = tuple(f_train_np.shape[-2:]) if (len(f_train_np.shape) > 2) else (1, f_train_np.shape[-1])

### Hyperparameter List

In [14]:
mod_obj.get_space()

{'epochs': <hyperopt.pyll.base.Apply at 0x7f8f2d336438>,
 'batch_size': <hyperopt.pyll.base.Apply at 0x7f8f2d336588>,
 'loss': <hyperopt.pyll.base.Apply at 0x7f8eb1c05c88>,
 'opt': <hyperopt.pyll.base.Apply at 0x7f8f2d3362b0>,
 'input_windows': <hyperopt.pyll.base.Apply at 0x7f8f2d33a630>,
 'topology': <hyperopt.pyll.base.Apply at 0x7f8f2d33a208>,
 'kernel_size': <hyperopt.pyll.base.Apply at 0x7f8f2d33aa20>,
 'dropout': <hyperopt.pyll.base.Apply at 0x7f8f2d33ac50>,
 'attention': <hyperopt.pyll.base.Apply at 0x7f8f2d33add8>,
 'max_attn_len': <hyperopt.pyll.base.Apply at 0x7f8f2d33ae80>}

### Set Hyperparameters

In [15]:
params = {
    'epochs': 100,
    'batch_size': 256,
    'loss': 'ce',
    'opt': {
        'name': 'Adam',
        'lr': .001
    },
    'input_windows': 5,
    'topology': [5, 3, 1],
    'kernel_size': 4,
    'dropout': .2,
    'attention': False,
    'max_attn_len': 80,
}

### Make Model

In [16]:
dev = torch.device('cuda') if (torch.cuda.is_available()) else torch.device('cpu')
mdl = mod_obj.get_model(params, input_shape).to(device=dev)
device = dev
model = mdl

### Make Loss Function and Optimizer

In [17]:
loss_fn, opt = mod_obj.make_loss_fn(params).to(device), mod_obj.make_optimizer(params, model.parameters())
writer = self.tbx(params, logdir) if (logdir is not None) else None
model.zero_grad()
opt.zero_grad()

logging.debug('w[-2:][-2:]: {}'.format(list(model.parameters())[-2:][-2:]))

### Set Data

In [18]:
train_data = (f_train_np, l_train_np)
val_data = (f_val_np, l_val_np)

### Metrics

In [19]:
history = {
    'loss': [],
    'val_loss': []
}
for name in mod_obj.metrics_fns.keys():
    history[name] = []
    history['val_{}'.format(name)] = []

### Define Batch Loss Compute Function

In [20]:
def bloss(params, model, loss_function, feat_batch, lab_batch, optimizer=None, metrics_fns=mod_obj.metrics_fns):
    """
    Compute loss and metrics on batch, run optimizer on losses if passed.
    """
    # logging.debug('batch tensor[0][0]: {}'.format(feat_batch[0][0]))
    outputs_batch = model(feat_batch)
    loss = loss_function(outputs_batch, lab_batch)
    max_batch, pred_batch = torch.max(outputs_batch, dim=1) # Convert network outputs into predictions
    lab_batch_cpu = lab_batch.cpu()
    pred_batch_cpu = pred_batch.cpu()
    metrics = {name: fn(lab_batch_cpu, pred_batch_cpu) for name, fn in metrics_fns.items()}

    if (optimizer is not None):
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    logging.debug('batch loss:   {}'.format(loss.item()))
    return loss.item(), len(feat_batch), metrics

### Fit Model

In [22]:
try:
    for epoch in range(params['epochs']):
        epoch_str = str(epoch).zfill(3)
        model.train()
        losses, nums, metrics = zip(*[bloss(params, model, loss_fn, Xb, yb, optimizer=opt) for Xb, yb in mod_obj.batchify(params, mod_obj.preproc(params, train_data), device, shuffle_batches=True)])
        loss = np.sum(np.multiply(losses, nums)) / np.sum(nums)
        soa = {name[0]: tuple(d[name[0]] for d in metrics) for name in zip(*metrics)}
        metric = {name: np.sum(np.multiply(vals, nums)) / np.sum(nums) for name, vals in soa.items()}
        logging.debug('{} train loss: {}'.format(epoch_str, loss))
        history['loss'].append(loss)
        history['acc'].append(metric['acc'])
        logging.debug('{} w[-2:][-2:]: {}'.format(epoch_str, list(model.parameters())[-2:][-2:]))

        model.eval()
        with torch.no_grad():
            losses, nums, metrics = zip(*[bloss(params, model, loss_fn, Xb, yb) for Xb, yb in mod_obj.batchify(params, mod_obj.preproc(params, val_data), device, shuffle_batches=False)])
        loss = np.sum(np.multiply(losses, nums)) / np.sum(nums)
        soa = {name[0]: tuple(d[name[0]] for d in metrics) for name in zip(*metrics)}
        metric = {name: np.sum(np.multiply(vals, nums)) / np.sum(nums) for name, vals in soa.items()}
        logging.debug('{} val loss: {}'.format(epoch_str, loss))
        history['val_loss'].append(loss)
        history['val_acc'].append(metric['acc'])
        logging.debug('{} w[-2:][-2:]: {}'.format(epoch_str, list(model.parameters())[-2:][-2:]))

    results = {
        # 'history': history
        'mean': {
            'loss': np.mean(history['loss']),
            'val_loss': np.mean(history['val_loss']),
            'acc': np.mean(history['acc']),
            'val_acc': np.mean(history['val_acc'])
        },
        'last': {
            'loss': history['loss'][-1],
            'val_loss': history['val_loss'][-1],
            'acc': history['acc'][-1],
            'val_acc': history['val_acc'][-1]
        }
    }

except Exception as e:
    logging.error('Error during model fitting: {}'.format(str(e)))
    raise e

  'precision', 'predicted', average, warn_for)


In [23]:
results

{'mean': {'loss': 3.0495534735315006,
  'val_loss': 1.8445394916111426,
  'acc': 0.5011155778894473,
  'val_acc': 0.5046068548387097},
 'last': {'loss': 1.4121486174201645,
  'val_loss': 1.2944406309435446,
  'acc': 0.5018425460636516,
  'val_acc': 0.4465725806451613}}

In [24]:
try:
    for epoch in range(params['epochs']):
        epoch_str = str(epoch).zfill(3)

        model.train()
        losses, nums, metrics = zip(*[bloss(params, model, loss_fn, Xb, yb, optimizer=opt) for Xb, yb in mod_obj.batchify(params, mod_obj.preproc(params, train_data), device, shuffle_batches=True)])
        loss = np_inner(losses, nums)
        soa = {name[0]: tuple(d[name[0]] for d in metrics) for name in zip(*metrics)}
        metric = {name: np_inner(vals, nums) for name, vals in soa.items()}
        history['loss'].append(loss)
        for name, val in metric.items():
            history[name].append(val)

        model.eval()
        with torch.no_grad():
            losses, nums, metrics = zip(*[bloss(params, model, loss_fn, Xb, yb) for Xb, yb in mod_obj.batchify(params, mod_obj.preproc(params, val_data), device, shuffle_batches=False)])
        loss = np_inner(losses, nums)
        soa = {name[0]: tuple(d[name[0]] for d in metrics) for name in zip(*metrics)}
        metric = {name: np_inner(vals, nums) for name, vals in soa.items()}
        logging.debug('{} val loss: {}'.format(epoch_str, loss))
        history['val_loss'].append(loss)
        for name, val in metric.items():
            history['val_{}'.format(name)].append(val)

    results = {
        'history': history,
        'mean': {name: np.mean(vals) for name, vals in history.items()},
        'last': {name: vals[-1] for name, vals in history.items()}
    }

except Exception as e:
    logging.error('Error during model fitting: {}'.format(str(e)))
    raise e


  'precision', 'predicted', average, warn_for)


In [28]:
results['mean']

{'loss': 1.9973970999752655,
 'val_loss': 1.4383501185044165,
 'acc': 0.501318257956449,
 'val_acc': 0.4990221774193549,
 'f1': 0.38219632835718587,
 'val_f1': 0.3129545060022222}

In [27]:
results['last']

{'loss': 1.2067930588570472,
 'val_loss': 0.9496929222537626,
 'acc': 0.5055276381909548,
 'val_acc': 0.4465725806451613,
 'f1': 0.3592858176627701,
 'val_f1': 0.0}