# Model Testing

In [1]:
import sys
import os
from os import sep
from os.path import dirname, realpath
from pathlib import Path
from functools import partial, reduce
import logging

def get_cwd(fname, subdir, crunch_dir=realpath(Path.home()) +sep +'crunch' +sep):
    """
    Convenience function to make a directory string for the current file based on inputs.
    Jupyter Notebook in Anaconda invokes the Python interpreter in Anaconda's subdirectory
    which is why changing sys.argv[0] is necessary. In the future a better way to do this
    should be preferred..
    """
    return crunch_dir +subdir +fname

def fix_path(cwd):
    """
    Convenience function to fix argv and python path so that jupyter notebook can run the same as
    any script in crunch.
    """
    sys.argv[0] = cwd
    module_path = os.path.abspath(os.path.join('..'))
    if module_path not in sys.path:
        sys.path.append(module_path)

fname = 'model_test.ipynb'      # FILL
dir_name = 'model'              # FILL
fix_path(get_cwd(fname, dir_name +sep))

import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch
from torch.autograd import Variable
from torch.distributions import Normal, Categorical, constraints
from torch.utils.data import TensorDataset, DataLoader
import pyro
from pyro.infer import SVI, Trace_ELBO
from pyro.optim import Adam
import pyro.distributions as dist
from dask import delayed, compute
import matplotlib.pyplot as plt

from ipywidgets import interact, interact_manual, interactive, fixed
from IPython.display import display

pd.set_option("display.max_rows", 100)
pd.set_option('display.max_columns', 50)

from common_util import RECON_DIR, JSON_SFX_LEN, DT_CAL_DAILY_FREQ, is_type, pd_common_idx_rows, remove_dups_list, set_loglevel, chained_filter, get_variants, dump_df, load_json, gb_transpose, np_inner, pd_common_index_rows, filter_cols_below, inner_join, outer_join, ser_shift, list_get_dict, window_iter, benchmark
from common_util import isnt, midx_get_level, midx_intersect, str_to_list, pd_common_idx_rows, midx_split, pd_midx_to_arr, window_iter, np_is_ndim, get_class_name, get0
from model.common import DATASET_DIR, HOPT_WORKER_BIN, default_model, default_backend, default_dataset, default_trials_count
from model.data_util import datagen, align_first_last_cols, prune_nulls, prepare_transpose_data, prepare_label_data, prepare_target_data
from model.model_util import BINARY_CLF_MAP
from recon.dataset_util import prep_dataset, gen_group
from recon.split_util import get_train_test_split, gen_time_series_split, index_three_split, pd_binary_clip

Using TensorFlow backend.


In [2]:
set_loglevel('info')

### List Datasets

In [3]:
os.listdir(DATASET_DIR)

['labels_eod.json',
 'mvp_dnorm_raw.json',
 'drl.json',
 'mvp_labels_eod.json',
 'dnorm_raw_pba_ohlca.json',
 'mvp_labels_fbxeod.json',
 'mvp_dnorm_raw_pba_avgprice.json',
 'dma.json',
 'dnorm_raw.json',
 'raw_pba_ohlca.json',
 'targets_eod.json',
 'mvp_targets_eod.json',
 'row_masks.json',
 'ddiff.json',
 'dnorm_sym.json',
 'mvp_targets_fbxeod.json',
 'sym_raw.json',
 'dnorm_dmx_raw_pba_ohlca.json']

### Set Fixed Experiment Parameters ("Commandline" Arguments)

In [4]:
logdir = None
cmd_input = {
    'model=': 'BinTCN',
    'backend=': 'pytorch',
    'dataset=': 'dnorm_dmx_raw_pba_ohlca.json',
    'assets=': 'sp_500', # 'russell_2000'
    'trials_count=': 50,
}

In [5]:
model_code = cmd_input['model='] if (cmd_input['model='] is not None) else default_model
backend_name = cmd_input['backend='] if (cmd_input['backend='] is not None) else default_backend
dataset_fname = cmd_input['dataset='] if (cmd_input['dataset='] is not None) else default_dataset
assets = str_to_list(cmd_input['assets=']) if (cmd_input['assets='] is not None) else None
trials_count = int(cmd_input['trials_count=']) if (cmd_input['trials_count='] is not None) else default_trials_count

### Load Dataset

In [6]:
mod_obj = BINARY_CLF_MAP[backend_name][model_code]()
mod_name = get_class_name(mod_obj)
dataset_name = dataset_fname[:-JSON_SFX_LEN]
dataset_dict = load_json(dataset_fname, dir_path=DATASET_DIR)

In [7]:
dataset_dict

{'features': [['norm', 'dnorm_dmx_raw_pba_ohlca']],
 'labels': 'mvp_labels_eod.json',
 'targets': 'mvp_targets_eod.json',
 'row_masks': 'row_masks.json'}

In [8]:
dataset = prep_dataset(dataset_dict, assets=assets, filters_map=None)

In [9]:
logging.info('model: {}'.format(mod_name))
logging.info('backend: {}'.format(backend_name))
logging.info('dataset: {} {} df(s)'.format(len(dataset['features']['dfs']), dataset_name))
logging.info('assets: {}'.format(str('all' if (assets==None) else ', '.join(assets))))

INFO:root:model: BinaryTCN
INFO:root:backend: pytorch
INFO:root:dataset: 1 dnorm_dmx_raw_pba_ohlca df(s)
INFO:root:assets: sp_500


### Show Data Options

In [10]:
flts_data = []
flts_choices = {}
for i, (fpath, lpath, tpath, frec, lrec, trec, fcol, lcol, tcol, flt) in enumerate(datagen(dataset, feat_prep_fn=prepare_transpose_data, label_prep_fn=prepare_label_data, target_prep_fn=prepare_target_data, how='df_to_df', delayed=True)):
    ident = '{fdesc}[{fcol}], {ldesc}[{lcol}], {tdesc}[{tcol}])'.format(fdesc=frec.desc, fcol=fcol, ldesc=lrec.desc, lcol=lcol, tdesc=trec.desc, tcol=tcol)
    logging.info('{data_idx} (X, y, z) -> ({data_id})'.format(data_idx=i, data_id=ident))
    flts_data.append(flt)
    flts_choices[ident] = i

INFO:root:0 (X, y, z) -> (raw_pba_dmx[:], raw_pba_oc_retxeod_direod[:], raw_pba_oc_retxeod_reteod[:]))
INFO:root:1 (X, y, z) -> (raw_pba_dmx[:], raw_pba_oc_retxeod(0.25%)_direod[:], raw_pba_oc_retxeod(0.25%)_reteod[:]))
INFO:root:2 (X, y, z) -> (raw_pba_dmx[:], raw_pba_oc_retxeod(0.5*avg,1)_direod[:], raw_pba_oc_retxeod(0.5*avg,1)_reteod[:]))
INFO:root:3 (X, y, z) -> (raw_pba_dmx[:], raw_pba_oc_retxeod(0.5*max,1)_direod[:], raw_pba_oc_retxeod(0.5*max,1)_reteod[:]))
INFO:root:4 (X, y, z) -> (raw_pba_dmx[:], raw_pba_oc_retxeod(0.5*min,1)_direod[:], raw_pba_oc_retxeod(0.5*min,1)_reteod[:]))
INFO:root:5 (X, y, z) -> (raw_pba_dmx[:], raw_pba_oc_retxeod(0.5*std,1)_direod[:], raw_pba_oc_retxeod(0.5*std,1)_reteod[:]))
INFO:root:6 (X, y, z) -> (raw_pba_dmx[:], raw_pba_oc_retxeod(0.50%)_direod[:], raw_pba_oc_retxeod(0.50%)_reteod[:]))
INFO:root:7 (X, y, z) -> (raw_pba_dmx[:], raw_pba_oc_retxeod(1*avg,1)_direod[:], raw_pba_oc_retxeod(1*avg,1)_reteod[:]))
INFO:root:8 (X, y, z) -> (raw_pba_dmx[:], 

### Select Feature and Label/Target

In [11]:
feature, label, target = flts_data[1].compute()

  return pd.MultiIndex(levels=new_levels, labels=labels, names=columns.names)
  labels, = index.labels
  labels = getattr(columns, 'labels', None) or [


In [12]:
f = feature
pos_l, neg_l = pd_binary_clip(label) # Clip Label by Side
l = pos_l
t = target

### Set Device

In [13]:
dev = torch.device('cuda') if (torch.cuda.is_available()) else torch.device('cpu')

### Split Data into Ndarrays

In [14]:
val_ratio = .2
test_ratio = .2
train_ratio = 1-(val_ratio+test_ratio)
f_train_idx, f_val_idx, f_test_idx = midx_split(f.index, train_ratio, val_ratio, test_ratio)
l_train_idx, l_val_idx, l_test_idx = midx_split(l.index, train_ratio, val_ratio, test_ratio)
t_train_idx, t_val_idx, t_test_idx = midx_split(t.index, train_ratio, val_ratio, test_ratio)

In [15]:
f_train_pd, f_val_pd, f_test_pd = f.loc[f_train_idx], f.loc[f_val_idx], f.loc[f_test_idx]
l_train_pd, l_val_pd, l_test_pd = l.loc[l_train_idx], l.loc[l_val_idx], l.loc[l_test_idx]
t_train_pd, t_val_pd, t_test_pd = t.loc[t_train_idx], t.loc[t_val_idx], t.loc[t_test_idx]

In [16]:
if (is_type(f.index, pd.core.index.MultiIndex)):
    f_train_np, f_val_np, f_test_np = map(pd_midx_to_arr, [f_train_pd.stack(), f_val_pd.stack(), f_test_pd.stack()])
else:
    f_train_np, f_val_np, f_test_np = f_train_pd.values, f_val_pd.values, f_test_pd.values
l_train_np, l_val_np, l_test_np = l_train_pd.values, l_val_pd.values, l_test_pd.values
t_train_np, t_val_np, t_test_np = t_train_pd.values, t_val_pd.values, t_test_pd.values

In [17]:
val_tar = torch.tensor(t_val_np, dtype=torch.float32, device=dev, requires_grad=False).squeeze()

### Set Input Shape

In [18]:
input_shape = tuple(f_train_np.shape[-2:]) if (len(f_train_np.shape) > 2) else (1, f_train_np.shape[-1])

### Set Data

In [19]:
train_data = (f_train_np, l_train_np)
val_data = (f_val_np, l_val_np)

### Hyperparameter List

In [20]:
mod_obj.get_space()

{'epochs': <hyperopt.pyll.base.Apply at 0x7f2605373f28>,
 'batch_size': <hyperopt.pyll.base.Apply at 0x7f260531c0b8>,
 'loss': <hyperopt.pyll.base.Apply at 0x7f26987408d0>,
 'opt': <hyperopt.pyll.base.Apply at 0x7f260912ae48>,
 'input_windows': <hyperopt.pyll.base.Apply at 0x7f26987158d0>,
 'topology': <hyperopt.pyll.base.Apply at 0x7f2698715ba8>,
 'kernel_size': <hyperopt.pyll.base.Apply at 0x7f2698740278>,
 'dropout': <hyperopt.pyll.base.Apply at 0x7f2698740470>,
 'attention': <hyperopt.pyll.base.Apply at 0x7f2698740630>,
 'max_attn_len': <hyperopt.pyll.base.Apply at 0x7f26987407b8>}

### Set Hyperparameters

In [21]:
pmf = list(reversed(l_train_pd.value_counts(normalize=True)))

In [22]:
params = {
    'epochs': 100,
    'batch_size': 128, #256
    'loss': 'nll',
    'cw': pmf,
    'cw': None,
    'opt': {
        'name': 'Adam',
        'lr': .0001
    },
    'input_windows': 20,
    'topology': [30],
    'kernel_size': 8,
    'dropout': 0,
    'attention': False,
    'max_attn_len': 80,
}

### Set Data

In [23]:
train_data = (f_train_np, l_train_np)
val_data = (f_val_np, l_val_np)

### Define Batch Loss Compute Function

In [24]:
def bloss(params, model, loss_function, feat_batch, lab_batch, optimizer=None, ret_train_pred=False, metrics_fns=mod_obj.metrics_fns):
    """
    Compute loss and metrics on batch, run optimizer on losses if passed.
    """
    # logging.debug('batch tensor[0][0]: {}'.format(feat_batch[0][0]))
    outputs_batch = model(feat_batch)
    loss = loss_function(outputs_batch, lab_batch)
    max_batch, pred_batch = torch.max(outputs_batch, dim=1) # Convert network outputs into predictions
    lab_batch_cpu = lab_batch.cpu()
    pred_batch_cpu = pred_batch.cpu()
    metrics = {name: fn(lab_batch_cpu, pred_batch_cpu) for name, fn in metrics_fns.items()}

    if (optimizer is not None):
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if (not ret_train_pred):
            return loss.item(), len(feat_batch), metrics

#     logging.debug('batch loss:   {}'.format(loss.item()))
    return loss.item(), len(feat_batch), metrics, (max_batch.exp(), pred_batch.float())

### Make Classifier

In [25]:
# Make Model, Loss Fn, and Optimizer
clf_mdl = mod_obj.get_model(params, input_shape).to(device=dev)
loss_fn = mod_obj.make_loss_fn(params, None if (isnt(params['cw'])) else torch.tensor(params['cw']).to(dev)).to(dev)
opt = mod_obj.make_optimizer(params, clf_mdl.parameters())
display(clf_mdl)

TCN_Classifier(
  (tcn): TemporalConvNet(
    (network): Sequential(
      (0): TemporalBlock(
        (conv1): Conv1d(5, 240, kernel_size=(8,), stride=(1,), padding=(7,))
        (chomp1): Chomp1d()
        (relu1): ReLU()
        (dropout1): Dropout(p=0)
        (conv2): Conv1d(240, 240, kernel_size=(8,), stride=(1,), padding=(7,))
        (chomp2): Chomp1d()
        (relu2): ReLU()
        (dropout2): Dropout(p=0)
        (net): Sequential(
          (0): Conv1d(5, 240, kernel_size=(8,), stride=(1,), padding=(7,))
          (1): Chomp1d()
          (2): ReLU()
          (3): Dropout(p=0)
          (4): Conv1d(240, 240, kernel_size=(8,), stride=(1,), padding=(7,))
          (5): Chomp1d()
          (6): ReLU()
          (7): Dropout(p=0)
        )
        (downsample): Conv1d(5, 240, kernel_size=(1,), stride=(1,))
        (relu): ReLU()
      )
    )
  )
  (out): Linear(in_features=240, out_features=2, bias=True)
  (logprob): LogSoftmax()
)

### Make Pyro Model

In [38]:
pyro.get_param_store().clear()
tcn_cnn_offset = 4

In [39]:
def pyro_model(data, mdl, dev):
    """
    Generative Model - model for how observations are generated
    """
    priors = {}
    # Go through all convolutional layers and create distribution priors
    for i, block in enumerate(mdl.tcn.network):
        for j in range(2):
            prior_weight = Normal(loc=torch.zeros_like(block[j*tcn_cnn_offset].weight), scale=torch.ones_like(block[j*tcn_cnn_offset].weight))
            prior_bias = Normal(loc=torch.zeros_like(block[j*tcn_cnn_offset].bias), scale=torch.ones_like(block[j*tcn_cnn_offset].bias))
            priors['blk{}-{}.weight'.format(i, j)] = prior_weight
            priors['blk{}-{}.bias'.format(i, j)] = prior_bias
    
    # Distribution priors of output layer
    prior_weight = Normal(loc=torch.zeros_like(mdl.out.weight), scale=torch.ones_like(mdl.out.weight))
    prior_bias = Normal(loc=torch.zeros_like(mdl.out.bias), scale=torch.ones_like(mdl.out.bias))
    priors['out.weight'] = prior_weight
    priors['out.bias'] = prior_bias
    
    # lift module parameters to random variables sampled from the priors
    lifted_module = pyro.random_module("module", mdl, priors)

    # sample a classifier (which also samples w and b)
    lifted_clf = lifted_module()
    
    for i in pyro.plate("data_loop", len(data)):
        return pyro.sample("obs", Categorical(logits=lifted_clf), obs=data[1])

    for i in pyro.plate("data_loop", N, subsample=data, device=dev):
        x_data = data[:, :-1]
        y_data = data[:, -1]
        # run the regressor forward conditioned on inputs
        prediction_mean = lifted_reg_model(x_data).squeeze()
        pyro.sample("obs",
                    Normal(prediction_mean, Variable(torch.ones(data.size(0))).type_as(data)),
                    obs=y_data.squeeze())
        pyro.sample("obs", Categorical(logits=lifted_clf), obs=data[1])

In [40]:
def pyro_guide(data, mdl, dev):
    """
    Variational Distribution - Approximation of the Posterior Distribution
    """
    priors = {}
    for i, block in enumerate(mdl.tcn.network):
        for j in range(2):
            weight_mu_param = pyro.param('blk{}-{}_weight_mu'.format(i, j), torch.randn_like(block[j*tcn_cnn_offset].weight))
            weight_sigma_param = softplus(pyro.param('blk{}-{}_weight_sigma'.format(i, j), torch.randn_like(block[j*tcn_cnn_offset].weight)))
            weight_prior = Normal(loc=weight_mu_param, scale=weight_sigma_param)
            bias_mu_param = pyro.param('blk{}-{}_bias_mu'.format(i, j), torch.randn_like(block[j*tcn_cnn_offset].bias))
            bias_sigma_param = softplus(pyro.param('blk{}-{}_bias_sigma'.format(i, j), torch.randn_like(block[j*tcn_cnn_offset].bias)))
            bias_prior = Normal(loc=bias_mu_param, scale=bias_sigma_param)
            priors['blk{}-{}.weight'.format(i, j)] = weight_prior
            priors['blk{}-{}.bias'.format(i, j)] = bias_prior

    weight_mu_param = pyro.param('out_weight_mu', torch.randn_like(net.out.weight))
    weight_sigma_param = softplus(pyro.param('out_weight_sigma', torch.randn_like(net.out.weight)))
    weight_prior = Normal(loc=weight_mu_param, scale=weight_sigma_param)
    bias_mu_param = pyro.param('out_bias_mu', torch.randn_like(net.out.bias))
    bias_sigma_param = softplus(pyro.param('out_bias_sigma', torch.randn_like(net.out.bias)))
    bias_prior = Normal(loc=bias_mu_param, scale=bias_sigma_param)
    priors['out.weight'] = weight_prior
    priors['out.bias'] = bias_prior

    lifted_module = pyro.random_module("module", mdl, priors)

    return lifted_module()

In [41]:
pyro_optim = Adam({"lr": 0.001})
svi = SVI(pyro_model, pyro_guide, pyro_optim, loss=Trace_ELBO())
svi_history = {
    'loss': [],
    'val_loss': []
}

In [42]:
for epoch in range(params['epochs']):
    epoch_str = str(epoch).zfill(3)
    losses, nums = zip(*[(svi.step((Xb, yb), clf_mdl), len(Xb)) for Xb, yb in mod_obj.batchify(params, mod_obj.preproc(params, train_data), dev, shuffle_batches=True)])
    loss = np_inner(losses, nums)
    svi_history['loss'].append(loss)

TypeError: 'str' object is not callable

In [None]:
pyro_optim = Adam({"lr": 0.001})
svi = SVI(pyro_model, pyro_guide, pyro_optim, loss="ELBO")

N = len(X_train)

for j in range(3000):
    epoch_loss = 0.0
    perm = torch.randperm(N)
    # shuffle data
    data = data[perm]
    # get indices of each batch
    all_batches = get_batch_indices(N, 64)
    for ix, batch_start in enumerate(all_batches[:-1]):
        batch_end = all_batches[ix + 1]
        batch_data = data[batch_start: batch_end]        
        epoch_loss += svi.step(batch_data)
    if j % 100 == 0:
        print(j, "avg loss {}".format(epoch_loss/float(N)))

In [32]:
writer = self.tbx(params, logdir) if (logdir is not None) else None
mdl.zero_grad()
opt.zero_grad()

# Metrics
history = {
    'loss': [],
    'val_loss': []
}
for name in mod_obj.metrics_fns.keys():
    history[name] = []
    history['val_{}'.format(name)] = []

# Fit Model
try:
    for epoch in range(params['epochs']):
        epoch_str = str(epoch).zfill(3)

        mdl.train()
        losses, nums, metrics = zip(*[bloss(params, mdl, loss_fn, Xb, yb, optimizer=opt) for Xb, yb in mod_obj.batchify(params, mod_obj.preproc(params, train_data), dev, shuffle_batches=True)])
        loss = np_inner(losses, nums)
        soa = {name[0]: tuple(d[name[0]] for d in metrics) for name in zip(*metrics)}
        metric = {name: np_inner(vals, nums) for name, vals in soa.items()}
        history['loss'].append(loss)
        for name, val in metric.items():
            history[name].append(val)

        mdl.eval()
        with torch.no_grad():
            Xe, ye = get0(*mod_obj.batchify(params, mod_obj.preproc(params, val_data), dev, override_batch_size=val_data[-1].size, shuffle_batches=False))
            loss, num, metric, pred = bloss(params, mdl, loss_fn, Xe, ye)
            pred_conf, pred_dir = pred
        history['val_loss'].append(loss)
        for name, val in metric.items():
            history['val_{}'.format(name)].append(val)

    results = {
#         'history': history,
        'mean': {name: np.mean(vals) for name, vals in history.items()},
        'last': {name: vals[-1] for name, vals in history.items()}
    }

except Exception as e:
    logging.error('Error during model fitting: {}'.format(str(e)))
    raise e

### Results

In [33]:
train_range = l_train_pd.value_counts(normalize=True, sort=True, ascending=True).values
val_range = l_val_pd.value_counts(normalize=True, sort=True, ascending=True).values

In [34]:
display('      #0         #1')
display('train {}'.format(train_range))
display('val {}'.format(val_range))

'      #0         #1'

'train [0.46604215 0.53395785]'

'val [0.47188755 0.52811245]'

In [35]:
results['last']

{'loss': 0.0029391172548962965,
 'val_loss': 2.336294651031494,
 'acc': 1.0,
 'val_acc': 0.5015353121801432}

In [36]:
vt = val_tar[val_tar.size()[0]-pred_dir.size()[0]:]

In [37]:
pred_dir @ vt

tensor(0.3285, device='cuda:0')

In [38]:
(pred_conf * pred_dir) @ vt

tensor(0.3652, device='cuda:0')

In [39]:
vt.sum()

tensor(0.5334, device='cuda:0')