# Experiment Group Dataload Test

In [4]:
import sys
import os
from os import sep
from os.path import dirname, realpath
from pathlib import Path
from functools import partial, reduce
import logging

def get_cwd(fname, subdir, crunch_dir=realpath(Path.home()) +sep +'crunch' +sep):
    """
    Convenience function to make a directory string for the current file based on inputs.
    Jupyter Notebook in Anaconda invokes the Python interpreter in Anaconda's subdirectory
    which is why changing sys.argv[0] is necessary. In the future a better way to do this
    should be preferred..
    """
    return crunch_dir +subdir +fname

def fix_path(cwd):
    """
    Convenience function to fix argv and python path so that jupyter notebook can run the same as
    any script in crunch.
    """
    sys.argv[0] = cwd
    module_path = os.path.abspath(os.path.join('..'))
    if module_path not in sys.path:
        sys.path.append(module_path)

fname = 'dataload_test.ipynb'   # FILL
dir_name = 'model'              # FILL
fix_path(get_cwd(fname, dir_name +sep))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from dask import delayed, compute
from torch.utils.data import TensorDataset, DataLoader
import torch

from ipywidgets import interact, interactive, fixed
from IPython.display import display

pd.set_option("display.max_rows", 100)
pd.set_option('display.max_columns', 50)

from common_util import RECON_DIR, JSON_SFX_LEN, DT_CAL_DAILY_FREQ, is_type, pd_common_idx_rows, remove_dups_list, set_loglevel, chained_filter, get_variants, dump_df, load_json, gb_transpose, pd_common_index_rows, filter_cols_below, inner_join, outer_join, ser_shift, list_get_dict, window_iter, benchmark
from common_util import midx_get_level, midx_intersect, pd_common_idx_rows, midx_split, pd_midx_to_arr, window_iter, np_is_ndim
from model.common import XG_DIR, EXPECTED_NUM_HOURS, default_dataset
from model.data_util import xgdg, align_first_last_cols, prune_nulls
from data.data_api import DataAPI
from data.access_util import df_getters as dg, col_subsetters2 as cs2
from recon.dataset_util import prep_dataset, gen_group
from recon.split_util import get_train_test_split, gen_time_series_split, index_three_split, pd_binary_clip

Using TensorFlow backend.


Use this script to test if a given experiment group's data generator works.

### Settings

In [5]:
set_loglevel('info')

In [6]:
test_xgs = sorted(os.listdir(XG_DIR))[:4]

In [7]:
assets_str = 'sp_500'
assets = list(map(str.strip, assets_str.split(',')))

In [8]:
first_only = True

### Test Loop

In [7]:
for xg_fname in test_xgs:
    logging.info('xg: {}'.format(xg_fname))

    for i, (paths, recs, dfs) in enumerate(xgdg(xg_fname, delayed=True, assets=assets, filters_map=None)):
        fpath, lpath, tpath = paths
        frec, lrec, trec = recs
        logging.info('(X, y, z) -> ({fdesc}[:], {ldesc}[:], {tdesc}[:])'.format(fdesc=frec.desc, ldesc=lrec.desc, tdesc=trec.desc))
        with benchmark('time to load data') as b:
            f, l, t = dfs.compute()

        val_ratio = .2
        test_ratio = .2
        train_ratio = 1-(val_ratio+test_ratio)
        f_train_idx, f_val_idx, f_test_idx = midx_split(f.index, train_ratio, val_ratio, test_ratio)
        l_train_idx, l_val_idx, l_test_idx = midx_split(l.index, train_ratio, val_ratio, test_ratio)
        t_train_idx, t_val_idx, t_test_idx = midx_split(t.index, train_ratio, val_ratio, test_ratio)

        f_train_pd, f_val_pd, f_test_pd = f.loc[f_train_idx], f.loc[f_val_idx], f.loc[f_test_idx]
        l_train_pd, l_val_pd, l_test_pd = l.loc[l_train_idx], l.loc[l_val_idx], l.loc[l_test_idx]
        t_train_pd, t_val_pd, t_test_pd = t.loc[t_train_idx], t.loc[t_val_idx], t.loc[t_test_idx]

        if (is_type(f.index, pd.core.index.MultiIndex)):
            f_train_np, f_val_np, f_test_np = map(pd_midx_to_arr, [f_train_pd.stack(), f_val_pd.stack(), f_test_pd.stack()])
        else:
            f_train_np, f_val_np, f_test_np = f_train_pd.values, f_val_pd.values, f_test_pd.values
        l_train_np, l_val_np, l_test_np = l_train_pd.values, l_val_pd.values, l_test_pd.values
        t_train_np, t_val_np, t_test_np = t_train_pd.values, t_val_pd.values, t_test_pd.values

        display('f.shape: {}, l.shape:{}, t.shape:{}'.format(f_train_np.shape, l_train_np.shape, t_train_np.shape))
        display('f:', f_train_np)

        if (first_only):
            break

INFO:root:xg: test_xg0_single_channel_daily.json
INFO:root:(X, y, z) -> (raw_pba_oc_retxeod_reteod[:], raw_pba_oc_retxeod_direod[:], raw_pba_oc_retxeod_reteod[:])


'f.shape: (3007, 1), l.shape:(3007,), t.shape:(3007,)'

'f:'

array([[ 0.00470925],
       [ 0.00208197],
       [-0.01073618],
       ...,
       [ 0.0039105 ],
       [ 0.00089008],
       [-0.00663572]])

INFO:root:xg: test_xg1_multi_channel_daily.json
INFO:root:(X, y, z) -> (raw_trmi_v2_drl[:], raw_pba_oc_retxeod_direod[:], raw_pba_oc_retxeod_reteod[:])


'f.shape: (3007, 31), l.shape:(3007,), t.shape:(3007,)'

'f:'

array([[-9.92680e-02, -8.99600e-03,  7.75500e-03, ...,  7.76000e-04,
         2.32700e-03,  1.47350e-02],
       [-9.74290e-02, -2.22300e-03,  4.44600e-03, ...,  9.67000e-04,
        -9.70000e-05,  1.15990e-02],
       [-1.10931e-01, -2.24380e-02,  7.02900e-03, ...,  3.60000e-04,
         9.91000e-04,  1.38780e-02],
       ...,
       [-5.45210e-02, -7.15100e-03,  7.10500e-03, ...,  7.35000e-04,
         5.00000e-06,  4.81000e-03],
       [-3.58700e-03,  1.24880e-02,  5.20500e-03, ...,  4.13000e-04,
         1.09000e-03,  2.57200e-03],
       [-1.59867e-01, -1.26830e-02,  7.02500e-03, ...,  9.56000e-04,
         1.80000e-05,  3.85800e-03]])

INFO:root:xg: test_xg2_single_channel_intraday.json
INFO:root:(X, y, z) -> (raw_pba_dmx[:], raw_pba_oc_retxeod_direod[:], raw_pba_oc_retxeod_reteod[:])


ValueError: Inferred frequency None from passed values does not conform to passed frequency D