# Debug Runt Transform 'direod'

In [10]:
import sys
import os
from os import sep
from os.path import dirname, realpath
from pathlib import Path
import logging
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)

def get_cwd(fname, subdir, crunch_dir=realpath(Path.home()) +sep +'crunch' +sep):
    """
    Convenience function to make a directory string for the current file based on inputs.
    Jupyter Notebook in Anaconda invokes the Python interpreter in Anaconda's subdirectory
    which is why changing sys.argv[0] is necessary. In the future a better way to do this
    should be preferred..
    """
    return crunch_dir +subdir +fname
    
def fix_path(cwd):
    """
    Convenience function to fix argv and python path so that jupyter notebook can run the same as
    any script in crunch.
    """
    sys.argv[0] = cwd
    module_path = os.path.abspath(os.path.join('..'))
    if module_path not in sys.path:
        sys.path.append(module_path)

fix_path(get_cwd('test.ipynb', 'recon' +sep))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from numba import jit, vectorize
from dask import delayed, dataframe as dd

from ipywidgets import interact, interactive, fixed
from IPython.display import display

pd.set_option("display.max_rows", 100)
pd.set_option('display.max_columns', 50)

from common_util import MUTATE_DIR, DT_HOURLY_FREQ, DT_CAL_DAILY_FREQ
from common_util import load_json, dump_json, remove_dups_list, list_get_dict, is_empty_df, search_df, benchmark
from data.data_api import DataAPI
from data.access_util import df_getters as dg, col_subsetters2 as cs2
from mutate.common import default_runt_dir_name, default_trfs_dir_name
from mutate.runt_util import *
from mutate.runt import fill_defaults, get_variants, get_row_mask_keychain, get_desc_pfx, make_runt_entry

NameError: name 'default_fct' is not defined

## General

### Set Logging Level and Date Range

In [None]:
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
date_range = {
    'id': ('lt', 2018)
}

### Set Runt and Transform Directories

In [None]:
runt_dir_name = default_runt_dir_name
trfs_dir_name = default_trfs_dir_name

runt_dir = MUTATE_DIR +runt_dir_name
trfs_dir = runt_dir +trfs_dir_name

### Load all transforms

In [None]:
trf_defaults = load_json('trf_defaults.json', dir_path=runt_dir)
graph = load_json('graph.json', dir_path=runt_dir)
trfs = {}

logging.info('loading step settings...')
for fname in os.listdir(trfs_dir):
    trf = load_json(fname, dir_path=trfs_dir)
    trfs[trf['meta']['name']] = trf

### Define Test Step Function

In [None]:
def test_step(step_info, date_range, overwrites = {}):
    meta, fn, var, rm, src, dst = step_info['meta'], step_info['fn'], step_info['var'], step_info['rm'], step_info['src'], step_info['dst']

    # Loading transform, apply, and frequency settings
    ser_fn = overwrites['ser_fn'] if ('ser_fn' in overwrites) else RUNT_FN_TRANSLATOR[fn['ser_fn']]
    rtype_fn = overwrites['df_fn'] if ('df_fn' in overwrites) else RUNT_TYPE_TRANSLATOR[fn['df_fn']]
    freq = overwrites['freq'] if ('freq' in overwrites) else RUNT_FREQ_TRANSLATOR[fn['freq']]
    res_freq = RUNT_FREQ_TRANSLATOR[meta['res_freq']]

    # Making all possible parameter combinations
    variants = get_variants(var, meta['var_fmt'])

    # Loading row mask, if any
    if (rm is not None):
        rm_dg, rm_cs = list_get_dict(dg, rm), list_get_dict(cs2, rm)
        rm_paths, rm_recs, rm_dfs = DataAPI.load_from_dg(rm_dg, rm_cs)
        rm_keys = [remove_dups_list([key_chain[i] for key_chain in rm_paths]) for i in range(len(rm_paths[0]))]

    # Loading input data
    src_dg, src_cs = list_get_dict(dg, src), list_get_dict(cs2, src)
    src_paths, src_recs, src_dfs = DataAPI.load_from_dg(src_dg, src_cs)
    logging.debug('src_paths[0] ' +str(src_paths[0]))
    logging.debug('src_paths[-1] ' +str(src_paths[-1]))

    # Run transforms on inputs
    for key_chain in src_paths:
        logging.info('data: ' +str('_'.join(key_chain)))
        src_rec, src_df = list_get_dict(src_recs, key_chain), list_get_dict(src_dfs, key_chain)
        src_df = src_df.loc[search_df(src_df, date_range), :].dropna(axis=0, how='all')

        # Masking rows in src from row mask
        if (rm is not None):
            rm_key_chain = get_row_mask_keychain(key_chain, rm_keys)
            rm_df = list_get_dict(rm_dfs, rm_key_chain).dropna()
            not_in_src = rm_df.index.difference(src_df.index)
            logging.debug('row mask: ' +str('_'.join(rm_key_chain)))
            if (len(not_in_src)>0):
                logging.debug('rm_idx - src_idx: ' +str(not_in_src))
                src_df = src_df.loc[src_df.index & rm_df.index, :].dropna(axis=0, how='all')
            else:
                src_df = src_df.loc[rm_df.index, :].dropna(axis=0, how='all')

        logging.debug('pre_transform: ' +str(src_df))

        # Running variants of the transform
        for variant in variants:
            fn = ser_fn(**variant)
            runted_df = rtype_fn(src_df, ser_fn(**variant), freq)
            desc_sfx = meta['rec_fmt'].format(**variant)
            desc_pfx = get_desc_pfx(key_chain, src_rec)
            desc = '_'.join([desc_pfx, desc_sfx])

            if (meta['mtype_from']=='name'):       mutate_type = meta['name']
            elif (meta['mtype_from']=='rec_fmt'):  mutate_type = desc_sfx

            assert(not is_empty_df(runted_df))
            entry = make_runt_entry(desc, res_freq, mutate_type, src_rec)
            logging.info('dumping ' +desc +'...')
            logging.debug('post_transform: ' +str(runted_df))
            yield entry, runted_df

## Transform Debug

### Set Transform
Fill it in (default is day moving average)

In [None]:
trf_name = 'direod'

### Imports

### Test Section

In [None]:
logging.basicConfig(stream=sys.stdout, level=logging.INFO)

In [None]:
trf = trfs[trf_name]
step = fill_defaults(trf, trf_defaults)

In [None]:
for runt_rec, runt_df in test_step(step, date_range):
    None

## Test Disk Version

In [None]:
au = ['dir', 'direod']
actual_dg, actual_cs = list_get_dict(dg, au), list_get_dict(cs2, au)
actual_paths, actual_recs, actual_dfs = DataAPI.load_from_dg(actual_dg, actual_cs)
logging.debug('actual_paths[0] ' +str(actual_paths[0]))
logging.debug('actual_paths[-1] ' +str(actual_paths[-1]))

In [None]:
for key_chain in actual_paths:
    actual_rec, actual_df = list_get_dict(actual_recs, key_chain), list_get_dict(actual_dfs, key_chain)
    logging.debug(str(key_chain))
    logging.debug(actual_df)