In [1]:
import sys
import os
from os import sep
from os.path import dirname, realpath
from pathlib import Path
from itertools import product
import logging

def get_cwd(fname, subdir, crunch_dir=realpath(Path.home()) +sep +'crunch' +sep):
    """
    Convenience function to make a directory string for the current file based on inputs.
    Jupyter Notebook in Anaconda invokes the Python interpreter in Anaconda's subdirectory
    which is why changing sys.argv[0] is necessary. In the future a better way to do this
    should be preferred..
    """
    return crunch_dir +subdir +fname

def fix_path(cwd):
    """
    Convenience function to fix argv and python path so that jupyter notebook can run the same as
    any script in crunch.
    """
    sys.argv[0] = cwd
    module_path = os.path.abspath(os.path.join('..'))
    if module_path not in sys.path:
        sys.path.append(module_path)

fix_path(get_cwd('test.ipynb', 'recon' +sep))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from numba import jit, vectorize, float64

from ipywidgets import interact, interactive, fixed
from IPython.display import display

pd.set_option("display.max_rows", 100)
pd.set_option('display.max_columns', 50)

from common_util import DT_HOURLY_FREQ, DT_CAL_DAILY_FREQ, DT_BIZ_DAILY_FREQ, get_custom_biz_freq, query_df, search_df, chained_filter, benchmark
from common_util import outer_join, left_join
from data.data_api import DataAPI
from data.access_util import col_subsetters as cs
from mutate.common import dum, count_nonnan, count_nonzero, count_both
from mutate.label import *

# Code Dump Below

In [2]:
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)

date_range = {
    'id': ('lt', 2018)
}

search_terms = {
    'stage': 'mutate',
    'mutate_type': 'thresh',
    'raw_cat': 'us_equity_index'
}
thresh_dfs = {}
for rec, thresh_df in DataAPI.generate(search_terms):
    thresh_dfs[rec.root] = thresh_df.loc[search_df(thresh_df, date_range)]
logging.info('thresh data loaded')

# assert(set(price_dfs.keys()) == set(thresh_dfs.keys()))

INFO:root:thresh data loaded


In [3]:
ret_cols = ['pba_oa_return_fth_of_xwhole', 'pba_oa_return_fth_af_whole']
thresh_cols = ['pba_oa_return_fth_of_abs_xmin', 'pba_oa_return_fth_af_abs_min']

In [4]:
for name in thresh_dfs.keys():
    data_df = thresh_dfs[name]
    print(name)
    
    thresh_df = data_df.loc[:, ['pba_oc_spread_fth_af_whole', 'pba_oc_spread_fth_af_abs_avg']]

#     ret_cols = ['pba_oa_return_fth_af_whole']
#     thresh_cols = ['pba_oa_return_fth_of_xstd', 'pba_oa_return_fth_af_abs_min']
#     combos = list(product(ret_cols, thresh_cols))

#     ret_col, thresh_col = combos[0]
#     ret_df = data_df.loc[:, [ret_col]]
#     original_thresh_df = data_df.loc[:, ['pba_oc_spread_fth_af_whole']]
#     thresh_df = original_thresh_df.copy()
#     print(ret_col, thresh_col)

#     procedure = list(filter(lambda item: item in ['af', 'of'], thresh_col.split('_')))[0]
#     print(procedure)
#     ret_thresh_df = shift_time_series_df(procedure, thresh_df, thresh_col, ret_df)

#     result_df = left_join(ret_thresh_df, original_thresh_df.rename(columns={thresh_col: 'original'}))
#     break

dow_jones
sp_500
nasdaq_100
russell_2000


In [5]:
thresh_df

Unnamed: 0_level_0,pba_oc_spread_fth_af_whole,pba_oc_spread_fth_af_abs_avg
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1998-01-02 14:00:00+00:00,-0.5000,0.690000
1998-01-02 15:00:00+00:00,-0.5000,0.690000
1998-01-02 16:00:00+00:00,-0.5000,0.690000
1998-01-02 17:00:00+00:00,-0.5000,0.690000
1998-01-02 18:00:00+00:00,-0.5000,0.690000
1998-01-02 19:00:00+00:00,-0.5000,0.690000
1998-01-02 20:00:00+00:00,-0.5000,0.690000
1998-01-02 21:00:00+00:00,-0.5000,0.690000
1998-01-05 14:00:00+00:00,0.5400,0.517500
1998-01-05 15:00:00+00:00,0.5400,0.517500


## Unrelated Test

In [24]:
delimit_suffix = lambda s: s[:, -3] +'_' +s[-3:]
delimit_suffix('A_B_C123dir')

TypeError: string indices must be integers

In [25]:
def fix_label_df_column_names(label_df):
    delimit_suffix = lambda s, cut: s[:-cut] +'_' +s[-cut:]
    lab_suffixes = ['dir', 'mag', 'brk', 'nmb', 'nmt']
    selector = {
        "exact": [],
        "startswith": [],
        "endswith": lab_suffixes,
        "regex": [],
        "exclude": None
    }

    columns = chained_filter(label_df.columns, [selector])
    mapping = {col: delimit_suffix(col, 3) for col in columns}
    renamed = label_df.rename(mapping, axis='columns')

    return renamed

In [26]:
d = {'col1': [1, 2], 'A_B_Cdir': [3, 4], 'A_B_Cmag': [3, 4], 'A_B_Cbrk': [3, 4], 'A_B_Cnmb': [3, 4], 'A_B_Cnmt': [3, 4]
    , 'A_B_C123dir': [3, 4], 'A_B_C123mag': [3, 4], 'A_B_C123brk': [3, 4], 'A_B_C123nmb': [3, 4], 'A_B_C123nmt': [3, 4]
    , 'col2': [1, 2]}
test_df = pd.DataFrame(data=d)

In [27]:
test_df

Unnamed: 0,A_B_C123brk,A_B_C123dir,A_B_C123mag,A_B_C123nmb,A_B_C123nmt,A_B_Cbrk,A_B_Cdir,A_B_Cmag,A_B_Cnmb,A_B_Cnmt,col1,col2
0,3,3,3,3,3,3,3,3,3,3,1,1
1,4,4,4,4,4,4,4,4,4,4,2,2


In [28]:
fix_label_df_column_names(test_df)

Unnamed: 0,A_B_C123_brk,A_B_C123_dir,A_B_C123_mag,A_B_C123_nmb,A_B_C123_nmt,A_B_C_brk,A_B_C_dir,A_B_C_mag,A_B_C_nmb,A_B_C_nmt,col1,col2
0,3,3,3,3,3,3,3,3,3,3,1,1
1,4,4,4,4,4,4,4,4,4,4,2,2
