# decomp_test

In [1]:
import sys
import os
from os import sep
from os.path import dirname, realpath
from pathlib import Path
from functools import reduce
import logging
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)

def get_cwd(fname, subdir, crunch_dir=realpath(Path.home()) +sep +'crunch' +sep):
    """
    Convenience function to make a directory string for the current file based on inputs.
    Jupyter Notebook in Anaconda invokes the Python interpreter in Anaconda's subdirectory
    which is why changing sys.argv[0] is necessary. In the future a better way to do this
    should be preferred..
    """
    return crunch_dir +subdir +fname
    
def fix_path(cwd):
    """
    Convenience function to fix argv and python path so that jupyter notebook can run the same as
    any script in crunch.
    """
    sys.argv[0] = cwd
    module_path = os.path.abspath(os.path.join('..'))
    if module_path not in sys.path:
        sys.path.append(module_path)

fix_path(get_cwd('decomp_test.ipynb', 'recon' +sep))

import numpy as np
import pandas as pd
from dask import delayed
from sklearn.decomposition import FactorAnalysis, PCA, KernelPCA

from ipywidgets import interact, interactive, fixed
from IPython.display import display

pd.set_option("display.max_rows", 100)
pd.set_option('display.max_columns', 50)

from common_util import MUTATE_DIR, DT_HOURLY_FREQ, DT_CAL_DAILY_FREQ, is_valid, is_type, null_fn, identity_fn, pd_abs, pd_is_empty, ser_range_center_clip, pd_common_idx_rows, midx_split
from common_util import load_json, dump_json, ser_shift, df_sk_mw_transform, arr_nonzero, df_rows_in_year, get_variants, remove_dups_list, list_get_dict, is_empty_df, search_df, benchmark
from data.data_api import DataAPI
from recon.common import DATASET_DIR

DEBUG:matplotlib:$HOME=/home/kev
DEBUG:matplotlib:CONFIGDIR=/home/kev/.config/matplotlib
DEBUG:matplotlib:matplotlib data path: /home/kev/miniconda3/lib/python3.7/site-packages/matplotlib/mpl-data
DEBUG:matplotlib:loaded rc file /home/kev/miniconda3/lib/python3.7/site-packages/matplotlib/mpl-data/matplotlibrc
DEBUG:matplotlib:matplotlib version 3.1.0
DEBUG:matplotlib:interactive is False
DEBUG:matplotlib:platform is linux


CRITICAL:root:script location: /home/kev/crunch/recon/decomp_test.ipynb
CRITICAL:root:using project dir: /home/kev/crunch/


## Init DataAPI

In [2]:
DataAPI.__init__()
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)

## Load data

In [3]:
hduni_rcs, hduni_dfs = DataAPI.axe_load(['hduni', 'hduni'])
hdgau_rcs, hdgau_dfs = DataAPI.axe_load(['hdgau', 'hdgau'])

  labels = getattr(columns, 'labels', None) or [
  return pd.MultiIndex(levels=new_levels, labels=labels, names=columns.names)
  labels, = index.labels


In [13]:
hdgau_dfs[list(hdgau_dfs.keys())[20]].dropna()

Unnamed: 0_level_0,pba_oc
id,Unnamed: 1_level_1
2009-01-02 14:00:00+00:00,1
2009-01-02 15:00:00+00:00,2
2009-01-02 16:00:00+00:00,1
2009-01-02 17:00:00+00:00,1
2009-01-02 18:00:00+00:00,2
2009-01-02 19:00:00+00:00,2
2009-01-02 20:00:00+00:00,1
2009-01-02 21:00:00+00:00,1
2009-01-05 14:00:00+00:00,1
2009-01-05 15:00:00+00:00,2


### Features

In [3]:
dc_rcs, dc_dfs = DataAPI.axe_load(['dc', 'dc'])
hdxret_rcs, hdxret_dfs = DataAPI.axe_load(['hdxret', 'hdxret'])
dohlca_rcs, dohlca_dfs = DataAPI.axe_load(['dohlca', 'dohlca'])

  labels = getattr(columns, 'labels', None) or [
  return pd.MultiIndex(levels=new_levels, labels=labels, names=columns.names)
  labels, = index.labels


### Returns

In [4]:
dret_rcs,  dret_dfs = DataAPI.axe_load(['dret', 'dret'])
dret1_rcs, dret1_dfs = DataAPI.axe_load(['dret1', 'dret1_log'])
dret2_rcs, dret2_dfs = DataAPI.axe_load(['dret2', 'dret2'])

In [5]:
dxfbret1_rcs, dxfbret1_dfs = DataAPI.axe_load(['dxfbret1', 'dxfbret1_log'])
dxfbret2_rcs, dxfbret2_dfs = DataAPI.axe_load(['dxfbret2', 'dxfbret2'])

### Labels

In [6]:
ddir_rcs,  ddir_dfs = DataAPI.axe_load(['ddir', 'ddir'])
ddir1_rcs, ddir1_dfs = DataAPI.axe_load(['ddir1', 'ddir1_log'])
ddir2_rcs, ddir2_dfs = DataAPI.axe_load(['ddir2', 'ddir2'])

In [7]:
dxfbdir1_rcs, dxfbdir1_dfs = DataAPI.axe_load(['dxfbdir1', 'dxfbdir1_log'])
dxfbdir2_rcs, dxfbdir2_dfs = DataAPI.axe_load(['dxfbdir2', 'dxfbdir2'])

### Scores

In [8]:
dxfbval1_rcs, dxfbval1_dfs = DataAPI.axe_load(['dxfbval1', 'dxfbval1_log'])
dxfbval2_rcs, dxfbval2_dfs = DataAPI.axe_load(['dxfbval2', 'dxfbval2'])

### Select Data

In [145]:
asset_name = 'sp_500'
ret_src = 'vol'
ret_type = 'oc'
ret_htype = 'h{}'.format(ret_type)
ret_column = '_'.join([ret_src, ret_type])
ret_axeroot = '_'.join([ret_src, ret_htype, 'hdxret'])

feat_dfs = dc_dfs
feat_keys = list(filter(lambda k: k[0]==asset_name, feat_dfs.keys()))

label_dfs = ddir_dfs
target_dfs = dret_dfs
label_keys = list(filter(lambda k: k[0]==asset_name and k[3].startswith(ret_axeroot), label_dfs.keys()))
target_keys = list(filter(lambda k: k[0]==asset_name and k[3].startswith(ret_axeroot), target_dfs.keys()))

label1_dfs = ddir1_dfs
target1_dfs = dret1_dfs
label1_keys = list(filter(lambda k: k[0]==asset_name and k[3].startswith(ret_axeroot), label1_dfs.keys()))
target1_keys = list(filter(lambda k: k[0]==asset_name and k[3].startswith(ret_axeroot), target1_dfs.keys()))

In [146]:
label_df = pd.concat([label_dfs[lk].loc[:, ret_column].rename(lk[-1]) for lk in label_keys], axis=1)
target_df = pd.concat([target_dfs[lk].loc[:, ret_column].rename(lk[-1]) for lk in target_keys], axis=1)

In [147]:
label1_df = pd.concat([label_dfs[lk].loc[:, ret_column].rename(lk[-1]) for lk in label_keys], axis=1)
target1_df = pd.concat([target_dfs[lk].loc[:, ret_column].rename(lk[-1]) for lk in target_keys], axis=1)

### Join Data

In [148]:
feat_df = pd.concat([feat_dfs[fdf] for fdf in feat_keys], axis=1)

In [149]:
val_ratio, tst_ratio = .2, .2
ddf, rdf = pd_common_idx_rows(feat_df.dropna(), ser_shift(target_df.dropna()))
trn, val, tst = midx_split(ddf.index, 1-(val_ratio+tst_ratio), val_ratio, tst_ratio)

In [150]:
trn_data = ddf.loc[trn, :]
trn_targ = rdf.loc[trn, :]
val_data = ddf.loc[val, :]
val_targ = rdf.loc[val, :]

## Transformers

In [194]:
num_comp = 10
kernel = 'rbf' #'linear' 'rbf' 'cosine' 'sigmoid'
win_size = 10
assert win_size >= num_comp, "window size must be greater than or equal to number of components"

In [195]:
trf_kpca = KernelPCA(n_components=num_comp, kernel=kernel)

### Expanding PCA

In [196]:
#for i in range(num_comp+1, len(ddf)):
#    display(trf_kpca.fit_transform(ddf.iloc[0:i, :]))

### Moving Window PCA

In [197]:
trn_data_pca = df_sk_mw_transform(trn_data, trf_kpca, num_cols=num_comp, win_size=win_size)
val_data_pca = df_sk_mw_transform(val_data, trf_kpca, num_cols=num_comp, win_size=win_size)

In [198]:
trn_data_fin, trn_targ_fin = pd_common_idx_rows(trn_data_pca.dropna(), trn_targ.dropna())
val_data_fin, val_targ_fin = pd_common_idx_rows(val_data_pca.dropna(), val_targ.dropna())

### Ridge Regression

In [199]:
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor

In [200]:
rr_mdl_0 = Ridge(alpha=1.0, fit_intercept=True, normalize=False)
en_mdl_0 = ElasticNet(alpha=1.0, l1_ratio=0.5, fit_intercept=True, normalize=False, precompute=False, max_iter=1000)
rf_mdl_0 = RandomForestRegressor(n_estimators=10, criterion='mae', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None)
rr_mdl_0.fit(trn_data, trn_targ)
en_mdl_0.fit(trn_data, trn_targ)
rf_mdl_0.fit(trn_data, trn_targ)
print('Ridge Regr: {}'.format(rr_mdl_0.score(val_data, val_targ)))
print('ElasticNet: {}'.format(en_mdl_0.score(val_data, val_targ)))
print('RandForest: {}'.format(rf_mdl_0.score(val_data, val_targ)))

  "source": [


Ridge Regr: -0.010512206296257265
ElasticNet: -4.626390846307338e-05
RandForest: -0.06937468277464887


In [201]:
rr_mdl_1 = Ridge(alpha=1.0, fit_intercept=True, normalize=False)
en_mdl_1 = ElasticNet(alpha=1.0, l1_ratio=0.5, fit_intercept=True, normalize=False, precompute=False, max_iter=1000)
rf_mdl_1 = RandomForestRegressor(n_estimators=10, criterion='mae', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None)
rr_mdl_1.fit(trn_data_fin, trn_targ_fin)
en_mdl_1.fit(trn_data_fin, trn_targ_fin)
rf_mdl_1.fit(trn_data_fin, trn_targ_fin)
print('Ridge Regr: {}'.format(rr_mdl_1.score(val_data_fin, val_targ_fin)))
print('ElasticNet: {}'.format(en_mdl_1.score(val_data_fin, val_targ_fin)))
print('RandForest: {}'.format(rf_mdl_1.score(val_data_fin, val_targ_fin)))

  "source": [


Ridge Regr: 0.001990848545060042
ElasticNet: -0.00014404484015861918
RandForest: -0.07570621442740921
