# nb-model_feature-rank_mutual-importance

## Analyze Feature Ranking: Mutual Information Test Results.

In [1]:
import sys
import os
from os import sep
from os.path import dirname, realpath
from pathlib import Path
from collections import OrderedDict
from functools import partial, reduce
from itertools import product
import logging

def get_cwd(fname, subdir, crunch_dir=realpath(Path.home()) +sep +'crunch' +sep):
    """
    Convenience function to make a directory string for the current file based on inputs.
    Jupyter Notebook in Anaconda invokes the Python interpreter in Anaconda's subdirectory
    which is why changing sys.argv[0] is necessary. In the future a better way to do this
    should be preferred..
    """
    return crunch_dir +subdir +fname

def fix_path(cwd):
    """
    Convenience function to fix argv and python path so that jupyter notebook can run the same as
    any script in crunch.
    """
    sys.argv[0] = cwd
    module_path = os.path.abspath(os.path.join('..'))
    if module_path not in sys.path:
        sys.path.append(module_path)

fname = 'nb-model_xg-model-fr-mi.ipynb'
dir_name = 'model'
fix_path(get_cwd(fname, dir_name +sep))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from ipywidgets import interact, interactive, fixed
from IPython.display import display

pd.set_option("display.max_rows", 100)
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_colwidth', 500)

from common_util import MODEL_DIR, load_df, compose, isnt, is_type, remove_dups_list, NestedDefaultDict, benchmark
from model.common import FR_DIR
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)

CRITICAL:root:script location: /home/kev/crunch/model/nb-model_xg-model-fr-mi.ipynb
CRITICAL:root:using project dir: /home/kev/crunch/


## Load Reports

In [2]:
#pd.set_option('display.float_format', lambda x: '%8.2f' % x)
assets = ['sp_500', 'russell_2000', 'nasdaq_100', 'dow_jones']
freqs = ['d', 'h']
srcs = ['pba', 'vol', 'buzz', 'nonbuzz']

def df_scale(df, scalar=10**4):
    df[df.select_dtypes(include=['number']).columns] *= scalar
    return df

In [3]:
mi = NestedDefaultDict()
for path_root_dirs in product(assets, freqs, srcs):
    path_root = FR_DIR +sep.join(path_root_dirs) +sep
    for report_file in os.listdir(path_root):
        path = list(path_root_dirs)
        path.extend(report_file.split('.')[:-1])
        mi[path] = df_scale(load_df(report_file, dir_path=path_root))

In [4]:
mi_agg = NestedDefaultDict()
for kc, v in mi.items():
    #mi_agg[kc] = v.groupby(['sub', 'win', 'knn'])[['clf', 'reg']].agg(['mean', 'max', 'mad', 'std'])*10**4
    mi_agg[kc] = v.groupby(['sub', 'win', 'knn'])[['clf', 'reg']].agg(['max'])

In [5]:
mi_agg_dfs = NestedDefaultDict()
for asset in assets:
    for freq, src in product(freqs, srcs):
        kcs = mi_agg.childkeys([asset, freq, src])
        if (len(list(kcs)) > 0):
            mi_agg_dfs[[asset, freq, src]] = pd.concat([mi_agg[kc] for kc in mi_agg.childkeys([asset, freq, src])])

## Analyze

In [42]:
def ntop_common(asset, freq, src, use_clf_reg=False, ntop=.2, show=True):
    freq_src_df = mi_agg_dfs[[asset, freq, src]]
    ntop = int(len(freq_src_df) * ntop) if (is_type(ntop, float)) else ntop
    print('{} {} {} unique subsets: {}'.format(asset, freq, src, len(freq_src_df.reset_index()['sub'].unique())))
    print('ntop: {}'.format(ntop))

    ntop_reg_df = freq_src_df['reg'].nlargest(columns='max', n=ntop).reset_index()
    if (use_clf_reg):
        ntop_clf_df = freq_src_df['clf'].nlargest(columns='max', n=ntop).reset_index()
        assert(len(ntop_clf_df)==len(ntop_reg_df))

        if (show):
            display(ntop_clf_df, ntop_reg_df)
        return np.intersect1d(ntop_clf_df['sub'], ntop_reg_df['sub'])
    else:
        if (show):
            display(ntop_reg_df)
        return ntop_reg_df['sub']

In [43]:
def ntop_common_subset(asset, freq, src, sub, use_clf_reg=False, ntop=.2, show=False):
    freq_src_sub_df = pd.concat([mi[kc] for kc in list(mi.childkeys([asset, freq, src, sub, 'mi']))])
    ntop = int(len(freq_src_sub_df) * ntop) if (is_type(ntop, float)) else ntop
    #print('{} {} {} unique subsets: {}'.format(asset, freq, src, len(freq_src_df.reset_index()['sub'].unique())))
    print('ntop: {}'.format(ntop))

    ntop_reg_df = freq_src_sub_df.nlargest(columns='reg', n=ntop).reset_index()
    if (use_clf_reg):
        ntop_clf_df = freq_src_sub_df.nlargest(columns='clf', n=ntop).reset_index()
        assert(len(ntop_clf_df)==len(ntop_reg_df))

        if (show):
            display(ntop_clf_df, ntop_reg_df)
        return np.intersect1d(ntop_clf_df['ser'], ntop_reg_df['ser'])
    else:
        if (show):
            display(ntop_reg_df)
        return ntop_reg_df['ser']

In [44]:
def extract_axe_name(string):
    try:
        end = string.index('(')
    except ValueError as e:
        end = len(string)
    finally:
        return string[:end].split('_')[-1]

## Daily

In [109]:
freq = 'd'
ntop_common_dfs = NestedDefaultDict()
for asset, src in product(assets, srcs):
    ntop_common_dfs[[asset, freq, src]] = ntop_common(asset, freq, src, use_clf_reg=False, ntop=.05)

sp_500 d pba unique subsets: 83
ntop: 41


Unnamed: 0,sub,win,knn,max
0,pba_dohlca_dwrzn(-1)_dwrzn(20)_dwrzn(240)_dwrzn(5),10000,30000,1648.8624
1,pba_dohlca_dwrzn(-1)_dwrzn(20)_dwrzn(240)_dwrzn(5),30000,30000,1383.673699
2,"pba_hlh_hdxcret2(0.5*std,1)_dxfbcret2_hdxcret2(1*std,1)_hdxcret2(2*std,1)",10000,30000,1361.54803
3,pba_dohlca_dwrzn(-1)_dwrzn(20)_dwrzn(240)_dwrzn(5),10000,70000,1252.564331
4,pba_dohlca_dwrod(-1)_dwrod(20)_dwrod(240)_dwrod(5),10000,30000,1230.856728
5,pba_dohlca_dwrod(-1)_dwrod(20)_dwrod(240)_dwrod(5),30000,30000,1177.153521
6,pba_dohlca_dwrod(-1)_dwrod(20)_dwrod(240)_dwrod(5),10000,70000,1158.231545
7,pba_dohlca_dwrmx(-1)_dwrmx(20)_dwrmx(240)_dwrmx(5),10000,70000,1150.518257
8,pba_dohlca_dwrpt(-1)_dwrpt(20)_dwrpt(240)_dwrpt(5),10000,70000,1137.992473
9,pba_dohlca_dwrpt(-1)_dwrpt(20)_dwrpt(240)_dwrpt(5),10000,30000,1105.095895


sp_500 d vol unique subsets: 83
ntop: 41


Unnamed: 0,sub,win,knn,max
0,"vol_dohlca_dffd(0.200000,0.010000)_dffd(0.400000,0.010000)_dffd(0.600000,0.010000)_dffd(0.800000,0.010000)",10000,30000,1385.415514
1,"vol_dohlca_dffd(0.200000,0.010000)_dffd(0.400000,0.010000)_dffd(0.600000,0.010000)_dffd(0.800000,0.010000)",50000,30000,1311.895091
2,vol_dohlca,10000,30000,1277.103774
3,vol_dohlca_dwrmx(-1)_dwrmx(20)_dwrmx(240)_dwrmx(5),10000,30000,1274.50307
4,vol_dohlca_dwrpt(-1)_dwrpt(20)_dwrpt(240)_dwrpt(5),10000,30000,1271.518603
5,vol_dohlca_dwrod(-1)_dwrod(20)_dwrod(240)_dwrod(5),50000,70000,1246.850749
6,vol_dohlca_dwrod(-1)_dwrod(20)_dwrod(240)_dwrod(5),50000,30000,1191.619884
7,vol_dohlca,50000,30000,1182.275293
8,vol_dohlca,50000,70000,1178.661405
9,vol_dohlca_dwrmx(-1)_dwrmx(20)_dwrmx(240)_dwrmx(5),50000,70000,1178.661405


sp_500 d buzz unique subsets: 14
ntop: 7


Unnamed: 0,sub,win,knn,max
0,trmi3_etf3_social_buzz_dohlca_ddiff(1)_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz,30000,30000,818.355469
1,"trmi3_etf3_news_buzz_dohlca_dffd(0.200000,0.010000)_dffd(0.400000,0.010000)_dffd(0.600000,0.010000)_dffd(0.800000,0.010000)_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz",30000,30000,756.000202
2,trmi3_etf3_news_buzz_dohlca_dwrzn(-1)_dwrzn(20)_dwrzn(240)_dwrzn(5)_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz,200000,30000,650.070896
3,trmi3_etf3_news_buzz_dohlca_ddiff(1)_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz,50000,30000,632.830227
4,trmi3_etf3_news_buzz_dohlca_dwrmx(-1)_dwrmx(20)_dwrmx(240)_dwrmx(5)_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz,200000,30000,627.898248
5,trmi3_etf3_social_buzz_dohlca_dwrod(-1)_dwrod(20)_dwrod(240)_dwrod(5)_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz,200000,30000,627.408046
6,trmi3_etf3_news_buzz_dohlca_dwrpt(-1)_dwrpt(20)_dwrpt(240)_dwrpt(5)_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz,200000,30000,627.271865


sp_500 d nonbuzz unique subsets: 10
ntop: 5


Unnamed: 0,sub,win,knn,max
0,trmi3_etf3_social_forecast_dc_fundamental_sentiment_mkt3_bank_bond_stock,30000,30000,945.204591
1,trmi3_etf3_social_forecast_dc_fundamental_sentiment_mkt3_bank_bond_stock,10000,70000,940.99426
2,trmi3_etf3_social_forecast_dc_fundamental_sentiment_mkt3_bank_bond_stock,200000,30000,915.116848
3,trmi3_etf3_social_forecast_dc_fundamental_sentiment_mkt3_bank_bond_stock,10000,30000,908.058114
4,"trmi3_etf3_social_forecast_dc_dffd(0.200000,0.010000)_dwrxmx(-1)_dffd(0.400000,0.010000)_dffd(0.600000,0.010000)_dffd(0.800000,0.010000)_fundamental_sentiment_mkt3_bank_bond_stock",30000,30000,866.64564


russell_2000 d pba unique subsets: 83
ntop: 41


Unnamed: 0,sub,win,knn,max
0,pba_dohlca_dwrmx(-1)_dwrmx(20)_dwrmx(240)_dwrmx(5),10000,30000,1491.831754
1,pba_dohlca_dwrpt(-1)_dwrpt(20)_dwrpt(240)_dwrpt(5),10000,30000,1491.415986
2,"pba_dohlca_dffd(0.200000,0.010000)_dffd(0.400000,0.010000)_dffd(0.600000,0.010000)_dffd(0.800000,0.010000)",10000,30000,1490.837943
3,pba_dohlca_dwrod(-1)_dwrod(20)_dwrod(240)_dwrod(5),10000,30000,1388.052117
4,pba_dohlca,10000,30000,1378.605926
5,pba_dohlca_dwrmx(-1)_dwrmx(20)_dwrmx(240)_dwrmx(5),100000,30000,1368.987712
6,pba_dohlca_dwrpt(-1)_dwrpt(20)_dwrpt(240)_dwrpt(5),100000,30000,1366.190134
7,pba_dohlca_dwrpt(-1)_dwrpt(20)_dwrpt(240)_dwrpt(5),50000,30000,1358.820888
8,pba_dohlca_dwrmx(-1)_dwrmx(20)_dwrmx(240)_dwrmx(5),50000,30000,1356.744691
9,pba_dohlca_dwrpt(-1)_dwrpt(20)_dwrpt(240)_dwrpt(5),30000,30000,1336.865797


russell_2000 d vol unique subsets: 83
ntop: 41


Unnamed: 0,sub,win,knn,max
0,vol_dohlca_dwrod(-1)_dwrod(20)_dwrod(240)_dwrod(5),30000,30000,1737.219671
1,"vol_dohlca_dffd(0.200000,0.010000)_dffd(0.400000,0.010000)_dffd(0.600000,0.010000)_dffd(0.800000,0.010000)",10000,30000,1721.054484
2,vol_dohlca_dwrod(-1)_dwrod(20)_dwrod(240)_dwrod(5),10000,30000,1662.347458
3,vol_dohlca,10000,30000,1556.072136
4,vol_dohlca_dwrmx(-1)_dwrmx(20)_dwrmx(240)_dwrmx(5),10000,30000,1549.288327
5,vol_dohlca_dwrpt(-1)_dwrpt(20)_dwrpt(240)_dwrpt(5),10000,30000,1549.176755
6,"vol_dohlca_dffd(0.200000,0.010000)_dffd(0.400000,0.010000)_dffd(0.600000,0.010000)_dffd(0.800000,0.010000)",30000,30000,1547.028487
7,"vol_dohlca_dffd(0.200000,0.010000)_dffd(0.400000,0.010000)_dffd(0.600000,0.010000)_dffd(0.800000,0.010000)",10000,70000,1528.912428
8,vol_dohlca,10000,70000,1519.538237
9,vol_dohlca_dwrod(-1)_dwrod(20)_dwrod(240)_dwrod(5),10000,70000,1510.131117


russell_2000 d buzz unique subsets: 14
ntop: 7


Unnamed: 0,sub,win,knn,max
0,trmi3_etf3_news_buzz_dohlca_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz,30000,30000,890.595957
1,"trmi3_etf3_social_buzz_dohlca_dffd(0.200000,0.010000)_dffd(0.400000,0.010000)_dffd(0.600000,0.010000)_dffd(0.800000,0.010000)_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz",30000,30000,890.026029
2,trmi3_etf3_social_buzz_dohlca_dwrpt(-1)_dwrpt(20)_dwrpt(240)_dwrpt(5)_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz,50000,30000,879.096936
3,trmi3_etf3_social_buzz_dohlca_dwrmx(-1)_dwrmx(20)_dwrmx(240)_dwrmx(5)_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz,50000,30000,876.614579
4,trmi3_etf3_news_buzz_dohlca_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz,50000,30000,807.9584
5,trmi3_etf3_news_buzz_dohlca_dwrmx(-1)_dwrmx(20)_dwrmx(240)_dwrmx(5)_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz,30000,30000,803.483462
6,trmi3_etf3_news_buzz_dohlca_dwrpt(-1)_dwrpt(20)_dwrpt(240)_dwrpt(5)_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz,30000,30000,802.803839


russell_2000 d nonbuzz unique subsets: 10
ntop: 5


Unnamed: 0,sub,win,knn,max
0,"trmi3_etf3_news_forecast_dc_dffd(0.200000,0.010000)_dwrxmx(-1)_dffd(0.400000,0.010000)_dffd(0.600000,0.010000)_dffd(0.800000,0.010000)_fundamental_sentiment_mkt3_bank_bond_stock",100000,30000,899.417719
1,"trmi3_etf3_social_forecast_dc_dffd(0.200000,0.010000)_dwrxmx(-1)_dffd(0.400000,0.010000)_dffd(0.600000,0.010000)_dffd(0.800000,0.010000)_fundamental_sentiment_mkt3_bank_bond_stock",100000,70000,896.84038
2,"trmi3_etf3_social_forecast_dc_dffd(0.200000,0.010000)_dwrxmx(-1)_dffd(0.400000,0.010000)_dffd(0.600000,0.010000)_dffd(0.800000,0.010000)_fundamental_sentiment_mkt3_bank_bond_stock",100000,30000,867.134436
3,trmi3_etf3_news_forecast_dc_fundamental_sentiment_mkt3_bank_bond_stock,10000,30000,852.607198
4,"trmi3_etf3_social_forecast_dc_dffd(0.200000,0.010000)_dwrxmx(-1)_dffd(0.400000,0.010000)_dffd(0.600000,0.010000)_dffd(0.800000,0.010000)_fundamental_sentiment_mkt3_bank_bond_stock",200000,70000,819.985275


nasdaq_100 d pba unique subsets: 83
ntop: 41


Unnamed: 0,sub,win,knn,max
0,pba_dohlca_dwrzn(-1)_dwrzn(20)_dwrzn(240)_dwrzn(5),10000,30000,1277.238129
1,pba_dohlca_dwrpt(-1)_dwrpt(20)_dwrpt(240)_dwrpt(5),30000,30000,1098.538203
2,pba_dohlca_dwrmx(-1)_dwrmx(20)_dwrmx(240)_dwrmx(5),30000,30000,1088.782178
3,"pba_dohlca_dffd(0.200000,0.010000)_dffd(0.400000,0.010000)_dffd(0.600000,0.010000)_dffd(0.800000,0.010000)",10000,30000,1022.483934
4,pba_dohlca_dwrpt(-1)_dwrpt(20)_dwrpt(240)_dwrpt(5),10000,30000,1013.6196
5,pba_dohlca_dwrmx(-1)_dwrmx(20)_dwrmx(240)_dwrmx(5),10000,30000,1009.339854
6,pba_dohlca_dwrod(-1)_dwrod(20)_dwrod(240)_dwrod(5),10000,30000,965.820156
7,pba_dohlca_dwrpt(-1)_dwrpt(20)_dwrpt(240)_dwrpt(5),50000,30000,919.084118
8,pba_dohlca_dwrpt(-1)_dwrpt(20)_dwrpt(240)_dwrpt(5),10000,70000,913.331504
9,pba_dohlca_dwrmx(-1)_dwrmx(20)_dwrmx(240)_dwrmx(5),10000,70000,910.721043


nasdaq_100 d vol unique subsets: 83
ntop: 41


Unnamed: 0,sub,win,knn,max
0,vol_dohlca,10000,30000,1258.627425
1,"vol_dohlca_dffd(0.200000,0.010000)_dffd(0.400000,0.010000)_dffd(0.600000,0.010000)_dffd(0.800000,0.010000)",10000,30000,1222.054476
2,vol_dohlca_dwrzn(-1)_dwrzn(20)_dwrzn(240)_dwrzn(5),10000,30000,1209.869204
3,vol_dohlca_dwrmx(-1)_dwrmx(20)_dwrmx(240)_dwrmx(5),10000,30000,1180.483461
4,vol_dohlca_dwrpt(-1)_dwrpt(20)_dwrpt(240)_dwrpt(5),10000,30000,1177.350573
5,vol_dohlca_dwrzn(-1)_dwrzn(20)_dwrzn(240)_dwrzn(5),10000,70000,1113.907073
6,vol_dohlca_dwrmx(-1)_dwrmx(20)_dwrmx(240)_dwrmx(5),10000,70000,1106.160921
7,vol_dohlca_dwrpt(-1)_dwrpt(20)_dwrpt(240)_dwrpt(5),10000,70000,1105.190634
8,vol_dohlca_dwrod(-1)_dwrod(20)_dwrod(240)_dwrod(5),10000,30000,1078.974949
9,vol_dohlca,10000,70000,1060.111326


nasdaq_100 d buzz unique subsets: 14
ntop: 7


Unnamed: 0,sub,win,knn,max
0,"trmi3_etf3_social_buzz_dohlca_dffd(0.200000,0.010000)_dffd(0.400000,0.010000)_dffd(0.600000,0.010000)_dffd(0.800000,0.010000)_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz",200000,30000,983.506872
1,trmi3_etf3_social_buzz_dohlca_dwrzn(-1)_dwrzn(20)_dwrzn(240)_dwrzn(5)_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz,100000,30000,767.580271
2,trmi3_etf3_social_buzz_dohlca_dwrod(-1)_dwrod(20)_dwrod(240)_dwrod(5)_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz,100000,30000,756.069926
3,trmi3_etf3_news_buzz_dohlca_dwrzn(-1)_dwrzn(20)_dwrzn(240)_dwrzn(5)_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz,10000,30000,743.665164
4,trmi3_etf3_social_buzz_dohlca_dwrmx(-1)_dwrmx(20)_dwrmx(240)_dwrmx(5)_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz,30000,30000,729.892047
5,trmi3_etf3_social_buzz_dohlca_dwrpt(-1)_dwrpt(20)_dwrpt(240)_dwrpt(5)_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz,30000,30000,728.076317
6,"trmi3_etf3_news_buzz_dohlca_dffd(0.200000,0.010000)_dffd(0.400000,0.010000)_dffd(0.600000,0.010000)_dffd(0.800000,0.010000)_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz",10000,30000,724.000553


nasdaq_100 d nonbuzz unique subsets: 10
ntop: 5


Unnamed: 0,sub,win,knn,max
0,trmi3_etf3_social_forecast_dc_fundamental_sentiment_mkt3_bank_bond_stock,10000,30000,786.913738
1,trmi3_etf3_social_forecast_dc_fundamental_sentiment_mkt3_bank_bond_stock,200000,30000,717.919783
2,trmi3_etf3_news_forecast_dc_fundamental_sentiment_mkt3_bank_bond_stock,10000,30000,716.26035
3,"trmi3_etf3_social_forecast_dc_dffd(0.200000,0.010000)_dwrxmx(-1)_dffd(0.400000,0.010000)_dffd(0.600000,0.010000)_dffd(0.800000,0.010000)_fundamental_sentiment_mkt3_bank_bond_stock",30000,30000,707.795643
4,trmi3_etf3_social_forecast_dc_fundamental_sentiment_mkt3_bank_bond_stock,100000,30000,679.561534


dow_jones d pba unique subsets: 83
ntop: 41


Unnamed: 0,sub,win,knn,max
0,pba_dohlca_dwrod(-1)_dwrod(20)_dwrod(240)_dwrod(5),10000,30000,1269.785067
1,pba_dohlca_dwrpt(-1)_dwrpt(20)_dwrpt(240)_dwrpt(5),10000,30000,1242.561843
2,pba_dohlca_dwrmx(-1)_dwrmx(20)_dwrmx(240)_dwrmx(5),10000,30000,1234.707134
3,pba_dohlca_dwrpt(-1)_dwrpt(20)_dwrpt(240)_dwrpt(5),30000,30000,1155.110227
4,pba_dohlca_dwrmx(-1)_dwrmx(20)_dwrmx(240)_dwrmx(5),30000,30000,1144.052697
5,pba_dohlca_dwrod(-1)_dwrod(20)_dwrod(240)_dwrod(5),10000,70000,1123.587772
6,pba_dohlca_dwrzn(-1)_dwrzn(20)_dwrzn(240)_dwrzn(5),30000,30000,1078.153771
7,pba_dohlca_dwrzn(-1)_dwrzn(20)_dwrzn(240)_dwrzn(5),10000,30000,1073.294718
8,pba_dohlca_dwrzn(-1)_dwrzn(20)_dwrzn(240)_dwrzn(5),30000,70000,1057.074169
9,pba_dohlca_dwrod(-1)_dwrod(20)_dwrod(240)_dwrod(5),30000,30000,1045.492365


dow_jones d vol unique subsets: 83
ntop: 41


Unnamed: 0,sub,win,knn,max
0,vol_dohlca_dwrod(-1)_dwrod(20)_dwrod(240)_dwrod(5),10000,70000,1409.231532
1,vol_dohlca_dwrzn(-1)_dwrzn(20)_dwrzn(240)_dwrzn(5),10000,70000,1271.415152
2,vol_dohlca_dwrod(-1)_dwrod(20)_dwrod(240)_dwrod(5),30000,70000,1195.294718
3,vol_dohlca,30000,30000,1189.558809
4,vol_dohlca_dwrpt(-1)_dwrpt(20)_dwrpt(240)_dwrpt(5),30000,30000,1189.558809
5,vol_dohlca_dwrmx(-1)_dwrmx(20)_dwrmx(240)_dwrmx(5),30000,30000,1189.464332
6,vol_dohlca_dwrzn(-1)_dwrzn(20)_dwrzn(240)_dwrzn(5),30000,30000,1182.345367
7,"vol_dohlca_dffd(0.200000,0.010000)_dffd(0.400000,0.010000)_dffd(0.600000,0.010000)_dffd(0.800000,0.010000)",30000,30000,1179.730823
8,"vol_dohlca_dffd(0.200000,0.010000)_dffd(0.400000,0.010000)_dffd(0.600000,0.010000)_dffd(0.800000,0.010000)",10000,70000,1157.591462
9,vol_dohlca_dwrod(-1)_dwrod(20)_dwrod(240)_dwrod(5),10000,30000,1123.550717


dow_jones d buzz unique subsets: 14
ntop: 7


Unnamed: 0,sub,win,knn,max
0,trmi3_etf3_news_buzz_dohlca_ddiff(1)_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz,10000,30000,864.920199
1,"trmi3_etf3_social_buzz_dohlca_dffd(0.200000,0.010000)_dffd(0.400000,0.010000)_dffd(0.600000,0.010000)_dffd(0.800000,0.010000)_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz",200000,30000,820.123678
2,trmi3_etf3_social_buzz_dohlca_dwrzn(-1)_dwrzn(20)_dwrzn(240)_dwrzn(5)_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz,10000,30000,705.844329
3,trmi3_etf3_social_buzz_dohlca_dwrod(-1)_dwrod(20)_dwrod(240)_dwrod(5)_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz,30000,30000,699.103613
4,"trmi3_etf3_social_buzz_dohlca_dffd(0.200000,0.010000)_dffd(0.400000,0.010000)_dffd(0.600000,0.010000)_dffd(0.800000,0.010000)_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz",50000,30000,675.690337
5,trmi3_etf3_news_buzz_dohlca_dwrzn(-1)_dwrzn(20)_dwrzn(240)_dwrzn(5)_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz,30000,30000,648.226021
6,trmi3_etf3_social_buzz_dohlca_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz,30000,30000,647.447499


dow_jones d nonbuzz unique subsets: 10
ntop: 5


Unnamed: 0,sub,win,knn,max
0,trmi3_etf3_social_forecast_dc_fundamental_sentiment_mkt3_bank_bond_stock,10000,30000,971.700853
1,trmi3_etf3_social_forecast_dc_fundamental_sentiment_mkt3_bank_bond_stock,30000,30000,962.708192
2,trmi3_etf3_social_forecast_dc_fundamental_sentiment_mkt3_bank_bond_stock,50000,30000,869.243918
3,"trmi3_etf3_news_forecast_dc_dffd(0.200000,0.010000)_dffd(0.400000,0.010000)_dffd(0.600000,0.010000)_dffd(0.800000,0.010000)_fundamental_sentiment_mkt3_bank_bond_stock",50000,30000,734.430939
4,"trmi3_etf3_news_forecast_dc_dffd(0.200000,0.010000)_dwrxmx(-1)_dffd(0.400000,0.010000)_dffd(0.600000,0.010000)_dffd(0.800000,0.010000)_fundamental_sentiment_mkt3_bank_bond_stock",50000,30000,734.430939


### NTop Common Across Assets

In [110]:
d_pba = reduce(np.intersect1d, (ntop_common_dfs[[asset, 'd', 'pba']] for asset in assets))
display(len(d_pba), d_pba)

6

array(['pba_dlh_dlogret',
       'pba_dohlca_dffd(0.200000,0.010000)_dffd(0.400000,0.010000)_dffd(0.600000,0.010000)_dffd(0.800000,0.010000)',
       'pba_dohlca_dwrmx(-1)_dwrmx(20)_dwrmx(240)_dwrmx(5)',
       'pba_dohlca_dwrod(-1)_dwrod(20)_dwrod(240)_dwrod(5)',
       'pba_dohlca_dwrpt(-1)_dwrpt(20)_dwrpt(240)_dwrpt(5)',
       'pba_dohlca_dwrzn(-1)_dwrzn(20)_dwrzn(240)_dwrzn(5)'], dtype=object)

In [111]:
d_pba_common_axe = list(set(map(extract_axe_name, d_pba)))
display(d_pba_common_axe)

d_pba_minimal = []
for axe in d_pba_common_axe:
    d_pba_minimal.extend(reduce(np.intersect1d, (ntop_common_subset(asset, 'd', 'pba', axe, use_clf_reg=False, ntop=.05) for asset in assets)))
display(d_pba_minimal)

['dwrmx', 'dlogret', 'dffd', 'dwrzn', 'dwrod', 'dwrpt']

ntop: 10
ntop: 10
ntop: 10
ntop: 10
ntop: 1
ntop: 1
ntop: 1
ntop: 1
ntop: 10
ntop: 10
ntop: 10
ntop: 10
ntop: 10
ntop: 10
ntop: 10
ntop: 10
ntop: 10
ntop: 10
ntop: 10
ntop: 10
ntop: 10
ntop: 10
ntop: 10
ntop: 10


['pba_avgPrice_pba_dohlca_dwrmx(240)',
 'pba_close_pba_dohlca_dwrmx(240)',
 'pba_high_pba_dohlca_dwrmx(240)',
 'pba_low_pba_dohlca_dwrmx(240)',
 'pba_lh_pba_dlh_dlogret',
 'pba_avgPrice_pba_dohlca_dwrzn(240)',
 'pba_close_pba_dohlca_dwrzn(240)',
 'pba_avgPrice_pba_dohlca_dwrod(240)',
 'pba_high_pba_dohlca_dwrod(240)',
 'pba_open_pba_dohlca_dwrod(240)',
 'pba_avgPrice_pba_dohlca_dwrpt(240)',
 'pba_close_pba_dohlca_dwrpt(240)',
 'pba_high_pba_dohlca_dwrpt(240)',
 'pba_low_pba_dohlca_dwrpt(240)']

In [112]:
d_vol = reduce(np.intersect1d, (ntop_common_dfs[[asset, 'd', 'vol']] for asset in assets))
display(len(d_vol), d_vol)

6

array(['vol_dohlca',
       'vol_dohlca_dffd(0.200000,0.010000)_dffd(0.400000,0.010000)_dffd(0.600000,0.010000)_dffd(0.800000,0.010000)',
       'vol_dohlca_dwrmx(-1)_dwrmx(20)_dwrmx(240)_dwrmx(5)',
       'vol_dohlca_dwrod(-1)_dwrod(20)_dwrod(240)_dwrod(5)',
       'vol_dohlca_dwrpt(-1)_dwrpt(20)_dwrpt(240)_dwrpt(5)',
       'vol_dohlca_dwrzn(-1)_dwrzn(20)_dwrzn(240)_dwrzn(5)'], dtype=object)

In [113]:
d_vol_common_axe = list(set(map(extract_axe_name, d_vol)))
display(d_vol_common_axe)

d_vol_minimal = []
for axe in d_vol_common_axe:
    d_vol_minimal.extend(reduce(np.intersect1d, (ntop_common_subset(asset, 'd', 'vol', axe, ntop=.2) for asset in assets)))
display(d_vol_minimal)

['dwrmx', 'dffd', 'dwrzn', 'dwrod', 'dwrpt', 'dohlca']

ntop: 40
ntop: 40
ntop: 40
ntop: 40
ntop: 40
ntop: 40
ntop: 40
ntop: 40
ntop: 40
ntop: 40
ntop: 40
ntop: 40
ntop: 40
ntop: 40
ntop: 40
ntop: 40
ntop: 40
ntop: 40
ntop: 40
ntop: 40
ntop: 10
ntop: 10
ntop: 10
ntop: 10


['vol_avgPrice_vol_dohlca_dwrmx(-1)',
 'vol_close_vol_dohlca_dwrmx(-1)',
 'vol_high_vol_dohlca_dwrmx(-1)',
 'vol_low_vol_dohlca_dwrmx(-1)',
 'vol_open_vol_dohlca_dwrmx(-1)',
 'vol_open_vol_dohlca_dwrmx(240)',
 'vol_avgPrice_vol_dohlca_dffd(0.200000,0.010000)',
 'vol_avgPrice_vol_dohlca_dffd(0.400000,0.010000)',
 'vol_close_vol_dohlca_dffd(0.200000,0.010000)',
 'vol_close_vol_dohlca_dffd(0.400000,0.010000)',
 'vol_high_vol_dohlca_dffd(0.200000,0.010000)',
 'vol_low_vol_dohlca_dffd(0.200000,0.010000)',
 'vol_low_vol_dohlca_dffd(0.400000,0.010000)',
 'vol_open_vol_dohlca_dffd(0.200000,0.010000)',
 'vol_avgPrice_vol_dohlca_dwrzn(-1)',
 'vol_close_vol_dohlca_dwrzn(-1)',
 'vol_high_vol_dohlca_dwrzn(-1)',
 'vol_high_vol_dohlca_dwrzn(240)',
 'vol_low_vol_dohlca_dwrzn(-1)',
 'vol_open_vol_dohlca_dwrzn(-1)',
 'vol_avgPrice_vol_dohlca_dwrod(-1)',
 'vol_close_vol_dohlca_dwrod(-1)',
 'vol_high_vol_dohlca_dwrod(-1)',
 'vol_low_vol_dohlca_dwrod(-1)',
 'vol_open_vol_dohlca_dwrod(-1)',
 'vol_avgPrice_v

In [None]:
d_buzz = reduce(np.intersect1d, (ntop_common_dfs[[asset, 'd', 'buzz']] for asset in assets))
display(len(d_buzz), d_buzz)

In [90]:
d_buzz_common_axe = list(set(map(extract_axe_name, d_buzz)))
display(d_buzz_common_axe)

d_buzz_minimal = []
for axe in d_buzz_common_axe:
    d_buzz_minimal.extend(reduce(np.intersect1d, (ntop_common_subset(asset, 'd', 'buzz', axe, ntop=.05) for asset in assets)))
display(d_buzz_minimal)

['dwrmx', 'dffd', 'dwrzn', 'dwrod', 'ddiff', 'dwrpt']

ntop: 80
ntop: 80
ntop: 80
ntop: 80
ntop: 80
ntop: 80
ntop: 80
ntop: 80
ntop: 80
ntop: 80
ntop: 80
ntop: 80
ntop: 80
ntop: 80
ntop: 80
ntop: 80
ntop: 20
ntop: 20
ntop: 20
ntop: 20
ntop: 80
ntop: 80
ntop: 80
ntop: 80


['etf3_buzz_N_open_trmi3_etf3_news_buzz_dohlca_dwrmx(-1)',
 'etf3_buzz_S_avgPrice_trmi3_etf3_social_buzz_dohlca_dwrmx(240)',
 'etf3_buzz_S_high_trmi3_etf3_social_buzz_dohlca_dwrmx(-1)',
 'mkt3_bondBuzz_N_close_trmi3_mkt3_news_bondBuzz_dohlca_dwrmx(-1)',
 'mkt3_stockIndexBuzz_S_close_trmi3_mkt3_social_stockIndexBuzz_dohlca_dwrmx(-1)',
 'etf3_buzz_N_avgPrice_trmi3_etf3_news_buzz_dohlca_dffd(0.800000,0.010000)',
 'etf3_buzz_N_close_trmi3_etf3_news_buzz_dohlca_dffd(0.200000,0.010000)',
 'etf3_buzz_N_high_trmi3_etf3_news_buzz_dohlca_dffd(0.200000,0.010000)',
 'etf3_buzz_N_low_trmi3_etf3_news_buzz_dohlca_dffd(0.600000,0.010000)',
 'etf3_buzz_N_open_trmi3_etf3_news_buzz_dohlca_dffd(0.800000,0.010000)',
 'mkt3_bondBuzz_S_avgPrice_trmi3_mkt3_social_bondBuzz_dohlca_dffd(0.400000,0.010000)',
 'mkt3_bondBuzz_S_close_trmi3_mkt3_social_bondBuzz_dohlca_dffd(0.600000,0.010000)',
 'mkt3_ratesBuzz_S_high_trmi3_mkt3_social_ratesBuzz_dohlca_dffd(0.800000,0.010000)',
 'etf3_buzz_N_close_trmi3_etf3_news_buz

In [91]:
d_nonbuzz = reduce(np.intersect1d, (ntop_common_dfs[[asset, 'd', 'nonbuzz']] for asset in assets))
display(len(d_nonbuzz), d_nonbuzz)

6

array(['trmi3_etf3_news_forecast_dc_dffd(0.200000,0.010000)_dffd(0.400000,0.010000)_dffd(0.600000,0.010000)_dffd(0.800000,0.010000)_fundamental_sentiment_mkt3_bank_bond_stock',
       'trmi3_etf3_news_forecast_dc_dffd(0.200000,0.010000)_dwrxmx(-1)_dffd(0.400000,0.010000)_dffd(0.600000,0.010000)_dffd(0.800000,0.010000)_fundamental_sentiment_mkt3_bank_bond_stock',
       'trmi3_etf3_news_forecast_dc_fundamental_sentiment_mkt3_bank_bond_stock',
       'trmi3_etf3_social_forecast_dc_dffd(0.200000,0.010000)_dffd(0.400000,0.010000)_dffd(0.600000,0.010000)_dffd(0.800000,0.010000)_fundamental_sentiment_mkt3_bank_bond_stock',
       'trmi3_etf3_social_forecast_dc_dffd(0.200000,0.010000)_dwrxmx(-1)_dffd(0.400000,0.010000)_dffd(0.600000,0.010000)_dffd(0.800000,0.010000)_fundamental_sentiment_mkt3_bank_bond_stock',
       'trmi3_etf3_social_forecast_dc_fundamental_sentiment_mkt3_bank_bond_stock'],
      dtype=object)

In [92]:
d_nonbuzz_common_axe = list(set(map(extract_axe_name, d_nonbuzz)))
display(d_nonbuzz_common_axe)

d_nonbuzz_minimal = []
for axe in d_nonbuzz_common_axe:
    d_nonbuzz_minimal.extend(reduce(np.intersect1d, (ntop_common_subset(asset, 'd', 'nonbuzz', axe, ntop=.55) for asset in assets)))
display(d_nonbuzz_minimal)

['dffd', 'stock']

ntop: 2596
ntop: 2596
ntop: 2596
ntop: 2596


ValueError: No objects to concatenate

best ntop common for pba, vol, buzz: dffd, dwrmx, dwrod, dwrpt, dwrzn

best ntop commo for nonbuzz: dffd

In general regardless of the source or asset the following transforms rank highest for mutual information with daily data:
* dffd
* dwrmx
* dwrod
* dwrpt
* dwrzn

The fact that these performed the best isn't too surprising. The following transforms did not peform consistently well over all assets and sources, but might peform well with a more powerful model / test for a particular source:

* dohlca (vol, pba)
* dlh_logret (vol, pba)
* ddiff(1) (vol, pba)
* dc (nonbuzz)

In [21]:
d_minimal_set = d_pba_minimal + d_vol_minimal + d_buzz_minimal + d_nonbuzz_minimal

In [22]:
common_axefiles_set = ['dffd', 'dwrmx', 'dwrod', 'dwrpt', 'dwrzn']

In [23]:
extra_axefiles_set = ['dohlca', 'dlogret', 'ddiff', 'dc']
common_extra_axefiles_set = common_axefiles_set + extra_axefiles_set

## Hourly

In [258]:
freq = 'h'
ntop_common_dfs = NestedDefaultDict()
for asset, src in product(assets, srcs):
    if (src == 'nonbuzz'): continue
    ntop_common_dfs[[asset, freq, src]] = ntop_common(asset, freq, src)

sp_500 h pba unique subsets: 19
ntop: 19


Unnamed: 0,sub,win,knn,max
0,pba_hohlca,10000,30000,882.71109
1,pba_hohlca,30000,70000,853.507221
2,pba_hohlca,10000,70000,806.086291
3,pba_hohlca_hdzn,10000,30000,694.282727
4,pba_hohlca,50000,70000,689.934422
5,pba_hohlca,200000,70000,662.315487
6,pba_hohlca_hdmx_hduni(8),50000,30000,622.20477
7,pba_hohlca,30000,30000,618.992181
8,pba_hohlca_hdzn_hdgau(8),200000,30000,605.138457
9,pba_hohlca,200000,30000,603.134296


sp_500 h vol unique subsets: 19
ntop: 19


Unnamed: 0,sub,win,knn,max
0,vol_hohlca,10000,30000,1372.839991
1,vol_hohlca,50000,30000,1224.022436
2,vol_hohlca,50000,70000,1213.051313
3,vol_hohlca,10000,70000,1206.210251
4,vol_hohlca,30000,30000,1140.67153
5,vol_hohlca,100000,30000,1017.106089
6,vol_hohlca,30000,70000,940.733767
7,vol_hohlca,200000,30000,862.48617
8,vol_hohlca,200000,70000,789.705835
9,vol_hohlca,100000,70000,789.420277


sp_500 h buzz unique subsets: 14
ntop: 14


Unnamed: 0,sub,win,knn,max
0,trmi3_etf3_news_buzz_hohlca_hdzn_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz,200000,30000,614.978553
1,trmi3_etf3_social_buzz_hohlca_hdod_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz,30000,30000,609.355249
2,trmi3_etf3_news_buzz_hohlca_hdzn_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz,50000,30000,585.782128
3,trmi3_etf3_social_buzz_hohlca_hdzn_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz,30000,30000,557.666496
4,trmi3_etf3_social_buzz_hohlca_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz,200000,30000,542.145758
5,trmi3_etf3_social_buzz_hohlca_hdpt_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz,200000,30000,538.138824
6,trmi3_etf3_social_buzz_hohlca_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz,30000,30000,534.743376
7,trmi3_etf3_social_buzz_hohlca_hdmx_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz,200000,30000,529.428671
8,trmi3_etf3_social_buzz_hohlca_hdod_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz,200000,30000,528.192746
9,trmi3_etf3_social_buzz_hohlca_hdzn_hdgau(8)_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz,200000,30000,524.821318


russell_2000 h pba unique subsets: 19
ntop: 19


Unnamed: 0,sub,win,knn,max
0,pba_hohlca,10000,70000,1338.798845
1,pba_hohlca,50000,30000,1322.981738
2,pba_hohlca,10000,30000,1284.51103
3,pba_hohlca,50000,70000,1221.640131
4,pba_hohlca,200000,70000,1165.960403
5,pba_hohlca,30000,70000,1100.217796
6,pba_hohlca,30000,30000,1094.465045
7,pba_hohlca,200000,30000,1060.699964
8,pba_hohlca,100000,30000,927.54855
9,pba_hohlca,100000,70000,850.813623


russell_2000 h vol unique subsets: 19
ntop: 19


Unnamed: 0,sub,win,knn,max
0,vol_hohlca,30000,30000,1779.06805
1,vol_hohlca,10000,30000,1693.814985
2,vol_hohlca,10000,70000,1557.376411
3,vol_hohlca,30000,70000,1380.793478
4,vol_hohlca,50000,30000,1351.772805
5,vol_hohlca,100000,30000,1260.713167
6,vol_hohlca,100000,70000,1205.580185
7,vol_hohlca,50000,70000,1177.567835
8,vol_hohlca,200000,30000,1008.627206
9,vol_hohlca,200000,70000,920.274028


russell_2000 h buzz unique subsets: 14
ntop: 14


Unnamed: 0,sub,win,knn,max
0,trmi3_etf3_news_buzz_hohlca_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz,50000,30000,1129.461228
1,trmi3_etf3_news_buzz_hohlca_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz,30000,30000,952.144922
2,trmi3_etf3_news_buzz_hohlca_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz,10000,30000,839.539643
3,trmi3_etf3_social_buzz_hohlca_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz,10000,30000,791.011014
4,trmi3_etf3_social_buzz_hohlca_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz,200000,30000,755.023118
5,trmi3_etf3_social_buzz_hohlca_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz,50000,30000,731.942167
6,trmi3_etf3_news_buzz_hohlca_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz,50000,70000,727.864173
7,trmi3_etf3_news_buzz_hohlca_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz,30000,70000,695.729177
8,trmi3_etf3_social_buzz_hohlca_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz,30000,70000,641.019966
9,trmi3_etf3_social_buzz_hohlca_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz,10000,70000,626.83985


nasdaq_100 h pba unique subsets: 19
ntop: 19


Unnamed: 0,sub,win,knn,max
0,pba_hohlca_hdmx_hduni(8),30000,30000,743.854252
1,pba_hohlca_hdzn,50000,30000,605.306208
2,pba_hohlca_hdpt,100000,30000,601.83817
3,pba_hohlca_hdmx,100000,30000,584.803072
4,pba_hoc_hlogret_hdzn_hret_hspread,10000,30000,552.540777
5,pba_hohlca_hdmx_hduni(8),100000,30000,548.27578
6,pba_hohlca_hdzn_hdgau(8),200000,30000,545.719236
7,pba_hohlca_hdzn,30000,30000,538.391371
8,pba_hohlca_hdmx,50000,30000,528.944866
9,pba_hoc_hlogret_hdzn_hdgau(8)_hret_hspread,30000,30000,527.742791


nasdaq_100 h vol unique subsets: 19
ntop: 19


Unnamed: 0,sub,win,knn,max
0,vol_hohlca,50000,30000,1535.497154
1,vol_hohlca,10000,30000,1452.099734
2,vol_hohlca,30000,30000,1308.109248
3,vol_hohlca,10000,70000,1123.377805
4,vol_hohlca,100000,30000,1013.009873
5,vol_hohlca,30000,70000,989.392634
6,vol_hohlca_hdmx,10000,30000,901.42111
7,vol_hohlca_hdpt,10000,30000,882.205841
8,vol_hohlca,200000,30000,854.987263
9,vol_hohlca,50000,70000,811.930637


nasdaq_100 h buzz unique subsets: 14
ntop: 14


Unnamed: 0,sub,win,knn,max
0,trmi3_etf3_news_buzz_hohlca_hdzn_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz,10000,30000,644.07823
1,trmi3_etf3_social_buzz_hohlca_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz,50000,30000,629.81737
2,trmi3_etf3_news_buzz_hohlca_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz,30000,30000,622.56237
3,trmi3_etf3_news_buzz_hohlca_hdpt_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz,30000,30000,567.925301
4,trmi3_etf3_news_buzz_hohlca_hdmx_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz,30000,30000,559.317782
5,trmi3_etf3_social_buzz_hohlca_hdod_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz,10000,30000,557.521221
6,trmi3_etf3_social_buzz_hohlca_hdpt_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz,50000,30000,556.182888
7,trmi3_etf3_social_buzz_hohlca_hdmx_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz,50000,30000,547.725258
8,trmi3_etf3_social_buzz_hohlca_hdmx_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz,100000,30000,542.327867
9,trmi3_etf3_social_buzz_hohlca_hdpt_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz,100000,30000,541.41583


dow_jones h pba unique subsets: 19
ntop: 19


Unnamed: 0,sub,win,knn,max
0,pba_hohlca,10000,30000,1049.255013
1,pba_hohlca,10000,70000,869.560063
2,pba_hohlca,30000,30000,837.219388
3,pba_hohlca_hdmx,50000,30000,777.844865
4,pba_hohlca,30000,70000,774.141322
5,pba_hohlca_hdpt,50000,30000,762.6151
6,pba_hohlca,50000,30000,723.221916
7,pba_hoc_hlogret_hdzn_hdgau(8)_hret_hspread,200000,30000,663.401609
8,pba_hohlca,50000,70000,655.249854
9,pba_hohlca_hdzn,10000,30000,627.13915


dow_jones h vol unique subsets: 19
ntop: 19


Unnamed: 0,sub,win,knn,max
0,vol_hohlca,30000,30000,1402.938813
1,vol_hohlca,10000,30000,1163.016126
2,vol_hohlca,30000,70000,1151.158427
3,vol_hohlca,10000,70000,1118.116141
4,vol_hohlca,50000,30000,997.471273
5,vol_hohlca,50000,70000,897.003107
6,vol_hohlca,100000,30000,750.983049
7,vol_hohlca,100000,70000,737.437219
8,vol_hohlca_hdod,30000,30000,721.444536
9,vol_hohlca,200000,30000,618.818765


dow_jones h buzz unique subsets: 14
ntop: 14


Unnamed: 0,sub,win,knn,max
0,trmi3_etf3_social_buzz_hohlca_hdzn_hdgau(8)_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz,100000,30000,728.930797
1,trmi3_etf3_social_buzz_hohlca_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz,200000,30000,704.940471
2,trmi3_etf3_social_buzz_hohlca_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz,100000,30000,690.62915
3,trmi3_etf3_social_buzz_hohlca_hdzn_hdgau(8)_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz,10000,30000,676.266578
4,trmi3_etf3_social_buzz_hohlca_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz,30000,30000,647.032484
5,trmi3_etf3_news_buzz_hohlca_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz,30000,30000,618.521139
6,trmi3_etf3_social_buzz_hohlca_hdzn_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz,200000,30000,581.741274
7,trmi3_etf3_news_buzz_hohlca_hdmx_hduni(8)_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz,100000,30000,565.058937
8,trmi3_etf3_social_buzz_hohlca_hdmx_hduni(8)_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz,30000,30000,561.240531
9,trmi3_etf3_news_buzz_hohlca_hdod_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz,30000,30000,558.795627


### NTop Common Across Assets

In [259]:
h_pba = reduce(np.intersect1d, (ntop_common_dfs[[asset, 'h', 'pba']] for asset in assets))
display(len(h_pba), h_pba)

3

array(['pba_hohlca', 'pba_hohlca_hdmx', 'pba_hohlca_hdzn'], dtype=object)

In [260]:
h_pba_common_axe = list(set(map(extract_axe_name, h_pba)))
display(h_pba_common_axe)

h_pba_minimal = []
for axe in h_pba_common_axe:
    h_pba_minimal.extend(reduce(np.intersect1d, (ntop_common_subset(asset, 'h', 'pba', axe) for asset in assets)))
display(h_pba_minimal)

['hdzn', 'hdmx', 'hohlca']

ntop: 88
ntop: 88
ntop: 88
ntop: 88
ntop: 88
ntop: 88
ntop: 88
ntop: 88
ntop: 40
ntop: 40
ntop: 40
ntop: 40


['pba_avgPrice_pba_hohlca_hdzn',
 'pba_close_pba_hohlca_hdzn',
 'pba_high_pba_hohlca_hdzn',
 'pba_lh_pba_hlh_hlogret_hdzn',
 'pba_lh_pba_hlh_hret_hdzn',
 'pba_lh_pba_hlh_hspread_hdzn',
 'pba_low_pba_hohlca_hdzn',
 'pba_oc_pba_hoc_hlogret_hdzn',
 'pba_oc_pba_hoc_hret_hdzn',
 'pba_oc_pba_hoc_hspread_hdzn',
 'pba_open_pba_hohlca_hdzn',
 'pba_avgPrice_pba_hohlca_hdmx',
 'pba_close_pba_hohlca_hdmx',
 'pba_high_pba_hohlca_hdmx',
 'pba_lh_pba_hlh_hlogret_hdmx',
 'pba_lh_pba_hlh_hret_hdmx',
 'pba_lh_pba_hlh_hspread_hdmx',
 'pba_low_pba_hohlca_hdmx',
 'pba_oc_pba_hoc_hlogret_hdmx',
 'pba_oc_pba_hoc_hret_hdmx',
 'pba_oc_pba_hoc_hspread_hdmx',
 'pba_open_pba_hohlca_hdmx',
 'pba_avgPrice_pba_hohlca',
 'pba_close_pba_hohlca',
 'pba_high_pba_hohlca',
 'pba_low_pba_hohlca',
 'pba_open_pba_hohlca']

In [261]:
h_vol = reduce(np.intersect1d, (ntop_common_dfs[[asset, 'h', 'vol']] for asset in assets))
display(len(h_vol), h_vol)

1

array(['vol_hohlca'], dtype=object)

In [262]:
h_vol_common_axe = list(set(map(extract_axe_name, h_vol)))
display(h_vol_common_axe)

h_vol_minimal = []
for axe in h_vol_common_axe:
    h_vol_minimal.extend(reduce(np.intersect1d, (ntop_common_subset(asset, 'h', 'vol', axe) for asset in assets)))
display(h_vol_minimal)

['hohlca']

ntop: 40
ntop: 40
ntop: 40
ntop: 40


['vol_avgPrice_vol_hohlca',
 'vol_close_vol_hohlca',
 'vol_high_vol_hohlca',
 'vol_low_vol_hohlca',
 'vol_open_vol_hohlca']

In [263]:
h_buzz = reduce(np.intersect1d, (ntop_common_dfs[[asset, 'h', 'buzz']] for asset in assets))
display(len(h_buzz), h_buzz)

3

array(['trmi3_etf3_news_buzz_hohlca_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz',
       'trmi3_etf3_social_buzz_hohlca_hdzn_hdgau(8)_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz',
       'trmi3_etf3_social_buzz_hohlca_mkt3_bondBuzz_ratesBuzz_stockIndexBuzz'],
      dtype=object)

In [264]:
h_buzz_common_axe = ['hdzn', 'hduni', 'hdgau', 'hdmx', 'hdpt', 'hdod', 'hohlca'] # list(set(map(extract_axe_name, h_buzz)))
display(h_buzz_common_axe)

h_buzz_minimal = []
for axe in h_buzz_common_axe:
    h_buzz_minimal.extend(reduce(np.intersect1d, (ntop_common_subset(asset, 'h', 'buzz', axe) for asset in assets)))
display(h_buzz_minimal)

['hdzn', 'hduni', 'hdgau', 'hdmx', 'hdpt', 'hdod', 'hohlca']

ntop: 128
ntop: 128
ntop: 128
ntop: 128
ntop: 128
ntop: 128
ntop: 128
ntop: 128
ntop: 128
ntop: 128
ntop: 128
ntop: 128
ntop: 128
ntop: 128
ntop: 128
ntop: 128
ntop: 128
ntop: 128
ntop: 128
ntop: 128
ntop: 128
ntop: 128
ntop: 128
ntop: 128
ntop: 64
ntop: 64
ntop: 64
ntop: 64


['etf3_buzz_N_avgPrice_trmi3_etf3_news_buzz_hohlca_hdzn',
 'etf3_buzz_N_close_trmi3_etf3_news_buzz_hohlca_hdzn',
 'etf3_buzz_N_high_trmi3_etf3_news_buzz_hohlca_hdzn',
 'etf3_buzz_N_low_trmi3_etf3_news_buzz_hohlca_hdzn',
 'etf3_buzz_N_open_trmi3_etf3_news_buzz_hohlca_hdzn',
 'etf3_buzz_S_open_trmi3_etf3_social_buzz_hohlca_hdzn',
 'mkt3_bondBuzz_N_open_trmi3_mkt3_news_bondBuzz_hohlca_hdzn',
 'mkt3_bondBuzz_S_open_trmi3_mkt3_social_bondBuzz_hohlca_hdzn',
 'mkt3_ratesBuzz_N_open_trmi3_mkt3_news_ratesBuzz_hohlca_hdzn',
 'mkt3_ratesBuzz_S_open_trmi3_mkt3_social_ratesBuzz_hohlca_hdzn',
 'mkt3_stockIndexBuzz_N_avgPrice_trmi3_mkt3_news_stockIndexBuzz_hohlca_hdzn',
 'mkt3_stockIndexBuzz_N_close_trmi3_mkt3_news_stockIndexBuzz_hohlca_hdzn',
 'mkt3_stockIndexBuzz_N_high_trmi3_mkt3_news_stockIndexBuzz_hohlca_hdzn',
 'mkt3_stockIndexBuzz_N_low_trmi3_mkt3_news_stockIndexBuzz_hohlca_hdzn',
 'mkt3_stockIndexBuzz_N_open_trmi3_mkt3_news_stockIndexBuzz_hohlca_hdzn',
 'mkt3_stockIndexBuzz_S_open_trmi3_mkt3_

In [265]:
h_minimal_set = h_pba_minimal + h_vol_minimal + h_buzz_minimal

In [266]:
h_minimal_set

['pba_avgPrice_pba_hohlca_hdzn',
 'pba_close_pba_hohlca_hdzn',
 'pba_high_pba_hohlca_hdzn',
 'pba_lh_pba_hlh_hlogret_hdzn',
 'pba_lh_pba_hlh_hret_hdzn',
 'pba_lh_pba_hlh_hspread_hdzn',
 'pba_low_pba_hohlca_hdzn',
 'pba_oc_pba_hoc_hlogret_hdzn',
 'pba_oc_pba_hoc_hret_hdzn',
 'pba_oc_pba_hoc_hspread_hdzn',
 'pba_open_pba_hohlca_hdzn',
 'pba_avgPrice_pba_hohlca_hdmx',
 'pba_close_pba_hohlca_hdmx',
 'pba_high_pba_hohlca_hdmx',
 'pba_lh_pba_hlh_hlogret_hdmx',
 'pba_lh_pba_hlh_hret_hdmx',
 'pba_lh_pba_hlh_hspread_hdmx',
 'pba_low_pba_hohlca_hdmx',
 'pba_oc_pba_hoc_hlogret_hdmx',
 'pba_oc_pba_hoc_hret_hdmx',
 'pba_oc_pba_hoc_hspread_hdmx',
 'pba_open_pba_hohlca_hdmx',
 'pba_avgPrice_pba_hohlca',
 'pba_close_pba_hohlca',
 'pba_high_pba_hohlca',
 'pba_low_pba_hohlca',
 'pba_open_pba_hohlca',
 'vol_avgPrice_vol_hohlca',
 'vol_close_vol_hohlca',
 'vol_high_vol_hohlca',
 'vol_low_vol_hohlca',
 'vol_open_vol_hohlca',
 'etf3_buzz_N_avgPrice_trmi3_etf3_news_buzz_hohlca_hdzn',
 'etf3_buzz_N_close_trmi