In [1]:
import sys
import os
from os import sep
from os.path import dirname, realpath
from pathlib import Path
from collections import defaultdict
from itertools import product
import logging

def get_cwd(fname, subdir, crunch_dir=realpath(Path.home()) +sep +'crunch' +sep):
    """
    Convenience function to make a directory string for the current file based on inputs.
    Jupyter Notebook in Anaconda invokes the Python interpreter in Anaconda's subdirectory
    which sis why changing sys.argv[0] is necessary. In the future a better way to do this
    should be preferred..
    """
    return crunch_dir +subdir +fname

def fix_path(cwd):
    """
    Convenience function to fix argv and python path so that jupyter notebook can run the same as
    any script in crunch.
    """
    sys.argv[0] = cwd
    module_path = os.path.abspath(os.path.join('..'))
    if module_path not in sys.path:
        sys.path.append(module_path)

fix_path(get_cwd('pattern_eda.ipynb', 'mutate' +sep))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from numba import jit, vectorize, float64, uint
from tslearn.preprocessing import TimeSeriesScalerMeanVariance
from tslearn.piecewise import PiecewiseAggregateApproximation
from tslearn.piecewise import SymbolicAggregateApproximation, OneD_SymbolicAggregateApproximation
from scipy.stats import zscore
from sortedcontainers import SortedList, SortedSet 

from ipywidgets import interact, interactive, fixed
from IPython.display import display

pd.set_option("display.max_rows", 100)
pd.set_option('display.max_columns', 50)

from common_util import DT_HOURLY_FREQ, DT_CAL_DAILY_FREQ, DT_BIZ_DAILY_FREQ, get_custom_biz_freq, get_custom_biz_freq_df, query_df, search_df, chained_filter, benchmark
from common_util import MUTATE_DIR, load_json, outer_join, left_join, count_nn_df, count_nz_df, count_nn_nz_df, pairwise, cust_count
from data.data_api import DataAPI
from data.access_util import col_subsetters as cs
from mutate.common import dum
from recon.viz import *

## Load Data

In [2]:
date_range = {
    'id': ('lt', 2018)
}
search_terms = {
    'stage': 'mutate',
    'mutate_type': 'thresh',
    'raw_cat': 'us_equity_index'
}
normalize_dfs = defaultdict(dict)
for rec, norm_df in DataAPI.generate(search_terms):
    normalize_dfs[rec.root][rec.desc] = norm_df.loc[search_df(norm_df, date_range)]
logging.info('normalize data loaded')

In [3]:
normed_dfs = normalize_dfs['sp_500']

In [14]:
def get_nmost_nulled_cols_df(df, n=5, keep_counts=False):
	nsmall = count_nn_df(df).nsmallest(n=n, keep='first')

	return nsmall if (keep_counts) else list(nsmall.index)

In [15]:
get_nmost_nulled_cols_df(normed_dfs['fth thresh'])

['pba_oc_spread_fth_af_estd',
 'pba_oc_spread_fth_af_abs_estd',
 'pba_oc_spread_fth_of_xstd',
 'pba_oc_spread_fth_of_abs_xstd',
 'pba_oc_return_fth_af_estd']

In [74]:
def get_seq_value(ser, capture=.80):
    tups = list(ser.iteritems())
    proportion = 0.0

    for idx, val in sorted(tups, key=lambda tup: tup[0], reverse=True):
        print(idx)
        if (proportion > capture):
            return {ser.max():ser.idxmax(), proportion: idx}
        else:
            proportion += val

In [75]:
for key, norm_df in normed_dfs.items():
    if (key[:6]!='thresh'):
        print(key)
        cust, count_df = cust_count(norm_df)
        vc = count_df.apply(pd.Series.value_counts, normalize=True)
        most_common = vc.idxmax(axis=0)
#         print(vc)
        seq = get_seq_value(vc.iloc[:, 0])
        print(seq)

raw_pba_dzn
9
8
7
{0.9375: 8, 0.9424846625766872: 7}
raw_pba_dmx
9
8
7
{0.9375: 8, 0.9424846625766872: 7}
raw_vol_dzn
14
13
11
9
8
7
6
{0.5954754601226994: 8, 0.9419095092024539: 6}
raw_vol_dmx
14
13
11
9
8
7
6
{0.5954754601226994: 8, 0.9419095092024539: 6}
