In [1]:
import sys
import os
from os import sep
from os.path import dirname, realpath
from pathlib import Path
import logging

def get_cwd(fname, subdir, crunch_dir=realpath(Path.home()) +sep +'crunch' +sep):
    """
    Convenience function to make a directory string for the current file based on inputs.
    Jupyter Notebook in Anaconda invokes the Python interpreter in Anaconda's subdirectory
    which is why changing sys.argv[0] is necessary. In the future a better way to do this
    should be preferred..
    """
    return crunch_dir +subdir +fname
    
def fix_path(cwd):
    """
    Convenience function to fix argv and python path so that jupyter notebook can run the same as
    any script in crunch.
    """
    sys.argv[0] = cwd
    module_path = os.path.abspath(os.path.join('..'))
    if module_path not in sys.path:
        sys.path.append(module_path)

fix_path(get_cwd('test.ipynb', 'recon' +sep))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from numba import jit, vectorize

from ipywidgets import interact, interactive, fixed
from IPython.display import display

pd.set_option("display.max_rows", 100)
pd.set_option('display.max_columns', 50)

from common_util import query_df, search_df, chained_filter, benchmark
from data.data_api import DataAPI
from data.access_util import col_subsetters as cs
from recon.common import dum, count_nonnan, count_nonzero, count_both
from recon.transform import *
from recon.filter import *
from recon.viz import *
from recon.corr import corr_mat

### Load Raw Data

Get all raw data, filter out data before 2018 (leave 2018 for final validation)

In [11]:
search_terms = {
    'stage': 'raw'
}
date_range = {
    'id': ('lt', 2018)
}
dfs = {}
for rec, df in DataAPI.generate(search_terms):
    dfs[rec.name] = df.loc[search_df(df, date_range)]

DatetimeIndex(['1998-01-01 01:00:00+00:00', '1998-01-01 02:00:00+00:00',
               '1998-01-01 03:00:00+00:00', '1998-01-01 04:00:00+00:00',
               '1998-01-01 05:00:00+00:00', '1998-01-01 06:00:00+00:00',
               '1998-01-01 07:00:00+00:00', '1998-01-01 08:00:00+00:00',
               '1998-01-01 09:00:00+00:00', '1998-01-01 10:00:00+00:00',
               ...
               '2017-12-31 14:00:00+00:00', '2017-12-31 15:00:00+00:00',
               '2017-12-31 16:00:00+00:00', '2017-12-31 17:00:00+00:00',
               '2017-12-31 18:00:00+00:00', '2017-12-31 19:00:00+00:00',
               '2017-12-31 20:00:00+00:00', '2017-12-31 21:00:00+00:00',
               '2017-12-31 22:00:00+00:00', '2017-12-31 23:00:00+00:00'],
              dtype='datetime64[ns, UTC]', name='id', length=175289, freq='H')
DatetimeIndex(['1998-01-01 01:00:00+00:00', '1998-01-01 02:00:00+00:00',
               '1998-01-01 03:00:00+00:00', '1998-01-01 04:00:00+00:00',
               '1998-01-0

In [12]:
list(dfs.keys())

['dow_jones_raw_0',
 'sp_500_raw_1',
 'nasdaq_100_raw_2',
 'russell_2000_raw_3',
 'oil_raw_4',
 'gold_raw_5']

In [13]:
pba = chained_filter(dfs['sp_500_raw_1'].columns, [cs['#pba']['ohlc']])
vol = chained_filter(dfs['sp_500_raw_1'].columns, [cs['#vol']['ohlc']])
pba_vol = pba + vol

In [14]:
pba_vol

['pba_open',
 'pba_high',
 'pba_low',
 'pba_close',
 'vol_open',
 'vol_high',
 'vol_low',
 'vol_close']

### Label EDA

In [29]:
fins = dfs['sp_500_raw_1'][pba].groupby(pd.Grouper(freq='Y'))['pba_close'].describe()

In [118]:
jan2['diff'] = jan2['pba_close'] - jan2['pba_open']
jan2['pct_diff'] = jan2['diff'] / jan2['pba_close']

# Prev price diff
jan2['pv_pct_diff'] = jan2['pct_diff'].shift(1)

# Prev price spread
jan2['pv_spread'] = (jan2['pba_high'] - jan2['pba_low']).shift(1)

# Prev IV
jan2['pv_vol_diff'] = (jan2['vol_close'] - jan2['vol_open']).shift(1)

# Prev IV pct_change
jan2['pv_vol_pct_diff'] = jan2['pv_vol_diff'] / jan2['vol_close'].shift(1)

# Prev IV spread
jan2['pv_vol_spread'] = (jan2['vol_high'] - jan2['vol_low']).shift(1)

In [119]:
fcs = ['pv_pct_diff', 'pv_spread', 'pv_vol_diff', 'pv_vol_pct_diff', 'pv_vol_spread']
lcs = ['pct_diff']
jan2 = jan2.dropna(axis=0, how='any', subset=fcs)
cm = corr_mat(jan2, feat_col_name=fcs, lab_col_name=lcs)
cm

Unnamed: 0,pct_diff
pv_pct_diff,-0.000515
pv_spread,0.008056
pv_vol_diff,0.003881
pv_vol_pct_diff,0.000885
pv_vol_spread,0.010523


In [None]:
frame['pba_numBids'].hist()

In [None]:
interact(dist_probs, col_name=full_list, value=(0.01, .1, .001), showboth=False, suppress_print=False);

In [None]:
tenth_inner_prob = {} #probability value will land within .1 of the mean
for sent in sent_list:
    tenth_inner_prob[sent] = dist_probs(sent, .1, suppress_print=True)

import operator
sorted_tenth_prob = sorted(tenth_inner_prob.items(), key=operator.itemgetter(1))
less_than_85_percent = list(filter(lambda x: x[1] < .85, sorted_tenth_prob))
greater_than_85_percent = list(filter(lambda x: x[1] >= .85, sorted_tenth_prob))
print(less_than_85_percent)

less_85_stats = pd.DataFrame(columns=['sent', 'mean', 'variance'])
lt85 = [tup[0] for tup in less_than_85_percent]
for sent in lt85:
    less_85_stats = less_85_stats.append({'sent':sent, 'mean':data[sent].mean(), 'variance':data[sent].var()}, ignore_index=True)

greater_85_stats = pd.DataFrame(columns=['sent', 'mean', 'variance'])
gt85 = [tup[0] for tup in greater_than_85_percent]
for sent in gt85:
    greater_85_stats = greater_85_stats.append({'sent':sent, 'mean':data[sent].mean(), 'variance':data[sent].var()}, ignore_index=True)

less_85_stats

In [None]:
#Show sentiment characteristics at sizable (at least greater than 1%) moves
move_thresh = .06
large_moves_pos = data[data.Z_dailypct >= move_thresh]
large_moves_neg = data[data.Z_dailypct <= -move_thresh]
print('all rows:', len(data))
print('all rows, no weekends:', len(data.dropna(axis=0, subset=['Z_dailydir'])))

print('\ntotal large moves:', len(large_moves_pos) + len(large_moves_neg))
print('large up moves [greater than', str(move_thresh) +']:', len(large_moves_pos))
print('large down moves [less than', str(-move_thresh) +']:', len(large_moves_neg))

TODO: Compare sentiment vectors at large movement thresholds ($\pm 6$% , $\pm 4$% , $\pm 2$% , $\pm 1.5$%) to sentiment vectors of the other price movements.
Compare things such as min/max, mean, median, variance, average probability (based on normal dist)