In [1]:
import sys
import math

import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import missingno as msno
import seaborn as sns
import scipy.stats as stats

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.model_selection import train_test_split, cross_val_score, ShuffleSplit
from sklearn.metrics import roc_curve, classification_report

from multiprocessing import cpu_count
from tqdm import tqdm, tqdm_notebook

from mpengine import mpPandasObj
from util import cprintf

from zig_zag import zig_zag_df
from statsmodels.tsa.stattools import adfuller
from statsmodels.distributions.empirical_distribution import ECDF

from entropy_features import plugIn, lempelZiv_lib, konto
from structural_breaks import get_bsadf, get_bsadf0
from sample_weights import mpNumCoEvents, mpSampleW, mpSampleTW, getAvgUniqueness, getIndMatrix
from financial_data_structures import dollar_bar_df
from cross_validation_in_finance import PurgedKFold, cvScore
from labeling import getDailyVol, getTEvents, addVerticalBarrier, getEvents, getBins, getBinsOld, df_returns, df_rolling_autocorr
from fractionally_differentiated_features import fracDiff, fracDiff_FFD, plotMinFFD

  from pandas.core import datetools


In [2]:
filepath = '~/Dev/notebook/lopez/features.csv'
features = pd.read_csv(filepath).assign(dates0 = lambda df: (pd.to_datetime(df['dates'], format='%Y-%m-%d %H:%M:%S.%f'))).drop(labels=['dates'], axis=1).set_index('dates0')

In [3]:
tEvents, _, _ = getTEvents(features.bsadf, h = features.bsadf.std())

  0%|          | 0/691 [00:00<?, ?it/s]100%|██████████| 691/691 [00:00<00:00, 10277.39it/s]


In [4]:
t1 = False
ptSl = [1,1]
target = getDailyVol(features.price, 100)
minRet = 0.01
cpus = cpu_count() - 1

events = getEvents(features.price, tEvents, ptSl, target, minRet, cpus, t1=t1)

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  trgt = trgt.loc[tEvents]
2018-11-05 13:52:58.279336 66.67% applyPtSlOnT1 done after 0.0 minutes. Remaining 0.0 minutes.2018-11-05 13:52:58.283916 100.0% applyPtSlOnT1 done after 0.0 minutes. Remaining 0.0 minutes.


In [5]:
labels = getBins(events, features.price)
labels.bin.value_counts()

features0 = features
features0['lbl'] = pd.Series(index = features0.index)
features0.lbl = labels.bin

features0 = features0.dropna()


In [6]:
%matplotlib

lbuy = labels[labels.bin > 0]
lsell = labels[labels.bin < 0]

price_b = features0.price.loc[lbuy.index]
price_s = features0.price.loc[lsell.index]

f,ax = plt.subplots()
features.price.plot(ax=ax)

price_b.plot(ax=ax, ls = '', marker = '^', markersize = 7, color = 'g')
price_s.plot(ax=ax, ls = '', marker = 'v', markersize = 7, color = 'r')

Using matplotlib backend: TkAgg


<matplotlib.axes._subplots.AxesSubplot at 0x7f3325cbe350>

In [7]:
events0 = events.loc[labels.index]

numCoEvents = mpPandasObj(mpNumCoEvents, ('molecule', events.index), cpu_count(), closeIdx = features.index, t1 = events['t1'])
numCoEvents = numCoEvents.loc[~numCoEvents.index.duplicated(keep='last')]
numCoEvents = numCoEvents.reindex(features.index).fillna(0)

sample_weights = mpPandasObj(mpSampleW, ('molecule', events.index), cpu_count(), t1 = events['t1'], numCoEvents = numCoEvents, close = features.price)
sample_weights *= sample_weights.shape[0] / sample_weights.sum()

sample_weights0 = sample_weights[labels.index]

2018-11-04 13:48:03.338955 100.0% mpNumCoEvents done after 0.0 minutes. Remaining 0.0 minutes.018-11-04 13:48:03.337241 75.0% mpNumCoEvents done after 0.0 minutes. Remaining 0.0 minutes.
2018-11-04 13:48:03.467002 100.0% mpSampleW done after 0.0 minutes. Remaining 0.0 minutes.018-11-04 13:48:03.464128 75.0% mpSampleW done after 0.0 minutes. Remaining 0.0 minutes.


In [8]:
features0.columns

Index([u'price', u'vpin', u'amihuds_lambda', u'kyles_lambda',
       u'buys_volume_entropy', u'bsadf', u'returns_entropy',
       u'aggressor_side_entropy1', u'lbl'],
      dtype='object')

In [9]:
from mpl_toolkits.mplot3d import Axes3D

b = features0[features0.lbl == 1].drop('lbl',axis=1)
s = features0[features0.lbl == -1].drop('lbl',axis=1)

b0 = b.vpin.values
b1 = b.amihuds_lambda.values
b2 = b.aggressor_side_entropy1.values

s0 = s.vpin.values
s1 = s.amihuds_lambda.values
s2 = s.aggressor_side_entropy1.values

fig = plt.figure()
ax = fig.add_subplot(111, projection = '3d')
ax.scatter(b0, b1, b2, c='r', marker='o')
ax.scatter(s0, s1, s2, c='g', marker='o')

<mpl_toolkits.mplot3d.art3d.Path3DCollection at 0x7f331ca70610>

In [10]:
X = features0.drop('lbl',axis=1)
y = features0.lbl

In [11]:
from sklearn.model_selection import KFold, RepeatedKFold, LeaveOneOut, LeavePOut, StratifiedKFold, StratifiedShuffleSplit, TimeSeriesSplit

RANDOM_STATE = 777
n_estimator = 600

In [12]:
numCoEvents = mpPandasObj(mpNumCoEvents, ('molecule', events.index), cpu_count(), closeIdx = features.index, t1 = events['t1'])
numCoEvents = numCoEvents.loc[~numCoEvents.index.duplicated(keep='last')]
numCoEvents = numCoEvents.reindex(features.index).fillna(0)

out_tW = mpPandasObj(mpSampleTW, ('molecule', events0.index), cpu_count(), t1 = events0['t1'], numCoEvents = numCoEvents)
avgU = out_tW.mean()

2018-11-04 13:48:10.774756 100.0% mpNumCoEvents done after 0.0 minutes. Remaining 0.0 minutes.018-11-04 13:48:10.773754 75.0% mpNumCoEvents done after 0.0 minutes. Remaining 0.0 minutes.
2018-11-04 13:48:10.877833 100.0% mpSampleTW done after 0.0 minutes. Remaining 0.0 minutes.018-11-04 13:48:10.875987 75.0% mpSampleTW done after 0.0 minutes. Remaining 0.0 minutes.


In [13]:
from sklearn.ensemble import GradientBoostingClassifier

#clf = GradientBoostingClassifier(n_estimators = 6000, learning_rate = 1.0, max_depth = 4, random_state = RANDOM_STATE)


#dtc = DecisionTreeClassifier(criterion='entropy', max_features = 'auto', class_weight = 'balanced')
#clf = BaggingClassifier(base_estimator = dtc, n_estimators = 60, max_samples = avgU, max_features = 1.)

#rnf = RandomForestClassifier(n_estimators = 1, criterion = 'entropy', bootstrap = False, class_weight = 'balanced_subsample')
#clf = BaggingClassifier(base_estimator = rnf, n_estimators = 1000, max_samples = avgU, max_features = 1.)

#clf = SVC(kernel = 'poly', C = 1)
#clf = BaggingClassifier(base_estimator = clf, n_estimators = 10, max_samples = avgU, max_features = 1.)

clf = SVC(kernel = 'poly', C = 1)
#clf = RandomForestClassifier(max_depth=16, n_estimators=n_estimator, criterion='entropy', random_state=RANDOM_STATE)

In [None]:
scores = cvScore(clf = clf, 
                 X = X, 
                 y = y, 
                 sample_weight = sample_weights0, #pd.Series(index = X.index, data = np.ones(X.shape[0]))
                 scoring = 'accuracy', 
                 t1 = events0['t1'], 
                 cv = 4, 
                 pctEmbargo = 0.01)

print('Accuracy: %0.2f (+/- %0.2f)' % (scores.mean(), scores.std()))

In [None]:
cv = 4

scores = cross_val_score(clf, X, y, cv = cv)
print('Accuracy: %0.2f (+/- %0.2f)' % (scores.mean(), scores.std()))