In [1]:
import sys
import math

import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import missingno as msno
import seaborn as sns
import scipy.stats as stats

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.model_selection import train_test_split, cross_val_score, ShuffleSplit
from sklearn.metrics import roc_curve, classification_report

from multiprocessing import cpu_count
from tqdm import tqdm, tqdm_notebook

from mpengine import mpPandasObj
from util import cprintf

from zig_zag import zig_zag_df
from statsmodels.tsa.stattools import adfuller
from sample_weights import mpNumCoEvents, mpSampleW, mpSampleTW, getAvgUniqueness, getIndMatrix
from financial_data_structures import dollar_bar_df0
from cross_validation_in_finance import PurgedKFold, cvScore
from labeling import getDailyVol, getTEvents, addVerticalBarrier, getEvents, getBins, getBinsOld, df_returns, df_rolling_autocorr
from fractionally_differentiated_features import fracDiff, fracDiff_FFD, plotMinFFD

%matplotlib

Using matplotlib backend: TkAgg


  from pandas.core import datetools


In [2]:
filepath = '~/Dev/notebook/lopez/data/bitfinex_BTCUSD_trades.csv'
#filepath = '~/Dev/notebook/lopez/data/btcusd_trades.csv'
cols = list(map(str.lower, ['Datetime','Amount','Price','<Unknown>']))
columns = dict(zip(range(len(cols)), cols))

df = pd.read_csv(filepath, header = None).rename(columns = columns).assign(dates = lambda df: (pd.to_datetime(df['datetime'], format='%Y-%m-%d %H:%M:%S.%f'))).assign(dollar_volume=lambda df: df['amount'] * df['price']).assign(volume=lambda df: df['amount']).drop(['datetime', '<unknown>'], axis = 1).set_index('dates').drop_duplicates()
df = df.iloc[::-1]

In [3]:
dollar_M = 1000000
dollar_df = dollar_bar_df0(df, 'dollar_volume', dollar_M)

100%|██████████| 971615/971615 [00:00<00:00, 2953639.52it/s]83427/971615 [00:00<00:00, 2833644.40it/s]
100%|██████████| 3039/3039 [00:06<00:00, 485.94it/s]


In [4]:
#dollar_df

In [5]:
close = dollar_df.price.copy()
close = close[~close.index.duplicated(keep='first')]

vwap = dollar_df.vwap.copy()
vwap = vwap.loc[close.index]

high = dollar_df.high.copy()
high = high.loc[close.index]

low = dollar_df.low.copy()
low = low.loc[close.index]

volPos = dollar_df.vol_pos.copy()
volPos = volPos.loc[close.index]

volNeg = dollar_df.vol_neg.copy()
volNeg = volNeg.loc[close.index]

In [6]:
%matplotlib
f,ax = plt.subplots()

vwap.plot(ax=ax)
close.plot(ax=ax)

Using matplotlib backend: TkAgg


<matplotlib.axes._subplots.AxesSubplot at 0x7f1552268ed0>

In [7]:
dailyVol = getDailyVol(close, 100)
h0 = dailyVol.mean()# + (dailyVol.max() - dailyVol.mean())

print (dailyVol.std())

0.0081788553329


In [8]:
%matplotlib

f,ax = plt.subplots()
dailyVol.plot(ax=ax)
ax.axhline(h0, color='r')
ax.axhline(h0 + dailyVol.std(), ls='--', color='g')
ax.axhline(h0 - dailyVol.std(), ls='--', color='g')

Using matplotlib backend: TkAgg


<matplotlib.lines.Line2D at 0x7f1551596f10>

In [9]:
df0 = close.index.searchsorted(close.index - pd.Timedelta(days=1))
df0 = df0[df0>0]
df0 = pd.Series(close.index[df0-1], index = close.index[close.shape[0]-df0.shape[0]:])
df0 = close.loc[df0.index] - close.loc[df0.values].values
print df0.std()

%matplotlib

f,ax = plt.subplots()
ax.plot(df0)


314.732804043
Using matplotlib backend: TkAgg


[<matplotlib.lines.Line2D at 0x7f155066f650>]

In [10]:
#p0 = close[:-1]
#p1 = close[1:]

#r = np.divide(p1,p0) - 1


tEvents, bEvents, sEvents = getTEvents(close, h = df0.std())



100%|██████████| 3039/3039 [00:00<00:00, 13907.50it/s] | 1436/3039 [00:00<00:00, 14350.09it/s]


In [11]:
%matplotlib

close_t = close.loc[tEvents]
close_s = close.loc[sEvents]
close_b = close.loc[bEvents]

f,ax = plt.subplots()

close.plot(ax = ax)
close_t.plot(ax = ax, ls = '', marker = 'o', markersize = 8, color = 'r')

#close_s.plot(ax = ax, ls = '', marker = 'o', markersize = 8, color = 'r')
#close_b.plot(ax = ax, ls = '', marker = '^', markersize = 8, color = 'g')

Using matplotlib backend: TkAgg


<matplotlib.axes._subplots.AxesSubplot at 0x7f1550640550>

In [12]:
#%matplotlib

#close_s   = close.loc[sEvents]
#close_p   = close.loc[pEvents]
#close_vol = close.loc[dailyVol.index]

#f,ax = plt.subplots(2, sharex=True, figsize=(10,7))
#dailyVol.plot(ax=ax[0])
#ax[0].axhline(h0, ls='--', color='r')

#close_vol.plot(ax = ax[1])
#close_s.plot(ax = ax[1], ls = '', marker = '^', markersize = 8, color = 'r')
#close_p.plot(ax = ax[1], ls = '', marker = 'o', markersize = 6, color = 'g')


In [13]:
t1 = False#addVerticalBarrier(tEvents, close)

In [14]:
ptSl = [1,1]
target = dailyVol
minRet = 0.01
cpus = cpu_count() - 1

events = getEvents(close, tEvents, ptSl, target, minRet, cpus, t1=t1)

2018-10-28 08:12:10.666285 33.33% applyPtSlOnT1 done after 0.0 minutes. Remaining 0.0 minutes.2018-10-28 08:12:10.667859 66.67% applyPtSlOnT1 done after 0.0 minutes. Remaining 0.0 minutes.2018-10-28 08:12:10.667961 100.0% applyPtSlOnT1 done after 0.0 minutes. Remaining 0.0 minutes.


In [15]:
labels = getBins(events, close)
labels.bin.value_counts()

 1.0    5
-1.0    4
Name: bin, dtype: int64

In [16]:
lbuy = labels[labels['bin'] > 0]
lsell = labels[labels['bin'] < 0]

price_buy = close.loc[lbuy.index]
price_sell = close.loc[lsell.index]

f,ax = plt.subplots()
close.plot(ax=ax)

price_buy.plot(ax=ax, ls = '', marker = '^', markersize = 7, color = 'g')
price_sell.plot(ax=ax, ls = '', marker = 'v', markersize = 7, color = 'r')

<matplotlib.axes._subplots.AxesSubplot at 0x7f155049d7d0>

In [17]:
events0 = events.loc[labels.index]

numCoEvents = mpPandasObj(mpNumCoEvents, ('molecule', events.index), cpu_count(), closeIdx = close.index, t1 = events['t1'])
numCoEvents = numCoEvents.loc[~numCoEvents.index.duplicated(keep='last')]
numCoEvents = numCoEvents.reindex(close.index).fillna(0)

sample_weights = mpPandasObj(mpSampleW, ('molecule', events.index), cpu_count(), t1 = events['t1'], numCoEvents = numCoEvents, close = close)
sample_weights *= sample_weights.shape[0] / sample_weights.sum()

sample_weights0 = sample_weights[labels.index]

2018-10-28 08:12:18.769366 100.0% mpNumCoEvents done after 0.0 minutes. Remaining 0.0 minutes.018-10-28 08:12:18.768147 75.0% mpNumCoEvents done after 0.0 minutes. Remaining 0.0 minutes.
2018-10-28 08:12:18.902847 100.0% mpSampleW done after 0.0 minutes. Remaining 0.0 minutes.018-10-28 08:12:18.901517 75.0% mpSampleW done after 0.0 minutes. Remaining 0.0 minutes.


In [18]:
corr = df_rolling_autocorr(close, 6)
corr = corr[corr <= 1.]
volt = dailyVol

features = pd.DataFrame().assign(volt = volt).assign(corr = corr).assign(vwap = vwap)


In [19]:
f0 = vwap - close
f1 = high - close
f2 = close - low
f3 = volPos
f4 = volNeg

corr = np.corrcoef(f4, f2)[0,1]
print corr

-0.369311163393


In [20]:
%matplotlib

f,ax = plt.subplots()
#f0.plot(ax=ax)
#f1.plot(ax=ax)
#f2.plot(ax=ax)
f3.plot(ax=ax)
f4.plot(ax=ax)



Using matplotlib backend: TkAgg


<matplotlib.axes._subplots.AxesSubplot at 0x7f1550447290>

In [23]:
f0_df = pd.DataFrame(index = f0.index).assign(price = f0)
df0 = fracDiff_FFD(f0_df, 0., 1e-6)

#corr = np.corrcoef(f0_df.loc[df0.index, 'price'], df0['price'])[0,1]
#print corr

#adf = adfuller(df0.price, maxlag = 1, regression = 'c', autolag = None)
#print adf

width: 1; d: 0.000000


In [21]:
f1_df = pd.DataFrame(index = f1.index).assign(price = f1)
df1 = fracDiff_FFD(f1_df, 0., 1e-6)

corr = np.corrcoef(f1_df.loc[df1.index, 'price'], df1['price'])[0,1]
print corr

adf = adfuller(df1.price, maxlag = 1, regression = 'c', autolag = None)
print adf

width: 1; d: 0.000000
1.0
(-35.166227078736597, 0.0, 1, 3037, {'5%': -2.862492156233158, '1%': -3.4325050331831117, '10%': -2.5672768570714237})


In [22]:
f2_df = pd.DataFrame(index = f2.index).assign(price = f2)
df2 = fracDiff_FFD(f2_df, 0., 1e-6)

corr = np.corrcoef(f2_df.loc[df2.index, 'price'], df2['price'])[0,1]
print corr

adf = adfuller(df2.price, maxlag = 1, regression = 'c', autolag = None)
print adf

width: 1; d: 0.000000
1.0
(-32.333488734739966, 0.0, 1, 3037, {'5%': -2.862492156233158, '1%': -3.4325050331831117, '10%': -2.5672768570714237})


In [23]:
f3_df = pd.DataFrame(index = f3.index).assign(price = f3)
df3 = fracDiff_FFD(f3_df, 0., 1e-6)

corr = np.corrcoef(f3_df.loc[df3.index, 'price'], df3['price'])[0,1]
print corr

adf = adfuller(df3.price, maxlag = 1, regression = 'c', autolag = None)
print adf

width: 1; d: 0.000000
1.0
(-28.305034251220469, 0.0, 1, 3037, {'5%': -2.862492156233158, '1%': -3.4325050331831117, '10%': -2.5672768570714237})


In [24]:
f4_df = pd.DataFrame(index = f4.index).assign(price = f4)
df4 = fracDiff_FFD(f4_df, 0., 1e-6)

corr = np.corrcoef(f4_df.loc[df4.index, 'price'], df4['price'])[0,1]
print corr

adf = adfuller(df4.price, maxlag = 1, regression = 'c', autolag = None)
print adf

width: 1; d: 0.000000
1.0
(-27.91643015522946, 0.0, 1, 3037, {'5%': -2.862492156233158, '1%': -3.4325050331831117, '10%': -2.5672768570714237})


In [25]:
#%matplotlib

#f,ax = plt.subplots()
#df2.plot(ax=ax)



In [26]:
features = pd.DataFrame().assign(f0 = df0.loc[labels.index].price).assign(f1 = df1.loc[labels.index].price).assign(f2 = df2.loc[labels.index].price).assign(f3 = df3.loc[labels.index].price).assign(f4 = df4.loc[labels.index].price).assign(lbl = labels.bin)

X = features.drop('lbl',axis=1)
y = features['lbl']

X

Unnamed: 0,f0,f1,f2,f3,f4
2018-08-18 13:06:33.367,1.460629,8.800000,0.000000,5.900637,151.641747
2018-08-18 13:11:25.588,10.735159,12.500000,0.000000,14.215115,146.086130
2018-08-18 13:12:44.462,7.896993,26.500000,0.700000,57.357052,100.437261
2018-08-18 13:13:44.502,-4.707923,8.300000,15.700000,75.457641,84.063412
2018-08-18 13:15:45.992,-5.040718,17.600000,19.431339,42.618608,118.201316
2018-08-18 13:18:10.443,-10.968470,4.500000,24.100000,122.609463,36.900129
2018-08-18 13:24:56.012,19.140555,35.300000,0.200000,63.334386,105.577138
2018-08-18 13:34:22.178,2.116191,17.200000,17.900000,76.543239,84.926098
2018-08-18 13:46:04.003,11.831573,29.400000,0.113181,59.329697,101.677209
2018-08-18 13:47:16.459,5.747064,18.575455,4.100000,13.602720,154.335247


In [27]:
from mpl_toolkits.mplot3d import Axes3D

b = features[features['lbl'] == 1].drop('lbl',axis=1)
s = features[features['lbl'] == -1].drop('lbl',axis=1)

b0 = b['f0'].values
b1 = b['f3'].values
b2 = b['f4'].values

s0 = s['f0'].values
s1 = s['f3'].values
s2 = s['f4'].values

fig = plt.figure()
ax = fig.add_subplot(111, projection = '3d')
ax.scatter(b0, b1, b2, c='r', marker='o')
ax.scatter(s0, s1, s2, c='g', marker='o')




<mpl_toolkits.mplot3d.art3d.Path3DCollection at 0x7f2e5c129050>

In [28]:
from sklearn.model_selection import KFold, RepeatedKFold, LeaveOneOut, LeavePOut, StratifiedKFold, StratifiedShuffleSplit, TimeSeriesSplit

RANDOM_STATE = 777
n_estimator = 600

#cv = 6
#cv = ShuffleSplit(n_splits=6, test_size = 0.3, random_state = 0)
#cv = RepeatedKFold(n_splits = 6, n_repeats = 4, random_state = RANDOM_STATE)
#cv = LeaveOneOut()
#cv = LeavePOut(p = 2)
#cv = StratifiedKFold(n_splits=6)
#cv = StratifiedShuffleSplit(n_splits=6, test_size = 0.3, random_state = RANDOM_STATE)
#cv = TimeSeriesSplit(max_train_size = None, n_splits = 6)

#clf = RandomForestClassifier(max_depth=10, n_estimators=n_estimator, criterion='entropy', random_state=RANDOM_STATE)
#clf = SVC(kernel = 'poly', C = 1)

#scores = cross_val_score(clf, X, y, cv = cv)
#print('Accuracy: %0.2f (+/- %0.2f)' % (scores.mean(), scores.std()))

In [24]:
numCoEvents = mpPandasObj(mpNumCoEvents, ('molecule', events.index), cpu_count(), closeIdx = close.index, t1 = events['t1'])
numCoEvents = numCoEvents.loc[~numCoEvents.index.duplicated(keep='last')]
numCoEvents = numCoEvents.reindex(close.index).fillna(0)

out_tW = mpPandasObj(mpSampleTW, ('molecule', events0.index), cpu_count(), t1 = events0['t1'], numCoEvents = numCoEvents)
avgU = out_tW.mean()

###########################################################################################################3
#t1 = events0['t1']

#barIx = t1.index
#barIx = barIx.union(t1.values).drop_duplicates()

#indM = getIndMatrix(barIx, t1)

#avgU = getAvgUniqueness(indM).mean()

#########################################################################################################33

from sklearn.ensemble import GradientBoostingClassifier

#clf = GradientBoostingClassifier(n_estimators = 6000, learning_rate = 1.0, max_depth = 4, random_state = RANDOM_STATE)


#dtc = DecisionTreeClassifier(criterion='entropy', max_features = 'auto', class_weight = 'balanced')
#clf = BaggingClassifier(base_estimator = dtc, n_estimators = 60, max_samples = avgU, max_features = 1.)

#rnf = RandomForestClassifier(n_estimators = 1, criterion = 'entropy', bootstrap = False, class_weight = 'balanced_subsample')
#clf = BaggingClassifier(base_estimator = rnf, n_estimators = 1000, max_samples = avgU, max_features = 1.)

clf = SVC(kernel = 'poly', C = 1)
clf = BaggingClassifier(base_estimator = clf, n_estimators = 10, max_samples = avgU, max_features = 1.)

2018-11-04 13:13:52.484543 100.0% mpNumCoEvents done after 0.0 minutes. Remaining 0.0 minutes.018-11-04 13:13:52.484439 75.0% mpNumCoEvents done after 0.0 minutes. Remaining 0.0 minutes.
2018-11-04 13:13:52.590639 100.0% mpSampleTW done after 0.0 minutes. Remaining 0.0 minutes.018-11-04 13:13:52.589226 75.0% mpSampleTW done after 0.0 minutes. Remaining 0.0 minutes.


In [25]:
scores = cvScore(clf = clf, 
                 X = X, 
                 y = y, 
                 sample_weight = sample_weights0, #pd.Series(index = X.index, data = np.ones(X.shape[0]))
                 scoring = 'accuracy', 
                 t1 = events0['t1'], 
                 cv = 10, 
                 pctEmbargo = 0.01)

print('Accuracy: %0.2f (+/- %0.2f)' % (scores.mean(), scores.std()))

NameError: name 'X' is not defined