In [1]:
import sys
import math

import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import missingno as msno
import seaborn as sns
import scipy.stats as stats

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.model_selection import train_test_split, cross_val_score, ShuffleSplit
from sklearn.metrics import roc_curve, classification_report

from multiprocessing import cpu_count
from tqdm import tqdm, tqdm_notebook

from mpengine import mpPandasObj
from util import cprintf

from zig_zag import zig_zag_df
from statsmodels.tsa.stattools import adfuller
from statsmodels.distributions.empirical_distribution import ECDF

from entropy_features import plugIn, lempelZiv_lib, konto
from structural_breaks import get_bsadf, get_bsadf0
from sample_weights import mpNumCoEvents, mpSampleW, mpSampleTW, getAvgUniqueness, getIndMatrix
from financial_data_structures import dollar_bar_df
from cross_validation_in_finance import PurgedKFold, cvScore
from labeling import getDailyVol, getTEvents, addVerticalBarrier, getEvents, getBins, getBinsOld, df_returns, df_rolling_autocorr
from fractionally_differentiated_features import fracDiff, fracDiff_FFD, plotMinFFD

  from pandas.core import datetools


In [2]:
#filepath = '~/Dev/notebook/lopez/data/bitfinex_BTCUSD_trades.csv'
filepath = '~/Dev/notebook/lopez/data/btcusd_trades.csv'
cols = list(map(str.lower, ['Datetime','Amount','Price','<Unknown>']))
columns = dict(zip(range(len(cols)), cols))

df = pd.read_csv(filepath, header = None).rename(columns = columns).assign(dates = lambda df: (pd.to_datetime(df['datetime'], format='%Y-%m-%d %H:%M:%S.%f'))).assign(dollar_volume=lambda df: df['amount'] * df['price']).assign(volume=lambda df: df['amount']).drop(['datetime', '<unknown>'], axis = 1).set_index('dates').drop_duplicates()
df = df.iloc[::-1]

In [6]:
dollar_M = 1000000
dollar_df = dollar_bar_df(df, 'dollar_volume', dollar_M)
#dollar_df = dollar_df.iloc[0:800]

100%|██████████| 9931276/9931276 [00:02<00:00, 3319201.51it/s]9931276 [00:00<323:26:01,  8.53it/s]


In [7]:
close = dollar_df.price.copy()
close = close[~close.index.duplicated(keep='first')]
len(close)

38294

In [8]:
features = pd.DataFrame(index = dollar_df.index, columns = ['price'])
features.price = close

In [9]:
def add_aggressor_side_entropy0(features_df, raw_df):
    features_df['aggressor_side_entropy0'] = pd.Series(index = features_df.index)
    
    for i in tqdm(range(len(features_df.index)-1)):
        p = raw_df[features_df.index[i]:features_df.index[i+1]].price
        p = p.diff()
        p = (p / np.abs(p)).fillna(0) + 1
    
        msg = ''
        for j in p:
            msg = msg + str(int(j))

        e = konto(msg) 
        features_df.aggressor_side_entropy0[features_df.index[i + 1]] = e['h']
    
    return features_df



In [10]:
def add_aggressor_side_entropy1(features_df, raw_df):
    features_df['aggressor_side_entropy1'] = pd.Series(index = features_df.index)
    
    for i in tqdm(range(len(features_df.index)-1)):
        vol = raw_df[features_df.index[i]:features_df.index[i+1]].volume
        vol = vol / np.abs(vol)
        vol = (vol + 1) / 2
    
        msg = ''
        for j in vol:
            msg = msg + str(int(j))

        e = konto(msg) 
        features_df.aggressor_side_entropy1[features_df.index[i + 1]] = e['h']
    
    return features_df

In [11]:
def add_bsadf(features_df):
    features_df['bsadf'] = pd.Series(index = features_df.index)
    
    price = features_df.price
    logp = np.log(price)
    
    for i in tqdm(range(1,logp.shape[0])):
        index = features_df.index[i]        
        features_df.bsadf[index] = get_bsadf0(logP = logp[0:i], minSL = 100, constant ='ctt', lags = 16)['gsadf']

    return features_df

In [12]:
def get_quantile_encoding_msg(series, quantile = 10):
    Q = np.linspace(.0, 1., quantile + 1)    
    
    msg = '';
    for x in tqdm(series):
        for i in range(len(Q)-1):
          
            q0 = series.quantile(Q[i])
            q1 = series.quantile(Q[i+1])
            
            if x >= q0 and x < q1:
                msg = msg + str(int(i))
                break
                
    return msg

def add_returns_entropy(features_df, q = 2):
    features_df['returns_entropy'] = pd.Series(index = features_df.index)
    
    s0 = features_df.price[0:-2]
    s1 = features_df.price[1:-1]

    r = np.log(np.divide(s1,s0))
    msg = get_quantile_encoding_msg(r, q)
    
    for i in tqdm(range(100, len(msg))):
        msg0 = msg[0:i]
        e = konto(msg0)
        
        index = features_df.index[i]  
        features_df.returns_entropy[index] = e['h']
      
    return features_df

In [13]:
def add_buys_volume_entropy(features_df, raw_df, q = 2):
    features_df['buys_volume_entropy'] = pd.Series(index = features_df.index)
    
    s = pd.Series(index = features_df.index)

    for i in tqdm(range(len(features_df.index)-1)):
        df0 = raw_df[features_df.index[i]:features_df.index[i + 1]]

        Vb = df0[df0.volume > 0].volume.sum()
        V = df0.volume.abs().sum()

        index = features_df.index[i+1] 
        s[index] = Vb/V
    
    
    s = s.dropna()
    msg = get_quantile_encoding_msg(s, 2)

    for i in tqdm(range(100, len(msg))):
            msg0 = msg[0:i]
            e = konto(msg0)

            index = features_df.index[i]
            features_df.buys_volume_entropy[index] = e['h']
    
    return features_df

In [14]:
def add_kyles_lambda(features_df, raw_df):    
    features_df['kyles_lambda'] = pd.Series(index = features_df.index)
    
    from sklearn import linear_model
    
    for i in tqdm(range(len(features_df.index)-1)):
        df0 = raw_df[features_df.index[i]:features_df.index[i + 1]]
        
        dP = df0.price.diff()
        dP = dP.dropna()

        bV = df0.volume[dP.index]

        dP = np.array(dP)
        bV = np.array(bV)

        bV = bV.reshape(-1, 1)

        regr = linear_model.LinearRegression()
        regr.fit(bV, dP)
        
        index = features_df.index[i]
        features_df.kyles_lambda[index] = regr.coef_
        
    return features_df

In [15]:
def add_amihuds_lambda(features_df, raw_df, start = 100):
    features_df['amihuds_lambda'] = pd.Series(index = features_df.index)

    dLogP = np.abs(np.log(features_df.price).diff())
    dLogP = dLogP.dropna()
    
    bV = pd.Series(index = dLogP.index)
    
    from sklearn import linear_model
    
    for i in tqdm(range(len(features_df.index)-1)):
        index0 = features_df.index[i]
        index1 = features_df.index[i + 1]
        
        df0 = raw_df[index0:index1]        
        bV[index1] = df0.volume.sum()
    
    for i in tqdm(range(start,len(dLogP.index))):        
        X = bV.iloc[:i]
        Y = dLogP.iloc[:i]
        
        X = np.array(X)
        Y = np.array(Y)
        
        X = X.reshape(-1, 1)
        
        regr = linear_model.LinearRegression()
        regr.fit(X, Y)
    
        index = dLogP.index[i]
        features_df.amihuds_lambda[index] = regr.coef_
        
    return features_df

In [16]:
def add_vpin(features_df, raw_df):
    features_df['vpin'] = pd.Series(index = features_df.index)
    
    Vb = pd.Series(index = features_df.index)
    Vs = pd.Series(index = features_df.index)
    V  = pd.Series(index = features_df.index)
    
    for i in tqdm(range(len(features_df.index)-1)):
        index0 = features_df.index[i]
        index1 = features_df.index[i + 1]
        
        df0 = raw_df[index0:index1]
        
        dfB = df0[df0.volume > 0]
        dfS = df0[df0.volume < 0]
        
        Vb[index1] = np.abs(dfB.volume).sum()
        Vs[index1] = np.abs(dfS.volume).sum()
        
        V[index1] = Vb[index1] + Vs[index1]
        
    V  = V.dropna()
    Vb = Vb.dropna()
    Vs = Vs.dropna()
        
    for i in tqdm(range(1, len(features_df.index))):    
        Vb0 = Vb.iloc[:i]
        Vs0 = Vs.iloc[:i]
        V0  = V.iloc[:i]
        
        index = features_df.index[i]
        features_df.vpin[index] = np.abs(np.subtract(Vb0, Vs0)).sum() / V0.sum()
        
    return features_df        

In [17]:
features = add_vpin(features, df)
features.vpin = features.vpin - features.vpin.mean()

100%|██████████| 38293/38293 [00:52<00:00, 728.24it/s]  | 1/38293 [00:00<6:40:56,  1.59it/s]
100%|██████████| 38293/38293 [00:25<00:00, 1477.60it/s]


In [18]:
features = add_amihuds_lambda(features, df)
features.amihuds_lambda = features.amihuds_lambda - features.amihuds_lambda.mean()

100%|██████████| 38293/38293 [00:11<00:00, 3388.98it/s] | 298/38293 [00:00<00:12, 2975.87it/s]
100%|██████████| 38193/38193 [00:35<00:00, 1072.42it/s]


In [20]:
features = add_kyles_lambda(features, df)
features.kyles_lambda = features.kyles_lambda - features.kyles_lambda.mean()

  0%|          | 0/38293 [00:00<?, ?it/s]Exception KeyError: KeyError(<weakref at 0x7fc9e29c4f70; to 'tqdm' at 0x7fca207030d0>,) in <bound method tqdm.__del__ of   0%|          | 0/38293 [00:00<?, ?it/s]> ignored


ValueError: cannot reindex from a duplicate axis

In [17]:
#%matplotlib
#f, ax = plt.subplots(2)

#features.price.plot(ax=ax[0])
#features.vpin.plot(ax=ax[1])
#features.kyles_lambda.plot(ax=ax[1])
#features.amihuds_lambda.plot(ax=ax[2])

#plt.scatter(bV, dP)
#plt.plot(bV, regr.predict(bV))



In [18]:
features = add_buys_volume_entropy(features, df, q = 4)
features.buys_volume_entropy = features.buys_volume_entropy - features.buys_volume_entropy.mean()


100%|██████████| 799/799 [00:00<00:00, 1017.28it/s]   | 93/799 [00:00<00:00, 929.41it/s]
100%|██████████| 799/799 [00:00<00:00, 1895.88it/s]
100%|██████████| 693/693 [05:13<00:00,  2.21it/s]


In [13]:
#%matplotlib
#f, ax = plt.subplots(2)

#features.buys_volume_entropy.plot(ax=ax[0])
#features.price.plot(ax=ax[1])

#corr = np.corrcoef(features.buys_volume_entropy, features.price)[0,1]
#print corr

Using matplotlib backend: TkAgg
nan


In [19]:
features = add_bsadf(features)
features.bsadf = features.bsadf - features.bsadf.mean()

100%|██████████| 799/799 [05:37<00:00,  2.37it/s]     | 108/799 [00:00<00:00, 1003.10it/s]


In [20]:
features = add_returns_entropy(features)
features.returns_entropy = features.returns_entropy - features.returns_entropy.mean()

100%|██████████| 798/798 [00:00<00:00, 1835.41it/s]   | 175/798 [00:00<00:00, 1737.82it/s]
100%|██████████| 697/697 [07:28<00:00,  1.56it/s]


In [21]:
#features = add_aggressor_side_entropy0(features, df)
#features.aggressor_side_entropy0 = features.aggressor_side_entropy0 - features.aggressor_side_entropy0.mean()

features = add_aggressor_side_entropy1(features, df)
features.aggressor_side_entropy1 = features.aggressor_side_entropy1 - features.aggressor_side_entropy1.mean()


100%|██████████| 799/799 [03:08<00:00,  4.25it/s]     | 3/799 [00:00<01:32,  8.63it/s]


In [23]:
features = features.dropna()
features.columns

#features.bsadf = features.bsadf - features.bsadf.mean()


Index([u'price', u'vpin', u'amihuds_lambda', u'kyles_lambda'], dtype='object')

In [22]:
%matplotlib
f, ax = plt.subplots(3)

features.price.plot(ax=ax[0], title = 'Price')
features.vpin.plot(ax=ax[1], title = 'vpin')
features.amihuds_lambda.plot(ax=ax[2], title = 'amihuds_lambda')
#features.kyles_lambda.plot(ax=ax[3], title = 'kyles_lambda')
#features.buys_volume_entropy.plot(ax=ax[4], title = 'buys_volume_entropy')
#features.bsadf.plot(ax=ax[5], title = 'bsadf')
#features.returns_entropy.plot(ax=ax[6], title = 'returns_entropy')
#features.aggressor_side_entropy1.plot(ax=ax[7], title = 'aggressor_side_entropy1')

Using matplotlib backend: TkAgg


TypeError: Empty 'DataFrame': no numeric data to plot

In [54]:
h0_0 = features.aggressor_side_entropy1.std()
h0_1 = features.bsadf.std()

tEvents0, _, _ = getTEvents(features.aggressor_side_entropy1, h = 3. * h0_0)
tEvents1, _, _ = getTEvents(features.bsadf, h = h0_1)


  0%|          | 0/691 [00:00<?, ?it/s]100%|██████████| 691/691 [00:00<00:00, 10320.39it/s]
100%|██████████| 691/691 [00:00<00:00, 9902.40it/s]


In [55]:
#corr = np.corrcoef(features.vpin, features.price)[0,1]
#print corr

In [56]:
%matplotlib

close_p = features.price
close_t0 = features.price.loc[tEvents0]
close_t1 = features.price.loc[tEvents1]

f,ax = plt.subplots()

close_p.plot(ax = ax)
#close_t0.plot(ax = ax, ls = '', marker = 'o', markersize = 8, color = 'r')
close_t1.plot(ax = ax, ls = '', marker = 'v', markersize = 8, color = 'r')



Using matplotlib backend: TkAgg


<matplotlib.axes._subplots.AxesSubplot at 0x7fc182cdc4d0>

In [69]:
t1 = False
ptSl = [1,1]
target = getDailyVol(features.price, 100)
minRet = 0.01
cpus = cpu_count() - 1

events = getEvents(features.price, tEvents, ptSl, target, minRet, cpus, t1=t1)

2018-11-04 12:42:01.017260 33.33% applyPtSlOnT1 done after 0.0 minutes. Remaining 0.0 minutes.2018-11-04 12:42:01.045902 66.67% applyPtSlOnT1 done after 0.0 minutes. Remaining 0.0 minutes.2018-11-04 12:42:01.047946 100.0% applyPtSlOnT1 done after 0.0 minutes. Remaining 0.0 minutes.


In [70]:
labels = getBins(events, features.price)
labels.bin.value_counts()

 1.0    58
-1.0    41
Name: bin, dtype: int64

In [71]:
%matplotlib

lbuy = labels[labels['bin'] > 0]
lsell = labels[labels['bin'] < 0]

price_buy = close.loc[lbuy.index]
price_sell = close.loc[lsell.index]

f,ax = plt.subplots()
close.plot(ax=ax)

price_buy.plot(ax=ax, ls = '', marker = '^', markersize = 7, color = 'g')
price_sell.plot(ax=ax, ls = '', marker = 'v', markersize = 7, color = 'r')

Using matplotlib backend: TkAgg


<matplotlib.axes._subplots.AxesSubplot at 0x7fc196602250>

In [74]:
features.index

DatetimeIndex(['2018-08-18 07:46:53.286000', '2018-08-18 07:53:46.468000',
               '2018-08-18 08:03:36.403000', '2018-08-18 08:37:58.223000',
               '2018-08-18 09:28:56.130000', '2018-08-18 10:10:45.035000',
               '2018-08-18 11:02:25.933000', '2018-08-18 11:30:02.515000',
               '2018-08-18 12:37:31.197000', '2018-08-18 12:53:31.956000',
               ...
               '2018-08-23 07:29:42.535000', '2018-08-23 07:40:32.223000',
               '2018-08-23 08:05:02.421000', '2018-08-23 08:45:07.964000',
               '2018-08-23 09:17:21.299000', '2018-08-23 09:24:43.107000',
               '2018-08-23 09:26:13.295000', '2018-08-23 09:42:24.280000',
               '2018-08-23 10:14:36.584000', '2018-08-23 10:28:30.312000'],
              dtype='datetime64[ns]', name=u'dates', length=692, freq=None)