# Testing high-frequency predictions



In [10]:
import warnings
warnings.filterwarnings("ignore")

In [11]:
import pandas as pd
%matplotlib inline
import matplotlib
import numpy as np
import matplotlib.pyplot as plt 
from pandas import datetime
from pandas.plotting import autocorrelation_plot
from statsmodels.tsa.arima_model import ARIMA
from sklearn.metrics import mean_squared_error
import statsmodels.api as sm
from numba import jit
from scipy import stats

In [12]:
# from plotly import __version__
# from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
# import plotly.graph_objs as go
# import plotly.grid_objs as gd
# import seaborn as sns
# import cufflinks as cf
# init_notebook_mode(connected=True)
# cf.go_offline()
# cf.set_config_file(offline=False, world_readable=True, theme='polar')

In [13]:
# Display and formatting
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 1200)
# palette = sns.color_palette('deep', 5)
# palette[1], palette[2] = palette[2], palette[1]
# sns.set_style('darkgrid')
%matplotlib inline

In [14]:
co_5_trades = pd.read_csv(
    "/Volumes/GoogleDrive/Shared drives/Data/gasoil + brent/merged.csv")
co_5_trades.set_index("Date_Time", inplace=True)

In [15]:
co_5_trades.head()

Unnamed: 0_level_0,Price,Volume,Bid Price,Bid Size,Ask Price,Ask Size
Date_Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-02-02 01:00:00.686,70.0,3,69.96,4.0,70.0,47.0
2018-02-02 01:00:00.686,70.0,3,69.96,4.0,70.0,47.0
2018-02-02 01:00:00.686,70.0,1,69.96,4.0,70.0,47.0
2018-02-02 01:00:00.686,70.0,1,69.96,4.0,70.0,47.0
2018-02-02 01:00:00.686,70.0,1,69.96,4.0,70.0,47.0


In [None]:
# Load trades
co_5_trades = pd.read_csv(
    "/Volumes/GoogleDrive/Shared drives/Data/gasoil + brent/merged.csv",
    dtype={
        "Date": str,
        "Time": str,
        "Price": np.float64,
        "Volume": np.float64,
        "Market Flag": str,
    },
)

# Set the index
co_5_trades["timestamp"] = co_5_trades["Date"] + " " + co_5_trades["Time"]
co_5_trades["timestamp"] = pd.to_datetime(
    co_5_trades["timestamp"], format="%m/%d/%Y %H:%M:%S.%f"
)
co_5_trades.set_index("timestamp", inplace=True)
co_5_trades.drop(["Date", "Time"], axis=1, inplace=True)
co_5_trades.sort_index(inplace=True)

In [None]:
# Load order books
co_5_ob = pd.read_csv(
    "C:\\TickData\\TickWrite7\\DATA\\fivedays\\quotes_CO10-14.09.2018.csv",
    dtype={
        "Date": str,
        "Time": str,
        "Bid Price": np.float64,
        "Bid Size": np.float64,
        "Ask Price": np.float64,
        "Ask Size": np.float64,
        "Market Flag": str,
        "Quote Condition": str,
    },
)

# Set the index
co_5_ob["timestamp"] = co_5_ob["Date"] + " " + co_5_ob["Time"]
co_5_ob["timestamp"] = pd.to_datetime(
    co_5_ob["timestamp"], format="%m/%d/%Y %H:%M:%S.%f"
)
co_5_ob.set_index("timestamp", inplace=True)
co_5_ob.drop(["Date", "Time"], axis=1, inplace=True)
co_5_ob.sort_index(inplace=True)

In [None]:
# drop the useless columns
co_5_trades.drop("Market Flag", inplace=True, axis=1)
co_5_ob.drop("Quote Condition", inplace=True, axis=1)
co_5_ob.drop("Market Flag", inplace=True, axis=1)

In [None]:
# Rename columns
co_5_trades.columns = ["price", "volume"]
co_5_ob.columns = ["bid_price", "bid_size", "ask_price", "ask_size"]

In [None]:
# Review data
co_5_trades.head()

In [None]:
co_5_ob.head()

In [None]:
# NOTE: Convert size in terms of dollar amount

In [16]:
# Copy
trades = co_5_trades.copy()
ob = co_5_ob.copy()

# Truncate trades and books to some specific window
trades_idx = trades.index.copy()
trades.index = trades.index.ceil("10ms")
ob.index = ob.index.ceil("10ms")
ob = ob.resample("10ms").last()

# Generate series of time
df_ts = pd.DataFrame(index=pd.date_range(trades.index.min(), trades.index.max(), freq="10ms"))
df_ts.index.name = "timestamp"
ob = df_ts.join(ob, how="left")

# ffill books data
ob.fillna(method="ffill", inplace=True)

NameError: name 'co_5_ob' is not defined

In [None]:
# Merge data frames
taq = trades.join(ob, how="left")

In [None]:
taq.info(verbose=True)

In [None]:
taq.head()

In [None]:
taq.loc["2018-09-10 14:00"].head(10)

In [None]:
taq.head()

In [None]:
# features engineering
taq['mid'] = (taq['bid_price'] + taq['ask_price']) / 2.0

taq.loc[:,'volume_weighted_mid'] = (taq.loc[:,'bid_price'] * taq.loc[:,'ask_size']
+ taq.loc[:,'ask_price'] * taq.loc[:,'bid_size']) / (taq.loc[:,'bid_size']+taq.loc[:,'ask_size'])

taq['direction_volume'] = np.where(taq['price'] < taq['mid'], taq['volume']*(-1), taq['volume'])


In [None]:
# Parameters
rolling_length = 10
flow_amount = 10

In [None]:
# Prediction model using flow
taq['long_sig'] = np.where(taq['direction_volume'].rolling(rolling_length).sum() > flow_amount, 1, 0)
taq['short_sig'] = np.where(taq['direction_volume'].rolling(rolling_length).sum() < -flow_amount, -1, 0)

taq['returns'] = taq['mid'].pct_change()
taq['signal'] = taq['long_sig'] + taq['short_sig']

taq['long_pnl'] = taq['long_sig'].shift(1) * taq['returns']
taq['short_pnl'] = taq['short_sig'].shift(1) * taq['returns']
taq['total_pnl'] = taq['long_pnl'] + taq['short_pnl']
taq['cum_pnl'] = (1 + taq['total_pnl']).cumprod()
taq['cum_pnl'].plot()
plt.show()

## Order book imbalance


In [None]:
def ofi(quotes):
    qdf = quotes.copy()

    qdf['mid_change'] = ((qdf['bid_price'] + qdf['ask_price']) / 2.0).diff().div(0.01)
    
    qdf['prev_bidprice'] = qdf['bid_price'].shift()
    qdf['prev_bidsize'] = qdf['bid_size'].shift()
    qdf['prev_askprice'] = qdf['ask_price'].shift()
    qdf['prev_asksize'] = qdf['ask_size'].shift()

    # Fix any missing/invalid data
    qdf.replace([np.inf, np.NINF], np.nan, inplace=True)
    qdf.fillna(method="ffill", inplace=True)
    qdf.fillna(method="bfill", inplace=True)
    
    bid_geq = qdf['bid_price'] >= qdf['prev_bidprice']
    bid_leq = qdf['bid_price'] <= qdf['prev_bidprice']
    ask_geq = qdf['ask_price'] >= qdf['prev_askprice']
    ask_leq = qdf['ask_price'] <= qdf['prev_askprice']
    
    qdf['ofi'] = np.zeros(len(qdf))
    qdf['ofi'].loc[bid_geq] += qdf['bid_size'].loc[bid_geq]
    qdf['ofi'].loc[bid_leq] -= qdf['prev_bidsize'].loc[bid_leq]
    qdf['ofi'].loc[ask_geq] += qdf['prev_asksize'].loc[ask_geq]
    qdf['ofi'].loc[ask_leq] -= qdf['ask_size'].loc[ask_leq]    
    
    return qdf

In [None]:
# Review TAQ data
taq.head()

In [None]:
ofi_df = ofi(taq)

In [None]:
ofi_df.head()

## To test
What is the next trade going to be? Buy or sell? 

If we know the next trade direction, we can predict likelihood of fill

# Logistic regression

http://www.chloe-hsu.com/html/logistic_regression

https://towardsdatascience.com/building-a-logistic-regression-in-python-step-by-step-becd4d56c9c8



In [None]:
autocorrelation_plot(taq['mid'])

In [None]:
model = ARIMA(taq['mid'], order=(5,1,5))
model_fit = model.fit(disp=0)
print(model_fit.summary())

In [None]:
https://www.machinelearningplus.com/time-series/arima-model-time-series-forecasting-python/

# GARCH 

In [None]:
from arch import arch_model

model=arch_model(taq['returns'], vol='Garch', p=1, o=0, q=1, dist='Normal')
results=model.fit()
print(results.summary())



In [None]:
# wont run, not enough memory

forecasts = results.forecast(horizon=5, method='simulation', simulations=100)

sims = forecasts.simulations


In [None]:
lines = plt.plot(sims.values[-1,:,:].T, color='blue', alpha=0.01)
lines[0].set_label('Simulated paths')
plt.show()



In [None]:
print(np.percentile(sims.values[-1,:,-1].T,5))
plt.hist(sims.values[-1, :,-1],bins=50)
plt.title('Distribution of Returns')
plt.show()

# From here needs re-doing

In [None]:

windows = [10,20,50]
for window in windows:
        co_merged_1m["trade_flow_{}".format(window)] = co_merged_1m['Direction_Volume'].rolling(window).sum()

In [None]:
X = co_merged_1m['Mid_Price'].loc['2018-02-02 06:00:00':'2018-02-02 18:00:00']
X.fillna(method="ffill", inplace=True)

# X.replace([np.nan, np.inf], 0.0, inplace=True) 

# Arima

In [None]:
size = int(len(X) * 0.70)
train, test = X[0:size], X[size:len(X)]
history = [x for x in train]
predictions = list()
for t in range(len(test)):
    model = ARIMA(history, order=(5,1,0))
    model_fit = model.fit(disp=0)
    output = model_fit.forecast()
    yhat = output[0]
    predictions.append(yhat)
    obs = test[t]
    history.append(obs)
    print('predicted=%f, expected=%f' % (yhat, obs))

In [None]:
plt.plot(predictions)
plt.show()

In [None]:
print(size)

In [None]:
# this is a separate rolling ARIMA

In [None]:
def StartARIMAForecasting(Actual, P, D, Q):
	model = ARIMA(Actual, order=(P, D, Q))
	model_fit = model.fit(disp=0)
	prediction = model_fit.forecast()[0]
	return prediction


In [None]:
    
ActualData = X
#Size
NumberOfElements = len(ActualData)


In [None]:

#Use 70% of data as training, rest 30% to Test model
TrainingSize = int(NumberOfElements * 0.7)
TrainingData = ActualData[0:TrainingSize]
TestData = ActualData[TrainingSize:NumberOfElements]



In [None]:
#new arrays to store actual and predictions
Actual = [x for x in TrainingData]
Predictions = list()



In [None]:

#in a for loop, predict values using ARIMA model
for timepoint in range(len(TestData)):
	ActualValue =  TestData[timepoint]
	#forcast value
	Prediction = StartARIMAForecasting(Actual, 5,1,0)    
	print('Actual=%f, Predicted=%f' % (ActualValue, Prediction))
	#add it in the list
	Predictions.append(Prediction)
	Actual.append(ActualValue)



In [None]:
#Print MSE to see how good the model is
# Error = mean_squared_error(TestData, Predictions)
# print('Test Mean Squared Error (smaller the better fit): %.3f' % Error)

plt.plot(Predictions)
plt.show()

In [None]:
https://machinelearningmastery.com/arima-for-time-series-forecasting-with-python/

In [None]:
https://towardsdatascience.com/forecasting-exchange-rates-using-arima-in-python-f032f313fc56

In [None]:
https://machinelearningmastery.com/make-sample-forecasts-arima-python/