# Prediction of short-term mid-price movement direction based on LOB data

*** Disclaimer
The information and materails provided here are not intended to be and do not constitute financial advice, investment advice, trading advice or any other advice or recommendation of any sort.

********
Download the limit order book sample data from https://lobsterdata.com/info/DataSamples.php
********

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import os as os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

### change the data path according where you store them 
os.chdir("/content/drive/My Drive/Colab Notebooks/session 2")

orderbook = "GOOG_2012-06-21_34200000_57600000_orderbook_10.csv"
message = "GOOG_2012-06-21_34200000_57600000_message_10.csv"


In [0]:
num_levels = 10

header_list = []
for i in range(num_levels):
    header_list = header_list + ["Pa%d"%(i+1),"Va%d"%(i+1),"Pb%d"%(i+1),"Vb%d"%(i+1)]
df_orderbook = pd.read_csv(orderbook,header=None,names=header_list)

df_message = pd.read_csv(message,usecols = [0,1,3,4,5], names=['time', 'type','size','price','direction'])
df_message.index = pd.Timestamp(datetime.date.today()) + pd.TimedeltaIndex(df_message.time, unit='s')
df_orderbook.index = df_message.index

In [0]:
df_message.head()

Unnamed: 0,time,type,size,price,direction
2020-02-10 09:30:00.015105074,34200.015105,4,4,5794000,1
2020-02-10 09:30:00.059901970,34200.059902,4,300,5794000,1
2020-02-10 09:30:00.113246707,34200.113247,5,1,5795100,1
2020-02-10 09:30:00.113246707,34200.113247,5,1,5795000,1
2020-02-10 09:30:00.113246707,34200.113247,5,1,5794900,1


In [0]:
#binary function
def labelling(a):
    if a > 0:
        b = 1
    else:
        b= 0
    return b

In [0]:
# Spreads and mid-prices
def feature_v2(num_levels,df): # 20
    for i in range(1,num_levels+1):
        df["spread%d"%(i)] = df["Pa%d"%(i)] - df["Pb%d"%(i)]
        df["midprice%d"%(i)] = (df["Pa%d"%(i)] + df["Pb%d"%(i)])/2
    return df

def feature_v3(num_levels,df): # 20 - 2
    for i in range(1, num_levels):
        df["PA_diff%d"%(i)] = df["Pa%d"%(i+1)] - df["Pa%d"%(i)]
        df["PB_diff%d"%(i)] = df["Pb%d"%(i)] - df["Pb%d"%(i+1)]
    return df

def feature_v4(num_levels,df): # 4
    lst = ["Pa%d"%(i+1) for i in range(num_levels)]
    df["Pa_mean"] = df[df.columns.intersection(lst)].sum(axis=1)    
    
    lst = ["Pb%d"%(i+1) for i in range(num_levels)]
    df["Pb_mean"] = df[df.columns.intersection(lst)].sum(axis=1)
    
    lst = ["Va%d"%(i+1) for i in range(num_levels)]
    df["Va_mean"] = df[df.columns.intersection(lst)].sum(axis=1)
    
    lst = ["Vb%d"%(i+1) for i in range(num_levels)]
    df["Vb_mean"] = df[df.columns.intersection(lst)].sum(axis=1)
    return df

def feature_v5(num_levels,df): # 20
    for i in range(num_levels): #
        df["pri_accum_diff%d"%(i+1)] = 0
        df["vol_accum_diff%d"%(i+1)] = 0
        for k in range(i):
            df["pri_accum_diff%d"%(i+1)] += (df["Pa%d"%(k+1)] - df["Pb%d"%(k+1)])
            df["vol_accum_diff%d"%(i+1)] += (df["Va%d"%(i+1)] - df["Vb%d"%(i+1)])
    return df

def normalize_input(X_train, X_test):
    scaler = StandardScaler()
    # Fit on training set only.
    scaler.fit(X_train)
    # Apply transform to both the training set and the test set.
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    return X_train, X_test

In [0]:
df_orderbook = feature_v2(num_levels,df_orderbook)
df_orderbook = feature_v3(num_levels,df_orderbook)
df_orderbook = feature_v4(num_levels,df_orderbook)
df_orderbook = feature_v5(num_levels,df_orderbook)



from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score, confusion_matrix
def model_scoring(y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    print("======================================")
    print("======Model Performance===============")

    print("the accuracy is: ", accuracy)
    print("the precision is：",precision)
    print("the recall is: ", recall)
    print("the f1 score is: ", f1)
    print("confution matrix: \n", confusion_matrix(y_test, y_pred))
    
def split_sequence(X_sequence, y_sequence, n_steps):
    X, y = list(), list()
    for i in range(0, len(X_sequence)-n_steps+1):
        seq_x, seq_y = X_sequence[i: i+n_steps], y_sequence[i + n_steps - 1]
        X.append(seq_x)
        y.append(seq_y)
    return np.array(X), np.array(y)

####################################################################
####################################################################

df_orderbook1s = df_orderbook.resample('1S').first()
df_orderbook1s["price_10min"] = df_orderbook1s['midprice1'].shift(-600)
df_orderbook1s.dropna(inplace=True)
df_orderbook1s["price_change"] = df_orderbook1s["price_10min"] - df_orderbook1s['midprice1']

X = df_orderbook1s.drop(['price_10min', 'price_change'], axis=1).values
y = np.array(list(map(labelling, df_orderbook1s["price_change"])))
####################################################################
####################################################################

In [0]:
X.shape

(10743, 102)

In [0]:
y.shape

(10743,)

In [0]:
# logistic regression
from sklearn.linear_model import LogisticRegression
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train, X_test = normalize_input(X_train, X_test)
clf = LogisticRegression(random_state=0, 
                         penalty='l2',
                         solver='lbfgs',
                         max_iter = 100000,
                         multi_class='ovr').fit(X_train, y_train)
print("training score.", clf.score(X_train, y_train))
print("testing score.", clf.score(X_test, y_test))
y_pred = clf.predict(X_test)
model_scoring(y_test, y_pred)

training score. 0.686968085106383
testing score. 0.6928327645051194
the accuracy is:  0.6928327645051194
the precision is： 0.5833333333333334
the recall is:  0.10294117647058823
the f1 score is:  0.17500000000000002
confution matrix: 
 [[2128   75]
 [ 915  105]]


In [0]:
############################
 ######### XGBoost ########
############################
import xgboost as xgb
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_test = normalize_input(X_train, X_test)
xgb_clf = xgb.XGBClassifier()
xgb_clf.fit(X_train, y_train)
y_pred = xgb_clf.predict(X_test)
model_scoring(y_test, y_pred)

the accuracy is:  0.8245695672405771
the precision is： 0.8057692307692308
the recall is:  0.6028776978417266
the f1 score is:  0.6897119341563785
confution matrix: 
 [[1353  101]
 [ 276  419]]
