In [1]:
from __future__ import print_function
import csv
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import svm
from sklearn.metrics import confusion_matrix
from datetime import datetime, date, time
%matplotlib inline

In [2]:
# Read data
AAPL = pd.read_csv("../Data/AAPL_05222012_0930_1300_LOB_2.csv")

In [3]:
# Dimension of the data frame
AAPL.shape

(332673, 62)

In [4]:
# Change column name
AAPL.columns = [col_name.split('.')[1] if len(col_name) > 5 else col_name for col_name in AAPL.columns]

# Note that the best ask and bid price are listed first
AAPL.iloc[0:3, 0:14]

Unnamed: 0,Index,Time,BID_PRICE1,BID_UPDATE_TIME1,BID_SIZE1,ASK_PRICE1,ASK_UPDATE_TIME1,ASK_SIZE1,BID_PRICE2,BID_UPDATE_TIME2,BID_SIZE2,ASK_PRICE2,ASK_UPDATE_TIME2,ASK_SIZE2
0,1,2012/05/22 09:30:00.000,569.02,2012/05/22 09:29:34.006,40,570,2012/05/22 09:29:43.573,400,568.8,2012/05/22 09:29:47.563,100,570.01,2012/05/22 09:29:37.762,100
1,2,2012/05/22 09:30:00.003,569.02,2012/05/22 09:29:34.006,40,570,2012/05/22 09:29:43.573,400,568.8,2012/05/22 09:29:47.563,100,570.01,2012/05/22 09:29:37.762,100
2,3,2012/05/22 09:30:00.003,569.02,2012/05/22 09:29:34.006,40,570,2012/05/22 09:29:43.573,400,568.8,2012/05/22 09:29:47.563,100,570.01,2012/05/22 09:29:37.762,100


In [5]:
AAPL.columns

Index([u'Index', u'Time', u'BID_PRICE1', u'BID_UPDATE_TIME1', u'BID_SIZE1',
       u'ASK_PRICE1', u'ASK_UPDATE_TIME1', u'ASK_SIZE1', u'BID_PRICE2',
       u'BID_UPDATE_TIME2', u'BID_SIZE2', u'ASK_PRICE2', u'ASK_UPDATE_TIME2',
       u'ASK_SIZE2', u'BID_PRICE3', u'BID_UPDATE_TIME3', u'BID_SIZE3',
       u'ASK_PRICE3', u'ASK_UPDATE_TIME3', u'ASK_SIZE3', u'BID_PRICE4',
       u'BID_UPDATE_TIME4', u'BID_SIZE4', u'ASK_PRICE4', u'ASK_UPDATE_TIME4',
       u'ASK_SIZE4', u'BID_PRICE5', u'BID_UPDATE_TIME5', u'BID_SIZE5',
       u'ASK_PRICE5', u'ASK_UPDATE_TIME5', u'ASK_SIZE5', u'BID_PRICE6',
       u'BID_UPDATE_TIME6', u'BID_SIZE6', u'ASK_PRICE6', u'ASK_UPDATE_TIME6',
       u'ASK_SIZE6', u'BID_PRICE7', u'BID_UPDATE_TIME7', u'BID_SIZE7',
       u'ASK_PRICE7', u'ASK_UPDATE_TIME7', u'ASK_SIZE7', u'BID_PRICE8',
       u'BID_UPDATE_TIME8', u'BID_SIZE8', u'ASK_PRICE8', u'ASK_UPDATE_TIME8',
       u'ASK_SIZE8', u'BID_PRICE9', u'BID_UPDATE_TIME9', u'BID_SIZE9',
       u'ASK_PRICE9', u'ASK_UPDATE_TIME9

We calculate mid-price by $$P^{mid}_{t} = \frac{1}{2}(P^{ask}_{t} + P^{bid}_{t})$$

In [6]:
# Create mid-price
AAPL['MID_PRICE'] = AAPL['BID_PRICE1'] + AAPL['ASK_PRICE1']

In [7]:
# Get the mid-price difference and movement
mid_diff = AAPL['MID_PRICE'].diff()[1:].ravel()
mid_diff = np.append(mid_diff, np.nan)

# Create group: -1, 0, 1
AAPL['MID_MOVEMENT'] = pd.Categorical((mid_diff > 0)*1 + (mid_diff < 0)*-1)

# Replace the first element by NaN
AAPL['MID_MOVEMENT'].iloc[-1] = np.nan

# Change label into down, stationary, up
AAPL['MID_MOVEMENT'].cat.categories = ['down', 'stationary', 'up']

For bid-ask spread crossing, we have to consider three scenarios:
1. An __upward__ price spread crossing occurs when the best bid price at $t+\Delta t$ exceeds the best ask price at time $t$ $\Rightarrow$ $P^{bid}_{t + \Delta t} > P^{ask}_{t}$ 
2. A __downward__ price spread crossing happens when the best ask price at $t+\Delta t$ is less than the best bid price at time $t$ $\Rightarrow$ $P^{ask}_{t + \Delta t} < P^{bid}_{t}$ 
3. No price spread crossing takes place if $P^{ask}_{t + \Delta t} \leq P^{bid}_{t}$ and $P^{bid}_{t + \Delta t} \geq P^{ask}_{t}$ 

In [8]:
def spread_f(bid, ask, delta_t):
    # shifted bid (append nan at the last)
    bid_shift = np.append(bid[delta_t:], np.array(np.nan).repeat(delta_t))
    ask_shift = np.append(ask[delta_t:], np.array(np.nan).repeat(delta_t))
    
    # Determine up or down
    spread = (bid_shift > ask)*1 + (ask_shift < bid)*-1
    spread = spread.astype('float')
    spread[-delta_t:] = np.nan
    
    # Change pandas series and transform to categorical data
    spread = pd.Series(pd.Categorical(spread, categories=[-1, 0, 1]))

    # Change label
    spread.cat.categories = ['down', 'stationary', 'up']
    
    return(spread)

In [9]:
# Create spread
spread = spread_f(AAPL['BID_PRICE1'], AAPL['ASK_PRICE1'], 1)

In [10]:
spread.value_counts()

stationary    332672
up                 0
down               0
dtype: int64

In [11]:
spread.tail()

332668    stationary
332669    stationary
332670    stationary
332671    stationary
332672           NaN
dtype: category
Categories (3, object): [down, stationary, up]

In [12]:
spread5 = spread_f(AAPL['BID_PRICE1'], AAPL['ASK_PRICE1'], 5)

In [13]:
spread5.value_counts()

stationary    332398
down             148
up               122
dtype: int64

In [14]:
spread5.tail(6)

332667    stationary
332668           NaN
332669           NaN
332670           NaN
332671           NaN
332672           NaN
dtype: category
Categories (3, object): [down, stationary, up]

In [15]:
AAPL['SPREAD'] = spread_f(AAPL['BID_PRICE1'], AAPL['ASK_PRICE1'], 5)

In [16]:
AAPL['MID_MOVEMENT'].cat.categories = ['down', 'stationary', 'up']

In [17]:
# Create time variable
time = np.array([datetime.strptime(time, "%Y/%m/%d %H:%M:%S.%f") for time in AAPL['Time']])

# For all the data before 11:00, they are all training set
train_index = time < datetime(2012, 5, 22, 11, 0)

In [18]:
# Separate training and testing set
train = AAPL.iloc[train_index]
test = AAPL.iloc[np.logical_not(train_index)]

In [19]:
train.shape

(203349, 65)

In [20]:
test.shape

(129324, 65)

In [21]:
np.random.seed(0)
# Sample
t_index = np.random.choice(np.arange(203349), 10000, replace=False)
v_index = np.random.choice(np.arange(203349), 10000, replace=False)

In [22]:
train.columns

Index([u'Index', u'Time', u'BID_PRICE1', u'BID_UPDATE_TIME1', u'BID_SIZE1',
       u'ASK_PRICE1', u'ASK_UPDATE_TIME1', u'ASK_SIZE1', u'BID_PRICE2',
       u'BID_UPDATE_TIME2', u'BID_SIZE2', u'ASK_PRICE2', u'ASK_UPDATE_TIME2',
       u'ASK_SIZE2', u'BID_PRICE3', u'BID_UPDATE_TIME3', u'BID_SIZE3',
       u'ASK_PRICE3', u'ASK_UPDATE_TIME3', u'ASK_SIZE3', u'BID_PRICE4',
       u'BID_UPDATE_TIME4', u'BID_SIZE4', u'ASK_PRICE4', u'ASK_UPDATE_TIME4',
       u'ASK_SIZE4', u'BID_PRICE5', u'BID_UPDATE_TIME5', u'BID_SIZE5',
       u'ASK_PRICE5', u'ASK_UPDATE_TIME5', u'ASK_SIZE5', u'BID_PRICE6',
       u'BID_UPDATE_TIME6', u'BID_SIZE6', u'ASK_PRICE6', u'ASK_UPDATE_TIME6',
       u'ASK_SIZE6', u'BID_PRICE7', u'BID_UPDATE_TIME7', u'BID_SIZE7',
       u'ASK_PRICE7', u'ASK_UPDATE_TIME7', u'ASK_SIZE7', u'BID_PRICE8',
       u'BID_UPDATE_TIME8', u'BID_SIZE8', u'ASK_PRICE8', u'ASK_UPDATE_TIME8',
       u'ASK_SIZE8', u'BID_PRICE9', u'BID_UPDATE_TIME9', u'BID_SIZE9',
       u'ASK_PRICE9', u'ASK_UPDATE_TIME9

In [23]:
clf = svm.SVC()
clf.fit(train[['BID_PRICE1', 'ASK_PRICE1', 'BID_PRICE2', 'ASK_PRICE2', 'BID_PRICE3', 'ASK_PRICE3',
               'BID_PRICE4', 'ASK_PRICE4', 'BID_PRICE5', 'ASK_PRICE5', 'BID_PRICE6', 'ASK_PRICE6',
               'BID_PRICE7', 'ASK_PRICE7', 'BID_PRICE8', 'ASK_PRICE8', 'BID_PRICE9', 'ASK_PRICE9',
               'BID_PRICE10', 'ASK_PRICE10',
               'BID_SIZE1', 'ASK_SIZE1', 'BID_SIZE2', 'ASK_SIZE2', 'BID_SIZE3', 'ASK_SIZE3',
               'BID_SIZE4', 'ASK_SIZE4', 'BID_SIZE5', 'ASK_SIZE5', 'BID_SIZE6', 'ASK_SIZE6',
               'BID_SIZE7', 'ASK_SIZE7', 'BID_SIZE8', 'ASK_SIZE8', 'BID_SIZE9', 'ASK_SIZE9',
               'BID_SIZE10', 'ASK_SIZE10', 'MID_PRICE']].iloc[t_index], train['MID_MOVEMENT'].iloc[t_index])

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [24]:
prediction = clf.predict(train[['BID_PRICE1', 'ASK_PRICE1', 'BID_PRICE2', 'ASK_PRICE2', 'BID_PRICE3', 'ASK_PRICE3',
               'BID_PRICE4', 'ASK_PRICE4', 'BID_PRICE5', 'ASK_PRICE5', 'BID_PRICE6', 'ASK_PRICE6',
               'BID_PRICE7', 'ASK_PRICE7', 'BID_PRICE8', 'ASK_PRICE8', 'BID_PRICE9', 'ASK_PRICE9',
               'BID_PRICE10', 'ASK_PRICE10',
               'BID_SIZE1', 'ASK_SIZE1', 'BID_SIZE2', 'ASK_SIZE2', 'BID_SIZE3', 'ASK_SIZE3',
               'BID_SIZE4', 'ASK_SIZE4', 'BID_SIZE5', 'ASK_SIZE5', 'BID_SIZE6', 'ASK_SIZE6',
               'BID_SIZE7', 'ASK_SIZE7', 'BID_SIZE8', 'ASK_SIZE8', 'BID_SIZE9', 'ASK_SIZE9',
               'BID_SIZE10', 'ASK_SIZE10', 'MID_PRICE']].iloc[v_index])

In [25]:
confusion_matrix(prediction, train.iloc[v_index]['MID_MOVEMENT'])

array([[  72,   13,    5],
       [ 860, 8097,  873],
       [   6,   14,   60]])

In [26]:
np.mean(prediction == train.iloc[v_index]['MID_MOVEMENT'])

0.82289999999999996

In [27]:
np.mean(prediction == 'stationary')

0.98299999999999998

* [sklearn: Ensemble methods](http://scikit-learn.org/stable/modules/ensemble.html)
* [sklearn: Random Forest](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html)
* [sklearn: SVM](http://scikit-learn.org/stable/modules/svm.html)