In [1]:
from __future__ import print_function
import csv
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn import cross_validation
from sklearn.metrics import confusion_matrix
from datetime import datetime, date, time, timedelta
%matplotlib inline

In [2]:
# Read data
AAPL = pd.read_csv("../Data/AAPL_05222012_0930_1300_LOB_2.csv")

In [3]:
# Change column name
AAPL.columns = [col_name.split('.')[1] if len(col_name) > 5 else col_name for col_name in AAPL.columns]

# Dimension of the data frame
AAPL.shape

(332673, 62)

### Basic Set

In [4]:
# Price and Volume
AAPL.columns

Index([u'Index', u'Time', u'BID_PRICE1', u'BID_UPDATE_TIME1', u'BID_SIZE1',
       u'ASK_PRICE1', u'ASK_UPDATE_TIME1', u'ASK_SIZE1', u'BID_PRICE2',
       u'BID_UPDATE_TIME2', u'BID_SIZE2', u'ASK_PRICE2', u'ASK_UPDATE_TIME2',
       u'ASK_SIZE2', u'BID_PRICE3', u'BID_UPDATE_TIME3', u'BID_SIZE3',
       u'ASK_PRICE3', u'ASK_UPDATE_TIME3', u'ASK_SIZE3', u'BID_PRICE4',
       u'BID_UPDATE_TIME4', u'BID_SIZE4', u'ASK_PRICE4', u'ASK_UPDATE_TIME4',
       u'ASK_SIZE4', u'BID_PRICE5', u'BID_UPDATE_TIME5', u'BID_SIZE5',
       u'ASK_PRICE5', u'ASK_UPDATE_TIME5', u'ASK_SIZE5', u'BID_PRICE6',
       u'BID_UPDATE_TIME6', u'BID_SIZE6', u'ASK_PRICE6', u'ASK_UPDATE_TIME6',
       u'ASK_SIZE6', u'BID_PRICE7', u'BID_UPDATE_TIME7', u'BID_SIZE7',
       u'ASK_PRICE7', u'ASK_UPDATE_TIME7', u'ASK_SIZE7', u'BID_PRICE8',
       u'BID_UPDATE_TIME8', u'BID_SIZE8', u'ASK_PRICE8', u'ASK_UPDATE_TIME8',
       u'ASK_SIZE8', u'BID_PRICE9', u'BID_UPDATE_TIME9', u'BID_SIZE9',
       u'ASK_PRICE9', u'ASK_UPDATE_TIME9

### Time-insenstive Set
* bid-ask spreads    
We calculate mid-price by $$Spread_i = P^{ask}_{i} - P^{bid}_{i}$$

In [5]:
AAPL['SPREAD1'] = AAPL['ASK_PRICE1'] - AAPL['BID_PRICE1']
AAPL['SPREAD2'] = AAPL['ASK_PRICE2'] - AAPL['BID_PRICE2']
AAPL['SPREAD3'] = AAPL['ASK_PRICE3'] - AAPL['BID_PRICE3']
AAPL['SPREAD4'] = AAPL['ASK_PRICE4'] - AAPL['BID_PRICE4']
AAPL['SPREAD5'] = AAPL['ASK_PRICE5'] - AAPL['BID_PRICE5']
AAPL['SPREAD6'] = AAPL['ASK_PRICE6'] - AAPL['BID_PRICE6']
AAPL['SPREAD7'] = AAPL['ASK_PRICE7'] - AAPL['BID_PRICE7']
AAPL['SPREAD8'] = AAPL['ASK_PRICE8'] - AAPL['BID_PRICE8']
AAPL['SPREAD9'] = AAPL['ASK_PRICE9'] - AAPL['BID_PRICE9']
AAPL['SPREAD10'] = AAPL['ASK_PRICE10'] - AAPL['BID_PRICE10']

* mid-prices    
$$P^{mid}_{i} = \frac{1}{2}(P^{ask}_{i} + P^{bid}_{i})$$

In [6]:
AAPL['MID_PRICE1'] = (AAPL['BID_PRICE1'] + AAPL['ASK_PRICE1'])/2
AAPL['MID_PRICE2'] = (AAPL['BID_PRICE2'] + AAPL['ASK_PRICE2'])/2
AAPL['MID_PRICE3'] = (AAPL['BID_PRICE3'] + AAPL['ASK_PRICE3'])/2
AAPL['MID_PRICE4'] = (AAPL['BID_PRICE4'] + AAPL['ASK_PRICE4'])/2
AAPL['MID_PRICE5'] = (AAPL['BID_PRICE5'] + AAPL['ASK_PRICE5'])/2
AAPL['MID_PRICE6'] = (AAPL['BID_PRICE6'] + AAPL['ASK_PRICE6'])/2
AAPL['MID_PRICE7'] = (AAPL['BID_PRICE7'] + AAPL['ASK_PRICE7'])/2
AAPL['MID_PRICE8'] = (AAPL['BID_PRICE8'] + AAPL['ASK_PRICE8'])/2
AAPL['MID_PRICE9'] = (AAPL['BID_PRICE9'] + AAPL['ASK_PRICE9'])/2
AAPL['MID_PRICE10'] = (AAPL['BID_PRICE10'] + AAPL['ASK_PRICE10'])/2

* price differences
$$\text{Range(ask)}=P^{ask}_{n} - P^{ask}_{1}$$
$$\text{Range(bid)}=P^{bid}_{1} - P^{bid}_{n}$$

In [7]:
# Range
AAPL['ASK_RANGE'] = AAPL.ASK_PRICE10 - AAPL.ASK_PRICE1
AAPL['BID_RANGE'] = AAPL.BID_PRICE1 - AAPL.BID_PRICE10

# Absolute difference for ask
AAPL['ASK21'] = np.absolute(AAPL.ASK_PRICE2 - AAPL.ASK_PRICE1)
AAPL['ASK32'] = np.absolute(AAPL.ASK_PRICE3 - AAPL.ASK_PRICE2)
AAPL['ASK43'] = np.absolute(AAPL.ASK_PRICE4 - AAPL.ASK_PRICE3)
AAPL['ASK54'] = np.absolute(AAPL.ASK_PRICE5 - AAPL.ASK_PRICE4)
AAPL['ASK65'] = np.absolute(AAPL.ASK_PRICE6 - AAPL.ASK_PRICE5)
AAPL['ASK76'] = np.absolute(AAPL.ASK_PRICE7 - AAPL.ASK_PRICE6)
AAPL['ASK87'] = np.absolute(AAPL.ASK_PRICE8 - AAPL.ASK_PRICE7)
AAPL['ASK98'] = np.absolute(AAPL.ASK_PRICE9 - AAPL.ASK_PRICE8)
AAPL['ASK109'] = np.absolute(AAPL.ASK_PRICE10 - AAPL.ASK_PRICE9)

# Absolute difference for bid
AAPL['BID21'] = np.absolute(AAPL.BID_PRICE2 - AAPL.BID_PRICE1)
AAPL['BID32'] = np.absolute(AAPL.BID_PRICE3 - AAPL.BID_PRICE2)
AAPL['BID43'] = np.absolute(AAPL.BID_PRICE4 - AAPL.BID_PRICE3)
AAPL['BID54'] = np.absolute(AAPL.BID_PRICE5 - AAPL.BID_PRICE4)
AAPL['BID65'] = np.absolute(AAPL.BID_PRICE6 - AAPL.BID_PRICE5)
AAPL['BID76'] = np.absolute(AAPL.BID_PRICE7 - AAPL.BID_PRICE6)
AAPL['BID87'] = np.absolute(AAPL.BID_PRICE8 - AAPL.BID_PRICE7)
AAPL['BID98'] = np.absolute(AAPL.BID_PRICE9 - AAPL.BID_PRICE8)
AAPL['BID109'] = np.absolute(AAPL.BID_PRICE10 - AAPL.BID_PRICE9)

* mean prices

In [8]:
# mean bid price
AAPL['MEAN_BID_PRICE'] = 0.1*(AAPL['BID_PRICE1'] + AAPL['BID_PRICE2'] + AAPL['BID_PRICE3'] +
                              AAPL['BID_PRICE4'] + AAPL['BID_PRICE5'] + AAPL['BID_PRICE6'] +
                              AAPL['BID_PRICE7'] + AAPL['BID_PRICE8'] + AAPL['BID_PRICE9'] + AAPL['BID_PRICE10'])

# mean ask price
AAPL['MEAN_ASK_PRICE'] = 0.1*(AAPL['ASK_PRICE1'] + AAPL['ASK_PRICE2'] + AAPL['ASK_PRICE3'] +
                              AAPL['ASK_PRICE4'] + AAPL['ASK_PRICE5'] + AAPL['ASK_PRICE6'] +
                              AAPL['ASK_PRICE7'] + AAPL['ASK_PRICE8'] + AAPL['ASK_PRICE9'] + AAPL['ASK_PRICE10'])

* mean volumes

In [9]:
# mean bid volume
AAPL['MEAN_BID_SIZE'] = 0.1*(AAPL['BID_SIZE1'] + AAPL['BID_SIZE2'] + AAPL['BID_SIZE3'] +
                             AAPL['BID_SIZE4'] + AAPL['BID_SIZE5'] + AAPL['BID_SIZE6'] +
                             AAPL['BID_SIZE7'] + AAPL['BID_SIZE8'] + AAPL['BID_SIZE9'] + AAPL['BID_SIZE10'])

# mean ask volume
AAPL['MEAN_ASK_SIZE'] = 0.1*(AAPL['ASK_SIZE1'] + AAPL['ASK_SIZE2'] + AAPL['ASK_SIZE3'] + 
                             AAPL['ASK_SIZE4'] + AAPL['ASK_SIZE5'] + AAPL['ASK_SIZE6'] +
                             AAPL['ASK_SIZE7'] + AAPL['ASK_SIZE8'] + AAPL['ASK_SIZE9'] + AAPL['ASK_SIZE10'])

* accumulated differences

In [10]:
# Price
AAPL['PRICE_ASK_BID'] = (AAPL['SPREAD1'] + AAPL['SPREAD2'] + AAPL['SPREAD3'] + 
                         AAPL['SPREAD4'] + AAPL['SPREAD5'] + AAPL['SPREAD6'] +
                         AAPL['SPREAD7'] + AAPL['SPREAD8'] + AAPL['SPREAD9'] + AAPL['SPREAD10'])

# Volume
AAPL['SIZE_ASK_BID'] = sum((AAPL.ASK_SIZE1 - AAPL.BID_SIZE1, AAPL.ASK_SIZE2 - AAPL.BID_SIZE2,
                            AAPL.ASK_SIZE3 - AAPL.BID_SIZE3, AAPL.ASK_SIZE4 - AAPL.BID_SIZE4,
                            AAPL.ASK_SIZE5 - AAPL.BID_SIZE5, AAPL.ASK_SIZE6 - AAPL.BID_SIZE6,
                            AAPL.ASK_SIZE7 - AAPL.BID_SIZE7, AAPL.ASK_SIZE8 - AAPL.BID_SIZE8,
                            AAPL.ASK_SIZE9 - AAPL.BID_SIZE9, AAPL.ASK_SIZE10 - AAPL.BID_SIZE10))

In [11]:
AAPL.shape

(332673, 108)

### Time-sensitive Set

In [12]:
# Create time variable
time = np.array([datetime.strptime(time, "%Y/%m/%d %H:%M:%S.%f") for time in AAPL['Time']])

In [13]:
index = np.zeros(len(time))
for i in np.arange(len(time)):
    j = i
    t_standard = time[i] - timedelta(seconds=1)
    while t_standard < time[j] or time[j] == time[j - 1]:
        if j == 0: break
        j = j - 1
    index[i] = j

In [14]:
time[0:3]

array([datetime.datetime(2012, 5, 22, 9, 30),
       datetime.datetime(2012, 5, 22, 9, 30, 0, 3000),
       datetime.datetime(2012, 5, 22, 9, 30, 0, 3000)], dtype=object)

In [15]:
time[354]

datetime.datetime(2012, 5, 22, 9, 30, 1, 5000)

In [16]:
index[354]

1.0

In [17]:
time[353:356]

array([datetime.datetime(2012, 5, 22, 9, 30, 1, 1000),
       datetime.datetime(2012, 5, 22, 9, 30, 1, 5000),
       datetime.datetime(2012, 5, 22, 9, 30, 1, 38000)], dtype=object)

In [18]:
time[378]

datetime.datetime(2012, 5, 22, 9, 30, 2, 13000)

In [19]:
index[378]

354.0

In [20]:
def time_derivative(time, price, delta_t=1):
    derivative = np.zeros(len(price))
    
    # Find the index
    j = 0
    for i in np.arange(len(time)):
        j = i
        t_standard = time[i] - timedelta(seconds=delta_t)
        while t_standard < time[j] or time[j] == time[j - 1]:
            if j == 0: break
            j = j - 1
            
        # Once the while loop end the j will be the index of first most recent 1 second
        # We can use the index to calculate derivative
        derivative[i] = (price[i] - price[j])/(i - j + 1)
    
    return(derivative)

* Price Derivative

In [21]:
# Bid price derivative
AAPL['BID_PDERIV1'] = time_derivative(time, AAPL['BID_PRICE1'])
AAPL['BID_PDERIV2'] = time_derivative(time, AAPL['BID_PRICE2'])
AAPL['BID_PDERIV3'] = time_derivative(time, AAPL['BID_PRICE3'])
AAPL['BID_PDERIV4'] = time_derivative(time, AAPL['BID_PRICE4'])
AAPL['BID_PDERIV5'] = time_derivative(time, AAPL['BID_PRICE5'])
AAPL['BID_PDERIV6'] = time_derivative(time, AAPL['BID_PRICE6'])
AAPL['BID_PDERIV7'] = time_derivative(time, AAPL['BID_PRICE7'])
AAPL['BID_PDERIV8'] = time_derivative(time, AAPL['BID_PRICE8'])
AAPL['BID_PDERIV9'] = time_derivative(time, AAPL['BID_PRICE9'])
AAPL['BID_PDERIV10'] = time_derivative(time, AAPL['BID_PRICE10'])

# Ask price derivative
AAPL['ASK_PDERIV1'] = time_derivative(time, AAPL['ASK_PRICE1'])
AAPL['ASK_PDERIV2'] = time_derivative(time, AAPL['ASK_PRICE2'])
AAPL['ASK_PDERIV3'] = time_derivative(time, AAPL['ASK_PRICE3'])
AAPL['ASK_PDERIV4'] = time_derivative(time, AAPL['ASK_PRICE4'])
AAPL['ASK_PDERIV5'] = time_derivative(time, AAPL['ASK_PRICE5'])
AAPL['ASK_PDERIV6'] = time_derivative(time, AAPL['ASK_PRICE6'])
AAPL['ASK_PDERIV7'] = time_derivative(time, AAPL['ASK_PRICE7'])
AAPL['ASK_PDERIV8'] = time_derivative(time, AAPL['ASK_PRICE8'])
AAPL['ASK_PDERIV9'] = time_derivative(time, AAPL['ASK_PRICE9'])
AAPL['ASK_PDERIV10'] = time_derivative(time, AAPL['ASK_PRICE10'])

* Volume Derivative

In [22]:
# Bid volume derivative
AAPL['BID_SDERIV1'] = time_derivative(time, AAPL['BID_SIZE1'])
AAPL['BID_SDERIV2'] = time_derivative(time, AAPL['BID_SIZE2'])
AAPL['BID_SDERIV3'] = time_derivative(time, AAPL['BID_SIZE3'])
AAPL['BID_SDERIV4'] = time_derivative(time, AAPL['BID_SIZE4'])
AAPL['BID_SDERIV5'] = time_derivative(time, AAPL['BID_SIZE5'])
AAPL['BID_SDERIV6'] = time_derivative(time, AAPL['BID_SIZE6'])
AAPL['BID_SDERIV7'] = time_derivative(time, AAPL['BID_SIZE7'])
AAPL['BID_SDERIV8'] = time_derivative(time, AAPL['BID_SIZE8'])
AAPL['BID_SDERIV9'] = time_derivative(time, AAPL['BID_SIZE9'])
AAPL['BID_SDERIV10'] = time_derivative(time, AAPL['BID_SIZE10'])

# Ask volume derivative
AAPL['ASK_SDERIV1'] = time_derivative(time, AAPL['ASK_SIZE1'])
AAPL['ASK_SDERIV2'] = time_derivative(time, AAPL['ASK_SIZE2'])
AAPL['ASK_SDERIV3'] = time_derivative(time, AAPL['ASK_SIZE3'])
AAPL['ASK_SDERIV4'] = time_derivative(time, AAPL['ASK_SIZE4'])
AAPL['ASK_SDERIV5'] = time_derivative(time, AAPL['ASK_SIZE5'])
AAPL['ASK_SDERIV6'] = time_derivative(time, AAPL['ASK_SIZE6'])
AAPL['ASK_SDERIV7'] = time_derivative(time, AAPL['ASK_SIZE7'])
AAPL['ASK_SDERIV8'] = time_derivative(time, AAPL['ASK_SIZE8'])
AAPL['ASK_SDERIV9'] = time_derivative(time, AAPL['ASK_SIZE9'])
AAPL['ASK_SDERIV10'] = time_derivative(time, AAPL['ASK_SIZE10'])

### Independent variable:
* mid-price movement

In [23]:
def midprice_movement(mid, delta_t):
    mid_diff = mid.diff(periods=delta_t)[delta_t:].ravel()
    mid_diff = np.append(mid_diff, np.array(np.nan).repeat(delta_t))
    
    mid_movement = (mid_diff > 0)*1 + (mid_diff < 0)*-1
    mid_movement = mid_movement.astype('float')
    mid_movement[-delta_t:] = np.nan
    
    # Change to pandas series and transform to categorical data
    mid_movement = pd.Series(pd.Categorical(mid_movement, categories=[-1, 0, 1]))
    
    # Change label
    mid_movement.cat.categories = ['down', 'stationary', 'up']
    
    return(mid_movement)

In [24]:
# Create mid-price movement
AAPL['MID_MOVEMENT'] = midprice_movement(AAPL['MID_PRICE1'], 10)

In [25]:
AAPL['MID_MOVEMENT'].value_counts()

stationary    120244
down          110049
up            102370
dtype: int64

* bid-ask spread crossing    

For bid-ask spread crossing, we have to consider three scenarios:
1. An __upward__ price spread crossing occurs when the best bid price at $t+\Delta t$ exceeds the best ask price at time $t$ $\Rightarrow$ $P^{bid}_{t + \Delta t} > P^{ask}_{t}$ 
2. A __downward__ price spread crossing happens when the best ask price at $t+\Delta t$ is less than the best bid price at time $t$ $\Rightarrow$ $P^{ask}_{t + \Delta t} < P^{bid}_{t}$ 
3. No price spread crossing takes place if $P^{ask}_{t + \Delta t} \leq P^{bid}_{t}$ and $P^{bid}_{t + \Delta t} \geq P^{ask}_{t}$ 

In [26]:
def spread_crossing(bid, ask, delta_t):
    # shifted bid (append nan at the last)
    bid_shift = np.append(bid[delta_t:], np.array(np.nan).repeat(delta_t))
    ask_shift = np.append(ask[delta_t:], np.array(np.nan).repeat(delta_t))
    
    # Determine up or down
    spread = (bid_shift > ask)*1 + (ask_shift < bid)*-1
    spread = spread.astype('float')
    spread[-delta_t:] = np.nan
    
    # Change to pandas series and transform to categorical data
    spread = pd.Series(pd.Categorical(spread, categories=[-1, 0, 1]))

    # Change label
    spread.cat.categories = ['down', 'stationary', 'up']
    
    return(spread)

In [27]:
# Create spread crossing movement
AAPL['SPREAD_CROSSING'] = spread_crossing(AAPL['BID_PRICE1'], AAPL['ASK_PRICE1'], 5)

In [28]:
AAPL['SPREAD_CROSSING'].value_counts()

stationary    332398
down             148
up               122
dtype: int64

In [29]:
spread_crossing(AAPL['BID_PRICE1'], AAPL['ASK_PRICE1'], 1000).value_counts()

stationary    123000
down          112585
up             96088
dtype: int64

### Train vs. Test

In [30]:
# For all the data before 11:00, they are all training set
train_index = time < datetime(2012, 5, 22, 11, 0)

# Separate training and testing set
train = AAPL.iloc[train_index]
test = AAPL.iloc[np.logical_not(train_index)]

In [31]:
train.shape

(203349, 150)

In [32]:
test.shape

(129324, 150)

### Train vs. Validation

In [33]:
# Take 10% of the training set as validation
X_train, X_validation, y_train, y_validation = cross_validation.train_test_split(
    train, train['MID_MOVEMENT'], test_size=0.1, random_state=0)

In [34]:
X_train.shape

(183014, 150)

In [35]:
X_validation.shape

(20335, 150)

In [36]:
# Pull out the column name in x
r_not_x = re.compile('TIME|INDEX|MID_MOVEMENT|SPREAD_CROSSING')
col_in_x = [not bool(r_not_x.search(col.upper())) for col in AAPL.columns]
AAPL.columns[col_in_x]

Index([u'BID_PRICE1', u'BID_SIZE1', u'ASK_PRICE1', u'ASK_SIZE1', u'BID_PRICE2',
       u'BID_SIZE2', u'ASK_PRICE2', u'ASK_SIZE2', u'BID_PRICE3', u'BID_SIZE3',
       ...
       u'ASK_SDERIV1', u'ASK_SDERIV2', u'ASK_SDERIV3', u'ASK_SDERIV4',
       u'ASK_SDERIV5', u'ASK_SDERIV6', u'ASK_SDERIV7', u'ASK_SDERIV8',
       u'ASK_SDERIV9', u'ASK_SDERIV10'],
      dtype='object', length=126)

### Mid-price Movement

### SVM

In [37]:
# Sample 50000 observation for fitting model
np.random.seed(0)
t_index = np.random.choice(np.arange(183014), 30000, replace=False)

In [38]:
# Cross-validation to select C and gamma in SVM kernel
C_range = 10.0**np.arange(-3, 4, 1)
gamma_range = 10.0**np.arange(-5, 3, 1)

clf = svm.SVC()
clf.fit(X_train.iloc[:, col_in_x].iloc[t_index], y_train[t_index])
prediction = clf.predict(X_validation.iloc[:, col_in_x])

In [39]:
np.mean(prediction == y_validation)

0.37231374477501844

### Random Forest

In [40]:
rf_clf = RandomForestClassifier(n_estimators=100, max_features='sqrt')
rf_clf.fit(X_train.iloc[:, col_in_x].iloc[t_index], y_train[t_index])
prediction = rf_clf.predict(X_validation.iloc[:, col_in_x])

In [41]:
confusion_matrix(prediction, y_validation)

array([[4758, 1397, 1274],
       [1141, 4278, 1033],
       [1156, 1020, 4278]])

In [42]:
np.mean(prediction == y_validation)

0.65473321858864031

### Spread Crossing

### SVM

### Random Forest

* [sklearn: Ensemble methods](http://scikit-learn.org/stable/modules/ensemble.html)
* [sklearn: Random Forest](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html)
* [sklearn: SVM](http://scikit-learn.org/stable/modules/svm.html)