In [3]:
from IPython.display import Image

import sys
import os
import matplotlib.pyplot as plt
import scipy.io as sio
import pandas as pd
import numpy as np
import seaborn as sns

from time import sleep
from datetime import datetime as dt
from natsort import natsorted

%matplotlib inline
sns.set_style("darkgrid")
plt.rcParams.update({'legend.fontsize': 'large',
                     'axes.labelsize': 'large',
                     'axes.titlesize': 'large',
                     'xtick.labelsize': 'large',
                     'ytick.labelsize': 'large'})


This notebook is about extracting and calculating features needed in my thesis. 

Originally data is provided in matlab form, each day of market data in separate files. At first I will read the data using Python pandas, extract bid size&price and ask price&size with timestamp into csv file ($v_1$ in the picture below). This is done file by file executing in batches, combining all the data into one big csv file.

Then I will utilise pyspark to calculate other features ($v_2$ to $v_{10}$). Finally there should be 139 features ready to analyze.

![title](features.png)

This is list of features is derived from Kercheval, A.N., Zhang, Y., 2015. Modelling high-frequency limit order book dynamics with support vector machines. Quantitative Finance 15, 1315–1329.

There is one corrupted file among the data. I was told to ignore it, so let's mark it away.

In [4]:
# CORRUPTED FILE: 'SE0000115446_SEK/8_SE0000115446_SEK_208.mat'
corrupted = ['8_SE0000115446_SEK_208.mat']
#sio.loadmat(corrupted[0])

Start with file management. Data is stored into different folders for each stock. Find the folders and Filter out non-data directories.

In [12]:
# Filter only needed directories
path = '../Data/'
dirs = [path + d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))]
del dirs[1]
del dirs[1]
dirs

['../Data/SE0000101032_SEK',
 '../Data/SE0000115446_SEK',
 '../Data/FI0009005318_EUR',
 '../Data/FI0009007835_EUR',
 '../Data/DK0010268606_DKK']

Function to extract necessary data.

In [110]:
import pytz # to handle timezones

# method to extract data from the order book
def extract_order_book(path, deep=10):
    # load data
    mat = sio.loadmat(path)
    order_book = mat['ob']

    # time strip and convert to correct timezone
    ts = pd.Series(order_book['ts'][0][0][:,0], name='ts')
    ts = pd.to_datetime(ts, unit='ms') 
    ts = ts.dt.tz_localize('UTC').dt.tz_convert('Europe/Helsinki')
    
    # filter out pre-session and after-session data
    f_ts = (ts.dt.hour > 8) & (ts.dt.hour < 18)
    
    # extract timestamp data to list of dataframes
    dfs = [pd.DataFrame(ts[f_ts]).reset_index(drop=True)]
    
    # extract order book data
    for c in ['ask_p', 'ask_q', 'bid_p', 'bid_q']:
        dfs.append(pd.DataFrame(order_book[c][0, 0][f_ts, :deep], columns=[c + str(i) for i in xrange(1, deep+1)]))
        
    df = pd.concat(dfs, axis=1)
    df = df.set_index('ts').tz_localize(None) # remove timezone information
    return df

In [81]:
# sort and save filenames to ensure chronologically correct data.
f_names = []
for d in dirs:
    files = os.listdir(d)
    f_names.append(natsorted(files))


In [122]:
## HIDAS ÄLÄ AJA
# filename: SE0000101032_SEK.csv
testfile = 'SE0000101032_SEK_test.csv'
'''
for i in xrange(len(dirs)):
    d = dirs[i]
    files = f_names[i]
    n = len(files)
    
    for j, f in enumerate(files):
        # update progress
        sys.stdout.write("%s (%i/%i)\t\r" % (d + '/' + f, j+1, n))
        sys.stdout.flush()
            
        # checks for bad files
        if not f.endswith('.mat'):
            continue
        if f in corrupted:
            continue
            
        # extract data
        df = extract_order_book(d+'/'+f)
        
        filename = f[2:18] + '.csv'
        
        file_exists = os.path.isfile(filename)
        
        with open(filename, 'a') as w:
            
            # if file does not exist, write headers into file
            if file_exists:
                df.to_csv(w, header=False)
            else:
                df.to_csv(w, header=True)  
'''

../Data/SE0000101032_SEK/8_SE0000101032_SEK_1.mat (1/765)	

# Spark

In [127]:
from pyspark.sql.functions import col

## $v_1$: Order book data

$v_1 = \{ P^{ask}_{i}, Q^{ask}_{i}, P^{bid}_{i}, Q^{bid}_{i}\}^n_{i=1}$ is read from the csv-files.

In [211]:
df = spark.read.format("csv").options(header="true", 
                                      inferSchema="true",
                                      dateFormat="yyyy-MM-dd HH:mm:ss")\
            .load(testfile)
    
df.select(df.columns[:10]).show(2, False)

print 'column names are: \'ts\' and '
cols = df.columns
print cols[1:11]
print cols[11:21]
print cols[21:31]
print cols[31:41]

+-----------------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
|ts                     |ask_p1   |ask_p2   |ask_p3   |ask_p4   |ask_p5   |ask_p6   |ask_p7   |ask_p8   |ask_p9   |
+-----------------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
|2010-06-01 09:00:00.142|1128000.0|1130000.0|1140000.0|1142000.0|1143000.0|1145000.0|1150000.0|1153000.0|1160000.0|
|2010-06-01 09:00:00.144|1128000.0|1130000.0|1140000.0|1142000.0|1143000.0|1145000.0|1150000.0|1153000.0|1160000.0|
+-----------------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
only showing top 2 rows

column names are: 'ts' and 
['ask_p1', 'ask_p2', 'ask_p3', 'ask_p4', 'ask_p5', 'ask_p6', 'ask_p7', 'ask_p8', 'ask_p9', 'ask_p10']
['ask_q1', 'ask_q2', 'ask_q3', 'ask_q4', 'ask_q5', 'ask_q6', 'ask_q7', 'ask_q8', 'ask_q9', 'ask_q10']
['bid_p1', 'bid_p2', 'bid_p3', 'bid_p4', 'bid_p

## $v_2$: Spread and mid price

$v_2 = \{ (P^{ask}_i - P^{bid}_i), \frac{(P^{ask}_i + P^{bid}_i)}{2} \}^n_{i=1} $

In [212]:
# calculate spreads per price level
for i in xrange(1, 11):
    df = df.withColumn('spread_p' + str(i), col('ask_p' + str(i)) - col('bid_p' + str(i)))
    
df.select(df.columns[41:]).show(2)

# calculate Mid-Prices per price level
for i in xrange(1, 11):
    marksColumns = [col('ask_p' + str(i)), col('bid_p' + str(i))]
    averageFunc = sum(x for x in marksColumns)/len(marksColumns)
    
    df = df.withColumn('midprice' + str(i), averageFunc)
        
df.select(df.columns[51:]).show(2)

+---------+---------+---------+---------+---------+---------+---------+---------+---------+----------+
|spread_p1|spread_p2|spread_p3|spread_p4|spread_p5|spread_p6|spread_p7|spread_p8|spread_p9|spread_p10|
+---------+---------+---------+---------+---------+---------+---------+---------+---------+----------+
|  18000.0|  30000.0|  75000.0|  82000.0|  91000.0| 109000.0| 115000.0| 123000.0| 135000.0|  145000.0|
|  18000.0|  30000.0|  75000.0|  82000.0|  91000.0| 109000.0| 115000.0| 123000.0| 135000.0|  145000.0|
+---------+---------+---------+---------+---------+---------+---------+---------+---------+----------+
only showing top 2 rows

+---------+---------+---------+---------+---------+---------+---------+---------+---------+----------+
|midprice1|midprice2|midprice3|midprice4|midprice5|midprice6|midprice7|midprice8|midprice9|midprice10|
+---------+---------+---------+---------+---------+---------+---------+---------+---------+----------+
|1119000.0|1115000.0|1102500.0|1101000.0|1097500

## $v_3$: Price differences

$v_2 = \{ \big| P^{ask}_{i+1} - P^{ask}_i \big|, \big| P^{bid}_{i+1} - P^{bid}_i \big| \} ^{n-1}_{i=1}$


In [213]:
import pyspark.sql.functions as F

# calculate price differences for subsequent asking and bidding prices
# ask
for i in xrange(1, 10):
    df = df.withColumn('ask_diff' + str(i), F.abs(col('ask_p' + str(i+1)) - col('ask_p' + str(i))))

df.select(df.columns[61:]).show(2)
    
# bid
for i in xrange(1, 10):
    df = df.withColumn('bid_diff' + str(i), F.abs(col('bid_p' + str(i+1)) - col('bid_p' + str(i))))

df.select(df.columns[70:]).show(2)

+---------+---------+---------+---------+---------+---------+---------+---------+---------+
|ask_diff1|ask_diff2|ask_diff3|ask_diff4|ask_diff5|ask_diff6|ask_diff7|ask_diff8|ask_diff9|
+---------+---------+---------+---------+---------+---------+---------+---------+---------+
|   2000.0|  10000.0|   2000.0|   1000.0|   2000.0|   5000.0|   3000.0|   7000.0|   5000.0|
|   2000.0|  10000.0|   2000.0|   1000.0|   2000.0|   5000.0|   3000.0|   7000.0|   5000.0|
+---------+---------+---------+---------+---------+---------+---------+---------+---------+
only showing top 2 rows

+---------+---------+---------+---------+---------+---------+---------+---------+---------+
|bid_diff1|bid_diff2|bid_diff3|bid_diff4|bid_diff5|bid_diff6|bid_diff7|bid_diff8|bid_diff9|
+---------+---------+---------+---------+---------+---------+---------+---------+---------+
|  10000.0|  35000.0|   5000.0|   8000.0|  16000.0|   1000.0|   5000.0|   5000.0|   5000.0|
|  10000.0|  35000.0|   5000.0|   8000.0|  16000.0|   1

## $v_4$: Price and volume means

$v_4 = \{ \frac{1}{n}\sum_{i=1}^{n} P^{ask}_i,
          \frac{1}{n}\sum_{i=1}^{n} P^{bid}_i, 
          \frac{1}{n}\sum_{i=1}^{n} Q^{ask}_i, 
          \frac{1}{n}\sum_{i=1}^{n} Q^{bid}_i \}$

In [214]:
for c in ['ask_p', 'bid_p', 'ask_q', 'bid_q']:
    marksColumns = [col(c + str(i)) for i in xrange(1, 11)]
    averageFunc = sum(x for x in marksColumns)/len(marksColumns)
    df = df.withColumn('mean' + c, averageFunc)

df.select(df.columns[79:]).show(2)

+---------+---------+---------+---------+
|meanask_p|meanbid_p|meanask_q|meanbid_q|
+---------+---------+---------+---------+
|1145600.0|1053300.0|   1998.2|   1813.3|
|1145600.0|1053300.0|   2018.2|   1813.3|
+---------+---------+---------+---------+
only showing top 2 rows



## $v_5$: Accumulated differences

$v_4 = \{ \frac{1}{n}(\sum_{i=1}^{n} P^{ask}_i - P^{bid}_i), 
          \frac{1}{n}(\sum_{i=1}^{n} Q^{ask}_i - Q^{bid}_i) \}$

In [238]:
foo = df

marksColumns = [col('ask_p' + str(i)) for i in xrange(1, 11)] + [col('bid_p' + str(i)) for i in xrange(1, 11)]
bar = (sum(x for x in marksColumns[:10]) - sum(x for x in marksColumns[10:])) / (len(marksColumns) / 2)
foo = foo.withColumn('foo', bar)

marksColumns = [col('ask_q' + str(i)) for i in xrange(1, 11)] + [col('bid_q' + str(i)) for i in xrange(1, 11)]
bar = (sum(x for x in marksColumns[:10]) + sum(x for x in marksColumns[10:])) / (len(marksColumns) / 2)
foo = foo.withColumn('bar', bar)

foo.select(foo.columns[83:]).show(2)

+---------------+----------+----------+-------+-----+
|acc_diff_pbid_q|acc_diff_p|acc_diff_q|    foo|  bar|
+---------------+----------+----------+-------+-----+
|        92300.0|   92300.0|     184.9|92300.0|184.9|
|        92300.0|   92300.0|     204.9|92300.0|204.9|
+---------------+----------+----------+-------+-----+
only showing top 2 rows



In [232]:
marksColumns = [col('ask_p' + str(i)) for i in xrange(1, 11)] + [col('bid_p' + str(i)) for i in xrange(1, 11)]
meanOfDiffs = (sum(x for x in marksColumns[:10]) - sum(x for x in marksColumns[10:])) / (len(marksColumns) / 2)
df = df.withColumn('acc_diff_p', meanOfDiffs)

marksColumns = [col('ask_q' + str(i)) for i in xrange(1, 11)] + [col('bid_q' + str(i)) for i in xrange(1, 11)]
meanOfDiffs = (sum(x for x in marksColumns[:10]) - sum(x for x in marksColumns[10:])) / (len(marksColumns) / 2)
df = df.withColumn('acc_diff_q', meanOfDiffs)


df.select(df.columns[83:]).show(2)

+---------------+----------+----------+
|acc_diff_pbid_q|acc_diff_p|acc_diff_q|
+---------------+----------+----------+
|        92300.0|   92300.0|     184.9|
|        92300.0|   92300.0|     204.9|
+---------------+----------+----------+
only showing top 2 rows

