# Analiza zmiany wartości mid price na podstawie Limit Order Book

Będziemy LOBy analizować 


In [2]:
from imbalance import prep_data
from parser import parse
from utils import confusion_matrix
from warnings import filterwarnings
filterwarnings('ignore')


file_locs = ['data/OrderBookSnapshots.9061.csv', 'data/OrderBookSnapshots.9062.csv', 'data/OrderBookSnapshots.9063.csv',
             'data/OrderBookSnapshots.9064.csv', 'data/OrderBookSnapshots.9065.csv']

## Imbalance

No to z czym sie będziemy porównywać to imbalance z pracy (link)

In [3]:
from sklearn import linear_model


for file_name in file_locs:
    X_train, Y_train, X_test, Y_test = prep_data(parse(file_name))
    classifier = linear_model.SGDClassifier(loss="log", alpha=0.1, max_iter=3000, tol=0, shuffle=False)
    classifier.fit(X_train, Y_train)
    Y_check = classifier.predict(X_test)
    print(classifier.score(X_test,Y_test))
    print(confusion_matrix(Y_check, Y_test))
    print()

0.564983888292159
[[266 199]
 [206 260]]

0.5240128068303095
[[181 153]
 [293 310]]

0.5776053215077606
[[233 139]
 [242 288]]

0.5417095777548918
[[170 140]
 [305 356]]

0.5695538057742782
[[184 134]
 [194 250]]



## Klasyfikator oparty na sieci neuronowej

In [20]:
import torch
from torch import nn
import numpy as np

No nie wiem czy coś tu pisać

### Preprocessing

Zamiast brać słupki z LOB dzielimy cały zakres na kubełki, których wielkość to ułamek wartości mid price. Następnie bierzemy tylko kilka z każdej strony mid price i normujemy tak, aby wartości sumowały się do jedynki.


In [18]:

def fits(index, arr):
    return index >= 0 and index < len(arr)


def get_XY(data, n_buckets=5, bucket_size=0.05 , omit_no_change=True):
    
    keys = list(data.keys())
    keys.sort()

    growths = []
    X = []

    for i, curr in enumerate(keys[:-1]):
        currKey = curr
        nextKey = keys[i+1]

        if not omit_no_change or data[nextKey][2] != data[currKey][2]:
            rows = np.zeros(2*n_buckets)
            mid_price = data[currKey][2]
            #print(mid_price)
            for bid_price, bid_size in reversed(data[currKey][0]):
                bucket = int(( n_buckets*bucket_size -  (mid_price-bid_price)/mid_price)/bucket_size)
                #print(bucket)
                if fits(bucket, rows):
                    rows[bucket] += bid_size

            for ask_price, ask_size in data[currKey][1]:
                bucket = int(((ask_price - mid_price)/mid_price )/bucket_size)
                #print(bucket + n_buckets)
                if fits(bucket + n_buckets, rows):
                    rows[bucket + n_buckets] += ask_size

            ###print("!!!", currKey, nextKey)

            # poprawne dane - min ask > max bid
            if data[currKey][0][-1][0] <  data[currKey][1][0][0] and data[nextKey][0][-1][0] <  data[nextKey][1][0][0]:
                growths.append(data[currKey][2] < data[nextKey][2])
                rows /= rows.sum()
                X.append(rows)
                
    return np.array(X, dtype=np.float32), np.array(growths, dtype=np.int)


### Model

Do trenowania używamy klasycznej sieci neuronowej z warstwami fc i dodatkowo dropoutem

In [35]:
bucket_size = 0.01
input_size = 2*20
hidden_size = 100

loss = nn.NLLLoss()

In [36]:
model = nn.Sequential(
    nn.Linear(input_size, hidden_size),
    nn.Tanh(),
    nn.Dropout(p=0.6, inplace=False),
    nn.Linear(hidden_size, hidden_size),
    nn.Tanh(),
    nn.Dropout(p=0.6, inplace=False),
    nn.Linear(hidden_size, hidden_size),
    nn.Tanh(),
    nn.Dropout(p=0.6, inplace=False),
    nn.Linear(hidden_size, hidden_size),
    nn.Tanh(),
    nn.Dropout(p=0.5, inplace=False),
    nn.Linear(hidden_size, hidden_size),
    nn.Tanh(),
    nn.Linear(hidden_size, 2),
    nn.LogSoftmax()
)

## Wyniki

In [37]:
from utils import accuracy


for data in file_locs:
    X, Y = get_XY(parse(data), n_buckets=input_size//2, bucket_size=bucket_size)
    model.load_state_dict(torch.load("models/model"))
    model.eval()
    X = torch.from_numpy(X)
    Y = torch.from_numpy(Y)
    pred = model(X)
    loss_val = loss(pred, Y)
    print("Loss: {}".format(loss_val.item()))
    print("Acc: {}".format(accuracy(pred.data.numpy(), Y.data.numpy())))
    print("Confusion matrix: ")
    print(confusion_matrix(np.argmax(pred.data.numpy(), axis=1), Y.data.numpy()))
    print()

Loss: 0.6931143999099731
Acc: 0.5016279574560452
Confusion matrix: 
[[2311 2296]
 [   0    0]]

Loss: 0.6932200193405151
Acc: 0.4922246220302376
Confusion matrix: 
[[2279 2351]
 [   0    0]]

Loss: 0.6931547522544861
Acc: 0.49809031678274546
Confusion matrix: 
[[2217 2234]
 [   0    0]]

Loss: 0.69325852394104
Acc: 0.48873591989987486
Confusion matrix: 
[[2343 2451]
 [   0    0]]

Loss: 0.6931905150413513
Acc: 0.4952051145444859
Confusion matrix: 
[[1859 1895]
 [   0    0]]



Dziękujemy za uwagę