# Analiza zmiany wartości mid price na podstawie Limit Order Book

Będziemy LOBy analizować 


In [1]:
from imbalance import prep_data
from parser import parse
from utils import confusion_matrix
from warnings import filterwarnings
filterwarnings('ignore')


file_locs = ['data/OrderBookSnapshots.9061.csv', 'data/OrderBookSnapshots.9062.csv', 'data/OrderBookSnapshots.9063.csv',
             'data/OrderBookSnapshots.9064.csv', 'data/OrderBookSnapshots.9065.csv']

## Imbalance

No to z czym sie będziemy porównywać to imbalance z pracy (link)

In [2]:
from sklearn import linear_model


for file_name in file_locs:
    X_train, Y_train, X_test, Y_test = prep_data(parse(file_name))
    classifier = linear_model.SGDClassifier(loss="log", alpha=0.1, max_iter=3000, tol=0, shuffle=False)
    classifier.fit(X_train, Y_train)
    Y_check = classifier.predict(X_test)
    print(classifier.score(X_test,Y_test))
    print(confusion_matrix(Y_check, Y_test))
    print()

0.5510204081632653
[[260 207]
 [211 253]]

0.5250800426894343
[[200 177]
 [268 292]]

0.5465631929046563
[[216 154]
 [255 277]]

0.5159629248197735
[[169 166]
 [304 332]]

0.5551181102362205
[[169 119]
 [220 254]]



## Klasyfikator oparty na sieci neuronowej

In [3]:
import torch
from torch import nn
import numpy as np

No nie wiem czy coś tu pisać

### Preprocessing

Zamiast brać słupki z LOB dzielimy cały zakres na kubełki, których wielkość to ułamek wartości mid price. Następnie bierzemy tylko kilka z każdej strony mid price i normujemy tak, aby wartości sumowały się do jedynki.


In [4]:
### plik preprocess.py

def fits(index, arr):
    return index >= 0 and index < len(arr)


def get_XY(data, n_buckets=5, bucket_size=0.05 , omit_no_change=True):
    
    keys = list(data.keys())
    keys.sort()

    
    bad0 = 0
    bad1 = 0

    growths = []
    X = []

    for i, curr in enumerate(keys[:-1]):
        currKey = curr
        nextKey = keys[i+1]

        currday, currh = currKey.split()
        nextday, nexth = nextKey.split()

        currh = int(currh[:4])
        nexth = int(nexth[:4])

        

        if currh < 900 or currh > 1600 or currh + 1 != nexth:
            bad0 += 1
            continue


        if not omit_no_change or data[nextKey][2] != data[currKey][2]:
            rows0 = np.zeros(2*n_buckets)
            rows = np.zeros(2*n_buckets)
            
            mid_price = data[currKey][2]
            centers = (np.arange(2*n_buckets) - n_buckets + 0.5)*bucket_size*mid_price + mid_price
            #print(mid_price, centers)

            #print(mid_price)
            for bid_price, bid_size in reversed(data[currKey][0]):
                bucket = int(( n_buckets*bucket_size -  (mid_price-bid_price)/mid_price)/bucket_size)
                #print(bucket)
                if fits(bucket, rows):
                    rows0[bucket] += bid_size
                    #norm += bid_size
                    norm = 0.
                    for i in range(len(rows)//2):
                        norm += 1. / max(1.*(abs(centers[i]-bid_price)/(mid_price*bucket_size))**2., 0.25)
                    check = 0.
                    for i in range(len(rows)//2):
                        rows[i] += bid_size* ( (1./max((abs(centers[i]-bid_price)/(mid_price*bucket_size))**2., 0.25)) / norm )
                        check += ( (1./max(1.*(abs(centers[i]-bid_price)/(mid_price*bucket_size))**2., 0.25)) / norm )

            for ask_price, ask_size in data[currKey][1]:
                bucket = int(((ask_price - mid_price)/mid_price )/bucket_size)
                #print(bucket + n_buckets)
                if fits(bucket + n_buckets, rows):
                    rows0[bucket + n_buckets] += ask_size
                    #norm += bid_size
                    norm = 0.
                    for i in range(len(rows)//2, len(rows)):
                        norm += 1. / max(1.*(abs(centers[i]-ask_price)/(mid_price*bucket_size))**2., 0.25)
                    for i in range(len(rows)//2, len(rows)):
                        rows[i] += ask_size* ( (1./max(1.*(abs(centers[i]-ask_price)/(mid_price*bucket_size))**2., 0.25)) / norm )

            # poprawne dane - min ask > max bid
            if data[currKey][0][-1][0] <  data[currKey][1][0][0] and data[nextKey][0][-1][0] <  data[nextKey][1][0][0]:
                growths.append(data[currKey][2] < data[nextKey][2])
                rows /= rows.sum() #norm
                rows0 /= rows0.sum()
                X.append(rows)
            else:
                bad1 += 1


    X = np.array(X, dtype=np.float32)
    Y = np.array(growths, dtype=np.int)

    test_size =  len(Y)//8

    #dzielenie na czesc do uczenia/walidacji i testowa
    return (X[test_size:], Y[test_size:]), (X[:test_size], Y[:test_size])


### Model

Do trenowania używamy klasycznej sieci neuronowej z warstwami fc i dodatkowo dropoutem

In [5]:
bucket_size = 0.04
input_size = 2*3
hidden_size = 1000

loss = nn.CrossEntropyLoss()

In [6]:
model = nn.Sequential(
    nn.Linear(input_size, hidden_size),
    nn.Tanh(),
    nn.Dropout(p=0.6, inplace=False),
    nn.Linear(hidden_size, hidden_size//4),
    nn.ReLU(),
    nn.Linear(hidden_size//4, 2)
)

## Wyniki

In [7]:
from utils import accuracy

from preprocess import get_XY

for data in file_locs:
    _, (X, Y) = get_XY(parse(data), n_buckets=input_size//2, bucket_size=bucket_size)
    
    set_number = data[data.find('.')+1:data.rfind('.')]
    
    model.load_state_dict(torch.load("models/model"+set_number))
    model.eval()
    X = torch.from_numpy(X)
    Y = torch.from_numpy(Y)
    pred = model.forward(X)
    loss_val = loss(pred, Y)
    print("Loss: {}".format(loss_val.item()))
    print("Acc: {}".format(accuracy(pred.data.numpy(), Y.data.numpy())))
    print("Confusion matrix: ")
    print(confusion_matrix(np.argmax(pred.data.numpy(), axis=1), Y.data.numpy()))
    print()
    
## python3 main.py --data ../data/OrderBookSnapshots.9062.csv --debug --epochs 250 --buckets 3 --bucket_size 0.04 --print_every 100 --hidden 1000

Loss: 0.6941564679145813
Acc: 0.5064655172413793
Confusion matrix: 
[[  0   0]
 [229 235]]

Loss: 0.6927377581596375
Acc: 0.5161987041036717
Confusion matrix: 
[[  0   0]
 [224 239]]

Loss: 0.7168537974357605
Acc: 0.49774774774774777
Confusion matrix: 
[[  0   0]
 [223 221]]

Loss: 0.6917941570281982
Acc: 0.5269709543568465
Confusion matrix: 
[[211 196]
 [ 32  43]]

Loss: 0.6923332214355469
Acc: 0.5108108108108108
Confusion matrix: 
[[  0   0]
 [181 189]]



Dziękujemy za uwagę