In [1]:
import torch
import torch.nn.functional as F
from torch.utils import data
from torchinfo import summary
import torch.nn as nn
import torch.optim as optim
import sys
import pandas as pd
import pickle
import numpy as np
from tqdm import tqdm
from datetime import datetime
import copy
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

In [2]:
import platform; print(platform.platform())

macOS-14.6-arm64-arm-64bit


In [3]:
if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    x = torch.ones(1, device=mps_device)
    print (x)
else:
    print ("MPS device not found.")

tensor([1.], device='mps:0')


In [4]:
def data_classification(X, Y, T):
    [N, D] = X.shape
    # print(X.shape,T,N,Y.shape)
    df = np.array(X)
    # print(df.shape)
    dY = np.array(Y)

    dataY = dY[T - 1:N]

    dataX = np.zeros((N - T + 1, T, D))
    for i in range(T, N + 1):
        dataX[i - T] = df[i - T:i, :]

    return dataX, dataY


In [5]:
device = mps_device

In [6]:
class deeplob(nn.Module):
    def __init__(self, y_len):
        super().__init__()
        self.y_len = y_len
        
        # convolution blocks
        self.conv1 = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=32, kernel_size=(1,2), stride=(1,2)),
            nn.LeakyReLU(negative_slope=0.01),
#             nn.Tanh(),
            nn.BatchNorm2d(32),
            nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(4,1)),
            nn.LeakyReLU(negative_slope=0.01),
            nn.BatchNorm2d(32),
            nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(4,1)),
            nn.LeakyReLU(negative_slope=0.01),
            nn.BatchNorm2d(32),
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(1,2), stride=(1,2)),
            nn.Tanh(),
            nn.BatchNorm2d(32),
            nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(4,1)),
            nn.Tanh(),
            nn.BatchNorm2d(32),
            nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(4,1)),
            nn.Tanh(),
            nn.BatchNorm2d(32),
        )
        self.conv3 = nn.Sequential(
            nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(1,10)),
            nn.LeakyReLU(negative_slope=0.01),
            nn.BatchNorm2d(32),
            nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(4,1)),
            nn.LeakyReLU(negative_slope=0.01),
            nn.BatchNorm2d(32),
            nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(4,1)),
            nn.LeakyReLU(negative_slope=0.01),
            nn.BatchNorm2d(32),
        )
        
        # inception moduels
        self.inp1 = nn.Sequential(
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=(1,1), padding='same'),
            nn.LeakyReLU(negative_slope=0.01),
            nn.BatchNorm2d(64),
            nn.Conv2d(in_channels=64, out_channels=64, kernel_size=(3,1), padding='same'),
            nn.LeakyReLU(negative_slope=0.01),
            nn.BatchNorm2d(64),
        )
        self.inp2 = nn.Sequential(
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=(1,1), padding='same'),
            nn.LeakyReLU(negative_slope=0.01),
            nn.BatchNorm2d(64),
            nn.Conv2d(in_channels=64, out_channels=64, kernel_size=(5,1), padding='same'),
            nn.LeakyReLU(negative_slope=0.01),
            nn.BatchNorm2d(64),
        )
        self.inp3 = nn.Sequential(
            nn.MaxPool2d((3, 1), stride=(1, 1), padding=(1, 0)),
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=(1,1), padding='same'),
            nn.LeakyReLU(negative_slope=0.01),
            nn.BatchNorm2d(64),
        )
        
        # lstm layers
        self.lstm = nn.LSTM(input_size=192, hidden_size=64, num_layers=1, batch_first=True)
        self.fc1 = nn.Linear(64, self.y_len)

    def forward(self, x):
        # h0: (number of hidden layers, batch size, hidden size)
        h0 = torch.zeros(1, x.size(0), 64).to(device)
        c0 = torch.zeros(1, x.size(0), 64).to(device)
    
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        
        x_inp1 = self.inp1(x)
        x_inp2 = self.inp2(x)
        x_inp3 = self.inp3(x)  
        
        x = torch.cat((x_inp1, x_inp2, x_inp3), dim=1)
        
#         x = torch.transpose(x, 1, 2)
        x = x.permute(0, 2, 1, 3).contiguous()
        x = torch.reshape(x, (-1, x.shape[1], x.shape[2]))
        
        x, _ = self.lstm(x, (h0, c0))
        x = x[:, -1, :]
        x = self.fc1(x)
        forecast_y = torch.softmax(x, dim=1)
        
        return forecast_y

In [7]:
model = torch.load('/Users/jandh/Desktop/Old Desktop/od/1 quater/Project Lab/best_val_model_pytorch')
model.to(mps_device)

  model = torch.load('/Users/jandh/Desktop/Old Desktop/od/1 quater/Project Lab/best_val_model_pytorch')


deeplob(
  (conv1): Sequential(
    (0): Conv2d(1, 32, kernel_size=(1, 2), stride=(1, 2))
    (1): LeakyReLU(negative_slope=0.01)
    (2): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Conv2d(32, 32, kernel_size=(4, 1), stride=(1, 1))
    (4): LeakyReLU(negative_slope=0.01)
    (5): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): Conv2d(32, 32, kernel_size=(4, 1), stride=(1, 1))
    (7): LeakyReLU(negative_slope=0.01)
    (8): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (conv2): Sequential(
    (0): Conv2d(32, 32, kernel_size=(1, 2), stride=(1, 2))
    (1): Tanh()
    (2): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Conv2d(32, 32, kernel_size=(4, 1), stride=(1, 1))
    (4): Tanh()
    (5): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): Conv2d(32, 32, kernel_size=(4, 1), stride

In [8]:
class Dataset_LOBSTER(data.Dataset):
    """Characterizes a dataset for PyTorch"""
    def __init__(self, x,y, num_classes, T):
        """Initialization""" 
        # self.k = k
        self.num_classes = num_classes
        self.T = T
            
        # x = prepare_x(data)
        # y = get_label(data)
        x, y = data_classification(x, y, self.T)
        # y = y[:,self.k] - 1
        self.length = len(x)

        x = torch.from_numpy(x)
        self.x = torch.unsqueeze(x, 1)
        self.y = torch.from_numpy(y)

    def __len__(self):
        """Denotes the total number of samples"""
        return self.length

    def __getitem__(self, index):
        """Generates samples of data"""
        return self.x[index], self.y[index]

In [9]:
def prep_data_quantile(test_data,norm,q=0.995):
    '''quantile based threshold for labelling 999%'''
    test_data = test_data.astype(np.float64)
    for i in range(0,40,2):
        if i%2==0:
            test_data.loc[:,i] = test_data.loc[:,i]/10000
    # print(test_data.head())
    
    #Creating labels of the data
    mid_Price = (test_data.iloc[:,0]+test_data.iloc[:,2])/2
    foward_mean = mid_Price[::-1].rolling(window = 10,min_periods = 10).mean()[::-1].shift(-1)
    pc_mid = (foward_mean-mid_Price)/mid_Price

    up = pc_mid.quantile(q)
    down = pc_mid.quantile(1-q)
    print(f"upvalue= {up:.4%} , downvalue = {down:.4%} ")
    labels = pc_mid.copy(deep=True)
    labels.loc[(pc_mid<up) & (pc_mid>down)] = 1
    labels.loc[pc_mid>=up] = 0
    labels.loc[pc_mid<=down] = 2

    print(labels.unique(),labels.value_counts(),'labels distribution')
    #Normalizing using z-score
    if norm=='Z':
        scaler = StandardScaler()
        scaler.fit(test_data)
        test_data = scaler.transform(test_data)

    #Normalizing using DecPrec
    if norm=='DecPrec':
        k_len = np.ceil(np.log10(test_data.abs().max()))
        # print(k_len)
        test_data = test_data/(10**k_len)
    # test
    # print(pc_mid.iloc[-10],pc_mid.iloc[-11])
    return test_data[:-10].to_numpy(),labels[:-10].to_numpy()

In [10]:
def prep_data_lobster(test_data,norm):

    '''Fixed threshold for labelling 20bps'''
    
    test_data = test_data.astype(np.float64)
    for i in range(0,40,2):
        if i%2==0:
            test_data.loc[:,i] = test_data.loc[:,i]/10000
    # print(test_data.head())
    def label_thresholder(x):
        if x>=0.002:
            return 0
        elif x<=-0.002:
            return 2
        else:
            return 1
    #Creating labels of the data
    mid_Price = (test_data.iloc[:,0]+test_data.iloc[:,2])/2
    foward_mean = mid_Price[::-1].rolling(window = 10,min_periods = 10).mean()[::-1].shift(-1)
    pc_mid = (foward_mean-mid_Price)/mid_Price
    # print(pc_mid.mean(),pc_mid.min(),pc_mid.max(),mid_Price)
    labels = pc_mid.apply(label_thresholder)
    print(labels.unique(),labels.value_counts(),'labels distribution')
    #Normalizing using z-score
    if norm=='Z':
        scaler = StandardScaler()
        scaler.fit(test_data)
        test_data = scaler.transform(test_data)

    #Normalizing using DecPrec
    if norm=='DecPrec':
        k_len = np.ceil(np.log10(test_data.abs().max()))
        # print(k_len)
        test_data = test_data/(10**k_len)
    # test
    # print(pc_mid.iloc[-10],pc_mid.iloc[-11])
    return test_data[:-10].to_numpy(),labels[:-10].to_numpy()

In [11]:
def Generic_test(test_data,name,prep_data,q=0.995):
    '''process inputs and score inputs on targets'''
    if prep_data.__name__=='prep_data_lobster':
        inputs, targets = prep_data(test_data,norm='DecPrec')

    elif prep_data.__name__=='prep_data_quantile':
        inputs, targets = prep_data(test_data,norm='DecPrec',q=q)
        
    # test_data = pd.read_excel(location,header=None)
    
    # col = 21
    # print(inputs[:,col].mean(),np.median(inputs[:,col]),inputs[:,col].max(),inputs[:,col].min(),inputs.shape)
    batch_size = 32
    dataset_test = Dataset_LOBSTER(inputs,targets, num_classes=3, T=100)
    test_loader = torch.utils.data.DataLoader(dataset=dataset_test, batch_size=batch_size, shuffle=False)
    # tmp_loader = torch.utils.data.DataLoader(dataset=dataset_test, batch_size=1, shuffle=True)
    
    # for x, y in tmp_loader:
    #     print(x)
    #     print(y)
    #     print(x.shape, y.shape)
    #     break
    n_correct = 0.
    n_total = 0.
    all_targets = []
    all_predictions = []
    for inputs, targets in test_loader:
        # Move to GPU
        model.eval()
        inputs, targets = inputs.to(device, dtype=torch.float), targets.to(device, dtype=torch.int64)
    
        # Forward pass
        outputs = model(inputs)
        
        # Get prediction
        # torch.max returns both max and argmax
        _, predictions = torch.max(outputs, 1)
    
        # update counts
        n_correct += (predictions == targets).sum().item()
        n_total += targets.shape[0]

        all_targets.append(targets.cpu().numpy())
        all_predictions.append(predictions.cpu().numpy())
    
    test_acc = n_correct / n_total
    print(f"Test acc of {name}_dataset: {test_acc:.4f}\n")
    all_targets = np.concatenate(all_targets)    
    all_predictions = np.concatenate(all_predictions)
    print('accuracy_score:', accuracy_score(all_targets, all_predictions))
    print(classification_report(all_targets, all_predictions, digits=4))

In [20]:
location = '/Users/jandh/Desktop/Old Desktop/od/1 quater/Project Lab/AMZN_2012-06-21_34200000_57600000_orderbook_10.xlsx'

In [21]:
test_data = pd.read_excel(location,header=None)
# test_data = test_data.iloc[::10]  # use if u want to downsample for 10 ticks
# inputs, targets = prep_data(test_data,norm='DecPrec')

In [22]:
def quantile_stats(col,suffix=None,verbose=True):
    q = [0.0001,0.001,0.01,0.05,0.1,0.25,0.5,0.75,0.9,0.95,0.99,0.999,0.9999]
    # ret_df = pd.DataFrame({'quantiles':q})
    # ret_df.set_index('quantiles',inplace=True)
    # ret_df['value'] = 0
    # ret_df['count']
    qvs= []
    qcs= []
    for i in q:
        
        if i>=0.05:
            quantile_count = (1-i)*len(col)
        else:
            quantile_count = i*len(col)

        qvs += [col.quantile(i)]
        qcs += [int(quantile_count)]
        if verbose:
            print(f"qunatile= {i:.2%} , quantileValue = {col.quantile(i):.4%} ,  quantile_count = {int(quantile_count)}")
    ret_df = pd.DataFrame({'quantiles':q,'values':qvs,'counts':qcs})
    ret_df.set_index(['quantiles','counts'],inplace=True)
    if suffix:
        ret_df.columns = ret_df.columns.map(lambda x: str(x) + suffix)
    return ret_df

In [23]:
def get_stats(test_data):
    # test_data = pd.read_excel(location,header=None)
    # test_data = test_data.iloc[::10]
    mid_Price = (test_data.iloc[:,0]+test_data.iloc[:,2])/2
    foward_mean = mid_Price[::-1].rolling(window = 10,min_periods = 10).mean()[::-1].shift(-1)
    pc_mid = (foward_mean-mid_Price)/mid_Price
    pc_mid = pc_mid[pc_mid.notnull()]
    quantile_stats(pc_mid)
    
# location = '/Users/jandh/Desktop/Old Desktop/od/1 quater/Project Lab/AMZN_2012-06-21_34200000_57600000_orderbook_10.xlsx'
get_stats(test_data)

qunatile= 0.01% , quantileValue = -0.0614% ,  quantile_count = 26
qunatile= 0.10% , quantileValue = -0.0337% ,  quantile_count = 269
qunatile= 1.00% , quantileValue = -0.0155% ,  quantile_count = 2697
qunatile= 5.00% , quantileValue = -0.0067% ,  quantile_count = 256251
qunatile= 10.00% , quantileValue = -0.0036% ,  quantile_count = 242764
qunatile= 25.00% , quantileValue = -0.0007% ,  quantile_count = 202303
qunatile= 50.00% , quantileValue = 0.0000% ,  quantile_count = 134869
qunatile= 75.00% , quantileValue = 0.0004% ,  quantile_count = 67434
qunatile= 90.00% , quantileValue = 0.0036% ,  quantile_count = 26973
qunatile= 95.00% , quantileValue = 0.0067% ,  quantile_count = 13486
qunatile= 99.00% , quantileValue = 0.0158% ,  quantile_count = 2697
qunatile= 99.90% , quantileValue = 0.0334% ,  quantile_count = 269
qunatile= 99.99% , quantileValue = 0.0738% ,  quantile_count = 26


In [24]:
# len(test_data)

AMAZON LOBSTER TEST

In [25]:
Generic_test(test_data,'Amazon',prep_data_lobster)

[1] 1    269748
Name: count, dtype: int64 labels distribution
Test acc of Amazon_dataset: 0.6754

accuracy_score: 0.6753585349300361
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000         0
           1     1.0000    0.6754    0.8062    269639
           2     0.0000    0.0000    0.0000         0

    accuracy                         0.6754    269639
   macro avg     0.3333    0.2251    0.2687    269639
weighted avg     1.0000    0.6754    0.8062    269639



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [26]:
Generic_test(test_data,'Amazon',prep_data_quantile)

upvalue= 0.0201% , downvalue = -0.0200% 
[ 0.  1.  2. nan] 1.0    267040
0.0      1349
2.0      1349
Name: count, dtype: int64 labels distribution
Test acc of Amazon_dataset: 0.6716

accuracy_score: 0.6716350379581588
              precision    recall  f1-score   support

           0     0.0064    0.1190    0.0122      1344
           1     0.9915    0.6763    0.8041    266946
           2     0.0062    0.2898    0.0122      1349

    accuracy                         0.6716    269639
   macro avg     0.3347    0.3617    0.2762    269639
weighted avg     0.9816    0.6716    0.7962    269639



APPLE TEST

In [27]:
location = '/Users/jandh/Desktop/Old Desktop/od/1 quater/Project Lab/AAPL_2012-06-21_34200000_57600000_orderbook_10.xlsx'
test_data = pd.read_excel(location,header=None)
# test_data = test_data.iloc[::10]  # use if u want to downsample for 10 ticks

In [28]:
get_stats(test_data)    

qunatile= 0.01% , quantileValue = -0.0260% ,  quantile_count = 40
qunatile= 0.10% , quantileValue = -0.0164% ,  quantile_count = 400
qunatile= 1.00% , quantileValue = -0.0087% ,  quantile_count = 4003
qunatile= 5.00% , quantileValue = -0.0042% ,  quantile_count = 380361
qunatile= 10.00% , quantileValue = -0.0026% ,  quantile_count = 360342
qunatile= 25.00% , quantileValue = -0.0007% ,  quantile_count = 300285
qunatile= 50.00% , quantileValue = 0.0000% ,  quantile_count = 200190
qunatile= 75.00% , quantileValue = 0.0007% ,  quantile_count = 100095
qunatile= 90.00% , quantileValue = 0.0025% ,  quantile_count = 40038
qunatile= 95.00% , quantileValue = 0.0042% ,  quantile_count = 20019
qunatile= 99.00% , quantileValue = 0.0086% ,  quantile_count = 4003
qunatile= 99.90% , quantileValue = 0.0160% ,  quantile_count = 400
qunatile= 99.99% , quantileValue = 0.0237% ,  quantile_count = 40


In [29]:
Generic_test(test_data,'Apple',prep_data_lobster)

[1] 1    400391
Name: count, dtype: int64 labels distribution
Test acc of Apple_dataset: 0.9654

accuracy_score: 0.9653544251302831


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000         0
           1     1.0000    0.9654    0.9824    400282
           2     0.0000    0.0000    0.0000         0

    accuracy                         0.9654    400282
   macro avg     0.3333    0.3218    0.3275    400282
weighted avg     1.0000    0.9654    0.9824    400282



In [30]:
Generic_test(test_data,'Apple',prep_data_quantile)

upvalue= 0.0107% , downvalue = -0.0109% 
[ 1.  0.  2. nan] 1.0    396377
0.0      2002
2.0      2002
Name: count, dtype: int64 labels distribution
Test acc of Apple_dataset: 0.9566

accuracy_score: 0.9566105895343783
              precision    recall  f1-score   support

           0     0.0113    0.0787    0.0198      1995
           1     0.9905    0.9659    0.9780    396286
           2     0.1071    0.0015    0.0030      2001

    accuracy                         0.9566    400282
   macro avg     0.3697    0.3487    0.3336    400282
weighted avg     0.9812    0.9566    0.9684    400282



MSFT TEST

In [31]:
location = '/Users/jandh/Desktop/Old Desktop/od/1 quater/Project Lab/MSFT_2012-06-21_34200000_57600000_orderbook_10.xlsx'
test_data = pd.read_excel(location,header=None)
# test_data = test_data.iloc[::10] 
get_stats(test_data)

qunatile= 0.01% , quantileValue = -0.0283% ,  quantile_count = 66
qunatile= 0.10% , quantileValue = -0.0166% ,  quantile_count = 668
qunatile= 1.00% , quantileValue = -0.0115% ,  quantile_count = 6687
qunatile= 5.00% , quantileValue = 0.0000% ,  quantile_count = 635317
qunatile= 10.00% , quantileValue = 0.0000% ,  quantile_count = 601879
qunatile= 25.00% , quantileValue = 0.0000% ,  quantile_count = 501566
qunatile= 50.00% , quantileValue = 0.0000% ,  quantile_count = 334377
qunatile= 75.00% , quantileValue = 0.0000% ,  quantile_count = 167188
qunatile= 90.00% , quantileValue = 0.0000% ,  quantile_count = 66875
qunatile= 95.00% , quantileValue = 0.0000% ,  quantile_count = 33437
qunatile= 99.00% , quantileValue = 0.0114% ,  quantile_count = 6687
qunatile= 99.90% , quantileValue = 0.0166% ,  quantile_count = 668
qunatile= 99.99% , quantileValue = 0.0297% ,  quantile_count = 66


In [32]:

Generic_test(test_data,'MSFT',prep_data_lobster)

[1] 1    668765
Name: count, dtype: int64 labels distribution
Test acc of MSFT_dataset: 0.9933

accuracy_score: 0.993265595463138


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000         0
           1     1.0000    0.9933    0.9966    668656
           2     0.0000    0.0000    0.0000         0

    accuracy                         0.9933    668656
   macro avg     0.3333    0.3311    0.3322    668656
weighted avg     1.0000    0.9933    0.9966    668656



In [33]:
Generic_test(test_data,'MSFT',prep_data_quantile)

upvalue= 0.0146% , downvalue = -0.0147% 
[ 1.  2.  0. nan] 1.0    662063
2.0      3347
0.0      3345
Name: count, dtype: int64 labels distribution
Test acc of MSFT_dataset: 0.9836

accuracy_score: 0.9836268574573472
              precision    recall  f1-score   support

           0     0.0202    0.0132    0.0160      3334
           1     0.9902    0.9934    0.9918    661981
           2     0.0146    0.0102    0.0120      3341

    accuracy                         0.9836    668656
   macro avg     0.3417    0.3389    0.3399    668656
weighted avg     0.9805    0.9836    0.9820    668656

