In [1]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.dataset import random_split

from sklearn.metrics import roc_auc_score

import matplotlib.pyplot as plt
%matplotlib inline

## Data Exploration and Augmentation

Use pandas to explore the datasets and deal with NaN entries.

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv("test.csv")

In [3]:
print(train.keys())
train.head()

Index(['id', 'last_price', 'mid', 'opened_position_qty ',
       'closed_position_qty', 'transacted_qty', 'd_open_interest', 'bid1',
       'bid2', 'bid3', 'bid4', 'bid5', 'ask1', 'ask2', 'ask3', 'ask4', 'ask5',
       'bid1vol', 'bid2vol', 'bid3vol', 'bid4vol', 'bid5vol', 'ask1vol',
       'ask2vol', 'ask3vol', 'ask4vol', 'ask5vol', 'y'],
      dtype='object')


Unnamed: 0,id,last_price,mid,opened_position_qty,closed_position_qty,transacted_qty,d_open_interest,bid1,bid2,bid3,...,bid2vol,bid3vol,bid4vol,bid5vol,ask1vol,ask2vol,ask3vol,ask4vol,ask5vol,y
0,0,3842.4,3842.6,,,103.0,0,3842.4,3842.0,3841.8,...,1,6,14,6,6,1,1,10,2,1
1,1,3842.8,3843.4,6.0,49.0,55.0,-43,3843.0,3842.8,3842.4,...,6,11,1,6,1,4,4,1,13,0
2,2,3844.0,3844.3,7.0,77.0,84.0,-69,3843.8,3843.6,3843.2,...,1,4,21,12,1,16,10,4,9,0
3,3,3843.8,3843.4,3.0,34.0,37.0,-30,3843.0,3842.8,3842.4,...,13,12,2,4,2,7,1,2,11,1
4,4,3843.2,3843.1,3.0,38.0,41.0,-35,3842.8,3842.4,3842.0,...,12,2,2,4,1,3,1,11,15,1


In [121]:
train.describe()

Unnamed: 0,id,last_price,mid,opened_position_qty,closed_position_qty,transacted_qty,d_open_interest,bid1,bid2,bid3,...,bid2vol,bid3vol,bid4vol,bid5vol,ask1vol,ask2vol,ask3vol,ask4vol,ask5vol,y
count,592380.0,592380.0,592380.0,419920.0,419920.0,592380.0,592380.0,592380.0,592380.0,592380.0,...,592380.0,592380.0,592380.0,592380.0,592380.0,592380.0,592380.0,592380.0,592380.0,592380.0
mean,296189.5,3965.639532,3965.639078,1.400583,1.964277,2.390106,-0.080745,3965.446863,3965.209031,3964.985168,...,4.906759,5.314979,5.601242,5.930859,3.887878,4.913441,5.309274,5.611319,5.92779,0.356958
std,171005.520569,81.801597,81.801148,2.262542,2.61252,3.781083,2.349781,81.795678,81.789472,81.783336,...,5.09852,5.411883,5.731617,6.256011,4.505598,5.432179,5.920962,6.505823,6.975328,0.479103
min,0.0,3812.0,3812.1,0.0,0.0,0.0,-171.0,3811.8,3811.6,3811.4,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
25%,148094.75,3896.4,3896.4,0.0,1.0,0.0,-1.0,3896.2,3896.0,3895.8,...,2.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,0.0
50%,296189.5,3983.8,3983.7,1.0,1.0,1.0,0.0,3983.6,3983.2,3983.0,...,3.0,4.0,4.0,4.0,2.0,3.0,4.0,4.0,4.0,0.0
75%,444284.25,4039.6,4039.5,2.0,2.0,3.0,1.0,4039.4,4039.0,4038.8,...,6.0,7.0,7.0,7.0,5.0,6.0,6.0,7.0,7.0,1.0
max,592379.0,4115.8,4116.0,80.0,212.0,311.0,50.0,4115.8,4115.6,4115.0,...,139.0,136.0,119.0,120.0,130.0,132.0,133.0,134.0,135.0,1.0


In [122]:
print(train.shape)
train.isna().sum()

(592380, 28)


id                           0
last_price                   0
mid                          0
opened_position_qty     172460
closed_position_qty     172460
transacted_qty               0
d_open_interest              0
bid1                         0
bid2                         0
bid3                         0
bid4                         0
bid5                         0
ask1                         0
ask2                         0
ask3                         0
ask4                         0
ask5                         0
bid1vol                      0
bid2vol                      0
bid3vol                      0
bid4vol                      0
bid5vol                      0
ask1vol                      0
ask2vol                      0
ask3vol                      0
ask4vol                      0
ask5vol                      0
y                            0
dtype: int64

In [123]:
print(test.shape)
test.isna().sum()

(191859, 27)


id                          0
last_price                  0
mid                         0
opened_position_qty     53656
closed_position_qty     53656
transacted_qty              0
d_open_interest             0
bid1                        0
bid2                        0
bid3                        0
bid4                        0
bid5                        0
ask1                        0
ask2                        0
ask3                        0
ask4                        0
ask5                        0
bid1vol                     0
bid2vol                     0
bid3vol                     0
bid4vol                     0
bid5vol                     0
ask1vol                     0
ask2vol                     0
ask3vol                     0
ask4vol                     0
ask5vol                     0
dtype: int64

To maximize the amount of data used, we fill NaN values instead of dropping those data entries altogether.

In [124]:
train['opened_position_qty '].fillna(np.floor(train['transacted_qty']/2), inplace = True)
train['closed_position_qty'].fillna(np.ceil(train['transacted_qty']/2), inplace = True)

test['opened_position_qty '].fillna(np.floor(test['transacted_qty']/2), inplace = True)
test['closed_position_qty'].fillna(np.ceil(test['transacted_qty']/2), inplace = True)

In [216]:
train.head()
#(train - train.min()) / (train.max() - train.min()).head()

Unnamed: 0,id,last_price,mid,opened_position_qty,closed_position_qty,transacted_qty,d_open_interest,bid1,bid2,bid3,...,bid2vol,bid3vol,bid4vol,bid5vol,ask1vol,ask2vol,ask3vol,ask4vol,ask5vol,y
0,0,3842.4,3842.6,51.0,52.0,103.0,0,3842.4,3842.0,3841.8,...,1,6,14,6,6,1,1,10,2,1
1,1,3842.8,3843.4,6.0,49.0,55.0,-43,3843.0,3842.8,3842.4,...,6,11,1,6,1,4,4,1,13,0
2,2,3844.0,3844.3,7.0,77.0,84.0,-69,3843.8,3843.6,3843.2,...,1,4,21,12,1,16,10,4,9,0
3,3,3843.8,3843.4,3.0,34.0,37.0,-30,3843.0,3842.8,3842.4,...,13,12,2,4,2,7,1,2,11,1
4,4,3843.2,3843.1,3.0,38.0,41.0,-35,3842.8,3842.4,3842.0,...,12,2,2,4,1,3,1,11,15,1


In [238]:
entry = train[train['id'] == 0]
entry.drop(['id', 'y'], axis=1).values.flatten()
entry.drop(['id', 'y'], axis=1).values

array([[3.8424e+03, 3.8426e+03, 5.1000e+01, 5.2000e+01, 1.0300e+02,
        0.0000e+00, 3.8424e+03, 3.8420e+03, 3.8418e+03, 3.8410e+03,
        3.8404e+03, 3.8428e+03, 3.8434e+03, 3.8436e+03, 3.8438e+03,
        3.8440e+03, 8.0000e+00, 1.0000e+00, 6.0000e+00, 1.4000e+01,
        6.0000e+00, 6.0000e+00, 1.0000e+00, 1.0000e+00, 1.0000e+01,
        2.0000e+00]])

## Data Loading

In [239]:
class HFTdataset(Dataset):
    def __init__(self, datafile):
        data = pd.read_csv(datafile)
        
        # Feature manipulation
        # fill NaN
        data['opened_position_qty '].fillna(np.floor(data['transacted_qty']/2), inplace = True)
        data['closed_position_qty'].fillna(np.ceil(data['transacted_qty']/2), inplace = True)
        
        # add/drop features
        
        # feature scaling
        dataNorm = (data - data.min()) / (data.max() - data.min())
        dataNorm['id'] = data['id']
        dataNorm['y'] = data['y']

        self.data = dataNorm
        self.datasize = data.shape[0]
        
    def __len__(self):
        return self.datasize
    
    def __getitem__(self, idx):
        entry = self.data[self.data['id'] == idx]
        return (torch.from_numpy(entry.drop(['id', 'y'], axis=1).values.flatten()).float(),
                torch.from_numpy(entry['y'].values).float())

In [240]:
# train set / val set split
training_data = HFTdataset('train.csv')
train_size = int(0.8 * len(training_data))
val_size = len(training_data) - train_size

train_set, val_set = random_split(training_data, [train_size, val_size])
train_loader = DataLoader(train_set, batch_size=1000, shuffle=True)
val_loader = DataLoader(val_set, batch_size=1000, shuffle=False)

# to get our prediction results
#pred_set = pandas.read_csv('test.csv')
#pred_loader = DataLoader(pred_set, batch_size=128, shuffle=False)

## DNN

First try a deep linear network.

In [241]:
model_linear = nn.Sequential(
    # input to hidden
    nn.Linear(26, 64),
    nn.ReLU(),
    nn.Dropout(0.2),
    
    # more hidden layers
    nn.Linear(64, 128),
    nn.ReLU(),
    nn.Dropout(0.2),
    
    nn.Linear(128, 256),
    nn.ReLU(),
    nn.Dropout(0.2),
    
#     nn.Linear(256, 256),
#     nn.ReLU(),
#     nn.Dropout(0.1),
    
#     nn.Linear(256, 256),
#     nn.ReLU(),
#     nn.Dropout(0.1),
    
#     nn.Linear(256, 256),
#     nn.ReLU(),
#     nn.Dropout(0.1),
    
    # hidden to output
    nn.Linear(256, 1)
)

## Training and Validation

In [242]:
# choose a loss function
loss_fn = nn.MSELoss()

def run_training(model, loss_fn, optimizer, N_epoch=25, device='cpu'):
    train_losses = []
    
    for epoch in range(N_epoch):
        # train
        model.train()
        
        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = data.to(device), target.to(device)
            
            optimizer.zero_grad()
            
            output = model(data)
            loss = loss_fn(output, target)
            loss.backward()

            optimizer.step()

        # Track loss each epoch
        train_losses.append(loss.item())
        print('Train Epoch: %d  Loss: %.4f' % (epoch + 1,  loss.item()))
        
        # save the model
        torch.save(model, 'model.pth')
        
        # validation
        model.eval()
        rocs = []
        with torch.no_grad():
            for batch_idx, (data, targets) in enumerate(val_loader):
                data, target = data.to(device), target.to(device)
                
                output = model(data)
                _, predicted = torch.max(output.data, 1)
                roc = roc_auc_score(target.cpu().to_numpy(), predicted.cpu().to_numpy())
                rocs.append(roc)
        print('Epoch : %d;  ROC: %.3f' % (epoch, roc))

In [243]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

model_linear.to(device)
optimizer = torch.optim.Adam(model_linear.parameters(), lr=1e-3)
run_training(model_linear, loss_fn, optimizer, device=device)

KeyboardInterrupt: 

## Make Predictions