In [1]:
import numpy as np
import pandas as pd
import glob
from tqdm import tqdm
import sys, os
import warnings
warnings.filterwarnings("ignore")

PATH = "/home/leon-zzh/Leon/Kaggle/optiver/optiver-realized-volatility-prediction"

def load_data(mode, path=PATH):
    # mode = "train"/"test"
    file_name = f'{path}/{mode}.csv'
    return pd.read_csv(file_name)

df = load_data("train")
print(df.shape, df["stock_id"].max())
df.head()

(428932, 3) 126


Unnamed: 0,stock_id,time_id,target
0,0,5,0.004136
1,0,11,0.001445
2,0,16,0.002168
3,0,31,0.002195
4,0,62,0.001747


In [2]:
SCALE = 100
PATH = "/home/leon-zzh/Leon/Kaggle/optiver/optiver-realized-volatility-prediction"

order_book_paths = glob.glob(f'{PATH}/book_train.parquet/*/*')
len(order_book_paths)

112

In [3]:
trade_paths = glob.glob(f'{PATH}/trade_train.parquet/*/*')
len(trade_paths)

112

In [4]:
order_books = dict()


for path in tqdm(order_book_paths):
    stock_id = int(path.split("=")[1].split("/")[0])
    book_df = pd.read_parquet(path)
    books_by_time = dict()
    
    for time_id in book_df.time_id.unique():
        books_by_time[time_id] = book_df[book_df["time_id"] == time_id].reset_index(drop=True)
    
    order_books[stock_id] = books_by_time

100%|██████████| 112/112 [04:19<00:00,  2.32s/it]


In [5]:
trades = dict()


for path in tqdm(trade_paths):
    stock_id = int(path.split("=")[1].split("/")[0])
    trade_df = pd.read_parquet(path)
    trade_by_time = dict()
    
    for time_id in trade_df.time_id.unique():
        trade_by_time[time_id] = trade_df[trade_df["time_id"] == time_id].reset_index(drop=True)
    
    trades[stock_id] = trade_by_time

100%|██████████| 112/112 [02:48<00:00,  1.50s/it]


In [6]:
# for st_id in trades.keys():
#     for t_id in tqdm(trades[st_id].keys()):
#         filldf = pd.DataFrame({"seconds_in_bucket": range(600)})
#         filldf = pd.merge(filldf, trades[st_id][t_id], on=["seconds_in_bucket"], how="left", suffixes=("_to_move", ""))
#         filldf.fillna(-1)
#         trades[st_id][t_id] = filldf

In [7]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset


# means_order = torch.FloatTensor([  0.9997,   1.0003, 769.9902, 766.7346,   0.9995,   1.0005, 959.3417,
#         928.2203, 300])
# stds_order = torch.FloatTensor([3.6881e-03, 3.6871e-03, 5.3541e+03, 4.9549e+03, 3.7009e-03, 3.6991e-03,
#         6.6838e+03, 5.7353e+03, 300])

means_order = torch.FloatTensor([  0.9997,   1.0003, 769.9902, 766.7346,   0.9995,   1.0005, 959.3417,
        928.2203])
stds_order = torch.FloatTensor([3.6881e-03, 3.6871e-03, 5.3541e+03, 4.9549e+03, 3.7009e-03, 3.6991e-03,
        6.6838e+03, 5.7353e+03])

# means_trade = torch.FloatTensor([300, 1.0, 100, 3.0])
# stds_trade = torch.FloatTensor([300, 0.004, 153, 3.5])

means_trade = torch.FloatTensor([1.0, 100, 3.0])
stds_trade = torch.FloatTensor([0.004, 153, 3.5])



class OptiverDataset(Dataset):
    
    def __init__(self, df, aug=False):
        super().__init__()
        self.df = df.reset_index(drop=True)
        self.aug = aug
        self.seq_len = 600
        self.order_features = ['bid_price1', 'ask_price1', 'bid_size1', 'ask_size1','bid_price2', 
                         'ask_price2', 'bid_size2', 'ask_size2']#, "seconds_in_bucket"]
        self.trade_features = ["price", "size", "order_count"]
        
    
    def extract_features(self, data_dict, stock_id, time_id, features, means, stds):
        X = -torch.ones((self.seq_len, len(features)))
        try:
            df = data_dict[stock_id][time_id]
            feature_array = df[features].values
            X[-feature_array.shape[0]:] = (torch.FloatTensor(feature_array) - means)/stds
        except:
            pass
        return X
    
    def extract_book_features(self, data_dict, stock_id, time_id, features, means, stds):
        X = torch.zeros((self.seq_len, len(features)))
        df = data_dict[stock_id][time_id]

        filldf = pd.DataFrame({"seconds_in_bucket": range(600)})
        filldf = pd.merge(filldf, df, on=["seconds_in_bucket"], how="left")
        filldf = filldf.fillna(method="ffill")
        X[:] = (torch.FloatTensor(filldf[features].values) - means)/stds
#             second_in_bucket = df["seconds_in_bucket"].values
            
#             for i in range(len(second_in_bucket)-1):
#                 X[second_in_bucket[i]:second_in_bucket[i+1]] = (torch.FloatTensor(df[features].values[i]) - means)/stds
                
#             if second_in_bucket[-1] < 600:
#                 X[second_in_bucket[-1]:] = (torch.FloatTensor(df[features].values[-1]) - means)/stds

        return X
    
    def extract_trade_features(self, data_dict, stock_id, time_id, features, means, stds):
        X = -torch.ones((self.seq_len, len(features)))
#         print(stock_id, time_id)
#         try:
        df = data_dict[stock_id][time_id]
#         second_in_bucket = df["seconds_in_bucket"].values

        filldf = pd.DataFrame({"seconds_in_bucket": range(600)})
        filldf = pd.merge(filldf, df, on=["seconds_in_bucket"], how="left")
#         print(filldf)
#         filldf = filldf.fillna(-1)
#         print(filldf[features].values.shape)
        X[:] = (torch.FloatTensor(filldf[features].values) - means)/stds
        X[:] = torch.nan_to_num(X, nan=-1.0)

        return X


    def __getitem__(self, index):
        row = self.df.iloc[index]
        
        X1 = self.extract_book_features(order_books, row.stock_id, row.time_id, self.order_features,
                                  means_order, stds_order)
        try:
            X2 = self.extract_trade_features(trades, row.stock_id, row.time_id, self.trade_features,
                                      means_trade, stds_trade) 
        except:
            X2 = -torch.ones((self.seq_len, len(self.trade_features)))

        target = torch.FloatTensor([row.target*SCALE])
        stock = torch.LongTensor([row.stock_id])
        return X1, X2, stock, target

    def __len__(self):
        return self.df.shape[0]
    
ds = OptiverDataset(df)
ds[1]

(tensor([[-0.0615, -0.0337, -0.1055,  ..., -0.0334, -0.1405, -0.1566],
         [-0.0615, -0.0337, -0.1055,  ..., -0.0334, -0.1405, -0.1566],
         [-0.0615, -0.0337, -0.1055,  ..., -0.0334, -0.1405, -0.1566],
         ...,
         [ 0.1562,  0.1296, -0.1434,  ...,  0.1023, -0.1256, -0.1528],
         [ 0.1562,  0.1296, -0.1434,  ...,  0.1023, -0.1256, -0.1528],
         [ 0.1562,  0.1296, -0.1434,  ...,  0.1023, -0.1256, -0.1528]]),
 tensor([[-0.0439, -0.6405, -0.2857],
         [-1.0000, -1.0000, -1.0000],
         [-1.0000, -1.0000, -1.0000],
         ...,
         [-1.0000, -1.0000, -1.0000],
         [-1.0000, -1.0000, -1.0000],
         [-1.0000, -1.0000, -1.0000]]),
 tensor([0]),
 tensor([0.1445]))

In [12]:
ds[0][0].shape

torch.Size([600, 8])

In [8]:
class ConvBlock(nn.Module):
    def __init__(self, in_dim, out_dim, kernel_size, stride=1):
        super().__init__()
        self.lin = nn.Conv1d(in_dim, out_dim, kernel_size, stride=stride)
        self.bn = nn.BatchNorm1d(out_dim)
        self.activation = nn.ReLU()
        
    def forward(self, x):
        x = self.lin(x)
        x = self.bn(x)
        return self.activation(x)
        

class SubModel(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        self.convs1 = nn.Sequential(ConvBlock(in_dim, 16, 3),
                                   ConvBlock(16, 32, 3))
        self.stock_conv = ConvBlock(36, 64, 4, stride=4)
        self.avg_pool = nn.AdaptiveAvgPool1d(8)
        self.max_pool = nn.AdaptiveMaxPool1d(8)
        self.convs2 = nn.Sequential(ConvBlock(128, 128, 2, stride=2),
                                    ConvBlock(128, 32, 2, stride=2),
                                    ConvBlock(32, 8, 2, stride=2))
        
    def forward(self, x, s):
        x = self.convs1(x.transpose(2, 1))
        x = self.stock_conv(torch.cat([x, s.repeat(1, 1, x.shape[2])], axis=1))
        x = torch.cat([self.avg_pool(x), self.max_pool(x)], axis=1)
        x = self.convs2(x).squeeze(-1)
        return x
    
    
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.order_model = SubModel(in_dim=8)
        self.trade_model = SubModel(in_dim=3)
        self.top = nn.Linear(16, 1)
        self.stock_emb = nn.Embedding(127, 4)
        
    def forward(self, inputs):
        x1, x2, s = inputs
        s = self.stock_emb(s).transpose(2, 1)
        
        x1 = self.order_model(x1, s)
        x2 = self.trade_model(x2, s)
        x = self.top(torch.cat([x1, x2], axis=1))
        return x

In [9]:
def read_data(data):
    return tuple(d.cuda() for d in data[:-1]), data[-1].cuda()

# Function to calculate the root mean squared percentage error
# def rmspe(y_true, y_pred):
#     return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))

def adjust_lr(optimizer, epoch):
    if epoch < 1:
        lr = 5e-5
    elif epoch < 10:
        lr = 1e-3
    elif epoch < 27:
        lr = 1e-4
    else:
        lr = 1e-5

    for p in optimizer.param_groups:
        p['lr'] = lr
    return lr
    
def get_optimizer(net):
    optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=3e-4, betas=(0.9, 0.999),
                                 eps=1e-08)
    return optimizer

def rmspe(y_true, y_pred):
    y_pred = np.clip(y_pred, 0, None)
    return (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))


def loss_func(y_pred, y_true):
    return torch.mean(torch.square((y_true - y_pred) / y_true))


def validate(model, val_loader):
    model.eval()
    
    tbar = tqdm(val_loader, file=sys.stdout)
    
    preds = []
    labels = []

    with torch.no_grad():
        for idx, data in enumerate(tbar):
            inputs, target = read_data(data)

            pred = model(inputs)

            preds.append(pred.detach().cpu().numpy().ravel())
            labels.append(target.detach().cpu().numpy().ravel())
    
    return np.concatenate(labels), np.concatenate(preds)



def train(model, train_loader, val_loader, epochs):
    
    optimizer = get_optimizer(model)
    
    for e in range(epochs):
        model.train()
        tbar = tqdm(train_loader, file=sys.stdout)
        
        lr = adjust_lr(optimizer, e)
        
        loss_list = []
        preds = []
        labels = []

        for idx, data in enumerate(tbar):
            inputs, target = read_data(data)

            optimizer.zero_grad()
            pred = model(inputs)

            loss = loss_func(pred, target)
            loss.backward()
            optimizer.step()
            
            loss_list.append(loss.detach().cpu().item())
            preds.append(pred.detach().cpu().numpy().ravel())
            labels.append(target.detach().cpu().numpy().ravel())
            
            avg_loss = np.round(np.mean(loss_list), 4)

            tbar.set_description(f"Epoch {e+1} Loss: {avg_loss} lr: {lr}")
            
        val_labels, val_preds = validate(model, val_loader)
        val_metric = np.round(rmspe(val_labels, val_preds), 4)

        train_metric = np.round(rmspe(np.concatenate(labels), np.concatenate(preds)), 4)
        log_text = f"Epoch {e+1}\n Train metric: {train_metric}\nValidation metric: {val_metric}\n"
            
        print(log_text)
    return model, val_preds



def kfold_train(BS=512, NW=7, NUM_FOLDS=5):
    oof_preds = np.zeros(df.shape[0])

    for fold in range(NUM_FOLDS):
        print(f"Fold {fold + 1}")
        train_ind = np.where(df["time_id"].values % NUM_FOLDS != fold)[0]
        val_ind = np.where(df["time_id"].values % NUM_FOLDS == fold)[0]

        train_df, val_df = df.iloc[train_ind], df.iloc[val_ind]


        train_ds = OptiverDataset(train_df, aug=False)
        val_ds = OptiverDataset(val_df, aug=False)

        train_loader = DataLoader(train_ds, batch_size=BS, shuffle=True, num_workers=NW,
                                  pin_memory=False, drop_last=True)
        val_loader = DataLoader(val_ds, batch_size=BS, shuffle=False, num_workers=NW,
                                  pin_memory=False, drop_last=False)

        model = Model()
        model.cuda()
        print("...... Start Training ......")
        model, val_preds = train(model, train_loader, val_loader, epochs=30)

        oof_preds[val_ind] = val_preds

        torch.save(model.state_dict(), f"./NN/optiver_nn_v01_{fold}.pth")
        
    df["nn_pred"] = oof_preds/SCALE
    df.to_csv("./NN/optiver_nn_v01_oof.csv", index=False, columns=["stock_id", "time_id", "nn_pred"])
    
    rmspe_score = rmspe(df["target"], oof_preds/SCALE)
    print(f"Our out of folds RMSPE is {rmspe_score}")

In [10]:
kfold_train()

Fold 1
...... Start Training ......
Epoch 1 Loss: 0.8751 lr: 5e-05: 100%|██████████| 676/676 [04:42<00:00,  2.40it/s]
100%|██████████| 162/162 [01:08<00:00,  2.35it/s]
Epoch 1
 Train metric: 0.7748000025749207
Validation metric: 0.6177999973297119

Epoch 2 Loss: 0.333 lr: 0.001: 100%|██████████| 676/676 [04:43<00:00,  2.39it/s] 
100%|██████████| 162/162 [01:07<00:00,  2.39it/s]
Epoch 2
 Train metric: 0.5274999737739563
Validation metric: 0.3977999985218048

Epoch 3 Loss: 0.1694 lr: 0.001: 100%|██████████| 676/676 [04:42<00:00,  2.39it/s]
100%|██████████| 162/162 [01:08<00:00,  2.38it/s]
Epoch 3
 Train metric: 0.4004000127315521
Validation metric: 0.31700000166893005

Epoch 4 Loss: 0.2293 lr: 0.001: 100%|██████████| 676/676 [04:42<00:00,  2.39it/s]
100%|██████████| 162/162 [01:08<00:00,  2.38it/s]
Epoch 4
 Train metric: 0.4512999951839447
Validation metric: 0.3709000051021576

Epoch 5 Loss: 0.1173 lr: 0.001: 100%|██████████| 676/676 [04:42<00:00,  2.39it/s]
100%|██████████| 162/162 [01:

In [14]:
train_ind = np.where(df["time_id"].values % 5 != 1)[0]
val_ind = np.where(df["time_id"].values % 5 == 1)[0]

In [15]:
train_df, val_df = df.iloc[train_ind], df.iloc[val_ind]

In [16]:
train_ds = OptiverDataset(train_df, aug=False)
val_ds = OptiverDataset(val_df, aug=False)

In [17]:
train_loader = DataLoader(train_ds, batch_size=12, shuffle=True, num_workers=4, pin_memory=False, drop_last=True)

In [18]:
train_ds[1]

(tensor([[-0.0514, -0.0498, -0.1296,  ..., -0.0784, -0.1383, -0.1599],
         [-0.0514, -0.0498, -0.1296,  ..., -0.0784, -0.1383, -0.1599],
         [-0.0514, -0.0498, -0.1296,  ..., -0.0784, -0.1383, -0.1599],
         ...,
         [ 0.0118, -0.0498, -0.1248,  ..., -0.0910, -0.1431, -0.1606],
         [ 0.0118, -0.0498, -0.1248,  ..., -0.0910, -0.1431, -0.1606],
         [ 0.0118, -0.0498, -0.1248,  ..., -0.0910, -0.1431, -0.1606]]),
 tensor([[-1., -1., -1.],
         [-1., -1., -1.],
         [-1., -1., -1.],
         ...,
         [-1., -1., -1.],
         [-1., -1., -1.],
         [-1., -1., -1.]]),
 tensor([0]),
 tensor([0.1747]))

In [21]:
%%time
for x in train_loader:
    print(x[0].shape)
    break

torch.Size([12, 600, 8])
CPU times: user 123 ms, sys: 362 ms, total: 485 ms
Wall time: 574 ms


In [84]:
kfold_train()

Fold 1
  0%|          | 0/676 [00:59<?, ?it/s]Error



KeyboardInterrupt: 

Error
Error
Error
Error
