In [1]:
import os
import pandas as pd
import numpy as np
from scipy import stats
from pathlib import Path
from dotenv import load_dotenv
from tqdm import tqdm
import wandb

import torch
from torch import nn
from torch.utils.data import DataLoader

from src.metrics import pearson_metric
from src.torch_models import EmbedMLP
from src.data import Dataset


load_dotenv()

True

In [2]:
device = "cuda:2"

# Hyperparameters

In [3]:
batch_size = 100000
epochs = 20
investment_id_dropout = 0.01

# Data preparation

In [4]:
dataset_dir = Path(os.environ['dataset_dir'])
data = pd.read_csv(dataset_dir / 'train.csv')

In [5]:
data = data.set_index('row_id')

In [6]:
# investment ids start from 0, but I need the 0th investment to be "unseen" investment
# that we will use if the new (unseen) investment in the test set arises
data.investment_id += 1

In [7]:
# train on earlier data, test on later data
train = data[data.time_id < 1000]
test = data.query("1000 <= time_id")

In [8]:
learned_investments = train.investment_id.unique()
new_investments_in_test = test.query("investment_id not in @learned_investments").index
test.loc[new_investments_in_test].investment_id = 0

In [9]:
train_dataset = Dataset(train.drop("time_id", axis=1))
test_dataset = Dataset(test.drop("time_id", axis=1))

# Configure Training

In [19]:
train_dataloader = DataLoader(train_dataset, batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

In [20]:
model = EmbedMLP(input_dim=301, num_embeddings=3775).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
loss_function = nn.MSELoss()

In [21]:
wandb.init(project="market_prediction", entity="parmezano", name='mlp_emb')

0,1
test_loss,█▅▂▁▁▁
train_loss,█▂▂▁▁▁

0,1
test_loss,0.8212
train_loss,0.83358


# Training

In [None]:
for i in range(epochs):
    model.train()
    train_losses = []
    for batch in tqdm(train_dataloader):
        optimizer.zero_grad()
        x, y_true = batch
        # drop some investment ids
        x[:, 0] *= (torch.rand(len(x)) > investment_id_dropout)
        
        x = x.to(device)
        y_true = y_true.to(device)
        y_pred = model(x)
        loss = loss_function(y_true, y_pred.view(-1))
        loss.backward()
        optimizer.step()
        train_losses.append(loss.item())
        
    model.eval()
    test_losses = []
    with torch.no_grad():
        for batch in tqdm(test_dataloader):
            x, y_true = batch[0].to(device), batch[1].to(device)
            y_pred = model(x)
            loss = loss_function(y_true, y_pred.view(-1))
            test_losses.append(loss.item())
    wandb.log({"train_loss": np.mean(train_losses, axis=0), "test_loss": np.mean(test_losses, axis=0)})

100%|██████████| 25/25 [00:19<00:00,  1.28it/s]
100%|██████████| 8/8 [00:04<00:00,  1.73it/s]
100%|██████████| 25/25 [00:19<00:00,  1.30it/s]
100%|██████████| 8/8 [00:04<00:00,  1.72it/s]
100%|██████████| 25/25 [00:19<00:00,  1.30it/s]
100%|██████████| 8/8 [00:04<00:00,  1.73it/s]
100%|██████████| 25/25 [00:19<00:00,  1.30it/s]
100%|██████████| 8/8 [00:04<00:00,  1.65it/s]
100%|██████████| 25/25 [00:19<00:00,  1.28it/s]
100%|██████████| 8/8 [00:04<00:00,  1.70it/s]
100%|██████████| 25/25 [00:19<00:00,  1.30it/s]
100%|██████████| 8/8 [00:04<00:00,  1.69it/s]
100%|██████████| 25/25 [00:19<00:00,  1.29it/s]
100%|██████████| 8/8 [00:04<00:00,  1.70it/s]
100%|██████████| 25/25 [00:19<00:00,  1.28it/s]
100%|██████████| 8/8 [00:04<00:00,  1.64it/s]
100%|██████████| 25/25 [00:19<00:00,  1.26it/s]
100%|██████████| 8/8 [00:04<00:00,  1.70it/s]
100%|██████████| 25/25 [00:19<00:00,  1.28it/s]
100%|██████████| 8/8 [00:05<00:00,  1.54it/s]
100%|██████████| 25/25 [00:21<00:00,  1.15it/s]
100%|███████