In [1]:
import os
import gc
import random

import numpy as np
import pandas as pd

from tqdm.notebook import tqdm

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GroupKFold

import joblib

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import warnings
warnings.simplefilter('ignore')


In [2]:
def seed_everything(seed: int):  
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
seed_everything(42)

### Split by investment ID? new investment ID in test set. TimeID as feature?

## Settings

In [3]:
PATH = "./ubiquant-market-prediction"
PRECISION = "half"
n_features = 300
FEATURES = [f'f_{i}' for i in range(n_features)]

## Read Data

In [4]:
if PRECISION == "half":
    train = pd.read_pickle(f'{PATH}/train.pkl')
else:
    train = pd.read_csv(f'{PATH}/train.csv')
    _ = train.pop("row_id")
    
train.head()

Unnamed: 0,investment_id,time_id,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,...,f_291,f_292,f_293,f_294,f_295,f_296,f_297,f_298,f_299,target
0,1,0,0.932617,0.113708,-0.4021,0.378418,-0.203979,-0.413574,0.96582,1.230469,...,-1.095703,0.200073,0.819336,0.941406,-0.086792,-1.086914,-1.044922,-0.287598,0.321533,-0.300781
1,2,0,0.811035,-0.51416,0.742188,-0.616699,-0.194214,1.771484,1.427734,1.133789,...,0.912598,-0.734375,0.819336,0.941406,-0.387695,-1.086914,-0.929688,-0.974121,-0.343506,-0.231079
2,6,0,0.394043,0.615723,0.567871,-0.60791,0.068909,-1.083008,0.979492,-1.125977,...,0.912598,-0.551758,-1.220703,-1.060547,-0.219116,-1.086914,-0.612305,-0.113953,0.243652,0.568848
3,7,0,-2.34375,-0.011871,1.875,-0.606445,-0.586914,-0.815918,0.77832,0.299072,...,0.912598,-0.266357,-1.220703,0.941406,-0.608887,0.104919,-0.783203,1.151367,-0.773438,-1.064453
4,8,0,0.842285,-0.262939,2.330078,-0.583496,-0.618164,-0.742676,-0.946777,1.230469,...,0.912598,-0.741211,-1.220703,0.941406,-0.588379,0.104919,0.753418,1.345703,-0.737793,-0.531738


In [5]:
# scaler = MinMaxScaler(feature_range=(-1, 1))
# train[[f'f_{i}' for i in range(300)]] = scaler.fit_transform(train[[f'f_{i}' for i in range(300)]])

In [6]:
class UMPDataset(Dataset):
    def __init__(self, df_data, mode='train'):
        self.mode = mode
        
        self.time_id = df_data['time_id'].values.astype(np.int32)
        self.investment_id = df_data['investment_id'].values.astype(np.int32)
        self.values = df_data[FEATURES].values
        if self.mode != 'test':
            self.targets = df_data['target'].values
            
        self.len = df_data.shape[0]
        
    def __len__(self):
        return self.len
    
    def __getitem__(self, idx):
        time_id = self.time_id[idx]
        investment_id = self.investment_id[idx]
        values = self.values[idx]
        if self.mode != 'test':
            targets = self.targets[idx]
            return time_id, investment_id, values, targets
        else:
            return time_id, investment_id, values

In [7]:
def swish(x):
    return x * torch.sigmoid(x)

# Residual block
class Residual1D(nn.Module):
    def __init__(self, in_dim, out_dim, activation='relu'):
        super(Residual1D, self).__init__()
        self.fc1 = nn.Linear(in_dim, out_dim)
        self.bn1 = nn.BatchNorm1d(out_dim)
        
        if activation == 'relu':
            self.activate = nn.ReLU(inplace=True)
        elif activation == 'swish':
            self.activate = swish
        else:
            raise NotImplementedError
            
        self.fc2 = nn.Linear(out_dim, out_dim)
        self.bn2 = nn.BatchNorm1d(out_dim)
        
    def forward(self, x):
        residual = x
        x = self.fc1(x)
        x = self.bn1(x)
        x = self.activate(x)
        x = self.fc2(x)
        x = self.bn2(x)
        
        x += residual
        x = self.activate(x)
        return x


class SimpleMLP(nn.Module):
    def __init__(self, value_dim=300, emb_size=16, time_emb=1220, investment_emb=3774):
        super().__init__()
        self.time_emb = nn.Embedding(time_emb, emb_size)
        self.investment_emb = nn.Embedding(investment_emb, emb_size)
        
        self.bn0 = nn.BatchNorm1d(value_dim)
        self.head = nn.Linear(value_dim, 256)
        
        self.res1 = Residual1D(256, 256, 'relu')
        self.drop1 = nn.Dropout(0.5)
        self.trans1 = nn.Linear(256, 128)
        self.res2 = Residual1D(128, 128, 'relu')
        self.drop2 = nn.Dropout(0.5)
        self.trans2 = nn.Linear(128, 32)
        self.res3 = Residual1D(32, 32, 'relu')
        self.drop3 = nn.Dropout(0.5)

        self.tail = nn.Linear(32, 1)
        
    def forward(self, x_time, x_investment, x_value):
        time_emb = self.time_emb(x_time)
        investment_emb = self.investment_emb(x_investment)
        
#         x = torch.cat([x_value, time_emb, investment_emb], 1)
        
#         x = torch.cat([x_value, investment_emb], 1)
        x = self.bn0(x_value)
        
        x = swish(self.head(x))
        x = self.drop1(self.res1(x))
        x = swish(self.trans1(x))
        x = self.drop2(self.res2(x))
        x = swish(self.trans2(x))
        x = self.drop3(self.res3(x))
        
        x = self.tail(x)
        
        return x

In [8]:
train.head()

Unnamed: 0,investment_id,time_id,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,...,f_291,f_292,f_293,f_294,f_295,f_296,f_297,f_298,f_299,target
0,1,0,0.932617,0.113708,-0.4021,0.378418,-0.203979,-0.413574,0.96582,1.230469,...,-1.095703,0.200073,0.819336,0.941406,-0.086792,-1.086914,-1.044922,-0.287598,0.321533,-0.300781
1,2,0,0.811035,-0.51416,0.742188,-0.616699,-0.194214,1.771484,1.427734,1.133789,...,0.912598,-0.734375,0.819336,0.941406,-0.387695,-1.086914,-0.929688,-0.974121,-0.343506,-0.231079
2,6,0,0.394043,0.615723,0.567871,-0.60791,0.068909,-1.083008,0.979492,-1.125977,...,0.912598,-0.551758,-1.220703,-1.060547,-0.219116,-1.086914,-0.612305,-0.113953,0.243652,0.568848
3,7,0,-2.34375,-0.011871,1.875,-0.606445,-0.586914,-0.815918,0.77832,0.299072,...,0.912598,-0.266357,-1.220703,0.941406,-0.608887,0.104919,-0.783203,1.151367,-0.773438,-1.064453
4,8,0,0.842285,-0.262939,2.330078,-0.583496,-0.618164,-0.742676,-0.946777,1.230469,...,0.912598,-0.741211,-1.220703,0.941406,-0.588379,0.104919,0.753418,1.345703,-0.737793,-0.531738


In [9]:
def SetL2Regularization(model, weight=0.01):
    params=[]
    for key, value in model.named_parameters():
        if "bias" in key:
            params += [{'params':value,'weight_decay':0.0}]
        else:
            params += [{'params':value,'weight_decay':weight}]
    return params

In [10]:
def train_one_fold(dataloaders, fold_id, split_m='time_id'):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = SimpleMLP().to(device)
    loss_fn = nn.MSELoss()
    params = SetL2Regularization(model, 0.001)
    optimizer = optim.Adam(params, lr=5e-4)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 
                                                     factor=0.25, 
                                                     patience=3, 
                                                     mode='min')
    epochs = 15
    
    num_train_examples = len(dataloaders['train'])
    num_valid_examples = len(dataloaders['valid'])
    
    losses = []
    best_loss = np.inf
    best_score = -1
    print("... Start Training ...")
    for e in range(epochs):
        # train
        model.train()
        train_loss = 0
        for i, (time_, investment_, value_, target_) in enumerate(tqdm(dataloaders['train'])):
            time_ = time_.to(device)
            investment_ = investment_.to(device)
            value_ = value_.to(device=device, dtype=torch.float)
            
            target_ = target_.unsqueeze(1).to(device, dtype=torch.float)

            y_pred = model(time_, investment_, value_)
            loss = loss_fn(y_pred, target_)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        train_epoch_loss = train_loss / num_train_examples

        # valid
        model.eval()
        valid_preds = list()
        valid_loss = 0
        with torch.no_grad():
            for i, (time_, investment_, value_, target_) in enumerate(tqdm(dataloaders['valid'])):
                time_ = time_.to(device)
                investment_ = investment_.to(device)
                value_ = value_.to(device=device, dtype=torch.float)

                target_ = target_.unsqueeze(1).to(device, dtype=torch.float)
                
                y_pred = model(time_, investment_, value_)
                
                val_loss = loss_fn(y_pred, target_)
                valid_loss += val_loss.item()
                valid_preds.extend(y_pred.detach().cpu().numpy().flatten())
                
        valid_epoch_loss = valid_loss / num_valid_examples

        # change lr
        scheduler.step(valid_epoch_loss)

        # oof
        oof = df_valid[['target']].copy()
        oof['pred'] = valid_preds
        
        score = oof['pred'].corr(oof['target'])
        if score > best_score:
            print("... score ...")
            oof['best_pred'] = valid_preds
            best_preds = valid_preds.copy()
            best_score = score
        else:
            oof['best_pred'] = best_preds

        # print score
        print(f"Epoch {e}, LR: {optimizer.param_groups[0]['lr']}")
        print(f"train loss: {train_epoch_loss:.8f}, valid loss {valid_epoch_loss:.8f}, pearson score: {score:.6f}")
        losses.append((train_epoch_loss, valid_epoch_loss))

        # save model
        if best_loss > valid_epoch_loss:
            torch.save(model.state_dict(), f'mlp_{split_m}_{fold_id}.pth')
            print(f'-- loss from {best_loss:.8f} to {valid_epoch_loss:.8f}, model saved')
            best_loss = valid_epoch_loss
#         print(oof)
        
    return losses, oof
    

In [None]:
from sklearn.model_selection import StratifiedKFold
kfold = StratifiedKFold(4, shuffle=True, random_state=42)

oof_list = list()

for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(train, train['time_id'])):
    print(f'Training Fold: {fold_id}\n')
    
    df_train = train.iloc[trn_idx]
    df_valid = train.iloc[val_idx]
    
    train_set = UMPDataset(df_train, mode='train')
    valid_set = UMPDataset(df_valid, mode='valid')
    dataloaders = {
        'train': DataLoader(train_set, batch_size=1024, num_workers=4, pin_memory=True, shuffle=True),
        'valid': DataLoader(valid_set, batch_size=1024, num_workers=4, pin_memory=True, shuffle=False)
    }
    
    _, oof = train_one_fold(dataloaders, fold_id, split_m="time")
    print(oof.head())
    oof_list.append(oof)

Training Fold: 0

... Start Training ...


  0%|          | 0/2301 [00:00<?, ?it/s]

  0%|          | 0/767 [00:00<?, ?it/s]

... score ...
Epoch 0, LR: 0.0005
train loss: 0.83940139, valid loss 0.83268581, pearson score: 0.137968
-- loss from inf to 0.83268581, model saved


  0%|          | 0/2301 [00:00<?, ?it/s]

  0%|          | 0/767 [00:00<?, ?it/s]

Epoch 1, LR: 0.0005
train loss: 0.83019450, valid loss 0.83300638, pearson score: 0.137417


  0%|          | 0/2301 [00:00<?, ?it/s]

  0%|          | 0/767 [00:00<?, ?it/s]

... score ...
Epoch 2, LR: 0.0005
train loss: 0.82963771, valid loss 0.83209256, pearson score: 0.140429
-- loss from 0.83268581 to 0.83209256, model saved


  0%|          | 0/2301 [00:00<?, ?it/s]

In [None]:
oof_list = list()

kfold = GroupKFold(n_splits=4)
for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(train, train['target'], train['time_id'])):
    
    print(f'Training Fold: {fold_id}\n')
    
    df_train = train.iloc[trn_idx]
    df_valid = train.iloc[val_idx]
    
    train_set = UMPDataset(df_train, mode='train')
    valid_set = UMPDataset(df_valid, mode='valid')
    dataloaders = {
        'train': DataLoader(train_set, batch_size=2048, num_workers=4, pin_memory=True, shuffle=True),
        'valid': DataLoader(valid_set, batch_size=2048, num_workers=4, pin_memory=True, shuffle=False)
    }
    
    _, oof = train_one_fold(dataloaders, fold_id, split_m="time")
    print(oof.head())
    oof_list.append(oof)

Training Fold: 0

... Start Training ...


  0%|          | 0/1151 [00:00<?, ?it/s]

In [None]:
oof = pd.concat(oof_list)
print('oof pearson score:', oof['pred'].corr(oof['target']))
print('oof pearson score:', oof['best_pred'].corr(oof['target']))

In [None]:
oof_list = list()

kfold = GroupKFold(n_splits=4)
for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(train, train['target'], train['investment_id'])):
    
    print(f'Training Fold: {fold_id}\n')
    
    df_train = train.iloc[trn_idx]
    df_valid = train.iloc[val_idx]
    
    train_set = UMPDataset(df_train, mode='train')
    valid_set = UMPDataset(df_valid, mode='valid')
    dataloaders = {
        'train': DataLoader(train_set, batch_size=1024, num_workers=4, pin_memory=True, shuffle=True),
        'valid': DataLoader(valid_set, batch_size=1024, num_workers=4, pin_memory=True, shuffle=False)
    }
    
    _, oof = train_one_fold(dataloaders, fold_id, split_m="inves")
    oof_list.append(oof)

In [None]:
oof = pd.concat(oof_list)
print('oof pearson score:', oof['pred'].corr(oof['target']))
print('oof pearson score:', oof['best_pred'].corr(oof['target']))

In [None]:
oof

In [None]:
# joblib.dump(scaler, 'minmaxscaler.pkl')