In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd "/content/drive/MyDrive/Kaggle/UbiquantMarketPredictionDrive/zume/ubiquant_src"
!git pull

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!nvidia-smi

/bin/bash: nvidia-smi: command not found


In [None]:
class Config:
    author = "zume" # Your name
    competition = "ubiquant-market-prediction"
    name = "MLP" # The name of the Dataset
    upload_from_colab = True # If True, the model uploads to the Kaggle Dataset
    
    colab_dir = "/content/drive/MyDrive/Kaggle/UbiquantMarketPredictionDrive" # Your own directory
    drive_path = colab_dir + f"/{author}"
    api_path = "/content/drive/MyDrive/Kaggle/kaggle.json" # Your own api-path
    
    dataset_path = ['robikscube/ubiquant-parquet'] # The dataset you want to download

    n_fold = 5
    n_test = 2
    purge = 10
    embargo = 0.01
    
    from scipy.special import comb
    trn_fold = [i for i in range(comb(n_fold, n_test, exact=True))]
    seed = 42
    max_epochs = 100

In [None]:
import os
import gc
import sys
import json
import pickle
import shutil
import random
import joblib
import requests
import itertools

import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob
from tqdm.auto import tqdm

from logging import StreamHandler, FileHandler, Formatter, getLogger, DEBUG, INFO

from sklearn.preprocessing import (
    StandardScaler,
    MinMaxScaler,
    RobustScaler,
)

import torch
import torch.nn as nn
import torch.optim as optim
import torch.functional as F
from torch.utils.data import Dataset, DataLoader
import tensorflow as tf

sys.path.append("/content/drive/MyDrive/Kaggle/UbiquantMarketPredictionDrive/zume/ubiquant_src")
import mycv


def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

def MAE(y_true, y_pred):
    return mean_absolute_error(y_true, y_pred)

def MSE(y_true, y_pred):
    return mean_squared_error(y_true, y_pred)

def RMSE(y_true, y_pred):
    return mean_squared_error(y_true, y_pred, squared=False)

def PearsonR(y_true, y_pred):
    return scipy.stats.pearsonr(y_true, y_pred)[0]

def PearsonR_metric(y_true, y_pred):
    return "pearsonr", scipy.stats.pearsonr(y_true, y_pred)[0], True

In [None]:
class UbiquantDataset(Dataset):
    def __init__(self, df, mode="train"):
        self.mode = mode
        self.features = df[[f"f_{i}" for i in range(300)]].values
        if self.mode != "test":
            self.targets = df["target"].values
        self.len = df.shape[0]

    def __len__(self):
        return self.len
    
    def __getitem__(self, index):
        if self.mode != "test":
            return self.features[index], self.targets[index]
        else:
            return self.features[index]

In [None]:
def swish(x):
    return x * torch.sigmoid(x)

class MLP(nn.Module):
    def __init__(self, n_features):
        super().__init__()

        self.l1 = nn.Linear(n_features, 64)
        self.bn1 = nn.BatchNorm1d(64)
        self.l2 = nn.Linear(64, 64)
        self.bn2 = nn.BatchNorm1d(64)
        self.l3 = nn.Linear(64, 32)
        self.bn3 = nn.BatchNorm1d(32)
        self.l4 = nn.Linear(32, 1)
    
    def forward(self,x):
        x = swish(self.l1(x))
        x = self.bn1(x)
        x = swish(self.l2(x))
        x = self.bn2(x)
        x = swish(self.l3(x))
        x = self.bn3(x)
        x = swish(self.l4(x))
        return x
    
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

In [None]:
def train_fn(cfg, train, fold, folds):
    model = MLP(300).to(device)
    loss_fn = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.1, patience=1, mode="min")
    epochs = 8

    idx_train = ((folds==fold).groupby(level=0).sum()==0)
    idx_valid = ((folds==fold).groupby(level=0).sum()>=1)

    train_set = UbiquantDataset(train[idx_train], mode="train")
    valid_set = UbiquantDataset(train[idx_valid], mode="valid")
    dataloaders = {
        "train": DataLoader(train_set, batch_size=512, num_workers=4, pin_memory=True),
        "valid": DataLoader(valid_set, batch_size=512, num_workers=4, pin_memory=True)
        }
        
    num_train = len(dataloaders["train"])
    num_valid = len(dataloaders["valid"])

    losses = []
    best_loss = np.inf

    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for i, (features, targets) in enumerate(dataloaders["train"]):
            features = features.to(device)
            targets = targets.unsqueeze(1).to(device)

            y = model(features)
            loss = loss_fn(y, targets)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        train_epoch_loss = train_loss / num_train

        model.eval()
        valid_preds = []
        valid_loss = 0
        with torch.no_grad():
            for i, (features, targets) in enumerate(dataloaders["valid"]):
                features = features.to(device)
                targets = targets.unsqueeze(1).to(device)

                y = model(features)
                loss = loss_fn(y, targets)
                
                valid_loss += loss.item()
                valid_preds.extend(y.detach().cpu().numpy().flatten())
            valid_epoch_loss = valid_loss / num_valid
        
        scheduler.step(valid_epoch_loss)
        oof = train[idx_valid][['target']].copy()
        oof['pred'] = valid_preds
        score = oof['pred'].corr(oof['target'])

        losses.append((train_epoch_loss, valid_epoch_loss))
        logger.info(f"EPOCH:{epoch}, LR:{optimizer.param_groups[0]['lr']}")
        logger.info(f"Train loss: {train_epoch_loss:.6f}")
        logger.info(f"Valid loss: {valid_epoch_loss:.6f}")
        logger.info(f"PearsonR score: {score:.6f}")

        if best_loss > valid_epoch_loss:
            torch.save(model.state_dict(), os.path.join(cfg.EXP_MODEL, f"model_{fold}.pth"))
            best_loss = valid_epoch_loss
    
    return losses, oof

In [None]:
def setup(cfg):
    cfg.COLAB = 'google.colab' in sys.modules
    if cfg.COLAB:
        global logger
        print('This environment is Google Colab')
        
        # mount
        from google.colab import drive
        if not os.path.isdir('/content/drive'):
            drive.mount('/content/drive') 
        
        # import library
        ! pip install --quiet tensorflow-addons

        # use kaggle api (need kaggle token)
        f = open(cfg.api_path, 'r')
        json_data = json.load(f) 
        os.environ['KAGGLE_USERNAME'] = json_data['username']
        os.environ['KAGGLE_KEY'] = json_data['key']

        # set dirs
        cfg.DRIVE = cfg.drive_path
        cfg.EXP = (cfg.name if cfg.name is not None 
            else requests.get('http://172.28.0.2:9000/api/sessions').json()[0]['name'][:-6]
        )
        cfg.INPUT = os.path.join(cfg.DRIVE, 'Input')
        cfg.OUTPUT = os.path.join(cfg.DRIVE, 'Output')
        cfg.SUBMISSION = os.path.join(cfg.DRIVE, 'Submission')
        cfg.DATASET = os.path.join(cfg.DRIVE, 'Dataset')

        cfg.OUTPUT_EXP = os.path.join(cfg.OUTPUT, cfg.EXP) 
        cfg.EXP_MODEL = os.path.join(cfg.OUTPUT_EXP, 'model')
        cfg.EXP_FIG = os.path.join(cfg.OUTPUT_EXP, 'fig')
        cfg.EXP_PREDS = os.path.join(cfg.OUTPUT_EXP, 'preds')

        # make dirs
        for d in [cfg.INPUT, cfg.SUBMISSION, cfg.EXP_MODEL, cfg.EXP_FIG, cfg.EXP_PREDS]:
            os.makedirs(d, exist_ok=True)
        
        if not os.path.isfile(os.path.join(cfg.INPUT, 'train.csv')):
            # load dataset
            ! pip install --upgrade --force-reinstall --no-deps kaggle
            ! kaggle competitions download -c $cfg.competition -p $cfg.INPUT
            filepath = os.path.join(cfg.INPUT,cfg.competition+'.zip')
            ! unzip -d $cfg.INPUT $filepath
            
        
        for path in cfg.dataset_path:
            datasetpath = os.path.join(cfg.DATASET,  path.split('/')[1])
            if not os.path.exists(datasetpath):
                os.makedirs(datasetpath, exist_ok=True)
                ! kaggle datasets download $path -p $datasetpath
                filepath = os.path.join(datasetpath, path.split("/")[1]+'.zip')
                ! unzip -d $datasetpath $filepath

        logger = getLogger("main")
        logger.setLevel(DEBUG)
        formatter = Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        h = FileHandler(os.path.join(cfg.OUTPUT_EXP ,"train.log"))
        h.setLevel(INFO)
        h.setFormatter(formatter)
        sh = StreamHandler()
        sh.setLevel(DEBUG)
        sh.setFormatter(formatter)
        logger.addHandler(h)
        logger.addHandler(sh)
    
    else:
        print('This environment is Kaggle Kernel')

        # set dirs
        cfg.INPUT = f'../input/{cfg.competition}'
        cfg.EXP = cfg.name
        cfg.OUTPUT_EXP = cfg.name
        cfg.SUBMISSION = './'
        cfg.DATASET = '../input/'
        
        cfg.EXP_MODEL = os.path.join(cfg.EXP, 'model')
        cfg.EXP_FIG = os.path.join(cfg.EXP, 'fig')
        cfg.EXP_PREDS = os.path.join(cfg.EXP, 'preds')

        # make dirs
        for d in [cfg.EXP_MODEL, cfg.EXP_FIG, cfg.EXP_PREDS]:
            os.makedirs(d, exist_ok=True)

    seed_everything(cfg.seed)
    return cfg


def dataset_create_new(dataset_name, upload_dir):
    dataset_metadata = {}
    dataset_metadata['id'] = f'{os.environ["KAGGLE_USERNAME"]}/{dataset_name}'
    dataset_metadata['licenses'] = [{'name': 'CC0-1.0'}]
    dataset_metadata['title'] = dataset_name
    with open(os.path.join(upload_dir, 'dataset-metadata.json'), 'w') as f:
        json.dump(dataset_metadata, f, indent=4)
    api = KaggleApi()
    api.authenticate()
    api.dataset_create_new(folder=upload_dir, convert_to_csv=False, dir_mode='tar')

In [None]:
def fit_mlp(cfg, train, folds):
    oof_list = []

    for fold in cfg.trn_fold:
        losses, oof = train_fn(cfg, train, fold, folds)
        oof_list.append(oof)
    oof = pd.concat(oof_list)
    logger.info(f"OOF PearsonR score: {oof['pred'].corr(oof['target'])}")

In [None]:
# =========================
# SetUp
# =========================
Config = setup(Config)
logger.info("Parameters")
logger.info("seed: " + f"{Config.seed}")
logger.info("n_fold: " + f"{Config.n_fold}")
logger.info("n_test: " + f"{Config.n_test}")
logger.info("purge: " + f"{Config.purge}")
logger.info("embargo: " + f"{Config.embargo}")

# 2nd import
import tensorflow_addons as tfa

This environment is Google Colab


In [None]:
# =========================
# Pre-Processing
# =========================
train = pd.read_parquet(os.path.join(Config.DATASET, 'ubiquant-parquet/train_low_mem.parquet'))
folds = mycv.get_CPGKfold(train, 'target', 'time_id', Config.n_fold, Config.n_test, Config.purge, Config.embargo)

gc.collect()

0

In [None]:
# =========================
# Training & Upload
# =========================

fit_mlp(cfg=Config, train=train, folds=folds)

# upload output folder to kaggle dataset
if Config.upload_from_colab:
    from kaggle.api.kaggle_api_extended import KaggleApi
    dataset_create_new(dataset_name=Config.EXP, upload_dir=Config.OUTPUT_EXP)