# Jane Street 2020: Multi-Layer Perceptron V

Using MLP to classify
 - Using EMA on features


## 0. Summary, and initial setup

In [1]:
# Imports, environment, and paths
import os, sys, gc, random
import numpy as np
import pandas as pd
import seaborn as sns
import datatable as dtable
from sklearn.metrics import roc_auc_score, roc_curve, log_loss
from sklearn.model_selection import GroupShuffleSplit
import matplotlib.pyplot as plt
from matplotlib import pyplot as plt
plt.style.use('dark_background') #plt.style.use('default')

from tqdm.notebook import tqdm
import janestreet

# PyTorch
import torch
from torch.autograd import Variable
from torch.utils.data import DataLoader
from torch.nn import CrossEntropyLoss, MSELoss
from torch.nn.modules.loss import _WeightedLoss
import torch.nn.functional as F
print(f'PyTorch version: {torch.__version__}; cuda available: {torch.cuda.is_available()}')

# auxiliary --------------
from importlib import reload
import time

# Print environment ------
pd.set_option('display.max_columns', 200) 
!conda info | grep 'active environment' # or use: !conda info --envs | grep '*'
print(f'working directory: {os.getcwd()}') 


PyTorch version: 1.7.1; cuda available: True
     active environment : base
working directory: /home/AWC/wang/learn/kaggle/k_JaneStreet20


In [2]:
# Reproducability
# globalSeed=67
# np.random.seed(globalSeed) # for reproducibility, does this work?

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything(seed=42)

In [3]:
# Suppressing warning before saving the presentation, only
import warnings
warnings.filterwarnings('ignore')


## 1. Load data

Variables for later sections:
 - data, features: 
 - nFeat, daySet

In [4]:
%%time
ddir='~/learn/kaggle/Data/JaneStreet20' # local
# ddir='../input/jane-street-market-prediction' # kaggle

# data = pd.read_csv(os.path.join(ddir,"train.csv"))
data = dtable.fread(os.path.join(ddir,"train.csv")).to_pandas() # using datatable for faster loading
features = pd.read_csv(os.path.join(ddir,"features.csv"))

nFeat=features.shape[0]
featName=[f'feature_{n}' for n in range(nFeat)]
xywCol=featName+['resp','weight']
daySet=data['date'].unique()

gc.collect();

CPU times: user 55.8 s, sys: 4.18 s, total: 59.9 s
Wall time: 5.16 s


0

## 2.  Preprecessing

The data may be used in later sections are
 - dataBlock
 - data_t, data_v, data_c for TVT data
 - nFeat, featName
 - norm, for normalization


### 2.1 Deal with nan

In [5]:
# Helper function
def fillNanWithinDay(df,dayCol,fillCol,spanFillNa=1):
    """fill NaN within date
    
    This function does (forward) fill without crossing dates, using EMA of a trailing
    window. Equal value in dayCol column indicates same date. "Date" here can
    be generalized to block with equal dayCol value
    Parameter:
      df: dataframe, original data
      dayCol: string, column name. Equal value indicates same date (block)
      fillCol: list of straings, names of columns to fill NaN
      spanFillNa: integer. Using a trailing ema of given span to fill NaN.
          spanFillNa=1 is equivalent to 'ffill' of df.fillna()
    return:
      list of pd.DataFrame of day, NaN replaced
    """
    dfList=[]
    dayList=df[dayCol].unique()
    for day in dayList:
        data_1=df.loc[df['date']==day]
        data_1_=data_1[fillCol].ewm(span=spanFillNa,adjust=False,ignore_na=True).mean()
        data_1_fill=data_1.copy()
        for cname in fillCol:
            toFill=data_1[cname].isna()
            data_1_fill.loc[toFill,cname]=data_1_.loc[toFill,cname]
        dfList.append(data_1_fill)
    
    return pd.concat(dfList)

In [6]:
%%time
# Fill NaN after first data points of day
spanFillNa=3
fillCol=[f'feature_{ii}' for ii in range(nFeat) if ii not in [0,64]] # features to fill nan
data=fillNanWithinDay(data,'date',fillCol,spanFillNa=spanFillNa)
print('#original: ',data.shape[0])

# Fill NaN on beginning of day
nPointStart=300 # num of samples to estimate day start
dayStart=[data.loc[data['date']==day,fillCol].iloc[:nPointStart] for day in daySet]
f_mean=pd.concat(dayStart).mean()
data[fillCol] = data[fillCol].fillna(f_mean)
f_mean.to_csv(os.path.join('fmean_JS20_MLP_01.csv')) # './model_sv'

data=data.loc[~data[xywCol].isna().any(axis=1)]
gc.collect()

#original:  2390491
CPU times: user 1min 15s, sys: 4.42 s, total: 1min 19s
Wall time: 1min 15s


0

In [7]:
# EMA of features
spanFeat=8
emaFeatName=[f'feature_{kk}' for kk in [54,50,61,62,63,66,67,68,122,123,125,125,126,127,128]]
emaCol={cn:cn+'_ema' for cn in emaFeatName}
# emaFeatName=[val for _,val in emaCol.items()]

dayList=data['date'].unique()
data_ema=[data.loc[data['date']==day,emaFeatName].ewm(span=spanFeat,adjust=False,ignore_na=True).mean() for day in
         dayList]
data_ema=pd.concat(data_ema).rename(columns=emaCol)
data=pd.concat([data,data_ema],axis=1)

# Drop 0 weight
data=data.loc[data['weight']>0].reset_index(drop=True)  # Dropping 0 weight
print('#After dropping NaN and 0 wieghts: ',data.shape[0])

print('#Nan in train: {:d}, {:d}, {:d}'.format(data.loc[:,featName].isna().to_numpy().sum(),
                                               data.loc[:,'resp'].isna().to_numpy().sum(),
                                               data.loc[:,'weight'].isna().to_numpy().sum()))

#After dropping NaN and 0 wieghts:  1981287
#Nan in train: 0, 0, 0


In [8]:
data_ema.shape

(2390491, 15)

## 3. Traing MLP

### 3.1 Model

In [9]:
class Model(torch.nn.Module):
    def __init__(self,nFeat):
        super(Model, self).__init__()
        self.batch_norm0 = torch.nn.BatchNorm1d(nFeat)
        self.dropout0 = torch.nn.Dropout(0.10143786981358652)

        self.dense1 = torch.nn.Linear(nFeat, 3001)
        self.batch_norm1 = torch.nn.BatchNorm1d(3001)
        self.dropout1 = torch.nn.Dropout(0.19720339053599725)

        self.dense2 = torch.nn.Linear(3001, 1324)
        self.batch_norm2 = torch.nn.BatchNorm1d(1324)
        self.dropout2 = torch.nn.Dropout(0.2703017847244654)

        self.dense3 = torch.nn.Linear(1324, 718)
        self.batch_norm3 = torch.nn.BatchNorm1d(718)
        self.dropout3 = torch.nn.Dropout(0.23148340929571917)
        
        self.dense4 = torch.nn.Linear(718, 110)
        self.batch_norm4 = torch.nn.BatchNorm1d(110)
        self.dropout4 = torch.nn.Dropout(0.2357768967777311)
        
        self.dense5 = torch.nn.Linear(110, 512)
        self.batch_norm5 = torch.nn.BatchNorm1d(512)
        self.dropout5 = torch.nn.Dropout(0.2357768967777311)

        self.dense6 = torch.nn.Linear(512, 394)
        self.batch_norm6 = torch.nn.BatchNorm1d(394)
        self.dropout6 = torch.nn.Dropout(0.2357768967777311)
        
        self.dense7 = torch.nn.Linear(394, 128)
        self.batch_norm7 = torch.nn.BatchNorm1d(128)
        self.dropout7 = torch.nn.Dropout(0.2357768967777311)

        self.dense_out = torch.nn.Linear(64, 1)

        self.Relu = torch.nn.ReLU(inplace=True)
        self.PReLU = torch.nn.PReLU()
        self.LeakyReLU = torch.nn.LeakyReLU(negative_slope=0.01, inplace=True)
        # self.GeLU = torch.nn.GELU()
        self.RReLU = torch.nn.RReLU()

    def forward(self, x):
        x = self.batch_norm0(x)
        x = self.dropout0(x)

        x = self.dense1(x)
        x = self.batch_norm1(x)
        #x = x * torch.sigmoid(x)
        x=self.Relu(x)
        x = self.dropout1(x)

        x = self.dense2(x)
        x = self.batch_norm2(x)
        #x = x * torch.sigmoid(x)
        x=self.Relu(x)
        x = self.dropout2(x)
        
        x = self.dense3(x)
        x = self.batch_norm3(x)
        x=self.Relu(x)
        x = self.dropout3(x)
        
        x = self.dense4(x)
        x = self.batch_norm4(x)
        #x = x * torch.sigmoid(x)
        x=self.Relu(x)
        x = self.dropout4(x)
        
        x = self.dense5(x)
        x = self.batch_norm5(x)
        x=self.Relu(x)
        x = self.dropout5(x)
        
        x = self.dense6(x)
        x = self.batch_norm6(x)
        x=self.Relu(x)
        x = self.dropout6(x)
        
        x = self.dense7(x)
        x = self.batch_norm7(x)
        x=self.Relu(x)
        x = self.dropout7(x)

        x = self.dense_out(x)

        return x

### 3.2 Trainging: helper functions

In [10]:
class MarketDataset:
    def __init__(self, data, featName):
        self.features = data[featName].values
        self.label = (data['resp']>0).astype('int').values.reshape(-1, 1)
    def __len__(self):
        return len(self.label)

    def __getitem__(self, idx):
        return {
            'features': torch.tensor(self.features[idx], dtype=torch.float),
            'label': torch.tensor(self.label[idx], dtype=torch.float)
        }

In [11]:
def train_fn(model, optimizer, scheduler, loss_fn, dataloader, device):
    model.train()
    final_loss = 0

    for data in dataloader:
        optimizer.zero_grad()
        features = data['features'].to(device)
        label = data['label'].to(device)
        outputs = model(features)
        loss = loss_fn(outputs, label)
        loss.backward()
        optimizer.step()
        if scheduler:
            scheduler.step()

        final_loss += loss.item()

    final_loss /= len(dataloader)

    return final_loss

def inference_fn(model, dataloader, device):
    model.eval()
    preds = []

    for data in dataloader:
        features = data['features'].to(device)

        with torch.no_grad():
            outputs = model(features)

        preds.append(outputs.sigmoid().detach().cpu().numpy())

    preds = np.concatenate(preds).reshape(-1)

    return preds

In [12]:
class SmoothBCEwLogits(_WeightedLoss):
    def __init__(self, weight=None, reduction='mean', smoothing=0.0):
        super().__init__(weight=weight, reduction=reduction)
        self.smoothing = smoothing
        self.weight = weight
        self.reduction = reduction

    @staticmethod
    def _smooth(targets:torch.Tensor, n_labels:int, smoothing=0.0):
        assert 0 <= smoothing < 1
        with torch.no_grad():
            targets = targets * (1.0 - smoothing) + 0.5 * smoothing
        return targets

    def forward(self, inputs, targets):
        targets = SmoothBCEwLogits._smooth(targets, inputs.size(-1),
            self.smoothing)
        loss = F.binary_cross_entropy_with_logits(inputs, targets,self.weight)

        if  self.reduction == 'sum':
            loss = loss.sum()
        elif  self.reduction == 'mean':
            loss = loss.mean()

        return loss

In [13]:
class EarlyStopping:
    def __init__(self, patience=7, mode="max", delta=0.):
        self.patience = patience
        self.counter = 0
        self.mode = mode
        self.best_score = None
        self.early_stop = False
        self.delta = delta
        if self.mode == "min":
            self.val_score = np.Inf
        else:
            self.val_score = -np.Inf

    def __call__(self, epoch_score, model, model_path):

        if self.mode == "min":
            score = -1.0 * epoch_score
        else:
            score = np.copy(epoch_score)

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(epoch_score, model, model_path)
        elif score < self.best_score: #  + self.delta
            self.counter += 1
            print('EarlyStopping counter: {} out of {}'.format(self.counter, self.patience))
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            # ema.apply_shadow()
            self.save_checkpoint(epoch_score, model, model_path)
            # ema.restore()
            self.counter = 0

    def save_checkpoint(self, epoch_score, model, model_path):
        if epoch_score not in [-np.inf, np.inf, -np.nan, np.nan]:
            print('Validation score improved ({} --> {}). Saving model!'.format(self.val_score, epoch_score))
            torch.save(model.state_dict(), model_path)
        self.val_score = epoch_score

In [14]:
def utility_score_bincount(date, weight, resp, action):
    count_i = len(np.unique(date))
    # print('weight: ', weight)
    # print('resp: ', resp)
    # print('action: ', action)
    # print('weight * resp * action: ', weight * resp * action)
    Pi = np.bincount(date, weight * resp * action)
    t = np.sum(Pi) / np.sqrt(np.sum(Pi ** 2)) * np.sqrt(250 / count_i)
    u = np.clip(t, 0, 6) * np.sum(Pi)
    return u

### 3.3 Training

In [15]:
batch_size = 4096
label_smoothing = 1e-2
learning_rate = 1e-3

start_time = time.time()

# ckp_path = os.path.join('JS20_MLP_01.pth') # './model_sv'

# gkf = GroupKFold(n_splits = 5)
gss = GroupShuffleSplit(n_splits=5, test_size=0.2)
for fold, (idx_t, idx_v) in enumerate(gss.split(data, groups=data['date'])):
    # print(f'fold {fold}: ',len(idx_t),len(idx_v),len(idx_t)+len(idx_v))
    
    train_set = MarketDataset(data.loc[idx_t],featName+emaFeatName)
    train_loader=DataLoader(train_set, batch_size=batch_size, shuffle=True)
    valid_set = MarketDataset(data.loc[idx_v],featName+emaFeatName)
    valid_loader=DataLoader(valid_set, batch_size=batch_size, shuffle=False) # Using True is bad, why??????????
    
    torch.cuda.empty_cache()
    device = torch.device("cuda:1")
    model = Model(len(featName+emaFeatName))
    model.to(device)
    
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    loss_fn = SmoothBCEwLogits(smoothing=label_smoothing)
    es = EarlyStopping(patience=3, mode="max")

    ckp_path = f'JS20_MLP_05_{fold}.pth'
    
    for epoch in range(10):
        train_loss = train_fn(model, optimizer, None, loss_fn, train_loader, device)
        valid_pred = inference_fn(model, valid_loader, device)
        auc_score = roc_auc_score((data.loc[idx_v,'resp']>0).astype(int).values.reshape(-1, 1), valid_pred)
        logloss_score = log_loss((data.loc[idx_v,'resp']>0).astype(int).values.reshape(-1, 1), valid_pred)
        valid_pred = np.where(valid_pred >= 0.5, 1, 0).astype(int)

        u_score = utility_score_bincount(date=data.loc[idx_v,'date'].values, weight=data.loc[idx_v,'weight'].values,
                                         resp=data.loc[idx_v,'resp'].values, action=valid_pred)
        print(f"EPOCH:{epoch:3}, train_loss:{train_loss:.5f}, u_score:{u_score:.5f}, auc:{auc_score:.5f}, logloss:{logloss_score:.5f}, "
              f"time: {(time.time() - start_time) / 60:.2f}min")

        es(auc_score, model, model_path=ckp_path)
        if es.early_stop:
            print("Early stop!")
            break
    break # only train 1 model for fast, you can remove it to train 5 folds

RuntimeError: mat1 dim 1 must match mat2 dim 0

In [None]:
# EPOCH:  0, train_loss:0.69311, u_score:1252.46847, auc:0.53161, logloss:0.69114, time: 1.13min
# Validation score improved (-inf --> 0.531611473976248). Saving model!
# EPOCH:  1, train_loss:0.69083, u_score:1494.60356, auc:0.52942, logloss:0.69150, time: 2.26min
# EarlyStopping counter: 1 out of 3
# EPOCH:  2, train_loss:0.69008, u_score:1475.69509, auc:0.53158, logloss:0.69124, time: 3.40min
# EarlyStopping counter: 2 out of 3
# EPOCH:  3, train_loss:0.68919, u_score:2119.23536, auc:0.53358, logloss:0.69088, time: 4.49min
# Validation score improved (0.531611473976248 --> 0.5335764266654076). Saving model!
# EPOCH:  4, train_loss:0.68825, u_score:1170.78415, auc:0.53083, logloss:0.69242, time: 5.56min
# EarlyStopping counter: 1 out of 3
# EPOCH:  5, train_loss:0.68696, u_score:1035.60073, auc:0.52705, logloss:0.69337, time: 6.67min
# EarlyStopping counter: 2 out of 3
# EPOCH:  6, train_loss:0.68568, u_score:1591.23117, auc:0.53299, logloss:0.69391, time: 7.80min
# EarlyStopping counter: 3 out of 3
# Early stop!

## 4. Submission

In [None]:
models = []
ckp_path_list=[]
for i in range(5): # for fast inference, you can change 1-->5 to get higher score
    torch.cuda.empty_cache()
    device = torch.device("cuda:0")
    model = Model(len(featName+emaFeatName))
    model.to(device)
    model.eval()
    
    ckp_path = f'JS20_MLP_05_{i}.pth'
    ckp_path_list.append(ckp_path)
    model.load_state_dict(torch.load(ckp_path))
    models.append(model)
print(f'Loaded')
print(ckp_path_list)

In [None]:
# Try not to use GPU here ******

# env = janestreet.make_env()
# env_iter = env.iter_test()

# alpha=2/(spanFillNa+1)
# prevDate=None
# opt_th=0.5
# nTest=0
# for (test_df_1, pred_df) in tqdm(env_iter):
#     test_df=test_df_1.iloc[0]
#     # Update fill value
#     if prevDate!=test_df['date']:
#         xx_fill=test_df[fillCol].fillna(f_mean)
#     else:
#         xx_fill=(((1-alpha)*xx_fill+alpha*test_df[fillCol].fillna(0))*(~test_df[fillCol].isna()) +
#                  xx_fill*test_df[fillCol].isna() )
#     if xx_fill.isna().any():
#         print('xx_fill contains NaN'); break
    
#     if test_df['weight'].item() > 0:
#         xx=test_df.loc[featName].copy()
#         if xx[fillCol].isna().any():
#             xx[fillCol]=test_df[fillCol].fillna(xx_fill)
#         for i, clf in enumerate(models):
#             if i == 0:
#                 pred=clf(torch.tensor(np.expand_dims(xx.values,axis=0),dtype=torch.float).to(device)).sigmoid().detach().cpu().numpy()
#             else:
#                 pred+=clf(torch.tensor(np.expand_dims(xx.values,axis=0),dtype=torch.float).to(device)).sigmoid().detach().cpu().numpy()
#         pred/=len(models)
#         pred_df.action=np.where(pred >= opt_th, 1, 0).astype(int)
#     else:
#         pred_df.action = 0
#     env.predict(pred_df)
#     prevDate=test_df['date']
#     nTest+=1
# print(f'nn={nTest}')

## 5. Testing

In [None]:
# data_c=pd.read_csv(os.path.join(ddir,'example_test.csv') )

In [None]:
# # Try not to use GPU here ******

# # env = janestreet.make_env()
# # env_iter = env.iter_test()

# alpha_na=2/(spanFillNa+1)
# alpha_feat=2/(spanFeat+1)
# prevDate=None
# opt_th=0.5
# nTest=0
# # for (test_df_1, pred_df) in tqdm(env_iter):
# #     test_df=test_df_1.iloc[0]
# for idx_c,test_df in data_c.iterrows():
#     # Update fill value
#     if prevDate!=test_df['date']:
#         xx_fill=test_df[fillCol].fillna(f_mean)
#     else:
#         xx_fill=(((1-alpha_na)*xx_fill+alpha_na*test_df[fillCol].fillna(0))*(~test_df[fillCol].isna()) +
#                  xx_fill*test_df[fillCol].isna() )
#     if xx_fill.isna().any():
#         print('xx_fill contains NaN'); break
    
#     xx=test_df.loc[featName].copy()
#     if xx[fillCol].isna().any():
#         xx[fillCol]=test_df[fillCol].fillna(xx_fill)
    
#     # EMA of features
#     if prevDate!=test_df['date']:
#         xx_ema=xx[emaFeatName]
#     else:
#         xx_ema=(1-alpha_feat)*xx_ema+alpha_feat*xx[emaFeatName]
#     xx=pd.concat([xx,xx_ema.rename(index=emaCol)])

#     if test_df['weight'].item() > 0:
#         for i, clf in enumerate(models):
#             if i == 0:
#                 pred=clf(torch.tensor(np.expand_dims(xx.values,axis=0),dtype=torch.float).to(device)).sigmoid().detach().cpu().numpy()
#             else:
#                 pred+=clf(torch.tensor(np.expand_dims(xx.values,axis=0),dtype=torch.float).to(device)).sigmoid().detach().cpu().numpy()
#         pred/=len(models)
#         #pred_df.action=np.where(pred >= opt_th, 1, 0).astype(int)
#     else:
#         #pred_df.action = 0
#         None
#     # env.predict(pred_df)
#     prevDate=test_df['date']
#     nTest+=1
#     if nTest>10:
#         break
# print(f'nn={nTest}')

In [None]:
xx.shape