# Libs

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from random import choices
import gc

from tensorflow.keras.layers import Input, Dense, BatchNormalization, Dropout, Concatenate, Lambda, GaussianNoise, Activation
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers.experimental.preprocessing import Normalization
import tensorflow as tf
import tensorflow_addons as tfa

# Data Import and Processing

In [2]:
train = pd.read_csv('/kaggle/input/jane-street-market-prediction/train.csv') # cudf is sensitivee about type and use at and iat instead of loc and iloc


train = train.query('date > 85').reset_index(drop = True) 
train = train[train['weight'] != 0]

In [3]:
# train.fillna(train.mean(),inplace=True)
# The mean of the whole data set 
# drop feature_0 https://www.kaggle.com/nanomathias/feature-0-beyond-feature-0
#f_mean = np.mean(train[features[1:]].values,axis=0)

#features_mean = []
features = [c for c in train.columns if 'feature' in c]
for i in features:
    x = train[i].mean()  
    #features_mean.append(x)
    train[i] = train[i].fillna(x)


train['action'] = ((train['resp'].values) > 0).astype(int)
resp_cols = ['resp_1', 'resp_2', 'resp_3', 'resp', 'resp_4']


X_train = train.loc[:, train.columns.str.contains('feature')]
#y_train = (train.loc[:, 'action'])

# resp_1 > 0 resp_2 > 0 ...
y_train = np.stack([(train[c] > 0).astype('int') for c in resp_cols]).T

# Model

In [4]:
# from the comments https://www.kaggle.com/tarlannazarov/own-jane-street-with-keras-nn
# 1111 gives the best result
#  I tried other seed, and got high score that is more than 8000. 
# I have tried a couple of random seeds and the worst only gives 5000 score. 
SEED = 1111
np.random.seed(SEED)

#  initial parameters from Keras Tuner bayesian optimization

# fit
def create_mlp(
    num_columns, num_labels, hidden_units, dropout_rates, label_smoothing, learning_rate
):

    inp = tf.keras.layers.Input(shape=(num_columns,))
    x = tf.keras.layers.BatchNormalization()(inp)
    x = tf.keras.layers.Dropout(dropout_rates[0])(x)
    for i in range(len(hidden_units)):
        x = tf.keras.layers.Dense(hidden_units[i])(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.Activation(tf.keras.activations.swish)(x)
        x = tf.keras.layers.Dropout(dropout_rates[i + 1])(x)
    
    x = tf.keras.layers.Dense(num_labels)(x)
    out = tf.keras.layers.Activation("sigmoid")(x)

    model = tf.keras.models.Model(inputs=inp, outputs=out)
    model.compile(
        optimizer=tfa.optimizers.RectifiedAdam(learning_rate=learning_rate), # RectifiedAdam Optimizer (known to be robust to the choice in learning rate)
        loss=tf.keras.losses.BinaryCrossentropy(label_smoothing=label_smoothing),
        metrics=tf.keras.metrics.AUC(name="AUC"),
    ) 

    return model

epochs = 200
batch_size = 4096
hidden_units = [160, 160, 160]
dropout_rates = [0.2, 0.2, 0.2, 0.2]
label_smoothing = 1e-2
#Label Smoothing is a regularization technique that introduces noise for the labels. 
#This accounts for the fact that datasets may have mistakes in them, so maximizing the likelihood of directly can be harmful.
#Assume for a small constant e, the training set label y is correct with probability 1 - e and incorrect otherwise. 
#Label Smoothing regularizes a model based on a softmax with k output values by replacing the hard 0 and 1 classification targets 
#with e/k-1 targets of and 1 - e respectively.
learning_rate = 1e-2
#normally the model training with a batch size of 4096 and learning rate 1e-3 starts to overfit 
#on the train set after only 10 epochs. 

tf.keras.backend.clear_session()
tf.random.set_seed(SEED)

clf = create_mlp(len(features), 5, hidden_units, dropout_rates, label_smoothing, learning_rate)
clf.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=2)

#save model
clf.save(f'model_v12.h5')

Epoch 1/200
384/384 - 5s - loss: 0.6986 - AUC: 0.5266
Epoch 2/200
384/384 - 5s - loss: 0.6900 - AUC: 0.5412
Epoch 3/200
384/384 - 5s - loss: 0.6894 - AUC: 0.5443
Epoch 4/200
384/384 - 5s - loss: 0.6891 - AUC: 0.5464
Epoch 5/200
384/384 - 5s - loss: 0.6888 - AUC: 0.5478
Epoch 6/200
384/384 - 5s - loss: 0.6886 - AUC: 0.5486
Epoch 7/200
384/384 - 5s - loss: 0.6884 - AUC: 0.5495
Epoch 8/200
384/384 - 5s - loss: 0.6882 - AUC: 0.5501
Epoch 9/200
384/384 - 5s - loss: 0.6879 - AUC: 0.5517
Epoch 10/200
384/384 - 5s - loss: 0.6878 - AUC: 0.5522
Epoch 11/200
384/384 - 5s - loss: 0.6877 - AUC: 0.5528
Epoch 12/200
384/384 - 5s - loss: 0.6874 - AUC: 0.5539
Epoch 13/200
384/384 - 5s - loss: 0.6873 - AUC: 0.5543
Epoch 14/200
384/384 - 5s - loss: 0.6871 - AUC: 0.5552
Epoch 15/200
384/384 - 5s - loss: 0.6870 - AUC: 0.5555
Epoch 16/200
384/384 - 5s - loss: 0.6868 - AUC: 0.5563
Epoch 17/200
384/384 - 5s - loss: 0.6866 - AUC: 0.5571
Epoch 18/200
384/384 - 5s - loss: 0.6864 - AUC: 0.5574
Epoch 19/200
384/38

# Prediction

In [5]:
th = 0.503 # https://www.kaggle.com/gkoundry/the-most-important-model-parameter
# This parameter controls the ratio of ones and zeros in the prediction. If the market is going up, then you will want to predict more ones than zeros.
# However it's a little more complicated than this as you need to take the weights into consideration and not just the direction of the whole market:$
# In the training data the overall market is going up, but the weighted returns are trending downwards so you would want to predict fewer ones for this time period.
# => It's the threshold, and yes it's an intuitive experience. The model predicts probabilities of resp between 0 and 1, 
# compares its median to the th, and then assigns 0 or 1. 

f_mean = np.mean(train[features[1:]].values,axis=0)

import janestreet
env = janestreet.make_env()

for (test_df, pred_df) in tqdm(env.iter_test()): 
    if test_df['weight'].item() > 0:
        
        x_tt = test_df.loc[:, features].values
        #if np.isnan(x_tt[:, 1:].sum()):
            # np.nan_to_num : Replace NaN with zero and infinity with large finite numbers
            # np.isnan : return a boolean list
            
        x_tt[:, 1:] = np.nan_to_num(x_tt[:, 1:]) + np.isnan(x_tt[:, 1:]) * f_mean
            
        pred = np.median(clf(x_tt))
        pred_df.action = np.where(pred >= th, 1, 0).astype(int)
        
    else:
        pred_df.action = 0
        
    env.predict(pred_df)

15219it [04:26, 57.16it/s]
