In [None]:
!pip install ../input/python-datatable/datatable-0.11.0-cp37-cp37m-manylinux2010_x86_64.whl > /dev/null 2>&1
import datatable as dt

# system
import warnings
warnings.filterwarnings('ignore')

from tensorflow.keras.layers import Input, Dense, BatchNormalization, Dropout, Concatenate, Lambda, GaussianNoise, Activation
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers.experimental.preprocessing import Normalization
import keras.backend as K
import tensorflow as tf
from tensorflow import keras

import numpy as np
from scipy.stats import norm
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,recall_score
from sklearn.utils import class_weight

from tqdm import tqdm

import janestreet

In [None]:
# Options
Display = False
Inf = False
TRAINING = True

# seed set
SEED = 2021
tf.random.set_seed(SEED)
np.random.seed(SEED)

## Reading

In [None]:
folder_path = '../input/jane-street-market-prediction/'
save_path = '../input/jane-autoencorder/'

In [None]:
# Reading csv
print('Reading train.csv')
train = dt.fread(folder_path + 'train.csv').to_pandas()

print('Finish Reading')

## Pre-processing

In [None]:
# Pre-process
print('Pre-process start')
train = train.astype({c: np.float32 for c in train.select_dtypes(include='float64').columns}) #limit memory use
train = train[train['weight'] > 0.0]
#train = train[train['date'] > 99].reset_index(drop=True)

## Feature Enginiering

In [None]:
# action = 0とするresp(=featureのリターン)を学習させるターゲットとする
train['action'] = (train['resp']> 0.0).astype('int8')
train['action_1'] = (train['resp_1']> 0.0).astype('int8')
train['action_2'] = (train['resp_2']> 0.0).astype('int8')
train['action_3'] = (train['resp_3']> 0.0).astype('int8')
train['action_4'] = (train['resp_4']> 0.0).astype('int8')

# features

select_feature = False # True
if select_feature:
    # lgbm_feature_importance_top...
    numbers = [18]

    features = [c for c in train.columns if "feature" in c if int(c.split('_')[-1]) in numbers]
else:
    features = [c for c in train.columns if "feature" in c]

target = 'action'
multi_target = ['action','action_1','action_2','action_3','action_4']

In [None]:
f_mean = np.mean(train[features[1:]].values,axis=0)

train.fillna(train.mean(),inplace=True)

In [None]:
#train[features] = (train[features]>0.0).astype('int8')

In [None]:
# Display
if Display:
    print('desccribe')
    display(train.describe())
    print(train.dtypes)

## Metric

In [None]:
def utility_score_numba(date, weight, resp, action):
    Pi = np.bincount(date, weight * resp * action)
    t = np.sum(Pi) / np.sqrt(np.sum(Pi ** 2)) * np.sqrt(250 / len(Pi))
    u = min(max(t, 0), 6) * np.sum(Pi)
    return u

def jane_utility(data, action_column="action"):
    return utility_score_numba(data["date"].values, data["weight"].values, data["resp"].values, data[action_column].values)

## Model

In [None]:
class WeightedBinaryCrossEntropy(keras.losses.Loss):
    """
    Args:
      pos_weight: Scalar to affect the positive labels of the loss function.
      weight: Scalar to affect the entirety of the loss function.
      from_logits: Whether to compute loss form logits or the probability.
      reduction: Type of tf.keras.losses.Reduction to apply to loss.
      name: Name of the loss function.
    """
    def __init__(self, pos_weight, weight, from_logits=False,
                 reduction=keras.losses.Reduction.AUTO,
                 name='weighted_binary_crossentropy'):
        super(WeightedBinaryCrossEntropy, self).__init__(reduction=reduction,
                                                         name=name)
        self.pos_weight = pos_weight
        self.weight = weight
        self.from_logits = from_logits
 
    def call(self, y_true, y_pred):
        if not self.from_logits:
            # Manually calculate the weighted cross entropy.
            # Formula is qz * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
            # where z are labels, x is logits, and q is the weight.
            # Since the values passed are from sigmoid (assuming in this case)
            # sigmoid(x) will be replaced by y_pred
 
            # qz * -log(sigmoid(x)) 1e-6 is added as an epsilon to stop passing a zero into the log
            x_1 = y_true * self.pos_weight * -tf.math.log(y_pred + 1e-6)
 
            # (1 - z) * -log(1 - sigmoid(x)). Epsilon is added to prevent passing a zero into the log
            x_2 = (1 - y_true) * -tf.math.log(1 - y_pred + 1e-6)
 
            return tf.add(x_1, x_2) * self.weight
 
        # Use built in function
        return tf.nn.weighted_cross_entropy_with_logits(y_true, y_pred, self.pos_weight) * self.weight

In [None]:
def create_mlp(input_dim, output_dim, label_smoothing, learning_rate):

    inputs = Input(input_dim)
    
    x = BatchNormalization()(inputs)
    x = tf.keras.layers.Dropout(0.2)(x)
    
    x = tf.keras.layers.Dense(160)(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Activation(tf.keras.activations.swish)(x)
    x = tf.keras.layers.Dropout(0.2)(x)

    x = tf.keras.layers.Dense(160)(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Activation(tf.keras.activations.swish)(x)
    x = tf.keras.layers.Dropout(0.2)(x)

    out = Dense(output_dim, activation='sigmoid')(x)
   
    model = Model(inputs=inputs, outputs=out)
    
    model.compile(
        optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate),
        loss= WeightedBinaryCrossEntropy(0.9, 1.0),
        #loss = tf.keras.losses.BinaryCrossentropy(label_smoothing=label_smoothing),
        #loss = tf.keras.losses.BinaryCrossentropy(),
        metrics = tf.keras.metrics.AUC(name="AUC"),
    )

    return model

# Traning,Validation

In [None]:
epochs = 1000
batch_size = 4096
label_smoothing = 0.01
learning_rate = 0.001

opt_th = 0.500

models = []

if TRAINING:
    date = train['date'].unique()

    kf = KFold(n_splits=5, random_state=SEED, shuffle=False)
    for k, (train_date, valid_date) in enumerate(kf.split(date)):
        train_date, valid_date = date[train_date], date[valid_date]
        
        index_train, index_valid = train['date'].isin(train_date), train['date'].isin(valid_date)
    
        X_train, X_valid = train.loc[index_train,features].values, train.loc[index_valid,features].values
        y_train, y_valid = train.loc[index_train,multi_target].values.astype('float16'), train.loc[index_valid,multi_target].values.astype('float16')
        
        print('Fold_{}'.format(k))
        print('=============================================================================')
        print('X_train:',X_train.shape)
        print('y_train:',y_train.shape)
        print('X_valid:',X_valid.shape)
        print('y_valid:',y_valid.shape)
        
        #class_weights = {0: 1.,1: 1.1}
        nn_model = create_mlp(len(features), len(multi_target), label_smoothing, learning_rate)

        nn_model.fit(X_train,
                     y_train,
                     #class_weight=class_weights,
                     epochs=epochs,
                     validation_data=(X_valid,y_valid),
                     batch_size=batch_size,
                     callbacks=[EarlyStopping('val_loss',patience=2,restore_best_weights=True)]
                    )
        # restore model
        models.append(nn_model)
        
        # save model
        nn_model.save_weights('./model_fold{}.hdf5'.format(k))
        
        # validation
        Valid = train[index_valid]

        predictions = np.zeros(X_valid.shape[0])
        predictions = np.median(nn_model(X_valid, training = False).numpy(),axis=1)

        Valid.action = np.where(predictions >= opt_th, 1, 0).astype('int8')
        print("Valid Jane Utility: {:.2f}".format(jane_utility(train[index_valid], action_column="action")))
        print("Pred  Jane Utility: {:.2f}".format(jane_utility(Valid, action_column="action")))
        print('accuracy:{:.3f}'.format(accuracy_score(train[index_valid].action.values, Valid.action.values)))
        print('precision:{:.3f}'.format(precision_score(train[index_valid].action.values, Valid.action.values)))
        print('recall:{:.3f}'.format(recall_score(train[index_valid].action.values, Valid.action.values)))
    
else:
    nn_model.load_weights(save_path+'model_fold{}.hdf5'.format(k))

## Sample_test

In [None]:
# Reading csv
print('Reading test.csv')

test = dt.fread(folder_path + 'example_test.csv').to_pandas()

print('Finish Reading')

In [None]:
test.fillna(train.mean(),inplace=True)
test = test.loc[:, features].values

In [None]:
predictions = np.zeros(test.shape[0])
predictions = np.median(np.mean([model(test, training = False).numpy() for model in models],axis=0),axis=1)



sns.distplot(predictions,
             kde_kws={'label': 'kde','color':'k'},
             fit=norm,
             fit_kws={'label': 'norm','color':'red'},
             rug=False
            )

plt.legend()
plt.xlim(0.2,0.8)
plt.show();

In [None]:
th = 0.500
pd.DataFrame(np.where(predictions >= opt_th, 1, 0).astype(int)).value_counts()

## Inference

In [None]:
# Inference
models = models[0]

if Inf:

    env = janestreet.make_env()

    for (test_df, pred_df) in tqdm(env.iter_test()):
        
        if test_df['weight'].item() > 0:
            x_tt = test_df.loc[:, features].values
        
            if np.isnan(x_tt[:, 1:].sum()):
                x_tt[:, 1:] = np.nan_to_num(x_tt[:, 1:]) + np.isnan(x_tt[:, 1:]) * f_mean
        
            pred = np.median(np.mean([model(x_tt, training = False).numpy() for model in models],axis=0),axis=1)
            pred_df.action = np.where(pred >= opt_th, 1, 0).astype(int)
        
        env.predict(pred_df)
else:
    print('Traning Done')

In [None]:
if Inf:
    # display submission.csv
    submission = pd.read_csv('./submission.csv')
    print(submission.action.value_counts())

In [None]:
if Inf:
    sns.countplot(submission.action);