In [1]:
import numpy as np # linear algebra
import pandas as pd
from tensorflow.keras.layers import Input, Dense, BatchNormalization, Dropout, Concatenate, Lambda, GaussianNoise, Activation
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers.experimental.preprocessing import Normalization
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.model_selection import GroupKFold

from tqdm import tqdm
from random import choices
import random

import kerastuner as kt

def set_all_seeds(seed):
    np.random.seed(seed)
    random.seed(seed)
    tf.random.set_seed(seed)

ModuleNotFoundError: No module named 'kerastuner'

In [2]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.model_selection._split import _BaseKFold, indexable, _num_samples
from sklearn.utils.validation import _deprecate_positional_args

# modified code for group gaps; source
# https://github.com/getgaurav2/scikit-learn/blob/d4a3af5cc9da3a76f0266932644b884c99724c57/sklearn/model_selection/_split.py#L2243

#要用time series split因为不能用当天的数据预测当天的stock 要用前几天的
class PurgedGroupTimeSeriesSplit(_BaseKFold):
    """Time Series cross-validator variant with non-overlapping groups.
    Allows for a gap in groups to avoid potentially leaking info from
    train into test if the model has windowed or lag features.
    Provides train/test indices to split time series data samples
    that are observed at fixed time intervals according to a
    third-party provided group.
    In each split, test indices must be higher than before, and thus shuffling
    in cross validator is inappropriate.
    This cross-validation object is a variation of :class:`KFold`.
    In the kth split, it returns first k folds as train set and the
    (k+1)th fold as test set.
    The same group will not appear in two different folds (the number of
    distinct groups has to be at least equal to the number of folds).
    Note that unlike standard cross-validation methods, successive
    training sets are supersets of those that come before them.
    Read more in the :ref:`User Guide <cross_validation>`.
    Parameters
    ----------
    n_splits : int, default=5  #自己设 做几个folder的cross validation 如果是5就是80%train20%test 10就是90%train20%test？？
        Number of splits. Must be at least 2.
    max_train_group_size : int, default=Inf
        Maximum group size for a single training set.
    group_gap : int, default=None  #自己设 希望trian和test数据之间有缝隙 eg 120个data 前80个trian 后20个test 隔了20个 为了防止数据缺失因为每个date的trade个数不一样？？
        Gap between train and test
    max_test_group_size : int, default=Inf
        We discard this number of groups from the end of each train split
    """

    @_deprecate_positional_args
    def __init__(self,
                 n_splits=5,
                 *,
                 max_train_group_size=np.inf,
                 max_test_group_size=np.inf,
                 group_gap=None,
                 verbose=False
                 ):
        super().__init__(n_splits, shuffle=False, random_state=None)
        self.max_train_group_size = max_train_group_size
        self.group_gap = group_gap
        self.max_test_group_size = max_test_group_size
        self.verbose = verbose

    def split(self, X, y=None, groups=None):
        """Generate indices to split data into training and test set.
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.
        y : array-like of shape (n_samples,)
            Always ignored, exists for compatibility.
        groups : array-like of shape (n_samples,)
            Group labels for the samples used while splitting the dataset into
            train/test set.
        Yields
        ------
        train : ndarray
            The training set indices for that split.
        test : ndarray
            The testing set indices for that split.
        """
        if groups is None:
            raise ValueError(
                "The 'groups' parameter should not be None")
        X, y, groups = indexable(X, y, groups)
        n_samples = _num_samples(X)
        n_splits = self.n_splits
        group_gap = self.group_gap
        max_test_group_size = self.max_test_group_size
        max_train_group_size = self.max_train_group_size
        n_folds = n_splits + 1
        group_dict = {}
        u, ind = np.unique(groups, return_index=True)
        unique_groups = u[np.argsort(ind)]
        n_samples = _num_samples(X)
        n_groups = _num_samples(unique_groups)
        for idx in np.arange(n_samples):
            if (groups[idx] in group_dict):
                group_dict[groups[idx]].append(idx)
            else:
                group_dict[groups[idx]] = [idx]
        if n_folds > n_groups:
            raise ValueError(
                ("Cannot have number of folds={0} greater than"
                 " the number of groups={1}").format(n_folds,
                                                     n_groups))

        group_test_size = min(n_groups // n_folds, max_test_group_size)
        group_test_starts = range(n_groups - n_splits * group_test_size,
                                  n_groups, group_test_size)
        for group_test_start in group_test_starts:
            train_array = []
            test_array = []

            group_st = max(0, group_test_start - group_gap - max_train_group_size)
            for train_group_idx in unique_groups[group_st:(group_test_start - group_gap)]:
                train_array_tmp = group_dict[train_group_idx]
                
                train_array = np.sort(np.unique(
                                      np.concatenate((train_array,
                                                      train_array_tmp)),
                                      axis=None), axis=None)

            train_end = train_array.size
 
            for test_group_idx in unique_groups[group_test_start:
                                                group_test_start +
                                                group_test_size]:
                test_array_tmp = group_dict[test_group_idx]
                test_array = np.sort(np.unique(
                                              np.concatenate((test_array,
                                                              test_array_tmp)),
                                     axis=None), axis=None)

            test_array  = test_array[group_gap:]
            
            
            if self.verbose > 0:
                    pass
                    
            yield [int(i) for i in train_array], [int(i) for i in test_array]

In [3]:
# cross validation
#deep learning在建立每一层model时都可以调整每一层的参数 用Cross validation tuner这个Class找最佳的参数
class CVTuner(kt.engine.tuner.Tuner):
    def run_trial(self, trial, X, y, splits, batch_size=32, epochs=1,callbacks=None):
        val_losses = []
        for train_indices, test_indices in splits:
            X_train, X_test = [x[train_indices] for x in X], [x[test_indices] for x in X]
            y_train, y_test = [a[train_indices] for a in y], [a[test_indices] for a in y]
            if len(X_train) < 2:
                X_train = X_train[0]
                X_test = X_test[0]
            if len(y_train) < 2:
                y_train = y_train[0]
                y_test = y_test[0]
            
            model = self.hypermodel.build(trial.hyperparameters)
            hist = model.fit(X_train,y_train,
                      validation_data=(X_test,y_test),  #通过对比y_test的prediction和y_test来决定跑几轮neural network
                      epochs=epochs,
                        batch_size=batch_size,
                      callbacks=callbacks)
            
            val_losses.append([hist.history[k][-1] for k in hist.history])
        val_losses = np.asarray(val_losses)
        self.oracle.update_trial(trial.trial_id, {k:np.mean(val_losses[:,i]) for i,k in enumerate(hist.history.keys())})
        self.save_model(trial.trial_id, model)

NameError: name 'kt' is not defined

In [None]:
TRAINING = True  #training和inference的唯一区别：TRAINING和USE_FINETUNE
USE_FINETUNE = False    #TRIANING和USE_FINETUNE用来控制之后的ifelse循环
FOLDS = xx # 自己设 一般5-10 
SEED = xx # 自己设 any integer

train = pd.read_csv('../input/jane-street-market-prediction/train.csv')
train = train.query('date > 85').reset_index(drop = True) 
#  Jane Street modify their trading model around day 85
# https://www.kaggle.com/c/jane-street-market-prediction/discussion/201930
#这个帖子说85天之前数据上升很大 可能janestreet在85天后用了新的计算方式 要drop掉前85天的数据
train = train.astype({c: np.float32 for c in train.select_dtypes(include='float64').columns}) #把float64变成了float32 limit memory use
train.fillna(train.mean(),inplace=True)
# missing value的处理：mean和median都试试 can try fillna(train.median())
train = train.query('weight > 0').reset_index(drop = True)
# drop了weight为零的 no contribution to the scoring evaludation
train['action'] =  (  (train['resp_1'] > 0 ) & (train['resp_2'] > 0 ) & (train['resp_3'] > 0 ) & (train['resp_4'] > 0 ) &  (train['resp'] > 0 )   ).astype('int')
#设置action何时为0何时为1
# you can adjust this strategies based on the prediction result 这里定义的是5个resp都大于0时action才是1
# train['action']= (train['resp'] > 0).astype('int')
features = [c for c in train.columns if 'feature' in c]

resp_cols = ['resp_1', 'resp_2', 'resp_3', 'resp', 'resp_4']

X = train[features].values #feature们
y = np.stack([(train[c] > 0).astype('int') for c in resp_cols]).T #从5个resp数值看是否为0还是1 有2^5种可能 #Multitarget classfication
# input: features, output: whether resp is bigger than 0

f_mean = np.mean(train[features[1:]].values,axis=0)

In [None]:
#处理noise outliers用下面网站这个function output比input减少了outlier
#Creating the autoencoder.¶
#The autoencoder should aid in denoising the data:
# https://www.semanticscholar.org/paper/Deep-Bottleneck-Classifiers-in-Supervised-Dimension-Parviainen/fb86483f7573f6430fe4597432b0cd3e34b16e43

In [None]:
def create_autoencoder(input_dim,output_dim,noise=0.05):
    i = Input(input_dim)
    encoded = BatchNormalization()(i) #该function是用的neural network的方式所以对起始weight数据非常敏感
    #为了降低这种敏感 要一个batch一个batch的放数据 每个batch都要normalize变成mean=0 std=1
    # https://machinelearningmastery.com/batch-normalization-for-training-of-deep-neural-networks/
#     Training deep neural networks with tens of layers is challenging as they can be sensitive to 
#     the initial random weights and configuration of the learning algorithm.
#     One possible reason for this difficulty is the distribution of the inputs to layers deep
#     in the network may change after each mini-batch when the weights are updated. 
#     This can cause the learning algorithm to forever chase a moving target. 
#     This change in the distribution of inputs to layers in the network 
#     is referred to the technical name “internal covariate shift.”
#     Batch normalization is a technique for training very deep neural networks 
#     that standardizes the inputs to a layer for each mini-batch.
#     This has the effect of stabilizing the learning process 
#     and dramatically reducing the number of training epochs required to train deep networks
    encoded = GaussianNoise(noise)(encoded) #加入非零数据减少overfitting
    #https://keras.io/api/layers/regularization_layers/gaussian_noise/

    #Apply additive zero-centered Gaussian noise.
    #This is useful to mitigate overfitting (you could see it as a form of random data augmentation).
    #Gaussian Noise (GS) is a natural choice as corruption process for real valued inputs.
    #stddev: Float, standard deviation of the noise distribution. 0.05
    # we can adjust this later 自己设 noise的参数
    encoded = Dense(64,activation='relu')(encoded) #flow connected neural network layer见之前cnn的课
    #64个headnotes 加了一个activation layer relu=出来的值都大于等于0
    # https://keras.io/api/layers/core_layers/dense/
    # units: Positive integer, dimensionality of the output space. 64
    # activation: Activation function to use.
    decoded = Dropout(0.2)(encoded) #加入dropout layer控制overfitting dropout rate是0.2控制多少input unit会drop掉l
    #https://keras.io/api/layers/regularization_layers/dropout/
    #The Dropout layer randomly sets input units to 0 with a frequency of rate at each step during training time, 
    # which helps prevent overfitting
    #rate: Float between 0 and 1. Fraction of the input units to drop. 0.2
    decoded = Dense(input_dim,name='decoded')(decoded) #一个dense layer 129个headnotes
    x = Dense(32,activation='relu')(decoded)
    x = BatchNormalization()(x)
    x = Dropout(0.2)(x)
    x = Dense(output_dim,activation='sigmoid',name='label_output')(x)
    
    encoder = Model(inputs=i,outputs=decoded)
    autoencoder = Model(inputs=i,outputs=[decoded,x])
    
    autoencoder.compile(optimizer=Adam(0.001),loss={'decoded':'mse','label_output':'binary_crossentropy'})
    #https://keras.io/api/models/model_training_apis/
    # optimizer: adam
    # learning rate: 0.001
    # loss function: mse
    # minimize crossentropy
    return autoencoder, encoder

#以上只是对x的预处理 不涉及y

In [None]:
# model
def create_model(hp,input_dim,output_dim,encoder):
    inputs = Input(input_dim)
    
    x = encoder(inputs)
    x = Concatenate()([x,inputs]) #use both raw and encoded features 作为真正input
    x = BatchNormalization()(x)
    x = Dropout(hp.Float('init_dropout',0.0,0.5))(x)
    # 自己设 dropout rate： 0-0.5
    
    for i in range(hp.Int('num_layers',1,3)): # num layer：1，2，3 自己设一共跑多少层layer 现在是1，2，3
        x = Dense(hp.Int(f'num_units_{i}',64,256))(x)
        # 自己设dense layer多少headnots， dense layer unit： 64-256
        x = BatchNormalization()(x)
        x = Lambda(tf.keras.activations.swish)(x)
        x = Dropout(hp.Float(f'dropout_{i}',0.0,0.5))(x)
        # 自己设 drop out rate： 0-0.5
    x = Dense(output_dim,activation='sigmoid')(x)
    model = Model(inputs=inputs,outputs=x)
    model.compile(optimizer=Adam(hp.Float('lr',0.00001,0.1,default=0.001)),loss=BinaryCrossentropy(label_smoothing=hp.Float('label_smoothing',0.0,0.1)),metrics=[tf.keras.metrics.AUC(name = 'auc')])
    # 自己设learning rate：0.00001-0.1
    # 自己设 label smoothing：0-0.1 让更多的结果是1 make the trade
#https://towardsdatascience.com/label-smoothing-making-model-robust-to-incorrect-labels-2fae037ffbd0
#Well, say you were training a model for binary classification. 
#Your labels would be 0 — cat, 1 — not cat.
#Now, say you label_smoothing = 0.2
#Using the equation, we get:
#new_onehot_labels = [0 1] * (1 — 0.2) + 0.2 / 2 =[0 1]*(0.8) + 0.1
#new_onehot_labels =[0.9 0.1]
#These are soft labels, instead of hard labels, that is 0 and 1. This will ultimately give you lower loss 
#when there is an incorrect prediction,and subsequently, your model will penalize and learn incorrectly by a slightly lesser degree.
#In essence, label smoothing will help your model to train around mislabeled data and consequently improve its robustness and performance.
    # maximize auc
    return model
# you can adjust the range for tuning parameters by yourself

In [None]:
#Defining and training the autoencoder.
#We add gaussian noise with mean and std from training data. 
#After training we lock the layers in the encoder from further training.

In [None]:
autoencoder, encoder = create_autoencoder(X.shape[-1],y.shape[-1],noise=0.1)
set_all_seeds(SEED)    
if TRAINING:
    autoencoder.fit(X,(X,y),
                    epochs=1000,
                    batch_size=xx, # 2的指数： 1024， 2048，4096
    #https://stats.stackexchange.com/questions/153531/what-is-batch-size-in-neural-network
    #one epoch = one forward pass and one backward pass of all the training examples
    #batch size = the number of training examples in one forward/backward pass. 
    #The higher the batch size, the more memory space you'll need.
    #The batch size defines the number of samples that will be propagated through the network.
    #For instance, let's say you have 1050 training samples and you want to set up a batch_size equal to 100.
    #The algorithm takes the first 100 samples (from 1st to 100th) from the training dataset and trains the network.
    #Next, it takes the second 100 samples (from 101st to 200th) and trains the network again. 
   #We can keep doing this procedure until we have propagated all samples through of the network. 
  #Problem might happen with the last set of samples. 
    #In our example, we've used 1050 which is not divisible by 100 without remainder. 
     #The simplest solution is just to get the final 50 samples and train the network.
    #Advantages of using a batch size < number of all samples:

#It requires less memory. Since you train the network using fewer samples,
#the overall training procedure requires less memory. 
#That's especially important if you are not able to fit the whole dataset in your machine's memory.

#Typically networks train faster with mini-batches.
#That's because we update the weights after each propagation. 
#In our example we've propagated 11 batches (10 of them had 100 samples and 1 had 50 samples) 
#and after each of them we've updated our network's parameters. 
#If we used all samples during propagation we would make only 1 update for the network's parameter.

#Disadvantages of using a batch size < number of all samples:

#The smaller the batch the less accurate the estimate of the gradient will be.
                    validation_split=0.1, # training: 90%, validation :10%
                    callbacks=[EarlyStopping('val_loss',patience=10,restore_best_weights=True)])
    # 当validation的loss达到最小，model停止
    # patience	Number of epochs with no improvement after which training will be stopped. 10
    encoder.save_weights('./encoder.hdf5')
else:
    encoder.load_weights('../input/自己命名一个folder之后inference会用到/encoder.hdf5')
encoder.trainable = False

In [None]:
# time series cross validaton
# https://www.kaggle.com/gogo827jz/jane-street-ffill-xgboost-purgedtimeseriescv
#We add the locked encoder as the first layer of the MLP. This seems to help in speeding up the submission rather than first predicting using the encoder then using the MLP.
#We use a Baysian Optimizer to find the optimal HPs for out model. 20 trials take about 2 hours on GPU.

In [None]:
model_fn = lambda hp: create_model(hp,X.shape[-1],y.shape[-1],encoder)

tuner = CVTuner(
        hypermodel=model_fn,
        oracle=kt.oracles.BayesianOptimization(
        objective= kt.Objective('val_auc', direction='max'),
        num_initial_points=4,
        max_trials=20))
# tuning parameters: initial experiment 4, max experiment 20
# max cv auc

FOLDS = xx # same as before
SEED = xx # any interger

if TRAINING:
    gkf = PurgedGroupTimeSeriesSplit(n_splits = FOLDS, group_gap=20)
    # you can try differenr group gap values: 10-50
    splits = list(gkf.split(y, groups=train['date'].values))
    tuner.search((X,),(y,),splits=splits,batch_size=xx,epochs=100,callbacks=[EarlyStopping('val_auc', mode='max',patience=3)])
    # you can try different values for batch size 
    hp  = tuner.get_best_hyperparameters(1)[0]
    pd.to_pickle(hp,f'./best_hp_{SEED}.pkl')
    for fold, (train_indices, test_indices) in enumerate(splits):
        model = model_fn(hp)
        X_train, X_test = X[train_indices], X[test_indices]
        y_train, y_test = y[train_indices], y[test_indices]
        model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=100,batch_size=xx,callbacks=[EarlyStopping('val_auc',mode='max',patience=10,restore_best_weights=True)])
        # same values as tuner
        model.save_weights(f'./model_{SEED}_{fold}.hdf5')
        model.compile(Adam(hp.get('lr')/100),loss='binary_crossentropy')
        model.fit(X_test,y_test,epochs=3,batch_size=xx)
        # same values as tuner
        model.save_weights(f'./model_{SEED}_{fold}_finetune.hdf5')
    tuner.results_summary()
else:
    models = []
    hp = pd.read_pickle(f'../input/自己命名一个folder之后inference会用到/best_hp_{SEED}.pkl')
    for f in range(FOLDS):
        model = model_fn(hp)
        if USE_FINETUNE:
            model.load_weights(f'../input/自己命名一个folder之后inference会用到/model_{SEED}_{f}_finetune.hdf5')
        else:
            model.load_weights(f'../input/自己命名一个folder之后inference会用到/model_{SEED}_{f}.hdf5')
        models.append(model)

In [None]:
# will not run for training, will run for prediction
if not  TRAINING:
    f = np.mean
    models = models[-2:]
    import janestreet
    env = janestreet.make_env()
    th = 0.5
    for (test_df, pred_df) in tqdm(env.iter_test()):
        if test_df['weight'].item() > 0:
            x_tt = test_df.loc[:, features].values
            if np.isnan(x_tt[:, 1:].sum()):
                x_tt[:, 1:] = np.nan_to_num(x_tt[:, 1:]) + np.isnan(x_tt[:, 1:]) * f_mean
            pred = np.mean([model(x_tt, training = False).numpy() for model in models],axis=0)
            pred = f(pred)
            pred_df.action = np.where(pred >= th, 1, 0).astype(int)
        else:
            pred_df.action = 0
        env.predict(pred_df)