# import

In [1]:
import os
import glob
from joblib import Parallel, delayed
import pandas as pd
import numpy as np
import scipy as sc
from sklearn.model_selection import KFold
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')
pd.set_option('max_columns', 300)

In [2]:
# data directory
data_dir = '../input/optiver-realized-volatility-prediction/'

# Recap (data format)

In [9]:
pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv').head()

Unnamed: 0,stock_id,time_id,target
0,0,5,0.004136
1,0,11,0.001445
2,0,16,0.002168
3,0,31,0.002195
4,0,62,0.001747


In [10]:
pd.read_parquet('../input/optiver-realized-volatility-prediction/book_train.parquet/stock_id=0').head()

Unnamed: 0,time_id,seconds_in_bucket,bid_price1,ask_price1,bid_price2,ask_price2,bid_size1,ask_size1,bid_size2,ask_size2
0,5,0,1.001422,1.002301,1.00137,1.002353,3,226,2,100
1,5,1,1.001422,1.002301,1.00137,1.002353,3,100,2,100
2,5,5,1.001422,1.002301,1.00137,1.002405,3,100,2,100
3,5,6,1.001422,1.002301,1.00137,1.002405,3,126,2,100
4,5,7,1.001422,1.002301,1.00137,1.002405,3,126,2,100


In [11]:
pd.read_parquet('../input/optiver-realized-volatility-prediction/trade_train.parquet/stock_id=0').head()

Unnamed: 0,time_id,seconds_in_bucket,price,size,order_count
0,5,21,1.002301,326,12
1,5,46,1.002778,128,4
2,5,50,1.002818,55,1
3,5,57,1.003155,121,5
4,5,68,1.003646,4,1


# WAP

In [16]:
# Function to calculate first WAP
def calc_wap1(df):
    wap = (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']) / (df['bid_size1'] + df['ask_size1'])
    return wap

# Function to calculate second WAP
def calc_wap2(df):
    wap = (df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2']) / (df['bid_size2'] + df['ask_size2'])
    return wap

# Log return, volatility

In [3]:
# Function to calculate the log of the return
# Remember that logb(x / y) = logb(x) - logb(y)
def log_return(series):
    return np.log(series).diff()

# Calculate the realized volatility
def realized_volatility(series):
    return np.sqrt(np.sum(series**2))

In [4]:
# Function to count unique elements of a series
def count_unique(series):
    return len(np.unique(series))

In [5]:
# Function to read our base train and test set
def read_train_test():
    train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
    test = pd.read_csv('../input/optiver-realized-volatility-prediction/test.csv')
    # Create a key to merge with book and trade data
    train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
    test['row_id'] = test['stock_id'].astype(str) + '-' + test['time_id'].astype(str)
    print(f'Our training set has {train.shape[0]} rows')
    return train, test

# Preprocessor (offer book)

まず、offer book に対しての前処理を行う。オリジナルの `book_train.parquet` には以下のカラムが含まれていないので、それを新たに定義する：

- wap
- log_return 
- wap_balance : $\text{WAP}_1-\text{WAP}_2$
- price_spread : $(p_{a1}-p_{b1})/(p_{a1}+p_{b1})/2$
- bid(ask)_spread : $p_1 - p_2$
- total_volume : $(n_{a1}+n_{a2})+(n_{b1}+n_{b2})$
- volume_imbalance : $|(n_{a1}+n_{a2})-(n_{b1}+n_{b2})|$

In [69]:
# Function to preprocess book data (for each stock id)
def book_preprocessor(file_path):
    df = pd.read_parquet(file_path)
    # Calculate Wap
    df['wap1'] = calc_wap1(df)
    df['wap2'] = calc_wap2(df)
    # Calculate log returns
    df['log_return1'] = df.groupby(['time_id'])['wap1'].apply(log_return)
    df['log_return2'] = df.groupby(['time_id'])['wap2'].apply(log_return)
    # Calculate wap balance
    df['wap_balance'] = abs(df['wap1'] - df['wap2'])
    # Calculate spread
    df['price_spread'] = (df['ask_price1'] - df['bid_price1']) / ((df['ask_price1'] + df['bid_price1']) / 2)
    df['bid_spread'] = df['bid_price1'] - df['bid_price2']
    df['ask_spread'] = df['ask_price1'] - df['ask_price2']
    df['total_volume'] = (df['ask_size1'] + df['ask_size2']) + (df['bid_size1'] + df['bid_size2'])
    df['volume_imbalance'] = abs((df['ask_size1'] + df['ask_size2']) - (df['bid_size1'] + df['bid_size2']))
    
    # Dict for aggregations
    create_feature_dict = {
        'wap1': [np.sum, np.mean, np.std],
        'wap2': [np.sum, np.mean, np.std],
        'log_return1': [np.sum, realized_volatility, np.mean, np.std],
        'log_return2': [np.sum, realized_volatility, np.mean, np.std],
        'wap_balance': [np.sum, np.mean, np.std],
        'price_spread':[np.sum, np.mean, np.std],
        'bid_spread':[np.sum, np.mean, np.std],
        'ask_spread':[np.sum, np.mean, np.std],
        'total_volume':[np.sum, np.mean, np.std],
        'volume_imbalance':[np.sum, np.mean, np.std]
    }
    
    # Function to get group stats for different windows (seconds in bucket)
    def get_stats_window(seconds_in_bucket, add_suffix = False):
        # Group by the window
        df_feature = df[df['seconds_in_bucket'] >= seconds_in_bucket].groupby(['time_id']).agg(create_feature_dict).reset_index()
        # Rename columns joining suffix
        df_feature.columns = ['_'.join(col) for col in df_feature.columns]
        # Add a suffix to differentiate windows
        if add_suffix:
            df_feature = df_feature.add_suffix('_' + str(seconds_in_bucket))
        return df_feature
    
    # Get the stats for different windows
    df_feature = get_stats_window(seconds_in_bucket = 0, add_suffix = False)
    df_feature_450 = get_stats_window(seconds_in_bucket = 450, add_suffix = True)
    df_feature_300 = get_stats_window(seconds_in_bucket = 300, add_suffix = True)
    df_feature_150 = get_stats_window(seconds_in_bucket = 150, add_suffix = True)
    
    # Merge all
    df_feature = df_feature.merge(df_feature_450, how = 'left', left_on = 'time_id_', right_on = 'time_id__450')
    df_feature = df_feature.merge(df_feature_300, how = 'left', left_on = 'time_id_', right_on = 'time_id__300')
    df_feature = df_feature.merge(df_feature_150, how = 'left', left_on = 'time_id_', right_on = 'time_id__150')
    # Drop unnecesary time_ids
    df_feature.drop(['time_id__450', 'time_id__300', 'time_id__150'], axis = 1, inplace = True)
    
    # Create row_id so we can merge
    stock_id = file_path.split('=')[1]
    df_feature['row_id'] = df_feature['time_id_'].apply(lambda x: f'{stock_id}-{x}')
    df_feature.drop(['time_id_'], axis = 1, inplace = True)
    return df_feature

## get_stats_window について

新たに定義した特徴量を持った offer book (ここでは `df_feature`）を、さらに時間で区分分けを行う（0, 0\~150, 150\~300, 300\~450, 450\~）。

In [59]:
df = pd.read_parquet('../input/optiver-realized-volatility-prediction/book_train.parquet/stock_id=0')
# Calculate Wap
df['wap1'] = calc_wap1(df)
df['wap2'] = calc_wap2(df)
# Calculate log returns
df['log_return1'] = df.groupby(['time_id'])['wap1'].apply(log_return)
df['log_return2'] = df.groupby(['time_id'])['wap2'].apply(log_return)
# Calculate wap balance
df['wap_balance'] = abs(df['wap1'] - df['wap2'])
# Calculate spread
df['price_spread'] = (df['ask_price1'] - df['bid_price1']) / ((df['ask_price1'] + df['bid_price1']) / 2)
df['bid_spread'] = df['bid_price1'] - df['bid_price2']
df['ask_spread'] = df['ask_price1'] - df['ask_price2']
df['total_volume'] = (df['ask_size1'] + df['ask_size2']) + (df['bid_size1'] + df['bid_size2'])
df['volume_imbalance'] = abs((df['ask_size1'] + df['ask_size2']) - (df['bid_size1'] + df['bid_size2']))

# Dict for aggregations
create_feature_dict = {
    'wap1': [np.sum, np.mean, np.std],
    'wap2': [np.sum, np.mean, np.std],
    'log_return1': [np.sum, realized_volatility, np.mean, np.std],
    'log_return2': [np.sum, realized_volatility, np.mean, np.std],
    'wap_balance': [np.sum, np.mean, np.std],
    'price_spread':[np.sum, np.mean, np.std],
    'bid_spread':[np.sum, np.mean, np.std],
    'ask_spread':[np.sum, np.mean, np.std],
    'total_volume':[np.sum, np.mean, np.std],
    'volume_imbalance':[np.sum, np.mean, np.std]
}

# Function to get group stats for different windows (seconds in bucket)
def get_stats_window(seconds_in_bucket, add_suffix = False):
    # Group by the window
    df_feature = df[df['seconds_in_bucket'] >= seconds_in_bucket].groupby(['time_id']).agg(create_feature_dict).reset_index()
    # Rename columns joining suffix
    df_feature.columns = ['_'.join(col) for col in df_feature.columns]
    # Add a suffix to differentiate windows
    if add_suffix:
        df_feature = df_feature.add_suffix('_' + str(seconds_in_bucket))
    return df_feature

### 各データについて

生データは、time_id と seconds_in_bucket が独立している。そこで、`get_stats_window` 以下の流れを作っている

- `groupby('time_id')` : 同じtime window のものをまとめる
    - 元データは (917553,20) --> (3830, 33) になる
    - つまり 3830 のユニークな time_id があるということ
- カラム名は seconds_in_bucket 毎に suffix をつけているので区別できる。
- あとは left join をしていく（キーをどうするかは `left_on, right_on`で指定している）
    - 元データは (3830, 33) --> (3830, 33 $\times$ 結合回数)になる
- 最終的に time_id から row_id を作成する

In [60]:
print(df.shape)
df.head()

(917553, 20)


Unnamed: 0,time_id,seconds_in_bucket,bid_price1,ask_price1,bid_price2,ask_price2,bid_size1,ask_size1,bid_size2,ask_size2,wap1,wap2,log_return1,log_return2,wap_balance,price_spread,bid_spread,ask_spread,total_volume,volume_imbalance
0,5,0,1.001422,1.002301,1.00137,1.002353,3,226,2,100,1.001434,1.00139,,,4.4e-05,0.000878,5.2e-05,-5.2e-05,331,321
1,5,1,1.001422,1.002301,1.00137,1.002353,3,100,2,100,1.001448,1.00139,1.4e-05,0.0,5.8e-05,0.000878,5.2e-05,-5.2e-05,205,195
2,5,5,1.001422,1.002301,1.00137,1.002405,3,100,2,100,1.001448,1.001391,0.0,1e-06,5.7e-05,0.000878,5.2e-05,-0.000103,205,195
3,5,6,1.001422,1.002301,1.00137,1.002405,3,126,2,100,1.001443,1.001391,-5e-06,0.0,5.2e-05,0.000878,5.2e-05,-0.000103,231,221
4,5,7,1.001422,1.002301,1.00137,1.002405,3,126,2,100,1.001443,1.001391,0.0,0.0,5.2e-05,0.000878,5.2e-05,-0.000103,231,221


In [51]:
df_feature     = get_stats_window(seconds_in_bucket = 0, add_suffix = False)
df_feature_450 = get_stats_window(seconds_in_bucket = 450, add_suffix = True)

In [56]:
print(df_feature.shape)
df_feature.head()

(3830, 33)


Unnamed: 0,time_id_,wap1_sum,wap1_mean,wap1_std,wap2_sum,wap2_mean,wap2_std,log_return1_sum,log_return1_realized_volatility,log_return1_mean,log_return1_std,log_return2_sum,log_return2_realized_volatility,log_return2_mean,log_return2_std,wap_balance_sum,wap_balance_mean,wap_balance_std,price_spread_sum,price_spread_mean,price_spread_std,bid_spread_sum,bid_spread_mean,bid_spread_std,ask_spread_sum,ask_spread_mean,ask_spread_std,total_volume_sum,total_volume_mean,total_volume_std,volume_imbalance_sum,volume_imbalance_mean,volume_imbalance_std
0,5,303.125061,1.003725,0.000693,303.105539,1.003661,0.000781,0.002292,0.004499,7.613599e-06,0.00026,0.002325,0.006999,8e-06,0.000404,0.117051,0.000388,0.000295,0.257255,0.000852,0.000211,0.053006,0.000176,0.000162,-0.045557,-0.000151,0.000126,97696,323.496689,138.101214,40738,134.89404,107.260583
1,11,200.047768,1.000239,0.000262,200.041171,1.000206,0.000272,0.00036,0.001204,1.810239e-06,8.6e-05,0.000801,0.002476,4e-06,0.000176,0.042312,0.000212,0.000155,0.078836,0.000394,0.000157,0.028358,0.000142,0.000148,-0.027001,-0.000135,6.5e-05,82290,411.45,172.263581,28410,142.05,102.139758
2,16,187.913849,0.999542,0.000864,187.939824,0.99968,0.000862,-0.002074,0.002369,-1.109201e-05,0.000173,-0.001493,0.004801,-8e-06,0.000352,0.062228,0.000331,0.000246,0.13633,0.000725,0.000164,0.036955,0.000197,0.00017,-0.037243,-0.000198,0.000171,78274,416.351064,138.433034,26586,141.414894,108.891243
3,31,119.859781,0.998832,0.000757,119.835941,0.998633,0.000656,-0.002828,0.002574,-2.376661e-05,0.000236,-0.002053,0.003637,-1.7e-05,0.000334,0.045611,0.00038,0.000248,0.103252,0.00086,0.00028,0.022764,0.00019,0.000199,-0.013001,-0.000108,9.1e-05,52232,435.266667,156.120334,17546,146.216667,121.533215
4,62,175.932865,0.999619,0.000258,175.934256,0.999626,0.000317,-2e-06,0.001894,-1.057099e-08,0.000144,-0.000281,0.003257,-2e-06,0.000247,0.044783,0.000254,0.000188,0.069901,0.000397,0.00013,0.033565,0.000191,8.3e-05,-0.019206,-0.000109,7.6e-05,60407,343.221591,158.054066,21797,123.846591,102.407501


In [58]:
print(df_feature_450.shape)
df_feature_450.head()

(3830, 33)


Unnamed: 0,time_id__450,wap1_sum_450,wap1_mean_450,wap1_std_450,wap2_sum_450,wap2_mean_450,wap2_std_450,log_return1_sum_450,log_return1_realized_volatility_450,log_return1_mean_450,log_return1_std_450,log_return2_sum_450,log_return2_realized_volatility_450,log_return2_mean_450,log_return2_std_450,wap_balance_sum_450,wap_balance_mean_450,wap_balance_std_450,price_spread_sum_450,price_spread_mean_450,price_spread_std_450,bid_spread_sum_450,bid_spread_mean_450,bid_spread_std_450,ask_spread_sum_450,ask_spread_mean_450,ask_spread_std_450,total_volume_sum_450,total_volume_mean_450,total_volume_std_450,volume_imbalance_sum_450,volume_imbalance_mean_450,volume_imbalance_std_450
0,5,68.236749,1.003482,0.000514,68.231672,1.003407,0.00064,-0.000361,0.001721,-5e-06,0.00021,6.8e-05,0.004114,1e-06,0.000503,0.024868,0.000366,0.000277,0.053236,0.000783,0.000181,0.01779,0.000262,0.000178,-0.011274,-0.000166,0.000126,17948,263.941176,116.940077,9620,141.470588,84.467864
1,11,54.027991,1.000518,0.000235,54.021532,1.000399,0.000287,-5.9e-05,0.000918,-1e-06,0.000126,0.000488,0.001883,9e-06,0.000258,0.014524,0.000269,0.000175,0.018812,0.000348,0.000144,0.012598,0.000233,0.000239,-0.007729,-0.000143,6.6e-05,24191,447.981481,177.264272,5275,97.685185,88.144569
2,16,43.922425,0.998237,0.000541,43.933158,0.998481,0.000766,-0.001469,0.001158,-3.3e-05,0.000173,-0.001831,0.002972,-4.2e-05,0.000451,0.016055,0.000365,0.000282,0.026608,0.000605,0.000105,0.008186,0.000186,0.000217,-0.009143,-0.000208,0.000168,20201,459.113636,116.212559,6869,156.113636,102.02467
3,31,17.965415,0.998079,0.00043,17.97163,0.998424,0.000544,-0.000526,0.000993,-2.9e-05,0.000239,-0.000882,0.001424,-4.9e-05,0.000342,0.006441,0.000358,0.000253,0.019047,0.001058,7.4e-05,0.002082,0.000116,4.6e-05,-0.000879,-4.9e-05,1.1e-05,9720,540.0,153.413704,2628,146.0,106.693624
4,62,35.982653,0.999518,0.000257,35.991844,0.999773,0.000212,0.000397,0.001378,1.1e-05,0.000233,-0.000298,0.000966,-8e-06,0.000163,0.013087,0.000364,0.000203,0.0187,0.000519,0.000138,0.00704,0.000196,2.4e-05,-0.004895,-0.000136,6.6e-05,14110,391.944444,123.180227,4212,117.0,99.328028


In [61]:
print(df_feature.merge(df_feature_450, how = 'left', left_on = 'time_id_', right_on = 'time_id__450').shape)
df_feature.merge(df_feature_450, how = 'left', left_on = 'time_id_', right_on = 'time_id__450').head()

(3830, 66)


Unnamed: 0,time_id_,wap1_sum,wap1_mean,wap1_std,wap2_sum,wap2_mean,wap2_std,log_return1_sum,log_return1_realized_volatility,log_return1_mean,log_return1_std,log_return2_sum,log_return2_realized_volatility,log_return2_mean,log_return2_std,wap_balance_sum,wap_balance_mean,wap_balance_std,price_spread_sum,price_spread_mean,price_spread_std,bid_spread_sum,bid_spread_mean,bid_spread_std,ask_spread_sum,ask_spread_mean,ask_spread_std,total_volume_sum,total_volume_mean,total_volume_std,volume_imbalance_sum,volume_imbalance_mean,volume_imbalance_std,time_id__450,wap1_sum_450,wap1_mean_450,wap1_std_450,wap2_sum_450,wap2_mean_450,wap2_std_450,log_return1_sum_450,log_return1_realized_volatility_450,log_return1_mean_450,log_return1_std_450,log_return2_sum_450,log_return2_realized_volatility_450,log_return2_mean_450,log_return2_std_450,wap_balance_sum_450,wap_balance_mean_450,wap_balance_std_450,price_spread_sum_450,price_spread_mean_450,price_spread_std_450,bid_spread_sum_450,bid_spread_mean_450,bid_spread_std_450,ask_spread_sum_450,ask_spread_mean_450,ask_spread_std_450,total_volume_sum_450,total_volume_mean_450,total_volume_std_450,volume_imbalance_sum_450,volume_imbalance_mean_450,volume_imbalance_std_450
0,5,303.125061,1.003725,0.000693,303.105539,1.003661,0.000781,0.002292,0.004499,7.613599e-06,0.00026,0.002325,0.006999,8e-06,0.000404,0.117051,0.000388,0.000295,0.257255,0.000852,0.000211,0.053006,0.000176,0.000162,-0.045557,-0.000151,0.000126,97696,323.496689,138.101214,40738,134.89404,107.260583,5,68.236749,1.003482,0.000514,68.231672,1.003407,0.00064,-0.000361,0.001721,-5e-06,0.00021,6.8e-05,0.004114,1e-06,0.000503,0.024868,0.000366,0.000277,0.053236,0.000783,0.000181,0.01779,0.000262,0.000178,-0.011274,-0.000166,0.000126,17948,263.941176,116.940077,9620,141.470588,84.467864
1,11,200.047768,1.000239,0.000262,200.041171,1.000206,0.000272,0.00036,0.001204,1.810239e-06,8.6e-05,0.000801,0.002476,4e-06,0.000176,0.042312,0.000212,0.000155,0.078836,0.000394,0.000157,0.028358,0.000142,0.000148,-0.027001,-0.000135,6.5e-05,82290,411.45,172.263581,28410,142.05,102.139758,11,54.027991,1.000518,0.000235,54.021532,1.000399,0.000287,-5.9e-05,0.000918,-1e-06,0.000126,0.000488,0.001883,9e-06,0.000258,0.014524,0.000269,0.000175,0.018812,0.000348,0.000144,0.012598,0.000233,0.000239,-0.007729,-0.000143,6.6e-05,24191,447.981481,177.264272,5275,97.685185,88.144569
2,16,187.913849,0.999542,0.000864,187.939824,0.99968,0.000862,-0.002074,0.002369,-1.109201e-05,0.000173,-0.001493,0.004801,-8e-06,0.000352,0.062228,0.000331,0.000246,0.13633,0.000725,0.000164,0.036955,0.000197,0.00017,-0.037243,-0.000198,0.000171,78274,416.351064,138.433034,26586,141.414894,108.891243,16,43.922425,0.998237,0.000541,43.933158,0.998481,0.000766,-0.001469,0.001158,-3.3e-05,0.000173,-0.001831,0.002972,-4.2e-05,0.000451,0.016055,0.000365,0.000282,0.026608,0.000605,0.000105,0.008186,0.000186,0.000217,-0.009143,-0.000208,0.000168,20201,459.113636,116.212559,6869,156.113636,102.02467
3,31,119.859781,0.998832,0.000757,119.835941,0.998633,0.000656,-0.002828,0.002574,-2.376661e-05,0.000236,-0.002053,0.003637,-1.7e-05,0.000334,0.045611,0.00038,0.000248,0.103252,0.00086,0.00028,0.022764,0.00019,0.000199,-0.013001,-0.000108,9.1e-05,52232,435.266667,156.120334,17546,146.216667,121.533215,31,17.965415,0.998079,0.00043,17.97163,0.998424,0.000544,-0.000526,0.000993,-2.9e-05,0.000239,-0.000882,0.001424,-4.9e-05,0.000342,0.006441,0.000358,0.000253,0.019047,0.001058,7.4e-05,0.002082,0.000116,4.6e-05,-0.000879,-4.9e-05,1.1e-05,9720,540.0,153.413704,2628,146.0,106.693624
4,62,175.932865,0.999619,0.000258,175.934256,0.999626,0.000317,-2e-06,0.001894,-1.057099e-08,0.000144,-0.000281,0.003257,-2e-06,0.000247,0.044783,0.000254,0.000188,0.069901,0.000397,0.00013,0.033565,0.000191,8.3e-05,-0.019206,-0.000109,7.6e-05,60407,343.221591,158.054066,21797,123.846591,102.407501,62,35.982653,0.999518,0.000257,35.991844,0.999773,0.000212,0.000397,0.001378,1.1e-05,0.000233,-0.000298,0.000966,-8e-06,0.000163,0.013087,0.000364,0.000203,0.0187,0.000519,0.000138,0.00704,0.000196,2.4e-05,-0.004895,-0.000136,6.6e-05,14110,391.944444,123.180227,4212,117.0,99.328028


## pandas.aggregateについて

`agg` or `aggregate`（同名関数）を用いることで、様々な処理を一括で行うことができる。

In [29]:
tmp = pd.DataFrame({"a":[1,2,3,4,5], "b":[6,7,8,9,10], "c":[11,12,13,14,15]})
tmp.agg([np.sum, np.mean])

Unnamed: 0,a,b,c
sum,15.0,40.0,65.0
mean,3.0,8.0,13.0


また辞書型として与えることで、任意の列に任意の処理を適用することができる。

In [39]:
tmp = pd.DataFrame({"a":[1,2,3,4,5], "b":[1,2,3,4,5], "c":[1,2,3,4,5]})
tmp.agg({"a":[np.mean, np.sum], "b":[np.mean, np.sum]})

Unnamed: 0,a,b
mean,3.0,3.0
sum,15.0,15.0


また、独自に関数を定義して適用することもできる。今回の処理で行っているのは、とある列で groupby して（`time_id`）、aggregate 処理を適用している。そのときに独自の関数も適用している。ただ、そのまま放置すると MultiIndex で返ってしまうので、以降の処理を簡易にするためにカラム名を上書きする必要がある（別にMultiIndexで扱っても良いけど）。また `reset_index` を挟むことで、 一旦groupbyでインデックスになったものをカラムに戻す。

In [48]:
def test1(series):
    return series+1
def test2(series):
    return series-1

tmp = pd.DataFrame({"a":[1,2,3,4,5], "b":[1,2,3,4,5], "c":[1,2,3,4,5]})
tmp = tmp.groupby("c").agg({"a":[np.mean, np.sum, test1], "b":[np.mean, np.sum, test2]})
tmp

Unnamed: 0_level_0,a,a,a,b,b,b
Unnamed: 0_level_1,mean,sum,test1,mean,sum,test2
c,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1,1,1,2,1,1,0
2,2,2,3,2,2,1
3,3,3,4,3,3,2
4,4,4,5,4,4,3
5,5,5,6,5,5,4


In [49]:
tmp = tmp.reset_index()
tmp.columns = ['_'.join(col) for col in tmp.columns]
tmp

Unnamed: 0,c_,a_mean,a_sum,a_test1,b_mean,b_sum,b_test2
0,1,1,1,2,1,1,0
1,2,2,2,3,2,2,1
2,3,3,3,4,3,3,2
3,4,4,4,5,4,4,3
4,5,5,5,6,5,5,4


# Preprocessor (trade)

offer book の前処理でやったことを同じことを繰り返す。

In [70]:
# Function to preprocess trade data (for each stock id)
def trade_preprocessor(file_path):
    df = pd.read_parquet(file_path)
    df['log_return'] = df.groupby('time_id')['price'].apply(log_return)
    
    # Dict for aggregations
    create_feature_dict = {
        'log_return':[realized_volatility],
        'seconds_in_bucket':[count_unique],
        'size':[np.sum],
        'order_count':[np.mean],
    }
    
    # Function to get group stats for different windows (seconds in bucket)
    def get_stats_window(seconds_in_bucket, add_suffix = False):
        # Group by the window
        df_feature = df[df['seconds_in_bucket'] >= seconds_in_bucket].groupby(['time_id']).agg(create_feature_dict).reset_index()
        # Rename columns joining suffix
        df_feature.columns = ['_'.join(col) for col in df_feature.columns]
        # Add a suffix to differentiate windows
        if add_suffix:
            df_feature = df_feature.add_suffix('_' + str(seconds_in_bucket))
        return df_feature
    
    # Get the stats for different windows
    df_feature = get_stats_window(seconds_in_bucket = 0, add_suffix = False)
    df_feature_450 = get_stats_window(seconds_in_bucket = 450, add_suffix = True)
    df_feature_300 = get_stats_window(seconds_in_bucket = 300, add_suffix = True)
    df_feature_150 = get_stats_window(seconds_in_bucket = 150, add_suffix = True)

    # Merge all
    df_feature = df_feature.merge(df_feature_450, how = 'left', left_on = 'time_id_', right_on = 'time_id__450')
    df_feature = df_feature.merge(df_feature_300, how = 'left', left_on = 'time_id_', right_on = 'time_id__300')
    df_feature = df_feature.merge(df_feature_150, how = 'left', left_on = 'time_id_', right_on = 'time_id__150')
    # Drop unnecesary time_ids
    df_feature.drop(['time_id__450', 'time_id__300', 'time_id__150'], axis = 1, inplace = True)
    
    df_feature = df_feature.add_prefix('trade_')
    stock_id = file_path.split('=')[1]
    df_feature['row_id'] = df_feature['trade_time_id_'].apply(lambda x:f'{stock_id}-{x}')
    df_feature.drop(['trade_time_id_'], axis = 1, inplace = True)
    return df_feature

## 前処理の詳細

はじめに groupby して、log_return を追加しておく。

In [64]:
tmp = pd.read_parquet('../input/optiver-realized-volatility-prediction/trade_train.parquet/stock_id=0')
tmp['log_return'] = tmp.groupby('time_id')['price'].apply(log_return)
tmp

Unnamed: 0,time_id,seconds_in_bucket,price,size,order_count,log_return
0,5,21,1.002301,326,12,
1,5,46,1.002778,128,4,0.000476
2,5,50,1.002818,55,1,0.000040
3,5,57,1.003155,121,5,0.000336
4,5,68,1.003646,4,1,0.000489
...,...,...,...,...,...,...
123438,32767,471,0.998659,200,3,0.000144
123439,32767,517,0.998515,90,1,-0.000144
123440,32767,523,0.998563,1,1,0.000048
123441,32767,542,0.998803,90,4,0.000240


# Preprocessor

In [72]:
# Funtion to make preprocessing function in parallel (for each stock id)
def preprocessor(list_stock_ids, is_train = True):
    
    # Parrallel for loop
    def for_joblib(stock_id):
        # Train
        if is_train:
            file_path_book = data_dir + "book_train.parquet/stock_id=" + str(stock_id)
            file_path_trade = data_dir + "trade_train.parquet/stock_id=" + str(stock_id)
        # Test
        else:
            file_path_book = data_dir + "book_test.parquet/stock_id=" + str(stock_id)
            file_path_trade = data_dir + "trade_test.parquet/stock_id=" + str(stock_id)
    
        # Preprocess book and trade data and merge them
        df_tmp = pd.merge(book_preprocessor(file_path_book), trade_preprocessor(file_path_trade), on = 'row_id', how = 'left')
        
        # Return the merge dataframe
        return df_tmp
    
    # Use parallel api to call paralle for loop
    df = Parallel(n_jobs = -1, verbose = 1)(delayed(for_joblib)(stock_id) for stock_id in list_stock_ids)
    # Concatenate all the dataframes that return from Parallel
    df = pd.concat(df, ignore_index = True)
    return df

## joblib について

joblibを用いることで、シンプルに並行処理を行うことができる。並行処理には `Parallel` と `delayed` を用いる。

```python
joblib.Parallel(<Parallelへの引数>)(
    joblib.delayed(<実行する関数>)(<関数への引数>) for 変数名 in イテラブル
)
```

ここでは、n_jobs = -1, verbose = 1 を並行処理の引数としている。また並行処理させる関数は `preprocessor` の内部で定義されている `for_joblib`である。この関数はストックID（株銘柄を識別するためのID）を引数に持ち、book と trade を左結合したDFを返す。Parallelの返り値はこれらのDFをID個数分であり、最終的にそれらを concat して縦方向に連結している（インデックスは無視）。

この preprocessor で作成した DF は、train.csv から作成したDFと結合される。

# get_time_stock

- （いまさらだが）groupby-->agg/apply で新規作成した列名は、その関数名がsuffixとして自動で付いている。なので見に覚えのない `log_return1_realized_volatility_...`という列名を持ったDFをいつの間にか扱っていることに留意する。

In [83]:
# Function to get group stats for the stock_id and time_id
def get_time_stock(df):
    # Get realized volatility columns
    vol_cols = ['log_return1_realized_volatility', 'log_return2_realized_volatility', 'log_return1_realized_volatility_450', 'log_return2_realized_volatility_450', 
                'log_return1_realized_volatility_300', 'log_return2_realized_volatility_300', 'log_return1_realized_volatility_150', 'log_return2_realized_volatility_150', 
                'trade_log_return_realized_volatility', 'trade_log_return_realized_volatility_450', 'trade_log_return_realized_volatility_300', 'trade_log_return_realized_volatility_150']

    # Group by the stock id
    df_stock_id = df.groupby(['stock_id'])[vol_cols].agg(['mean', 'std', 'max', 'min', ]).reset_index()
    # Rename columns joining suffix
    df_stock_id.columns = ['_'.join(col) for col in df_stock_id.columns]
    df_stock_id = df_stock_id.add_suffix('_' + 'stock')

    # Group by the stock id
    df_time_id = df.groupby(['time_id'])[vol_cols].agg(['mean', 'std', 'max', 'min', ]).reset_index()
    # Rename columns joining suffix
    df_time_id.columns = ['_'.join(col) for col in df_time_id.columns]
    df_time_id = df_time_id.add_suffix('_' + 'time')
    
    # Merge with original dataframe
    df = df.merge(df_stock_id, how = 'left', left_on = ['stock_id'], right_on = ['stock_id__stock'])
    df = df.merge(df_time_id, how = 'left', left_on = ['time_id'], right_on = ['time_id__time'])
    df.drop(['stock_id__stock', 'time_id__time'], axis = 1, inplace = True)
    return df

# Train 前のデータ最終形態

In [87]:
# Read train.csv and test.csv
train, test = read_train_test()

# Get unique stock ids 
train_stock_ids = [0,]

# Preprocess them using Parallel and our single stock id functions
train_ = preprocessor(train_stock_ids, is_train = True)
train = train.merge(train_, on = ['row_id'], how = 'left')

# Get group stats of time_id and stock_id
train = get_time_stock(train)

train[train["stock_id"]==0]

Our training set has 428932 rows


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   25.0s finished


Unnamed: 0,stock_id,time_id,target,row_id,wap1_sum,wap1_mean,wap1_std,wap2_sum,wap2_mean,wap2_std,log_return1_sum,log_return1_realized_volatility,log_return1_mean,log_return1_std,log_return2_sum,log_return2_realized_volatility,log_return2_mean,log_return2_std,wap_balance_sum,wap_balance_mean,wap_balance_std,price_spread_sum,price_spread_mean,price_spread_std,bid_spread_sum,bid_spread_mean,bid_spread_std,ask_spread_sum,ask_spread_mean,ask_spread_std,total_volume_sum,total_volume_mean,total_volume_std,volume_imbalance_sum,volume_imbalance_mean,volume_imbalance_std,wap1_sum_450,wap1_mean_450,wap1_std_450,wap2_sum_450,wap2_mean_450,wap2_std_450,log_return1_sum_450,log_return1_realized_volatility_450,log_return1_mean_450,log_return1_std_450,log_return2_sum_450,log_return2_realized_volatility_450,log_return2_mean_450,log_return2_std_450,wap_balance_sum_450,wap_balance_mean_450,wap_balance_std_450,price_spread_sum_450,price_spread_mean_450,price_spread_std_450,bid_spread_sum_450,bid_spread_mean_450,bid_spread_std_450,ask_spread_sum_450,ask_spread_mean_450,ask_spread_std_450,total_volume_sum_450,total_volume_mean_450,total_volume_std_450,volume_imbalance_sum_450,volume_imbalance_mean_450,volume_imbalance_std_450,wap1_sum_300,wap1_mean_300,wap1_std_300,wap2_sum_300,wap2_mean_300,wap2_std_300,log_return1_sum_300,log_return1_realized_volatility_300,log_return1_mean_300,log_return1_std_300,log_return2_sum_300,log_return2_realized_volatility_300,log_return2_mean_300,log_return2_std_300,wap_balance_sum_300,wap_balance_mean_300,wap_balance_std_300,price_spread_sum_300,price_spread_mean_300,price_spread_std_300,bid_spread_sum_300,bid_spread_mean_300,bid_spread_std_300,ask_spread_sum_300,ask_spread_mean_300,ask_spread_std_300,total_volume_sum_300,total_volume_mean_300,total_volume_std_300,volume_imbalance_sum_300,volume_imbalance_mean_300,volume_imbalance_std_300,wap1_sum_150,wap1_mean_150,wap1_std_150,wap2_sum_150,wap2_mean_150,wap2_std_150,log_return1_sum_150,log_return1_realized_volatility_150,log_return1_mean_150,log_return1_std_150,log_return2_sum_150,log_return2_realized_volatility_150,log_return2_mean_150,log_return2_std_150,wap_balance_sum_150,wap_balance_mean_150,wap_balance_std_150,price_spread_sum_150,price_spread_mean_150,price_spread_std_150,bid_spread_sum_150,bid_spread_mean_150,bid_spread_std_150,ask_spread_sum_150,ask_spread_mean_150,ask_spread_std_150,total_volume_sum_150,total_volume_mean_150,total_volume_std_150,volume_imbalance_sum_150,volume_imbalance_mean_150,volume_imbalance_std_150,trade_log_return_realized_volatility,trade_seconds_in_bucket_count_unique,trade_size_sum,trade_order_count_mean,trade_log_return_realized_volatility_450,trade_seconds_in_bucket_count_unique_450,trade_size_sum_450,trade_order_count_mean_450,trade_log_return_realized_volatility_300,trade_seconds_in_bucket_count_unique_300,trade_size_sum_300,trade_order_count_mean_300,trade_log_return_realized_volatility_150,trade_seconds_in_bucket_count_unique_150,trade_size_sum_150,trade_order_count_mean_150,log_return1_realized_volatility_mean_stock,log_return1_realized_volatility_std_stock,log_return1_realized_volatility_max_stock,log_return1_realized_volatility_min_stock,log_return2_realized_volatility_mean_stock,log_return2_realized_volatility_std_stock,log_return2_realized_volatility_max_stock,log_return2_realized_volatility_min_stock,log_return1_realized_volatility_450_mean_stock,log_return1_realized_volatility_450_std_stock,log_return1_realized_volatility_450_max_stock,log_return1_realized_volatility_450_min_stock,log_return2_realized_volatility_450_mean_stock,log_return2_realized_volatility_450_std_stock,log_return2_realized_volatility_450_max_stock,log_return2_realized_volatility_450_min_stock,log_return1_realized_volatility_300_mean_stock,log_return1_realized_volatility_300_std_stock,log_return1_realized_volatility_300_max_stock,log_return1_realized_volatility_300_min_stock,log_return2_realized_volatility_300_mean_stock,log_return2_realized_volatility_300_std_stock,log_return2_realized_volatility_300_max_stock,log_return2_realized_volatility_300_min_stock,log_return1_realized_volatility_150_mean_stock,log_return1_realized_volatility_150_std_stock,log_return1_realized_volatility_150_max_stock,log_return1_realized_volatility_150_min_stock,log_return2_realized_volatility_150_mean_stock,log_return2_realized_volatility_150_std_stock,log_return2_realized_volatility_150_max_stock,log_return2_realized_volatility_150_min_stock,trade_log_return_realized_volatility_mean_stock,trade_log_return_realized_volatility_std_stock,trade_log_return_realized_volatility_max_stock,trade_log_return_realized_volatility_min_stock,trade_log_return_realized_volatility_450_mean_stock,trade_log_return_realized_volatility_450_std_stock,trade_log_return_realized_volatility_450_max_stock,trade_log_return_realized_volatility_450_min_stock,trade_log_return_realized_volatility_300_mean_stock,trade_log_return_realized_volatility_300_std_stock,trade_log_return_realized_volatility_300_max_stock,trade_log_return_realized_volatility_300_min_stock,trade_log_return_realized_volatility_150_mean_stock,trade_log_return_realized_volatility_150_std_stock,trade_log_return_realized_volatility_150_max_stock,trade_log_return_realized_volatility_150_min_stock,log_return1_realized_volatility_mean_time,log_return1_realized_volatility_std_time,log_return1_realized_volatility_max_time,log_return1_realized_volatility_min_time,log_return2_realized_volatility_mean_time,log_return2_realized_volatility_std_time,log_return2_realized_volatility_max_time,log_return2_realized_volatility_min_time,log_return1_realized_volatility_450_mean_time,log_return1_realized_volatility_450_std_time,log_return1_realized_volatility_450_max_time,log_return1_realized_volatility_450_min_time,log_return2_realized_volatility_450_mean_time,log_return2_realized_volatility_450_std_time,log_return2_realized_volatility_450_max_time,log_return2_realized_volatility_450_min_time,log_return1_realized_volatility_300_mean_time,log_return1_realized_volatility_300_std_time,log_return1_realized_volatility_300_max_time,log_return1_realized_volatility_300_min_time,log_return2_realized_volatility_300_mean_time,log_return2_realized_volatility_300_std_time,log_return2_realized_volatility_300_max_time,log_return2_realized_volatility_300_min_time,log_return1_realized_volatility_150_mean_time,log_return1_realized_volatility_150_std_time,log_return1_realized_volatility_150_max_time,log_return1_realized_volatility_150_min_time,log_return2_realized_volatility_150_mean_time,log_return2_realized_volatility_150_std_time,log_return2_realized_volatility_150_max_time,log_return2_realized_volatility_150_min_time,trade_log_return_realized_volatility_mean_time,trade_log_return_realized_volatility_std_time,trade_log_return_realized_volatility_max_time,trade_log_return_realized_volatility_min_time,trade_log_return_realized_volatility_450_mean_time,trade_log_return_realized_volatility_450_std_time,trade_log_return_realized_volatility_450_max_time,trade_log_return_realized_volatility_450_min_time,trade_log_return_realized_volatility_300_mean_time,trade_log_return_realized_volatility_300_std_time,trade_log_return_realized_volatility_300_max_time,trade_log_return_realized_volatility_300_min_time,trade_log_return_realized_volatility_150_mean_time,trade_log_return_realized_volatility_150_std_time,trade_log_return_realized_volatility_150_max_time,trade_log_return_realized_volatility_150_min_time
0,0,5,0.004136,0-5,303.125061,1.003725,0.000693,303.105539,1.003661,0.000781,0.002292,0.004499,7.613599e-06,0.000260,0.002325,0.006999,0.000008,0.000404,0.117051,0.000388,0.000295,0.257255,0.000852,0.000211,0.053006,0.000176,0.000162,-0.045557,-0.000151,0.000126,97696.0,323.496689,138.101214,40738.0,134.894040,107.260583,68.236749,1.003482,0.000514,68.231672,1.003407,0.000640,-0.000361,0.001721,-0.000005,0.000210,0.000068,0.004114,0.000001,0.000503,0.024868,0.000366,0.000277,0.053236,0.000783,0.000181,0.017790,0.000262,0.000178,-0.011274,-0.000166,0.000126,17948.0,263.941176,116.940077,9620.0,141.470588,84.467864,139.521722,1.003753,0.000487,139.509756,1.003667,0.000585,0.000157,0.002953,1.131529e-06,0.000251,0.000274,0.004863,0.000002,0.000414,0.051757,0.000372,0.000273,0.114272,0.000822,0.000237,0.030976,0.000223,0.000173,-0.022548,-0.000162,0.000131,40995.0,294.928058,136.527199,19065.0,137.158273,97.898813,232.888919,1.003832,0.000445,232.870736,1.003753,0.000519,0.000276,0.003796,0.000001,0.000250,0.000003,0.006087,1.295471e-08,0.000400,0.091997,0.000397,0.000281,0.199058,0.000858,0.000221,0.043697,0.000188,0.000165,-0.034024,-0.000147,0.000120,75964.0,327.431034,142.761068,28672.0,123.586207,103.533216,0.002006,40.0,3179.0,2.750000,0.001060,14.0,1042.0,2.642857,0.001308,21.0,1587.0,2.571429,0.001701,30.0,2069.0,2.433333,0.004459,0.003698,0.052704,0.000747,0.006856,0.005494,0.086671,0.001347,0.00202,0.00169,0.036491,0.000046,0.003102,0.002308,0.028931,0.0,0.002973,0.002408,0.050501,0.000227,0.004559,0.003414,0.047745,0.000454,0.003751,0.003075,0.052446,0.000485,0.005746,0.004383,0.05907,0.000941,0.00233,0.001763,0.028551,0.0,0.001102,0.00086,0.008911,0.0,0.001613,0.001212,0.021736,0.0,0.002028,0.001545,0.027483,0.0,0.004499,,0.004499,0.004499,0.006999,,0.006999,0.006999,0.001721,,0.001721,0.001721,0.004114,,0.004114,0.004114,0.002953,,0.002953,0.002953,0.004863,,0.004863,0.004863,0.003796,,0.003796,0.003796,0.006087,,0.006087,0.006087,0.002006,,0.002006,0.002006,0.001060,,0.001060,0.001060,0.001308,,0.001308,0.001308,0.001701,,0.001701,0.001701
1,0,11,0.001445,0-11,200.047768,1.000239,0.000262,200.041171,1.000206,0.000272,0.000360,0.001204,1.810239e-06,0.000086,0.000801,0.002476,0.000004,0.000176,0.042312,0.000212,0.000155,0.078836,0.000394,0.000157,0.028358,0.000142,0.000148,-0.027001,-0.000135,0.000065,82290.0,411.450000,172.263581,28410.0,142.050000,102.139758,54.027991,1.000518,0.000235,54.021532,1.000399,0.000287,-0.000059,0.000918,-0.000001,0.000126,0.000488,0.001883,0.000009,0.000258,0.014524,0.000269,0.000175,0.018812,0.000348,0.000144,0.012598,0.000233,0.000239,-0.007729,-0.000143,0.000066,24191.0,447.981481,177.264272,5275.0,97.685185,88.144569,115.045656,1.000397,0.000207,115.039774,1.000346,0.000241,0.000096,0.000981,8.383753e-07,0.000092,0.000413,0.002009,0.000004,0.000188,0.027445,0.000239,0.000158,0.040589,0.000353,0.000121,0.018873,0.000164,0.000180,-0.014153,-0.000123,0.000059,55720.0,484.521739,168.586713,15584.0,135.513043,110.256349,173.052001,1.000301,0.000221,173.042301,1.000245,0.000266,0.000298,0.001058,0.000002,0.000081,0.000873,0.002262,5.044579e-06,0.000172,0.035454,0.000205,0.000158,0.061017,0.000353,0.000112,0.024394,0.000141,0.000154,-0.022032,-0.000127,0.000058,72535.0,419.277457,178.652395,26221.0,151.566474,104.576846,0.000901,30.0,1289.0,1.900000,0.000501,10.0,828.0,2.200000,0.000587,16.0,900.0,2.250000,0.000813,24.0,1173.0,2.041667,0.004459,0.003698,0.052704,0.000747,0.006856,0.005494,0.086671,0.001347,0.00202,0.00169,0.036491,0.000046,0.003102,0.002308,0.028931,0.0,0.002973,0.002408,0.050501,0.000227,0.004559,0.003414,0.047745,0.000454,0.003751,0.003075,0.052446,0.000485,0.005746,0.004383,0.05907,0.000941,0.00233,0.001763,0.028551,0.0,0.001102,0.00086,0.008911,0.0,0.001613,0.001212,0.021736,0.0,0.002028,0.001545,0.027483,0.0,0.001204,,0.001204,0.001204,0.002476,,0.002476,0.002476,0.000918,,0.000918,0.000918,0.001883,,0.001883,0.001883,0.000981,,0.000981,0.000981,0.002009,,0.002009,0.002009,0.001058,,0.001058,0.001058,0.002262,,0.002262,0.002262,0.000901,,0.000901,0.000901,0.000501,,0.000501,0.000501,0.000587,,0.000587,0.000587,0.000813,,0.000813,0.000813
2,0,16,0.002168,0-16,187.913849,0.999542,0.000864,187.939824,0.999680,0.000862,-0.002074,0.002369,-1.109201e-05,0.000173,-0.001493,0.004801,-0.000008,0.000352,0.062228,0.000331,0.000246,0.136330,0.000725,0.000164,0.036955,0.000197,0.000170,-0.037243,-0.000198,0.000171,78274.0,416.351064,138.433034,26586.0,141.414894,108.891243,43.922425,0.998237,0.000541,43.933158,0.998481,0.000766,-0.001469,0.001158,-0.000033,0.000173,-0.001831,0.002972,-0.000042,0.000451,0.016055,0.000365,0.000282,0.026608,0.000605,0.000105,0.008186,0.000186,0.000217,-0.009143,-0.000208,0.000168,20201.0,459.113636,116.212559,6869.0,156.113636,102.024670,67.910601,0.998685,0.000779,67.927550,0.998935,0.000891,-0.002591,0.001295,-3.810560e-05,0.000153,-0.001549,0.003196,-0.000023,0.000390,0.029308,0.000431,0.000294,0.046866,0.000689,0.000162,0.009622,0.000141,0.000185,-0.016945,-0.000249,0.000190,30956.0,455.235294,120.920736,9802.0,144.147059,101.873534,118.896016,0.999126,0.000829,118.918175,0.999312,0.000853,-0.002854,0.002138,-0.000024,0.000195,-0.002986,0.004019,-2.509050e-05,0.000369,0.044347,0.000373,0.000276,0.080811,0.000679,0.000163,0.019100,0.000161,0.000155,-0.028626,-0.000241,0.000195,50996.0,428.537815,135.376048,15718.0,132.084034,114.924631,0.001961,25.0,2161.0,2.720000,0.001048,9.0,1085.0,3.666667,0.001137,12.0,1189.0,3.166667,0.001621,20.0,2010.0,2.950000,0.004459,0.003698,0.052704,0.000747,0.006856,0.005494,0.086671,0.001347,0.00202,0.00169,0.036491,0.000046,0.003102,0.002308,0.028931,0.0,0.002973,0.002408,0.050501,0.000227,0.004559,0.003414,0.047745,0.000454,0.003751,0.003075,0.052446,0.000485,0.005746,0.004383,0.05907,0.000941,0.00233,0.001763,0.028551,0.0,0.001102,0.00086,0.008911,0.0,0.001613,0.001212,0.021736,0.0,0.002028,0.001545,0.027483,0.0,0.002369,,0.002369,0.002369,0.004801,,0.004801,0.004801,0.001158,,0.001158,0.001158,0.002972,,0.002972,0.002972,0.001295,,0.001295,0.001295,0.003196,,0.003196,0.003196,0.002138,,0.002138,0.002138,0.004019,,0.004019,0.004019,0.001961,,0.001961,0.001961,0.001048,,0.001048,0.001048,0.001137,,0.001137,0.001137,0.001621,,0.001621,0.001621
3,0,31,0.002195,0-31,119.859781,0.998832,0.000757,119.835941,0.998633,0.000656,-0.002828,0.002574,-2.376661e-05,0.000236,-0.002053,0.003637,-0.000017,0.000334,0.045611,0.000380,0.000248,0.103252,0.000860,0.000280,0.022764,0.000190,0.000199,-0.013001,-0.000108,0.000091,52232.0,435.266667,156.120334,17546.0,146.216667,121.533215,17.965415,0.998079,0.000430,17.971630,0.998424,0.000544,-0.000526,0.000993,-0.000029,0.000239,-0.000882,0.001424,-0.000049,0.000342,0.006441,0.000358,0.000253,0.019047,0.001058,0.000074,0.002082,0.000116,0.000046,-0.000879,-0.000049,0.000011,9720.0,540.000000,153.413704,2628.0,146.000000,106.693624,52.917110,0.998436,0.000504,52.918125,0.998455,0.000513,-0.001179,0.001776,-2.224226e-05,0.000245,-0.000440,0.002713,-0.000008,0.000376,0.017525,0.000331,0.000228,0.044159,0.000833,0.000278,0.008375,0.000158,0.000165,-0.005043,-0.000095,0.000076,22163.0,418.169811,146.485459,7669.0,144.698113,101.135778,80.875601,0.998464,0.000432,80.866621,0.998353,0.000477,-0.001290,0.002196,-0.000016,0.000245,-0.001112,0.003273,-1.372564e-05,0.000366,0.029323,0.000362,0.000247,0.074552,0.000920,0.000296,0.013789,0.000170,0.000191,-0.008745,-0.000108,0.000085,34363.0,424.234568,156.628404,12293.0,151.765432,124.293028,0.001561,15.0,1962.0,3.933333,0.000802,3.0,514.0,3.666667,0.001089,9.0,1556.0,5.111111,0.001401,11.0,1631.0,4.545455,0.004459,0.003698,0.052704,0.000747,0.006856,0.005494,0.086671,0.001347,0.00202,0.00169,0.036491,0.000046,0.003102,0.002308,0.028931,0.0,0.002973,0.002408,0.050501,0.000227,0.004559,0.003414,0.047745,0.000454,0.003751,0.003075,0.052446,0.000485,0.005746,0.004383,0.05907,0.000941,0.00233,0.001763,0.028551,0.0,0.001102,0.00086,0.008911,0.0,0.001613,0.001212,0.021736,0.0,0.002028,0.001545,0.027483,0.0,0.002574,,0.002574,0.002574,0.003637,,0.003637,0.003637,0.000993,,0.000993,0.000993,0.001424,,0.001424,0.001424,0.001776,,0.001776,0.001776,0.002713,,0.002713,0.002713,0.002196,,0.002196,0.002196,0.003273,,0.003273,0.003273,0.001561,,0.001561,0.001561,0.000802,,0.000802,0.000802,0.001089,,0.001089,0.001089,0.001401,,0.001401,0.001401
4,0,62,0.001747,0-62,175.932865,0.999619,0.000258,175.934256,0.999626,0.000317,-0.000002,0.001894,-1.057099e-08,0.000144,-0.000281,0.003257,-0.000002,0.000247,0.044783,0.000254,0.000188,0.069901,0.000397,0.000130,0.033565,0.000191,0.000083,-0.019206,-0.000109,0.000076,60407.0,343.221591,158.054066,21797.0,123.846591,102.407501,35.982653,0.999518,0.000257,35.991844,0.999773,0.000212,0.000397,0.001378,0.000011,0.000233,-0.000298,0.000966,-0.000008,0.000163,0.013087,0.000364,0.000203,0.018700,0.000519,0.000138,0.007040,0.000196,0.000024,-0.004895,-0.000136,0.000066,14110.0,391.944444,123.180227,4212.0,117.000000,99.328028,88.954468,0.999488,0.000205,88.965742,0.999615,0.000272,0.000645,0.001520,7.249930e-06,0.000162,-0.000201,0.002188,-0.000002,0.000233,0.022397,0.000252,0.000188,0.037820,0.000425,0.000140,0.017016,0.000191,0.000073,-0.010722,-0.000120,0.000076,36275.0,407.584270,165.851509,8851.0,99.449438,93.029811,134.948413,0.999618,0.000259,134.955499,0.999670,0.000293,0.000491,0.001609,0.000004,0.000139,0.000299,0.002927,2.213193e-06,0.000253,0.032718,0.000242,0.000193,0.053347,0.000395,0.000137,0.025220,0.000187,0.000088,-0.015757,-0.000117,0.000080,50121.0,371.266667,162.610706,17749.0,131.474074,109.275622,0.000871,22.0,1791.0,4.045455,0.000360,4.0,43.0,3.500000,0.000453,11.0,1219.0,4.909091,0.000550,16.0,1570.0,4.500000,0.004459,0.003698,0.052704,0.000747,0.006856,0.005494,0.086671,0.001347,0.00202,0.00169,0.036491,0.000046,0.003102,0.002308,0.028931,0.0,0.002973,0.002408,0.050501,0.000227,0.004559,0.003414,0.047745,0.000454,0.003751,0.003075,0.052446,0.000485,0.005746,0.004383,0.05907,0.000941,0.00233,0.001763,0.028551,0.0,0.001102,0.00086,0.008911,0.0,0.001613,0.001212,0.021736,0.0,0.002028,0.001545,0.027483,0.0,0.001894,,0.001894,0.001894,0.003257,,0.003257,0.003257,0.001378,,0.001378,0.001378,0.000966,,0.000966,0.000966,0.001520,,0.001520,0.001520,0.002188,,0.002188,0.002188,0.001609,,0.001609,0.001609,0.002927,,0.002927,0.002927,0.000871,,0.000871,0.000871,0.000360,,0.000360,0.000360,0.000453,,0.000453,0.000453,0.000550,,0.000550,0.000550
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3825,0,32751,0.002611,0-32751,296.387479,0.997938,0.000747,296.365481,0.997864,0.000769,-0.002136,0.002579,-7.215157e-06,0.000150,-0.002055,0.003821,-0.000007,0.000222,0.062912,0.000212,0.000159,0.163999,0.000552,0.000202,0.024713,0.000083,0.000072,-0.054055,-0.000182,0.000125,111148.0,374.235690,165.473374,58552.0,197.144781,129.451712,64.787079,0.996724,0.000301,64.782468,0.996653,0.000394,-0.001050,0.001148,-0.000016,0.000143,-0.001227,0.002048,-0.000019,0.000255,0.011163,0.000172,0.000141,0.027610,0.000425,0.000098,0.004838,0.000074,0.000039,-0.013995,-0.000215,0.000103,20943.0,322.200000,153.142825,14765.0,227.153846,175.857473,149.627778,0.997519,0.000830,149.612487,0.997417,0.000791,-0.001956,0.001673,-1.303872e-05,0.000136,-0.001652,0.002573,-0.000011,0.000210,0.028973,0.000193,0.000164,0.076295,0.000509,0.000150,0.009261,0.000062,0.000028,-0.025285,-0.000169,0.000107,52584.0,350.560000,142.775530,35092.0,233.946667,139.068560,226.496946,0.997784,0.000779,226.470294,0.997666,0.000755,-0.002261,0.002383,-0.000010,0.000158,-0.002749,0.003507,-1.210854e-05,0.000233,0.050565,0.000223,0.000166,0.130025,0.000573,0.000218,0.014881,0.000066,0.000030,-0.038915,-0.000171,0.000118,79431.0,349.916300,168.444647,44575.0,196.365639,137.189564,0.001519,52.0,3450.0,3.057692,0.000786,19.0,1159.0,2.947368,0.001162,35.0,2365.0,3.257143,0.001409,42.0,2957.0,3.238095,0.004459,0.003698,0.052704,0.000747,0.006856,0.005494,0.086671,0.001347,0.00202,0.00169,0.036491,0.000046,0.003102,0.002308,0.028931,0.0,0.002973,0.002408,0.050501,0.000227,0.004559,0.003414,0.047745,0.000454,0.003751,0.003075,0.052446,0.000485,0.005746,0.004383,0.05907,0.000941,0.00233,0.001763,0.028551,0.0,0.001102,0.00086,0.008911,0.0,0.001613,0.001212,0.021736,0.0,0.002028,0.001545,0.027483,0.0,0.002579,,0.002579,0.002579,0.003821,,0.003821,0.003821,0.001148,,0.001148,0.001148,0.002048,,0.002048,0.002048,0.001673,,0.001673,0.001673,0.002573,,0.002573,0.002573,0.002383,,0.002383,0.002383,0.003507,,0.003507,0.003507,0.001519,,0.001519,0.001519,0.000786,,0.000786,0.000786,0.001162,,0.001162,0.001162,0.001409,,0.001409,0.001409
3826,0,32753,0.001190,0-32753,206.063903,1.000310,0.000551,206.100395,1.000487,0.000599,0.000403,0.002206,1.966770e-06,0.000154,0.000959,0.002847,0.000005,0.000199,0.055028,0.000267,0.000193,0.111732,0.000542,0.000147,0.018874,0.000092,0.000088,-0.035445,-0.000172,0.000083,127953.0,621.131068,266.019708,48159.0,233.781553,153.128340,46.028267,1.000614,0.000278,46.036303,1.000789,0.000381,-0.000917,0.001146,-0.000020,0.000170,-0.000842,0.001863,-0.000018,0.000277,0.011711,0.000255,0.000166,0.026312,0.000572,0.000066,0.003454,0.000075,0.000061,-0.007959,-0.000173,0.000114,24448.0,531.478261,201.290916,10774.0,234.217391,153.964919,100.068151,1.000682,0.000439,100.092961,1.000930,0.000427,0.000297,0.001487,2.965882e-06,0.000149,0.000229,0.002255,0.000002,0.000227,0.029996,0.000300,0.000196,0.058826,0.000588,0.000118,0.007409,0.000074,0.000068,-0.017671,-0.000177,0.000100,66864.0,668.640000,264.869952,25792.0,257.920000,176.388041,147.077579,1.000528,0.000431,147.109034,1.000742,0.000458,0.000237,0.001519,0.000002,0.000126,-0.000017,0.002396,-1.181481e-07,0.000198,0.042064,0.000286,0.000186,0.084847,0.000577,0.000115,0.011615,0.000079,0.000073,-0.026583,-0.000181,0.000085,104044.0,707.782313,248.050412,37316.0,253.850340,159.936518,0.001411,28.0,4547.0,3.892857,0.000750,5.0,1158.0,4.600000,0.001066,12.0,2161.0,4.250000,0.001284,19.0,2494.0,3.421053,0.004459,0.003698,0.052704,0.000747,0.006856,0.005494,0.086671,0.001347,0.00202,0.00169,0.036491,0.000046,0.003102,0.002308,0.028931,0.0,0.002973,0.002408,0.050501,0.000227,0.004559,0.003414,0.047745,0.000454,0.003751,0.003075,0.052446,0.000485,0.005746,0.004383,0.05907,0.000941,0.00233,0.001763,0.028551,0.0,0.001102,0.00086,0.008911,0.0,0.001613,0.001212,0.021736,0.0,0.002028,0.001545,0.027483,0.0,0.002206,,0.002206,0.002206,0.002847,,0.002847,0.002847,0.001146,,0.001146,0.001146,0.001863,,0.001863,0.001863,0.001487,,0.001487,0.001487,0.002255,,0.002255,0.002255,0.001519,,0.001519,0.001519,0.002396,,0.002396,0.002396,0.001411,,0.001411,0.001411,0.000750,,0.000750,0.000750,0.001066,,0.001066,0.001066,0.001284,,0.001284,0.001284
3827,0,32758,0.004264,0-32758,187.915689,0.999552,0.000743,187.897700,0.999456,0.000736,0.001663,0.002913,8.895445e-06,0.000213,0.002077,0.003266,0.000011,0.000239,0.044629,0.000237,0.000188,0.098700,0.000525,0.000244,0.038039,0.000202,0.000147,-0.015621,-0.000083,0.000060,64622.0,343.734043,140.150429,21776.0,115.829787,105.146411,62.021208,1.000342,0.000188,62.011824,1.000191,0.000227,0.000413,0.001303,0.000007,0.000167,0.000238,0.001972,0.000004,0.000252,0.014819,0.000239,0.000139,0.029162,0.000470,0.000125,0.009371,0.000151,0.000090,-0.005240,-0.000085,0.000055,20533.0,331.177419,122.523805,7443.0,120.048387,92.412374,104.011571,1.000111,0.000454,104.003923,1.000038,0.000396,0.001353,0.001929,1.301021e-05,0.000190,0.000891,0.002646,0.000009,0.000261,0.022432,0.000216,0.000159,0.046396,0.000446,0.000177,0.019901,0.000191,0.000133,-0.007760,-0.000075,0.000046,33983.0,326.759615,124.014444,10965.0,105.432692,82.634975,132.980752,0.999855,0.000646,132.972460,0.999793,0.000593,0.000799,0.002404,0.000006,0.000209,0.001548,0.003006,1.163567e-05,0.000261,0.032494,0.000244,0.000189,0.068435,0.000515,0.000234,0.028567,0.000215,0.000150,-0.011187,-0.000084,0.000066,41648.0,313.142857,129.148560,13702.0,103.022556,84.414783,0.001521,36.0,4250.0,3.500000,0.000780,8.0,416.0,2.000000,0.001242,22.0,2294.0,3.727273,0.001375,27.0,2736.0,3.444444,0.004459,0.003698,0.052704,0.000747,0.006856,0.005494,0.086671,0.001347,0.00202,0.00169,0.036491,0.000046,0.003102,0.002308,0.028931,0.0,0.002973,0.002408,0.050501,0.000227,0.004559,0.003414,0.047745,0.000454,0.003751,0.003075,0.052446,0.000485,0.005746,0.004383,0.05907,0.000941,0.00233,0.001763,0.028551,0.0,0.001102,0.00086,0.008911,0.0,0.001613,0.001212,0.021736,0.0,0.002028,0.001545,0.027483,0.0,0.002913,,0.002913,0.002913,0.003266,,0.003266,0.003266,0.001303,,0.001303,0.001303,0.001972,,0.001972,0.001972,0.001929,,0.001929,0.001929,0.002646,,0.002646,0.002646,0.002404,,0.002404,0.002404,0.003006,,0.003006,0.003006,0.001521,,0.001521,0.001521,0.000780,,0.000780,0.000780,0.001242,,0.001242,0.001242,0.001375,,0.001375,0.001375
3828,0,32763,0.004352,0-32763,307.723687,1.002357,0.000356,307.732623,1.002386,0.000424,0.000520,0.003046,1.698933e-06,0.000174,0.000614,0.005105,0.000002,0.000292,0.075224,0.000245,0.000187,0.147444,0.000480,0.000145,0.034710,0.000113,0.000104,-0.050912,-0.000166,0.000149,118327.0,385.429967,140.552333,40547.0,132.074919,95.735325,76.176449,1.002322,0.000278,76.169808,1.002234,0.000390,-0.000225,0.001413,-0.000003,0.000163,0.000300,0.002180,0.000004,0.000252,0.015376,0.000202,0.000150,0.039426,0.000519,0.000124,0.007306,0.000096,0.000082,-0.011154,-0.000147,0.000169,34328.0,451.684211,168.076904,9226.0,121.394737,95.503589,163.371123,1.002277,0.000239,163.379414,1.002328,0.000380,-0.000447,0.002137,-2.744205e-06,0.000168,-0.000427,0.003934,-0.000003,0.000309,0.043832,0.000269,0.000182,0.084086,0.000516,0.000124,0.015576,0.000096,0.000069,-0.028557,-0.000175,0.000156,64318.0,394.588957,154.066273,20118.0,123.423313,88.622998,219.499522,1.002281,0.000270,219.509161,1.002325,0.000370,-0.000576,0.002645,-0.000003,0.000179,-0.000280,0.004526,-1.276322e-06,0.000307,0.058440,0.000267,0.000190,0.113443,0.000518,0.000139,0.020769,0.000095,0.000070,-0.032307,-0.000148,0.000145,85251.0,389.273973,146.309737,26493.0,120.972603,90.168594,0.001794,53.0,3217.0,2.150943,0.001012,12.0,1415.0,2.666667,0.001404,25.0,1627.0,1.920000,0.001650,36.0,2296.0,2.055556,0.004459,0.003698,0.052704,0.000747,0.006856,0.005494,0.086671,0.001347,0.00202,0.00169,0.036491,0.000046,0.003102,0.002308,0.028931,0.0,0.002973,0.002408,0.050501,0.000227,0.004559,0.003414,0.047745,0.000454,0.003751,0.003075,0.052446,0.000485,0.005746,0.004383,0.05907,0.000941,0.00233,0.001763,0.028551,0.0,0.001102,0.00086,0.008911,0.0,0.001613,0.001212,0.021736,0.0,0.002028,0.001545,0.027483,0.0,0.003046,,0.003046,0.003046,0.005105,,0.005105,0.005105,0.001413,,0.001413,0.001413,0.002180,,0.002180,0.002180,0.002137,,0.002137,0.002137,0.003934,,0.003934,0.003934,0.002645,,0.002645,0.002645,0.004526,,0.004526,0.004526,0.001794,,0.001794,0.001794,0.001012,,0.001012,0.001012,0.001404,,0.001404,0.001404,0.001650,,0.001650,0.001650


## 結局やっていること

### book_data
1. WAP, log_return, WAP/bid/ask/size の残差 を計算
1. データをseconds_in_bucketで分割する（0~, 150~, 300~, 450~）
1. それらの情報の sum, mean, std, realized_volatility を、time_id (=time windowのこと) 毎に groupbyして計算する

### trade

1. log_return を計算する
1. データをseconds_in_bucketで分割する（book_data と違ってスパースな時間区分であることに留意）
1. それらの情報を realized_volatility, count_unique, np.sum, mean で計算する

### train

1. 以上で計算した book・trade を stock_id 毎に用意して、それを縦連結する
1. 時間で分割、stock_id でループを回しているのでややこしく見えるが大したことはやってない

# train (LightGBM)

In [66]:
# Function to calculate the root mean squared percentage error
def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))

# Function to early stop with root mean squared percentage error
def feval_rmspe(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'RMSPE', rmspe(y_true, y_pred), False

In [67]:
def train_and_evaluate(train, test):
    # Hyperparammeters (just basic)
    params = {
      'objective': 'rmse',  
      'boosting_type': 'gbdt',
      'num_leaves': 100,
      'n_jobs': -1,
      'learning_rate': 0.1,
      'feature_fraction': 0.8,
      'bagging_fraction': 0.8,
      'verbose': -1
    }
    
    # Split features and target
    x = train.drop(['row_id', 'target', 'time_id'], axis = 1)
    y = train['target']
    x_test = test.drop(['row_id', 'time_id'], axis = 1)
    # Transform stock id to a numeric value
    x['stock_id'] = x['stock_id'].astype(int)
    x_test['stock_id'] = x_test['stock_id'].astype(int)
    
    # Create out of folds array
    oof_predictions = np.zeros(x.shape[0])
    # Create test array to store predictions
    test_predictions = np.zeros(x_test.shape[0])
    # Create a KFold object
    kfold = KFold(n_splits = 5, random_state = 66, shuffle = True)
    # Iterate through each fold
    for fold, (trn_ind, val_ind) in enumerate(kfold.split(x)):
        print(f'Training fold {fold + 1}')
        x_train, x_val = x.iloc[trn_ind], x.iloc[val_ind]
        y_train, y_val = y.iloc[trn_ind], y.iloc[val_ind]
        # Root mean squared percentage error weights
        train_weights = 1 / np.square(y_train)
        val_weights = 1 / np.square(y_val)
        train_dataset = lgb.Dataset(x_train, y_train, weight = train_weights, categorical_feature = ['stock_id'])
        val_dataset = lgb.Dataset(x_val, y_val, weight = val_weights, categorical_feature = ['stock_id'])
        model = lgb.train(params = params, 
                          train_set = train_dataset, 
                          valid_sets = [train_dataset, val_dataset], 
                          num_boost_round = 10000, 
                          early_stopping_rounds = 50, 
                          verbose_eval = 50,
                          feval = feval_rmspe)
        # Add predictions to the out of folds array
        oof_predictions[val_ind] = model.predict(x_val)
        # Predict the test set
        test_predictions += model.predict(x_test) / 5
        
    rmspe_score = rmspe(y, oof_predictions)
    print(f'Our out of folds RMSPE is {rmspe_score}')
    # Return test predictions
    return test_predictions

# 実際にLightGBMを回してみる

In [89]:
# Read train.csv and test.csv
train, test = read_train_test()

# Get unique stock ids 
train_stock_ids = train['stock_id'].unique()
# Set 0 for local macbook PC
train_stock_ids = [0, ]

# Preprocess them using Parallel and our single stock id functions
train_ = preprocessor(train_stock_ids, is_train = True)
train = train.merge(train_, on = ['row_id'], how = 'left')

# Get unique stock ids 
test_stock_ids = test['stock_id'].unique()
# Preprocess them using Parallel and our single stock id functions
test_ = preprocessor(test_stock_ids, is_train = False)
test = test.merge(test_, on = ['row_id'], how = 'left')

# Get group stats of time_id and stock_id
train = get_time_stock(train)
test = get_time_stock(test)

Our training set has 428932 rows


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   25.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.9s finished


In [90]:
# Traing and evaluate
test_predictions = train_and_evaluate(train, test)
# Save test predictions
test['target'] = test_predictions
#test[['row_id', 'target']].to_csv('submission.csv',index = False)

Training fold 1
Training until validation scores don't improve for 50 rounds
[50]	training's rmse: 0.00060594	training's RMSPE: 0.280645	valid_1's rmse: 0.000624408	valid_1's RMSPE: 0.288123
[100]	training's rmse: 0.000553776	training's RMSPE: 0.256485	valid_1's rmse: 0.000584433	valid_1's RMSPE: 0.269677
[150]	training's rmse: 0.000528296	training's RMSPE: 0.244684	valid_1's rmse: 0.000568897	valid_1's RMSPE: 0.262509
[200]	training's rmse: 0.000512281	training's RMSPE: 0.237267	valid_1's rmse: 0.000561137	valid_1's RMSPE: 0.258928
[250]	training's rmse: 0.000500215	training's RMSPE: 0.231678	valid_1's rmse: 0.000555248	valid_1's RMSPE: 0.256211
[300]	training's rmse: 0.000490539	training's RMSPE: 0.227196	valid_1's rmse: 0.0005509	valid_1's RMSPE: 0.254204
[350]	training's rmse: 0.000482713	training's RMSPE: 0.223572	valid_1's rmse: 0.000547611	valid_1's RMSPE: 0.252686
[400]	training's rmse: 0.000476017	training's RMSPE: 0.22047	valid_1's rmse: 0.000545258	valid_1's RMSPE: 0.251601
