# Competition

In [36]:
import os
import glob
from joblib import Parallel, delayed
import pandas as pd
import numpy as np
import scipy as sc
from sklearn.model_selection import StratifiedKFold, train_test_split
import random
import optuna
import xgboost as xgb
import warnings
from collections import Counter, defaultdict
from tqdm import tqdm
import numpy.matlib
warnings.filterwarnings('ignore')
pd.set_option('max_columns', 300)

## Preprocessing

### Preprocessing Utilities

In [2]:
def calc_waps(df):
    var1 = df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']
    var2 = df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2']
    var3 = df['bid_price1'] * df['bid_size1'] + df['ask_price1'] * df['ask_size1']
    var4 = df['bid_price2'] * df['bid_size2'] + df['ask_price2'] * df['ask_size2']
    volumes = df['bid_size1'] + df['ask_size1'] + df['bid_size2'] + df['ask_size2']
    df['wap1'] = var1 / (df['bid_size1'] + df['ask_size1'])
    df['wap2'] = var2 / (df['bid_size2'] + df['ask_size2'])
    df['wap3'] = var3 / (df['bid_size1'] + df['ask_size1'])
    df['wap4'] = var4 / (df['bid_size2'] + df['ask_size2'])
    df['wap12'] = (var1 + var2) / volumes
    df['wap34'] = (var3 + var4) / volumes
    # Calculate wap balance
    df['wap_balance1'] = abs(df['wap1'] - df['wap2'])
    df['wap_balance2'] = abs(df['wap3'] - df['wap4'])
    return df

def calc_log_returns(df):
    df['log_return1'] = df.groupby(['time_id'])['wap1'].apply(log_return)
    df['log_return2'] = df.groupby(['time_id'])['wap2'].apply(log_return)
    df['log_return3'] = df.groupby(['time_id'])['wap3'].apply(log_return)
    df['log_return4'] = df.groupby(['time_id'])['wap4'].apply(log_return)
    df['log_return12'] = df.groupby(['time_id'])['wap12'].apply(log_return)
    df['log_return34'] = df.groupby(['time_id'])['wap34'].apply(log_return)
    return df

def calc_depth(df):
    df['depth'] = df['bid_price1'] * df['bid_size1'] + df['ask_price1'] * df['ask_size1'] + df['bid_price2'] * df['bid_size2'] + df['ask_price2'] * df['ask_size2']
    return df

def calc_slope(df):
    v0 = (df['bid_size1']+df['ask_size1'])/2
    p0 = (df['bid_price1']+df['ask_price1'])/2
    slope_bid = ((df['bid_size1']/v0)-1)/abs((df['bid_price1']/p0)-1)+(
                (df['bid_size2']/df['bid_size1'])-1)/abs((df['bid_price2']/df['bid_price1'])-1)
    slope_ask = ((df['ask_size1']/v0)-1)/abs((df['ask_price1']/p0)-1)+(
                (df['ask_size2']/df['ask_size1'])-1)/abs((df['ask_price2']/df['ask_price1'])-1)
    df['slope_mid'] = (slope_bid + slope_ask) / 2
    df['slope_spread'] = abs(slope_bid - slope_ask)
    return df

def calc_spread(df):    
    # Calculate spread
    df['bid_spread'] = df['bid_price1'] - df['bid_price2']
    df['ask_spread'] = df['ask_price1'] - df['ask_price2']
    # Calculate mid
    df['bid_mid1'] = (df['bid_price1'] + df['ask_price1'])/2 - df['bid_price1']
    df['bid_mid2'] = (df['bid_price2'] + df['ask_price2'])/2 - df['bid_price2']
    df['ask_mid1'] = df['ask_price1'] - (df['bid_price1'] + df['ask_price1'])/2
    df['ask_mid2'] = df['ask_price2'] - (df['bid_price2'] + df['ask_price2'])/2
    # Calculate dispersion
    df['price_spread1'] = (df['ask_price1'] - df['bid_price1']) / ((df['ask_price1'] + df['bid_price1']) / 2)
    df['price_spread2'] = (df['ask_price2'] - df['bid_price2']) / ((df['ask_price2'] + df['bid_price2']) / 2)
    
    df['bid_ask_spread'] = abs(df['bid_spread'] - df['ask_spread'])
    df['total_volume'] = (df['ask_size1'] + df['ask_size2']) + (df['bid_size1'] + df['bid_size2'])
    df['volume_imbalance'] = abs((df['ask_size1'] + df['ask_size2']) - (df['bid_size1'] + df['bid_size2']))
    return df

def calc_price_impact(df):
    ask = (df['ask_price1'] * df['ask_size1'] + df['ask_price2'] * df['ask_size2'])/(df['ask_size1']+df['ask_size2'])
    bid = (df['bid_price1'] * df['bid_size1'] + df['bid_price2'] * df['bid_size2'])/(df['bid_size1']+df['bid_size2'])
    df['bid_impact1'] = (df['bid_price1'] - bid)/df['bid_price1']
    df['bid_impact2'] = (df['bid_price2'] - bid)/df['bid_price2']
    df['ask_impact1'] = (df['ask_price1'] - ask)/df['ask_price1']
    df['ask_impact2'] = (df['ask_price2'] - ask)/df['ask_price2']
    return df

In [3]:
def log_return(wap):
    return np.log(wap).diff()

In [4]:
def realized_volatility(series):
    return np.sqrt(np.sum(series**2))

def realized_absvar(series):
    return np.sqrt(np.pi/(2*series.count()))*np.sum(np.abs(series))

def realized_skew(series):
    return np.sqrt(series.count())*np.sum(series**3)/(realized_volatility(series)**3)

def realized_kurtosis(series):
    return series.count()*np.sum(series**4)/(realized_volatility(series)**4)

def realized_quarticity(series):
    return (series.count()/3)*np.sum(series**4)

def realized_skew(series):
    return np.sqrt(series.count())*np.sum(series**3)/(realized_volatility(series)**3)

def realized_kurtosis(series):
    return series.count()*np.sum(series**4)/(realized_volatility(series)**4)

def realized_quarticity(series):
    return (series.count()/3)*np.sum(series**4)

In [5]:
def count_unique(series):
    return len(np.unique(series))

In [6]:
# Function to read our base train and test set
def read_train_test():
    train = pd.read_csv(config.data_dir + 'train.csv')
    test = pd.read_csv(config.data_dir + 'test.csv')
    # Create a key to merge with book and trade data
    train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
    test['row_id'] = test['stock_id'].astype(str) + '-' + test['time_id'].astype(str)
    print(f'Our training set has {train.shape[0]} rows')
    return train, test

In [7]:
class Config:
    seed = 123
    data_dir = '/data/'
    time_gap = 100
    
    feature_dict_book = {
        'wap1': [np.sum, np.mean, np.std],
        'wap2': [np.sum, np.mean, np.std],
        'wap3': [np.sum, np.mean, np.std],
        'wap4': [np.sum, np.mean, np.std],
        'wap12': [np.sum, np.mean, np.std],
        'wap34': [np.sum, np.mean, np.std],
        'wap_balance1': [np.sum, np.mean, np.std],
        'wap_balance2': [np.sum, np.mean, np.std],
        'log_return1': [np.sum, realized_volatility, realized_absvar, realized_skew, np.mean, np.std],
        'log_return2': [np.sum, realized_volatility, realized_absvar, realized_skew, np.mean, np.std],
        'log_return3': [np.sum, realized_volatility, realized_absvar, realized_skew, np.mean, np.std],
        'log_return4': [np.sum, realized_volatility, realized_absvar, realized_skew, np.mean, np.std],
        'log_return12': [np.sum, realized_volatility, realized_absvar, realized_skew, np.mean, np.std],
        'log_return34': [np.sum, realized_volatility, realized_absvar, realized_skew, np.mean, np.std],
        'depth': [np.sum, np.mean, np.std],
        'slope_mid': [np.sum, np.mean, np.std],
        'slope_spread': [np.sum, np.mean, np.std],
        'bid_spread':[np.sum, np.mean, np.std],
        'ask_spread':[np.sum, np.mean, np.std],
        'bid_mid1':[np.sum, np.mean, np.std],
        'bid_mid2':[np.sum, np.mean, np.std],
        'ask_mid1':[np.sum, np.mean, np.std],
        'ask_mid2':[np.sum, np.mean, np.std],
        'price_spread1':[np.sum, np.mean, np.std],
        'price_spread2':[np.sum, np.mean, np.std],
        'total_volume':[np.sum, np.mean, np.std],
        'volume_imbalance':[np.sum, np.mean, np.std],
        'bid_ask_spread':[np.sum, np.mean, np.std],
        'bid_impact1':[np.sum, np.mean, np.std],
        'bid_impact2':[np.sum, np.mean, np.std],
        'ask_impact1':[np.sum, np.mean, np.std],
        'ask_impact2':[np.sum, np.mean, np.std],
    }
    
    feature_dict_trade = {
        'log_return':[realized_volatility, realized_absvar, realized_skew],
        'seconds_in_bucket':[count_unique],
        'size':[np.sum, realized_volatility, realized_absvar, realized_skew, np.mean, np.std, np.max, np.min],
        'order_count':[np.mean,np.sum,np.max],
    }
    
    model_params = {
        "xgb_bl": {
            "objective": "reg:squarederror",
            "booster": "gblinear",
            "nthread": -1,
            "eta": 0.3,
            "max_depth": 8,
            "min_child_weight": 1,
            "sampling_method": "gradient_based",
            "tree_method": "gpu_hist"  # turn it on for GPU
        },
        "xgb_tuning": {
            "objective": "reg:squarederror",
            "booster": "gbtree",
            "nthread": -1,
            "tree_method": "gpu_hist",
            'max_depth': 7,
            'eta': 0.03,
            'lambda': 0.01,
            "subsample": 0.2,
            "colsample_bytree": 0.33,
            "sampling_method": "uniform"
        },
        "xgb_optuna": {
            "objective": "reg:squarederror",
            "booster": "gblinear",
            "nthread": -1,
            "tree_method": "gpu_hist",
            'max_depth': 7,
            'eta': 0.03,
            'lambda': 1.0979256871605507e-06,
            'gamma': 2.3321112461277414e-08,
            'alpha': 0.006405029944559645,
            "sampling_method": "gradient_based"
        }
    }

In [8]:
config = Config

### Preprocessing of Book

In [9]:
from functools import reduce

def book_preprocessor(file_path):
    # Function to preprocess book data (for each stock id)
    
    df = pd.read_parquet(file_path)
    
    df = (
        df.pipe(calc_waps)
        .pipe(calc_log_returns)
        .pipe(calc_depth)
        .pipe(calc_slope)
        .pipe(calc_spread)
        .pipe(calc_price_impact)
    )
    
    def get_stats_window(seconds_in_bucket, add_suffix = False):
        # Function to get group stats for different windows (seconds in bucket)
        
        # Group by the window
        df_feature = df[df['seconds_in_bucket'] >= seconds_in_bucket].groupby(['time_id']).agg(config.feature_dict_book).reset_index()
        
        # Rename columns joining suffix
        df_feature.columns = ['_'.join(col) for col in df_feature.columns]
        
        # Add a suffix to differentiate windows
        if add_suffix:
            df_feature = df_feature.add_suffix('_' + str(seconds_in_bucket))
            df_feature.rename({f'time_id__{seconds_in_bucket}': 'time_id_'}, axis=1, inplace=True)
        return df_feature
    
    # Get the stats for different windows
    df_list = [get_stats_window(seconds_in_bucket = 0, add_suffix = False)]
    
    time_slices = [t * config.time_gap for t in range(1, 600 // config.time_gap)]
    for t in time_slices:
        df_list += [get_stats_window(seconds_in_bucket = t, add_suffix = True)]
    
    df_feature = reduce(lambda left, right: pd.merge(left, right, on='time_id_'), df_list)
    
    # Create row_id so we can merge
    stock_id = file_path.split('=')[1]
    df_feature.loc[:, 'row_id'] = df_feature['time_id_'].apply(lambda x: f'{stock_id}-{x}')
    df_feature.drop(['time_id_'], axis = 1, inplace = True)
    
    return df_feature

### Preprocessing of Trade

In [10]:
def trade_preprocessor(file_path):
    # Function to preprocess trade data (for each stock id)
    
    df = pd.read_parquet(file_path)
    df['log_return'] = df.groupby('time_id')['price'].apply(log_return)
    
    def get_stats_window(seconds_in_bucket, add_suffix = False):
        # Function to get group stats for different windows (seconds in bucket)
        
        # Group by the window
        df_feature = df[df['seconds_in_bucket'] >= seconds_in_bucket].groupby(['time_id']).agg(config.feature_dict_trade).reset_index()
        
        # Rename columns joining suffix
        df_feature.columns = ['_'.join(col) for col in df_feature.columns]
        
        # Add a suffix to differentiate windows
        if add_suffix:
            df_feature = df_feature.add_suffix('_' + str(seconds_in_bucket))
            df_feature.rename({f'time_id__{seconds_in_bucket}': 'time_id_'}, axis=1, inplace=True)
        return df_feature
    
    def tendency(price, vol):    
        df_diff = np.diff(price)
        val = (df_diff/price[1:])*100
        power = np.sum(val*vol[1:])
        return(power)
    
    def process_trade_features(df):
        lis = []
        for n_time_id in df['time_id'].unique():
            df_id = df[df['time_id'] == n_time_id]        
            tendencyV = tendency(df_id['price'].values, df_id['size'].values)      
            f_max = np.sum(df_id['price'].values > np.mean(df_id['price'].values))
            f_min = np.sum(df_id['price'].values < np.mean(df_id['price'].values))
            df_max =  np.sum(np.diff(df_id['price'].values) > 0)
            df_min =  np.sum(np.diff(df_id['price'].values) < 0)
            abs_diff = np.median(np.abs( df_id['price'].values - np.mean(df_id['price'].values)))        
            energy = np.mean(df_id['price'].values**2)
            iqr_p = np.percentile(df_id['price'].values,75) - np.percentile(df_id['price'].values,25)
            abs_diff_v = np.median(np.abs( df_id['size'].values - np.mean(df_id['size'].values)))        
            energy_v = np.sum(df_id['size'].values**2)
            iqr_p_v = np.percentile(df_id['size'].values,75) - np.percentile(df_id['size'].values,25)

            lis.append({'time_id':n_time_id,'tendency':tendencyV,'f_max':f_max,'f_min':f_min,'df_max':df_max,'df_min':df_min,
                       'abs_diff':abs_diff,'energy':energy,'iqr_p':iqr_p,'abs_diff_v':abs_diff_v,'energy_v':energy_v,'iqr_p_v':iqr_p_v})

        df_lr = pd.DataFrame(lis)
        return df_lr
    
    # Get the stats for different windows
    df_list = [get_stats_window(seconds_in_bucket = 0, add_suffix = False)]
    
    time_slices = [t * 100 for t in range(1, 600 // config.time_gap)]
    for t in time_slices:
        df_list += [get_stats_window(seconds_in_bucket = t, add_suffix = True)]
        
    df_feature = reduce(lambda left, right: pd.merge(left, right, on='time_id_'), df_list)
    df_lr = process_trade_features(df)
    df_feature = df_feature.merge(df_lr, how = 'left', left_on = 'time_id_', right_on = 'time_id')
    
    df_feature = df_feature.add_prefix('trade_')
    stock_id = file_path.split('=')[1]
    df_feature['row_id'] = df_feature['trade_time_id_'].apply(lambda x:f'{stock_id}-{x}')
    df_feature.drop(['trade_time_id_'], axis = 1, inplace = True)
    
    return df_feature

### Preprocess of Time Stock

In [11]:
# Function to get group stats for the stock_id and time_id
def get_time_stock(df):
    # Get realized volatility columns
    vol_cols = [col for col in df.columns if 'realized_volatility' in col]

    # Group by the stock id
    df_stock_id = df.groupby(['stock_id'])[vol_cols].agg(['mean', 'std', 'max', 'min', ]).reset_index()
    # Rename columns joining suffix
    df_stock_id.columns = ['_'.join(col) for col in df_stock_id.columns]
    df_stock_id = df_stock_id.add_suffix('_' + 'stock')

    # Group by the stock id
    df_time_id = df.groupby(['time_id'])[vol_cols].agg(['mean', 'std', 'max', 'min', ]).reset_index()
    # Rename columns joining suffix
    df_time_id.columns = ['_'.join(col) for col in df_time_id.columns]
    df_time_id = df_time_id.add_suffix('_' + 'time')
    
    # Merge with original dataframe
    df = df.merge(df_stock_id, how = 'left', left_on = ['stock_id'], right_on = ['stock_id__stock'])
    df = df.merge(df_time_id, how = 'left', left_on = ['time_id'], right_on = ['time_id__time'])
    df.drop(['stock_id__stock', 'time_id__time'], axis = 1, inplace = True)
    return df

### Overall Preprocessing

In [12]:
# Funtion to make preprocessing function in parallel (for each stock id)
def preprocessor(list_stock_ids, is_train = True):
    
    # Parrallel for loop
    def for_joblib(stock_id):
        # Train
        if is_train:
            file_path_book = config.data_dir + "book_train.parquet/stock_id=" + str(stock_id)
            file_path_trade = config.data_dir + "trade_train.parquet/stock_id=" + str(stock_id)
        # Test
        else:
            file_path_book = config.data_dir + "book_test.parquet/stock_id=" + str(stock_id)
            file_path_trade = config.data_dir + "trade_test.parquet/stock_id=" + str(stock_id)
    
        # Preprocess book and trade data and merge them
        df_tmp = pd.merge(book_preprocessor(file_path_book), trade_preprocessor(file_path_trade), on = 'row_id', how = 'left')
        
        # Return the merge dataframe
        return df_tmp
    
    # Use parallel api to call paralle for loop
    df = Parallel(n_jobs = -1, verbose = 1)(delayed(for_joblib)(stock_id) for stock_id in list_stock_ids)
    # Concatenate all the dataframes that return from Parallel
    df = pd.concat(df, ignore_index = True)
    return df

## Traning

### Utilities

In [13]:
# Function to calculate the root mean squared percentage error
def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))

# Function to early stop with root mean squared percentage error
def feval_rmspe(y_pred, xgb_train):
    y_true = xgb_train.get_label()
    return 'RMSPE', rmspe(y_true, y_pred)

### Modeling

In [37]:
def stratified_group_k_fold(X, y, groups, k, seed=None):
    """ https://www.kaggle.com/jakubwasikowski/stratified-group-k-fold-cross-validation """
    labels_num = np.max(y) + 1
    y_counts_per_group = defaultdict(lambda: np.zeros(labels_num))
    y_distr = Counter()
    for label, g in zip(y, groups):
        y_counts_per_group[g][label] += 1
        y_distr[label] += 1

    y_counts_per_fold = defaultdict(lambda: np.zeros(labels_num))
    groups_per_fold = defaultdict(set)

    def eval_y_counts_per_fold(y_counts, fold):
        y_counts_per_fold[fold] += y_counts
        std_per_label = []
        for label in range(labels_num):
            label_std = np.std([y_counts_per_fold[i][label] / y_distr[label] for i in range(k)])
            std_per_label.append(label_std)
        y_counts_per_fold[fold] -= y_counts
        return np.mean(std_per_label)
    
    groups_and_y_counts = list(y_counts_per_group.items())
    random.Random(seed).shuffle(groups_and_y_counts)

    for g, y_counts in tqdm(sorted(groups_and_y_counts, key=lambda x: -np.std(x[1])), total=len(groups_and_y_counts)):
        best_fold = None
        min_eval = None
        for i in range(k):
            fold_eval = eval_y_counts_per_fold(y_counts, i)
            if min_eval is None or fold_eval < min_eval:
                min_eval = fold_eval
                best_fold = i
        y_counts_per_fold[best_fold] += y_counts
        groups_per_fold[best_fold].add(g)

    all_groups = set(groups)
    for i in range(k):
        train_groups = all_groups - groups_per_fold[i]
        test_groups = groups_per_fold[i]

        train_indices = [i for i, g in enumerate(groups) if g in train_groups]
        test_indices = [i for i, g in enumerate(groups) if g in test_groups]

        yield train_indices, test_indices

In [30]:
def train_and_evaluate(train, test):
    params = config.model_params['xgb_tuning']
    
    # Split features and target
    x = train.drop(['row_id', 'target', 'time_id'], axis = 1)
    y = train['target']
    x_test = test.drop(['row_id', 'time_id'], axis = 1)
    # Transform stock id to a numeric value
    feats_nostock = [col for col in x.columns if col not in {"stock_id"}] 
    x['stock_id'] = x['stock_id'].astype(int)
    x_test['stock_id'] = x_test['stock_id'].astype(int)
    dtest = xgb.DMatrix(x_test)
    
    # Create out of folds array
    oof_predictions = np.zeros(x.shape[0])
    # Create test array to store predictions
    test_predictions = np.zeros(x_test.shape[0])
    # Create a KFold object
    skf = stratified_group_k_fold(
        X=x[feats_nostock], y=train['stock_id'].astype('category').cat.codes.values, 
        groups=np.array(train['time_id'].astype('category').cat.codes.values), k=5, seed=config.seed
    )
    # Iterate through each fold
    for fold, (train_idx, val_idx) in enumerate(skf):
        print(f'Training fold {fold + 1}')
        x_train, y_train = x.iloc[train_idx], y.iloc[train_idx]
        x_val, y_val = x.iloc[val_idx], y.iloc[val_idx]

        x_train["stock_id"] = x_train["stock_id"].astype(int)
        x_val["stock_id"] = x_val["stock_id"].astype(int)

        dtrain = xgb.DMatrix(x_train, label=y_train, weight=1/np.square(y_train), enable_categorical=True)
        dval = xgb.DMatrix(x_val, label=y_val, weight=1/np.square(y_val), enable_categorical=True)

        model = xgb.train(params,
                          dtrain=dtrain,
                          evals=[(dtrain, "dtrain"), (dval, "dval")],
                          verbose_eval=50,
                          early_stopping_rounds=100,
                          num_boost_round=1000,
                          feval=feval_rmspe)

        oof_predictions[val_idx] = model.predict(dval)
        test_predictions += model.predict(dtest) / 5
        
    rmspe_score = rmspe(y, oof_predictions)
    print(f'Our out of folds RMSPE is {rmspe_score}')
    # Return test predictions
    return test_predictions

### Running

In [15]:
# Read train and test
train, test = read_train_test()

# Get unique stock ids 
train_stock_ids = train['stock_id'].unique()
# Preprocess them using Parallel and our single stock id functions
train_ = preprocessor(train_stock_ids, is_train = True)
train = train.merge(train_, on = ['row_id'], how = 'left')

# Get unique stock ids 
test_stock_ids = test['stock_id'].unique()
# Preprocess them using Parallel and our single stock id functions
test_ = preprocessor(test_stock_ids, is_train = False)
test = test.merge(test_, on = ['row_id'], how = 'left')

# Get group stats of time_id and stock_id
train = get_time_stock(train)
test = get_time_stock(test)

Our training set has 428932 rows


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed: 10.3min
[Parallel(n_jobs=-1)]: Done 112 out of 112 | elapsed: 33.8min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.3s finished


### Optimization

In [16]:
def objective(trial, train=train):
    x = train.drop(['row_id', 'target', 'time_id'], axis = 1)
    y = train['target']
    train_x, valid_x, train_y, valid_y = train_test_split(x, y, test_size=0.25, random_state=config.seed)
    dtrain = xgb.DMatrix(train_x, label=train_y)
    dvalid = xgb.DMatrix(valid_x, label=valid_y)
    
    param = {
        "objective": "reg:squarederror",
        "booster": "gbtree",
        "max_depth": trial.suggest_int("max_depth", 1, 9),
        "eta": trial.suggest_categorical("eta", [.3, .1]),
        "lambda": trial.suggest_loguniform("lambda", 1e-8, 1.0),
        "gamma": trial.suggest_float("gamma", 1e-8, 1.0, log=True),
        "alpha": trial.suggest_loguniform("alpha", 1e-8, 1.0),
        "tree_method": "gpu_hist"  # turn it on for GPU
    }

    # Add a callback for pruning.
    model = xgb.train(param,
                      dtrain,
                      evals=[(dtrain, "dtrain"), (dvalid, "dval")],
                      verbose_eval=50,
                      feval=feval_rmspe,
                      num_boost_round=1000,
                      early_stopping_rounds=100)
    y_pred = model.predict(dvalid)
    return rmspe(valid_y, y_pred)

In [None]:
# study = optuna.create_study(
#     pruner=optuna.pruners.MedianPruner(n_warmup_steps=5), direction="minimize"
# )
# study.optimize(objective, n_trials=100)
# print(study.best_trial)

# print(study.best_trial)

# study.best_trial.params

#### Exp1 - Optuna Best Param

In [None]:
# # Traing and evaluate
# test_predictions = train_and_evaluate(train, test)
# # Save test predictions
# test['target'] = test_predictions
# test[['row_id', 'target']].to_csv('submission.csv',index = False)

#### Exp2 - Baseline

In [None]:
# Traing and evaluate
test_predictions = train_and_evaluate(train, test)
# Save test predictions
submit_df = test.copy()
submit_df['target'] = test_predictions
submit_df[['row_id', 'target']].to_csv('submission.csv',index = False)

100%|███████████████████████████████████████| 3830/3830 [00:55<00:00, 69.19it/s]


Training fold 1
[0]	dtrain-rmse:0.48326	dtrain-RMSPE:224.11708	dval-rmse:0.48324	dval-RMSPE:221.80678
[50]	dtrain-rmse:0.10539	dtrain-RMSPE:48.87768	dval-rmse:0.10540	dval-RMSPE:48.37676
[100]	dtrain-rmse:0.02300	dtrain-RMSPE:10.66499	dval-rmse:0.02301	dval-RMSPE:10.56178
[150]	dtrain-rmse:0.00504	dtrain-RMSPE:2.33911	dval-rmse:0.00507	dval-RMSPE:2.32473
[200]	dtrain-rmse:0.00119	dtrain-RMSPE:0.55189	dval-rmse:0.00123	dval-RMSPE:0.56309
[250]	dtrain-rmse:0.00051	dtrain-RMSPE:0.23538	dval-rmse:0.00056	dval-RMSPE:0.25839
[300]	dtrain-rmse:0.00044	dtrain-RMSPE:0.20485	dval-rmse:0.00050	dval-RMSPE:0.22933
[350]	dtrain-rmse:0.00043	dtrain-RMSPE:0.20044	dval-rmse:0.00049	dval-RMSPE:0.22718
[400]	dtrain-rmse:0.00043	dtrain-RMSPE:0.19818	dval-rmse:0.00049	dval-RMSPE:0.22653
[450]	dtrain-rmse:0.00042	dtrain-RMSPE:0.19646	dval-rmse:0.00049	dval-RMSPE:0.22631
[500]	dtrain-rmse:0.00042	dtrain-RMSPE:0.19463	dval-rmse:0.00049	dval-RMSPE:0.22651
[550]	dtrain-rmse:0.00042	dtrain-RMSPE:0.19260	dval-rms