In [1]:
import glob
import joblib
import lightgbm as lgb
import numpy as np
import os
import pandas as pd
from abc import ABCMeta, abstractmethod
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
from tqdm import tqdm
from typing import Callable, List, Tuple, Union, Optional

In [2]:
DATA_DIR = '../input/optiver-realized-volatility-prediction/'

In [3]:
class CreateFeatures:
    '''
    features
    ========
        - stock_id: [categorical]
        - past_vol: volatility during the feature bucket of each (stock_id, time_id)
        - past_vol2: past volatility calculated with WAP2
        - updown: Compare average WAP of first 10 seconds_in_bucket and last 10 in the feature bucket.
                  +1 if the first 10 < the last 10, -1 otherwise.
        - price_spread1: mean of ask_price1 - bid_price1
        - price_spread2: mean of ask_price2 - bid_price2
        - price_spread_bid: mean of bid_price1 - bid_price2
        - price_spread_ask: mean of ask_price1 - ask_price2
        - size_total_bid: mean of bid_size1 + bid_size2
        - size_total_ask: mean of ask_size1 + ask_size2
        - size_total1: mean of bid_size1 + ask_size1
        - size_total2: mean of bid_size2 + ask_size2
        - size_total3: mean of bid_size1 + bid_size2 + ask_size1 + ask_size2
        - size_spread1: mean of bid_size1 - ask_size1
        - size_spread2: mean of bid_size2 - ask_size2
        - size_spread3: mean of bid_size1 + bid_size2 - ask_size1 - ask_size2
    '''

    def __init__(self, rerun: bool) -> None:
        self.rerun = rerun
        # {feature_name: [aggregation method, aggregation target column in book]}
        self.agg_method_target = {
                                'stock_id': [None, None],
                                'time_id': [None, None],
                                'vol': [None, 'WAP'],
                                'vol2': [None, 'WAP2'],
                                'updown': [None, 'WAP'],
                                'price_spread1': [np.mean, 'price_spread1'],
                                'price_spread2': [np.mean, 'price_spread2'],
                                'price_spread_bid': [np.mean, 'price_spread_bid'],
                                'price_spread_ask': [np.mean, 'price_spread_ask'],
                                'size_total_bid': [np.mean, 'size_total_bid'],
                                'size_total_ask': [np.mean, 'size_total_ask'],
                                'size_total1': [np.mean, 'size_total1'],
                                'size_total2': [np.mean, 'size_total2'],
                                'size_total3': [np.mean, 'size_total3'],
                                'size_spread1': [np.mean, 'size_spread1'],
                                'size_spread2': [np.mean, 'size_spread2'],
                                'size_spread3': [np.mean, 'size_spread3']
                                }

    def create_features(self, train_test: str) -> Tuple[pd.DataFrame, List[str]]:
        ''' creates DataFrame of scaled features of all (stock_id, time_id)
            the DataFrame will be saved as a csv file

        :param train_test: either 'train' or 'test'
        :return: DataFrame of scaled features and list of feature names
        '''
        if not self.rerun:
            # just load an already-prepared feature data
            df_feature = pd.read_csv(f'features_{train_test}.csv')
            feature_names = list(set(df_feature.columns) - set('time_id'))
        else:
            print(f'Creating features for {train_test}ing...')
            book_list = glob.glob(DATA_DIR + f'book_{train_test}.parquet/*')
            # trade_list = glob.glob(DATA_DIR + f'trade_{train_test}.parquet/*')
            self.agg_method_target['vol'][0] = self._calc_vol
            self.agg_method_target['vol2'][0] = self._calc_vol
            self.agg_method_target['updown'][0] = self._calc_updown
            
            feature_dict = {feature_name: [] for feature_name in self.agg_method_target}
            for book_path in tqdm(book_list):
                # for each stock_id
                book = pd.read_parquet(book_path)
                book = self._add_wap_and_etc_to_book(book)

                feature_dict_of_stock_id = {}
                for feature_name in self.agg_method_target:
                    if feature_name == 'stock_id' or feature_name == 'time_id':
                        pass
                    else:
                        agg_method = self.agg_method_target[feature_name][0]
                        agg_target = self.agg_method_target[feature_name][1]
                        feature_dict_of_stock_id[feature_name] = book.groupby('time_id')[agg_target].agg(agg_method)
                        feature_dict[feature_name] += list(feature_dict_of_stock_id[feature_name])

                feature_dict['stock_id'] += [int(book_path.split('=')[1])] * feature_dict_of_stock_id['vol'].shape[0]
                feature_dict['time_id'] += list(feature_dict_of_stock_id['vol'].index)

            df_feature = pd.DataFrame({feature_name: feature_dict[feature_name] for feature_name in self.agg_method_target})

            scaler = MinMaxScaler()
            feature_names = list(set(list(df_feature.columns)) - set(['time_id']))
            feature_names_num = list(set(list(feature_names)) - set(['stock_id']))
            df_feature[feature_names_num] = pd.DataFrame(scaler.fit_transform(df_feature[feature_names_num]),
                                                     columns=feature_names_num)
            df_feature.to_csv(f'features_{train_test}.csv', encoding='utf-8-sig', index=False)
            print(f'Finish creating features for {train_test}ing!')
        return df_feature, feature_names

    # all the functions below are helper functions
    def _add_wap_and_etc_to_book(self, book: pd.DataFrame) -> pd.DataFrame:
        ''' calculates WAP and etc. for a stock for each second in bucket

        :param: pd.DataFrame of a book
        :return: pd.DataFrame of a book with new columns
        '''
        book['WAP'] = (book['bid_price1'] * book['ask_size1'] + book['ask_price1'] * book['bid_size1']) \
                           / (book['bid_size1'] + book['ask_size1'])
        book['WAP2'] = (book['bid_price2'] * book['ask_size2'] + book['ask_price2'] * book['bid_size2']) \
                       / (book['bid_size2'] + book['ask_size2'])
        book['price_spread1'] = book['ask_price1'] - book['bid_price1']
        book['price_spread2'] = book['ask_price2'] - book['bid_price2']
        book['price_spread_bid'] = book['bid_price1'] - book['bid_price2']
        book['price_spread_ask'] = book['ask_price1'] - book['ask_price2']
        book['size_total_bid'] = book['bid_size1'] + book['bid_size2']
        book['size_total_ask'] = book['ask_size1'] + book['ask_size2']
        book['size_total1'] = book['bid_size1'] + book['ask_size1']
        book['size_total2'] = book['bid_size2'] + book['ask_size2']
        book['size_total3'] = book['bid_size1'] + book['bid_size2'] + book['ask_size1'] + book['ask_size2']
        book['size_spread1'] = book['bid_size1'] - book['ask_size1']
        book['size_spread2'] = book['bid_size2'] - book['ask_size2']
        book['size_spread3'] = book['bid_size1'] + book['bid_size2'] - book['ask_size1'] - book['ask_size2']

        return book
        
    def _calc_vol(self, wap_series: pd.Series) -> float:
        ''' calculates volatility of a srtock during a time_id

        :param wap_series: series of weighted average price
        :return: volatility of a stock in the first 10 min
        '''
        # calculate log return log(S_{k}/S_{k-1}) for each k
        # where S_{k} is the price of the stock S at time k
        # and k is the index of the input list.
        list_log_return = np.log(wap_series).diff()
        return np.sqrt(np.sum(list_log_return**2))

    def _calc_updown(self, wap_series: pd.Series) -> float:
        ''' judges if WAP went up or down during the time_id

        :return: 1 when up, -1 when down, or 0 when neutral
        '''
        list_wap = list(wap_series)
        diff = np.mean(list_wap[-10:]) - np.mean(list_wap[:10])
        return np.sign(diff)

In [4]:
class Model(metaclass=ABCMeta):

    def __init__(self, run_fold_name: str, feature_names: List[str],
                 params: dict) -> None:
        ''' Constructor

        :param feature_names: list of feature names to specify columns of feature dataframe
        :param params: hyper parameters
        '''
        self.run_fold_name = run_fold_name
        self.feature_names = feature_names
        self.params = params
        self.model = None

    @abstractmethod
    def train(self, X_train: pd.DataFrame, y_train: pd.Series,
              X_valid: Optional[pd.DataFrame] = None,
              y_valid: Optional[pd.Series] = None
              ) -> None:
        ''' trains a model

        :param X_train: features of training data
        :param y_train: targets of training data
        :param X_valid: features of validation data
        :param y_valid: targets of validation data
        '''
        pass

    @abstractmethod
    def predict(self, X: pd.DataFrame) -> np.array:
        ''' returns prediction output from a learned model

        :param X: features of test data or validation data
        :return: predicted value
        '''
        pass

    @abstractmethod
    def save_model(self) -> None:
        ''' saves a model '''
        pass

    @abstractmethod
    def load_model(self) -> None:
        ''' loads a model '''
        pass

In [5]:
class ModelLGB(Model):


    def train(self, X_train, y_train, X_valid=None, y_valid=None):

        params = self.params
        if X_valid is not None:
            # weights to change feval from RMSE to RMSPE
            # idea from: https://www.kaggle.com/c/optiver-realized-volatility-prediction/discussion/250324
            w_train = 1/np.square(y_train)
            w_valid = 1/np.square(y_valid)
            lgb_train = lgb.Dataset(X_train, y_train, weight=w_train)
            lgb_eval = lgb.Dataset(X_valid, y_valid, weight=w_valid)
            self.model = lgb.train(params,
                                   lgb_train,
                                   num_boost_round=100,
                                   valid_sets=lgb_eval,
                                   feval=rmspe,
                                   verbose_eval=50,
                                   categorical_feature=['stock_id']
                                  )
        else:
            w_train = 1/np.square(y_train)
            lgb_train = lgb.Dataset(X_train, y_train, weight=w_train)
            self.model = lgb.train(params,
                                   lgb_train,
                                   num_boost_round=100,
                                   feval=rmspe,
                                   categorical_feature=['stock_id']
                           )

    def predict(self, X_test):
        return self.model.predict(X_test)

    def save_model(self):
        self.model.save_model('model.txt')

    def load_model(self):
        self.model = lgb.Booster(model_file='model.txt')

In [6]:
# delf-defiend evaluation metric function
def rmspe(y_true: np.array, y_pred: np.array):
    ''' self-defined eval metric
        Root Mean Squared Percentage Error

    :return: name: str, eval_result: float, is_higher_better: bool
    '''
    if type(y_pred) == lgb.basic.Dataset:
        y_pred = y_pred.get_label()
    rmspe = (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))
    return 'RMSPE', rmspe, False

In [7]:
class Util:
    @classmethod
    def dump(cls, value, path):
        os.makedirs(os.path.dirname(path), exist_ok=True)
        joblib.dump(value, path, compress=True)

    @classmethod
    def load(cls, path):
        return joblib.load(path)

    @classmethod
    def submission(cls, df_pred: pd.DataFrame) -> None:
        row_id = df_pred['stock_id'].apply(str) + '-' + df_pred['time_id'].apply(str)
        df_submission = pd.DataFrame({'row_id': row_id, 'target':df_pred['target']})
        df_submission.to_csv('submission.csv', index=False)

In [8]:
class Runner:

    def __init__(self, run_name: str, model_cls: Callable[[str, dict], Model],
                 feature_names: List[str], df_features_train: pd.DataFrame,
                 df_features_test: pd.DataFrame, params: dict):
        ''' Constructor

        :param run_name: name of run
        :param model_cls: class of model
        :param feature_names: list of feature names
        :param params: hyper parameters
        '''
        self.run_name = run_name
        self.model_cls = model_cls
        self.feature_names = feature_names
        self.X_train_all = df_features_train
        self.X_test = df_features_test
        self.params = params
        self.n_fold = 4

    def _train_fold(self, i_fold: Union[int, str]) -> Tuple[
                    Model, Optional[np.array],
                    Optional[np.array], Optional[float]]:
        ''' specifies number of fold for cv then learns & evaluates

        :param i_fold: number of fold ('all' for all)
        :return: a tuple of instance of model, index of record,
                 predicted value, and evaluation score
                 (returns only model if i_fold=='all')
        '''
        # load train data
        y_train_all = self._load_y_train()
        self.X_train_all['stock_id'] = self.X_train_all['stock_id'].astype('int64')
        self.X_train_all['time_id'] = self.X_train_all['time_id'].astype('int64')
        Xy_train_all = pd.merge(self.X_train_all, y_train_all,
                                left_on=['stock_id','time_id'], right_on=['stock_id','time_id'],
                                how='inner')
        X_train_all = Xy_train_all[self.feature_names]
        y_train_all = Xy_train_all['target']

        validation = i_fold != 'all'
        if validation:
            # split data into training and validation
            idx_train, idx_valid = self._load_index_fold(i_fold)
            X_train = X_train_all.iloc[idx_train]
            y_train = y_train_all.iloc[idx_train]
            X_valid = X_train_all.iloc[idx_valid]
            y_valid = y_train_all.iloc[idx_valid]

            # execute learning
            model = self._build_model(i_fold)
            model.train(X_train, y_train, X_valid, y_valid)

            # prediction and evaluation with validation data
            pred_valid = model.predict(X_valid)
            _, score, _ = rmspe(y_true=y_valid, y_pred=pred_valid)

            # return model, index, prediction, and score
            return model, idx_valid, pred_valid, score
        else:
            # learining with all data
            model = self._build_model(i_fold)
            model.train(X_train_all, y_train_all)

            # return model
            return model, None, None, None

    def run_train_cv(self) -> None:
        ''' learns and evaluates by CV

        learns, evaluates, and saves models and scores of each fold
        '''
        scores = []
        idxes_valid = []
        preds = []

        # learning for each fold
        for i_fold in range(self.n_fold):
            model, idx_valid, pred_valid, score = self._train_fold(i_fold)

            # hold result
            idxes_valid.append(idx_valid)
            scores.append(score)
            preds.append(pred_valid)

            # save model
            model.save_model()
        print(f'Mean score of the folds: {np.mean(scores)}')

    def run_predict_cv(self) -> pd.DataFrame:
        ''' predicts for test data with the mean of
            each fold's model learned through CV
            
            :return: predicted target as the mean of folds
        '''
        preds = []
        # prediction for each fold's model
        for i_fold in range(self.n_fold):
            model = self._build_model(i_fold)
            model.load_model()
            pred = model.predict(self.X_test[self.feature_names])
            preds.append(pred)

        # mean of the prediction values
        pred_mean = np.mean(preds, axis=0)
        df_pred = self.X_test[self.X_test.columns[:2]]
        df_pred.loc[:, 'target'] = pred_mean
        return df_pred

    def run_train_all(self) -> None:
        ''' learns with all the training data and save the model '''
        # learning
        i_fold = 'all'
        model, _, _, _ = self._train_fold(i_fold)
        model.save_model()

    def run_predict_all(self) -> pd.DataFrame:
        ''' predicts for test data with the model learned with all the training data

        :return: predicted target
        '''
        
        # predict with the mdoel learned with all the learning data
        i_fold = 'all'
        model = self._build_model(i_fold)
        model.load_model()
        pred = model.predict(self.X_test[self.feature_names])
        df_pred = self.X_test[self.X_test.columns[:2]]
        df_pred['target'] = pred
        return df_pred

    def _build_model(self, i_fold: Union[int, str]) -> Model:
        ''' builds a model with a specified fold for cv

        :param i_fold: number of fold
        :return: instance of model
        '''
        # build a model with run name, fold, and class of model
        run_fold_name = f'{self.run_name}-{i_fold}'
        return self.model_cls(run_fold_name, self.feature_names, self.params)

    def _load_y_train(self) -> pd.DataFrame:
        ''' loads target of train data; ['stock_id', 'time_id', 'target']

        :return: target dataframe of train data
        '''
        return pd.read_csv(DATA_DIR + 'train.csv')

    def _load_index_fold(self, i_fold: int) -> np.array:
        ''' returns the record index in response to the fold specified for cv

        :param i_fold: number of the fold
        :return: record index for the fold
        '''
        # return index to split data for learning and validation
        y_train = self._load_y_train()
        x_dummy = np.zeros(len(y_train))
        skf = KFold(n_splits=self.n_fold, shuffle=True, random_state=31)
        return list(skf.split(x_dummy, y_train))[i_fold]

## main

In [9]:
params_lgb = {
            'boosting_type': 'gbdt',
            'num_leaves': 100,
            'learning_rate': 0.05,
            'n_estimators': 1000,
            'metric': 'rmse',
            'early_stopping_rounds': 100
            }

# rerun=False if just using saved features works
cf = CreateFeatures(rerun=True)
df_features_train, feature_names = cf.create_features('train')
df_features_test, _ = cf.create_features('test')

# learning and prediction by lightGBM and create submission file
run_name = 'lgb'
runner = Runner(run_name=run_name,
                model_cls=ModelLGB,
                feature_names=feature_names,
                df_features_train=df_features_train,
                df_features_test=df_features_test,
                params=params_lgb)
runner.run_train_cv()
pred = runner.run_predict_cv()
Util.submission(pred)

  0%|          | 0/112 [00:00<?, ?it/s]

Creating features for training...


100%|██████████| 112/112 [11:03<00:00,  5.92s/it]
100%|██████████| 1/1 [00:00<00:00, 20.00it/s]

Finish creating features for training!
Creating features for testing...
Finish creating features for testing!



New categorical_feature is ['stock_id']


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3684
[LightGBM] [Info] Number of data points in the train set: 321699, number of used features: 16




[LightGBM] [Info] Start training from score 0.001803
Training until validation scores don't improve for 100 rounds
[50]	valid_0's rmse: 0.000543827	valid_0's RMSPE: 0.380046
[100]	valid_0's rmse: 0.000535151	valid_0's RMSPE: 0.361781
[150]	valid_0's rmse: 0.000534749	valid_0's RMSPE: 0.355537
[200]	valid_0's rmse: 0.000535292	valid_0's RMSPE: 0.353013
[250]	valid_0's rmse: 0.000535791	valid_0's RMSPE: 0.351218
Early stopping, best iteration is:
[161]	valid_0's rmse: 0.000534605	valid_0's RMSPE: 0.355014


New categorical_feature is ['stock_id']


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3684
[LightGBM] [Info] Number of data points in the train set: 321699, number of used features: 16




[LightGBM] [Info] Start training from score 0.001801
Training until validation scores don't improve for 100 rounds
[50]	valid_0's rmse: 0.00053513	valid_0's RMSPE: 0.37249
[100]	valid_0's rmse: 0.000525305	valid_0's RMSPE: 0.352498
[150]	valid_0's rmse: 0.000523817	valid_0's RMSPE: 0.348956
[200]	valid_0's rmse: 0.000523273	valid_0's RMSPE: 0.346179
[250]	valid_0's rmse: 0.000523173	valid_0's RMSPE: 0.344354
[300]	valid_0's rmse: 0.00052323	valid_0's RMSPE: 0.342755
Early stopping, best iteration is:
[218]	valid_0's rmse: 0.000523087	valid_0's RMSPE: 0.345355


New categorical_feature is ['stock_id']


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3684
[LightGBM] [Info] Number of data points in the train set: 321699, number of used features: 16




[LightGBM] [Info] Start training from score 0.001800
Training until validation scores don't improve for 100 rounds
[50]	valid_0's rmse: 0.000544708	valid_0's RMSPE: 0.375441
[100]	valid_0's rmse: 0.000537773	valid_0's RMSPE: 0.354413
[150]	valid_0's rmse: 0.000537657	valid_0's RMSPE: 0.350194
[200]	valid_0's rmse: 0.000539015	valid_0's RMSPE: 0.347649
Early stopping, best iteration is:
[136]	valid_0's rmse: 0.0005372	valid_0's RMSPE: 0.351213


New categorical_feature is ['stock_id']


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3684
[LightGBM] [Info] Number of data points in the train set: 321699, number of used features: 16




[LightGBM] [Info] Start training from score 0.001797
Training until validation scores don't improve for 100 rounds
[50]	valid_0's rmse: 0.000539602	valid_0's RMSPE: 0.374903
[100]	valid_0's rmse: 0.000529953	valid_0's RMSPE: 0.354999
[150]	valid_0's rmse: 0.000529051	valid_0's RMSPE: 0.350539
[200]	valid_0's rmse: 0.000528815	valid_0's RMSPE: 0.348501
[250]	valid_0's rmse: 0.000528539	valid_0's RMSPE: 0.346601
[300]	valid_0's rmse: 0.000528444	valid_0's RMSPE: 0.345295
[350]	valid_0's rmse: 0.000528418	valid_0's RMSPE: 0.344135
[400]	valid_0's rmse: 0.000528568	valid_0's RMSPE: 0.343095
Early stopping, best iteration is:
[342]	valid_0's rmse: 0.000528331	valid_0's RMSPE: 0.344261
Mean score of the folds: 0.24566350475139093


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)
