In [15]:
import os
import sys
import glob

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold

import lightgbm as lgbm

In [16]:
sys.path.append("../")
from src.utils import calc_wap, calc_wap2, log_return, realized_volatility, count_unique, rmspe, feval_RMSPE

# config

In [None]:
class CFG:
    # 実験番号
    exp_no = 1
    
    random_seed = 42
    input_dir = "../input"
    output_dir = "../output/data"
    
    # preprocessor_book
    feature_lst = [np.mean, np.sum, np.std, np.median, np.max, np.min]
    feature_log_return = feature_lst + [realized_volatility]

    create_feature_dict = {
        "log_return": feature_log_return,
        "log_return2": feature_log_return,
        "wap_balance": feature_lst,
        "price_spread": feature_lst,
        "bid_spread": feature_lst,
        "ask_spread": feature_lst,
        "volume_imbalance": feature_lst,
        "total_volume": feature_lst,
        "wap": feature_lst,
        "wap2": feature_lst,
    }
    
    last_seconds = [150, 300, 450, ]
    
    # preprocessor_trade
    aggregate_dictionary = {
        "log_return": feature_log_return,
        "seconds_in_bucket": [count_unique] + feature_lst,
        "size": feature_lst,
        "order_count": feature_lst,
    }

    

# featureting

In [26]:
def preprocessor_book(file_path):
    """
    bookデータの特徴量を生成
    
    CHECK
    ------
    CFG.create_feature_dict、CFG.last_secondsは外で定義してる
    """
    df = pd.read_parquet(file_path)
    
    # calculate return etc
    df["wap"] = calc_wap(df)
    df["log_return"] = df.groupby("time_id")["wap"].apply(log_return)

    df["wap2"] = calc_wap2(df)
    df["log_return2"] = df.groupby("time_id")["wap2"].apply(log_return)

    df["wap_balance"] = abs(df["wap"] - df["wap2"])

    df["price_spread"] = (df["ask_price1"] - df["bid_price1"]) / (
        (df["ask_price1"] + df["bid_price1"]) / 2
    )
    df["bid_spread"] = df["bid_price1"] - df["bid_price2"]
    df["ask_spread"] = df["ask_price1"] - df["ask_price2"]
    df["total_volume"] = (df["ask_size1"] + df["ask_size2"]) + (
        df["bid_size1"] + df["bid_size2"]
    )
    df["volume_imbalance"] = abs(
        (df["ask_size1"] + df["ask_size2"]) - (df["bid_size1"] + df["bid_size2"])
    )

    ##### groupby / all seconds
    df_feature = pd.DataFrame(
        df.groupby(["time_id"]).agg(CFG.create_feature_dict)
    ).reset_index()

    df_feature.columns = [
        "_".join(col) for col in df_feature.columns
    ]  # time_id is changed to time_id_

    ###### groupby / last XX seconds
    for second in CFG.last_seconds:
        second = 600 - second

        df_feature_sec = pd.DataFrame(
            df.query(f"seconds_in_bucket >= {second}")
            .groupby(["time_id"])
            .agg(CFG.create_feature_dict)
        ).reset_index()

        df_feature_sec.columns = [
            "_".join(col) for col in df_feature_sec.columns
        ]  # time_id is changed to time_id_

        df_feature_sec = df_feature_sec.add_suffix("_" + str(second))

        df_feature = pd.merge(
            df_feature,
            df_feature_sec,
            how="left",
            left_on="time_id_",
            right_on=f"time_id__{second}",
        )
        df_feature = df_feature.drop([f"time_id__{second}"], axis=1)

    # create row_id
    stock_id = file_path.split("=")[1]
    df_feature["row_id"] = df_feature["time_id_"].apply(lambda x: f"{stock_id}-{x}")
    df_feature = df_feature.drop(["time_id_"], axis=1)

    return df_feature

def preprocessor_trade(file_path):
    """
    tradeデータの特徴量を生成

    CHECK
    ------
    CFG.aggregate_dictionary、CFG.last_secondsは外で定義してる
    """
    df = pd.read_parquet(file_path)
    df["log_return"] = df.groupby("time_id")["price"].apply(log_return)


    df_feature = df.groupby("time_id").agg(CFG.aggregate_dictionary)

    df_feature = df_feature.reset_index()
    df_feature.columns = ["_".join(col) for col in df_feature.columns]

    ######groupby / last XX seconds
    for second in CFG.last_seconds:
        second = 600 - second

        df_feature_sec = (
            df.query(f"seconds_in_bucket >= {second}")
            .groupby("time_id")
            .agg(CFG.aggregate_dictionary)
        )
        df_feature_sec = df_feature_sec.reset_index()

        df_feature_sec.columns = ["_".join(col) for col in df_feature_sec.columns]
        df_feature_sec = df_feature_sec.add_suffix("_" + str(second))

        df_feature = pd.merge(
            df_feature,
            df_feature_sec,
            how="left",
            left_on="time_id_",
            right_on=f"time_id__{second}",
        )
        df_feature = df_feature.drop([f"time_id__{second}"], axis=1)

    df_feature = df_feature.add_prefix("trade_")
    stock_id = file_path.split("=")[1]
    df_feature["row_id"] = df_feature["trade_time_id_"].apply(
        lambda x: f"{stock_id}-{x}"
    )
    df_feature = df_feature.drop(["trade_time_id_"], axis=1)

    return df_feature

In [27]:
def preprocessor(input_dir, list_stock_ids, is_train=True):
    from joblib import Parallel, delayed  # parallel computing to save time

    df = pd.DataFrame()
    BOOK_TRAIN = f"book_train.parquet/stock_id={stock_id}"
    TRADE_TRAIN = f"trade_train.parquet/stock_id={stock_id}"
    BOOK_TEST = f"book_test.parquet/stock_id={stock_id}"
    TRADE_TEST = f"trade_test.parquet/stock_id={stock_id}"
    
    def for_joblib(stock_id):
        if is_train:
            file_path_book = os.path.join(
                input_dir, BOOK_TRAIN
            )
            file_path_trade = os.path.join(
                input_dir, TRADE_TRAIN
            )
        else:
            file_path_book = os.path.join(
                input_dir, BOOK_TEST
            )
            file_path_trade = os.path.join(
                input_dir, TRADE_TEST
            )

        df_tmp = pd.merge(
            preprocessor_book(file_path_book),
            preprocessor_trade(file_path_trade),
            on="row_id",
            how="left",
        )
        return pd.concat([df, df_tmp])

    df = Parallel(n_jobs=-1, verbose=1)(
        delayed(for_joblib)(stock_id) for stock_id in list_stock_ids
    )

    df = pd.concat(df, ignore_index=True)
    return df

In [28]:
def calc_model_importance(model, feature_names=None, importance_type="gain"):
    importance_df = pd.DataFrame(
        model.feature_importance(importance_type=importance_type),
        index=feature_names,
        columns=["importance"],
    ).sort_values("importance")
    return importance_df


def plot_importance(importance_df, title="", save_filepath=None, figsize=(8, 12)):
    fig, ax = plt.subplots(figsize=figsize)
    importance_df.plot.barh(ax=ax)
    if title:
        plt.title(title)
    plt.tight_layout()
    if save_filepath is None:
        plt.show()
    else:
        plt.savefig(save_filepath)
    plt.close()

# training set

In [1]:
### CHECK
# book, tradeのパスglobで渡したい

In [None]:
train = pd.read_csv(os.path.join(CFG.input_dir, "train.csv"))
train_ids = train.stock_id.unique()
train["row_id"] = train["stock_id"].astype(str) + "-" + train["time_id"].astype(str)
train = train[["row_id", "target"]]

df_train = preprocessor(CFG.input_dir, list_stock_ids=train_ids, is_train=True)
df_train = train.merge(df_train, on=["row_id"], how="left")
# df_train.head()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


# test set

In [None]:
test = pd.read_csv(os.path.join(CFG.input_dir, "test.csv"))
test_ids = test.stock_id.unique()

df_test = preprocessor(CFG.input_dir, list_stock_ids=test_ids, is_train=False)
df_test = test.merge(df_test, on=["row_id"], how="left")

# target encoding by stock_id

In [None]:
# stock_id target encoding
df_train["stock_id"] = df_train["row_id"].apply(lambda x: x.split("-")[0])
df_test["stock_id"] = df_test["row_id"].apply(lambda x: x.split("-")[0])

stock_id_target_mean = df_train.groupby("stock_id")["target"].mean()
df_test["stock_id_target_enc"] = df_test["stock_id"].map(
    stock_id_target_mean
)  # test_set

# training
#### CHECK
# この辺、あんまり分かってない
# oofでtarget encordingしてるんだと思うけども
# 自分で書き直したい
tmp = np.repeat(np.nan, df_train.shape[0])
kf = KFold(n_splits=10, shuffle=True, random_state=CFG.random_seed)
for idx_1, idx_2 in kf.split(df_train):
    target_mean = df_train.iloc[idx_1].groupby("stock_id")["target"].mean()

    tmp[idx_2] = df_train["stock_id"].iloc[idx_2].map(target_mean)
df_train["stock_id_target_enc"] = tmp

In [None]:
df_train.to_csv(os.path.join(CFG.output_dir, f"train_exp{CFG.exp_no}.csv"), index=False)
df_test.to_csv(os.path.join(CFG.output_dir, f"test_exp{CFG.exp_no}.csv"), index=False)