In [1]:
import os
import sys
import glob

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold

import lightgbm as lgbm

In [2]:
sys.path.append("../")
from src.utils import calc_wap, calc_wap2, log_return, realized_volatility, count_unique, rmspe, feval_RMSPE

# config

In [None]:
class CFG:
    # 実験番号
    exp_no = 1
    n_splits = 5
    
    random_seed = 42
    input_dir = "../output/data"
    output_dir = f"../output/exp_{exp_no}"
    
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # params
    # https://www.kaggle.com/felipefonte99/optiver-lgb-with-optimized-params
    lgbm_params = {
        'learning_rate': 0.13572437900113307,        
        'lambda_l1': 2.154360665259325,
        'lambda_l2': 6.711089761523827,
        'num_leaves': 769,
        'min_sum_hessian_in_leaf': 20.44437160769411,
        'feature_fraction': 0.7921473067441019,
        'feature_fraction_bynode': 0.8083803860191322,
        'bagging_fraction': 0.9726755660563261,
        'bagging_freq': 42,
        'min_data_in_leaf': 690,
        'max_depth': 3,
        'seed': seed,
        'feature_fraction_seed': seed,
        'bagging_seed': seed,
        'drop_seed': seed,
        'data_random_seed': seed,
        'objective': 'rmse',
        'boosting': 'gbdt',
        'verbosity': -1,
        'n_jobs': -1,
    }


In [28]:
def calc_model_importance(model, feature_names=None, importance_type="gain"):
    importance_df = pd.DataFrame(
        model.feature_importance(importance_type=importance_type),
        index=feature_names,
        columns=["importance"],
    ).sort_values("importance")
    return importance_df

def calc_mean_importance(importance_df_list):
    mean_importance = np.mean(
        np.array([df['importance'].values for df in importance_df_list]), axis=0)
    mean_df = importance_df_list[0].copy()
    mean_df['importance'] = mean_importance
    return mean_df

def plot_importance(importance_df, title="", save_filepath=None, figsize=(8, 12)):
    fig, ax = plt.subplots(figsize=figsize)
    importance_df.plot.barh(ax=ax)
    if title:
        plt.title(title)
    plt.tight_layout()
    if save_filepath is None:
        plt.show()
    else:
        plt.savefig(save_filepath)
    plt.close()
    
def save_lgbm_models(models, output_dir):
    """
    loadするときは
    clf = pickle.load(open('trained_model.pkl', 'rb'))
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    for num, model in enumerate(models):
        model.save_model(os.path.join(output_dir, f"model{num}.pkl"))

# data load

In [None]:
df_train = pd.read_csv(CFG.input_dir, f"train_exp{CFG.exp_no}.csv")
df_test = pd.read_csv(CFG.input_dir, f"test_exp{CFG.exp_no}.csv")

df_train['stock_id'] = df_train['stock_id'].astype(int)
df_test['stock_id'] = df_test['stock_id'].astype(int)

DO_FEAT_IMP = False
if len(df_test) == 3:
    DO_FEAT_IMP = True

# model

In [13]:
X = df_train.drop(['row_id','target'],axis=1)
y = df_train['target']

NameError: name 'df_train' is not defined

In [None]:
kf = KFold(n_splits=CFG.n_splits, 
           random_state=CFG.random_seed, shuffle=True)

oof = pd.DataFrame()
models = []
scores = 0.0

gain_importance_list = []
split_importance_list = []

In [None]:
### CHECK
# rmspeで評価したいのでweightを使用してる

for fold, (trn_idx, val_idx) in enumerate(kf.split(X, y)):

    print("Fold :", fold+1)
    
    # create dataset
    X_train, y_train = X.loc[trn_idx], y[trn_idx]
    X_valid, y_valid = X.loc[val_idx], y[val_idx]
    
    #RMSPE weight
    weights = 1/np.square(y_train)
    lgbm_train = lgbm.Dataset(X_train,y_train,weight = weights)

    weights = 1/np.square(y_valid)
    lgbm_valid = lgbm.Dataset(X_valid,y_valid,reference = lgbm_train,weight = weights)
    
    # model 
    model = lgbm.train(params=CFG.lgbm_params,
                      train_set=lgbm_train,
                      valid_sets=[lgbm_train, lgbm_valid],
                      num_boost_round=5000,         
                      feval=feval_RMSPE,
                      verbose_eval=100,
                      categorical_feature = ['stock_id']                
                     )
    
    # validation 
    y_pred = model.predict(X_valid, num_iteration=model.best_iteration)

    RMSPE = round(rmspe(y_true = y_valid, y_pred = y_pred),3)
    print(f'Performance of the　prediction: , RMSPE: {RMSPE}')

    #keep scores and models
    scores += RMSPE / CFG.n_splits
    models.append(model)
    print("*" * 100)
    
    # --- calc model feature importance ---
    if DO_FEAT_IMP:    
        feature_names = X_train.columns.values.tolist()
        gain_importance_df = calc_model_importance(
            model, feature_names=feature_names, importance_type='gain')
        gain_importance_list.append(gain_importance_df)

        split_importance_df = calc_model_importance(
            model, feature_names=feature_names, importance_type='split')
        split_importance_list.append(split_importance_df)

In [None]:
lgbm_output_dir = os.path.join(CFG.output_dir, "lgbm_model")
save_lgbm_models(models, lgbm_output_dir)

# TODO
- cvを算出
- importanceを可視化
- 

In [None]:
if DO_FEAT_IMP:
    mean_gain_df = calc_mean_importance(gain_importance_list)
    plot_importance(mean_gain_df, title='Model feature importance by gain')
    mean_gain_df = mean_gain_df.reset_index().rename(columns={'index': 'feature_names'})
    mean_gain_df.to_csv('gain_importance_mean.csv', index=False)

In [None]:
y_pred = df_test[['row_id']]
X_test = df_test.drop(['time_id', 'row_id'], axis = 1)

In [14]:
target = np.zeros(len(X_test))

#light gbm models
for model in models:
    pred = model.predict(X_test[X_valid.columns], num_iteration=model.best_iteration)
    target += pred / len(models)

NameError: name 'X_test' is not defined

In [None]:
y_pred = y_pred.assign(target = target)

In [None]:
y_pred.to_csv('submission.csv',index = False)