# Make Data

In [None]:
# ライブラリのインポート
import os
import random

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn import model_selection  

import lightgbm as lgb
import pickle
import warnings
import gc
 
from collections import Counter, defaultdict
from function import feature_engineering
from function import training_library
warnings.simplefilter('ignore')

# グラフ描画用
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style('darkgrid')

## Train

In [None]:
class CFG:
    seed = 42
    n_folds = 5
    boosting_type = "dart"
    metric = "mse"
    columns = []
    
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

In [None]:
def train_and_evaluate(target,save=False):
    train = feature_engineering.read_data(target)
    # Get feature list
    features = [col for col in train.columns if col not in ["datetime","start_code","section","speed","allcars","OCC","search_1h"]]
    params = {
        "objective":"regression",
        "metric":CFG.metric,
        "boosting": CFG.boosting_type,
        "seed":CFG.seed,
        "force_col_wise":True,
        "device":"gpu",
        "verbosity": -1
        }

    # Create a numpy array to store out of folds predictions
    oof_predictions = np.zeros(len(train))
    models = []
    gtss = training_library.GroupTimeSeriesSplit(n_splits=CFG.n_folds)
    for fold, (trn_ind,val_ind) in enumerate(gtss.split(train,groups=train["section"].to_numpy())):
        print(" " )
        print("-"*50)
        print(f"Training fold {fold} with {len(features)} features...")
        x_train, x_val = train[features].iloc[trn_ind], train[features].iloc[val_ind]
        y_train, y_val = train[f"{target}"].iloc[trn_ind], train[f"{target}"].iloc[val_ind]
        lgb_train = lgb.Dataset(x_train,y_train)
        lgb_valid = lgb.Dataset(x_val,y_val)
        model = lgb.train(
            params=params,
            train_set=lgb_train,
            num_boost_round=100000,
            valid_sets=[lgb_train,lgb_valid],
            early_stopping_rounds=50,
            callbacks=[training_library.LgbmProgressBarCallback(description=f"LGBM for {target}")]
            )

        if save :
            file = f"../run_test/submit/model/{target}/{target}_lgbm_{CFG.seed}_{fold}.pkl"
            pickle.dump(model, open(file, 'wb'))

        # Predict validation
        val_pred = model.predict(x_val)
        # Add to out of folds array
        oof_predictions[val_ind] = val_pred
        # Compute fold metric
        score = np.sqrt(mean_squared_error(y_val,val_pred))
        print(f"Our fold {fold} CV score is {score}")
        del x_train, x_val, y_train, y_val, lgb_train, lgb_valid
        gc.collect()

        models.append(model)

    # Compute out of folds metric
    score = np.sqrt(mean_squared_error(train[f"{target}"],oof_predictions))
    print(f"Our out of folds CV score is {score}")

    # # 日付時刻、変数A、変数Bを含むDataframe形式のデータを作成
    # data = {"datetime": train["datetime"],
    #         f"{target}": train[f"{target}"],
    #         "pred": oof_predictions}
    # df = pd.DataFrame(data)

    # # 折れ線グラフを作成
    # plt.figure(figsize=(50,10))
    # plt.plot(df["datetime"],df[f"{target}"],label=f"{target}")
    # plt.plot(df["datetime"],df["pred"],label="pred")

    # # X軸、Y軸のラベルを設定
    # plt.xlabel("Datetime")
    # plt.ylabel("Value")

    # # 凡例を表示
    # plt.legend()

    # 特徴量重要度を保管する dataframe を用意
    feature_importances = pd.DataFrame()

    for fold, model in enumerate(models):

        tmp = pd.DataFrame()
        tmp['feature'] = model.feature_name()
        tmp['importance'] = model.feature_importance(importance_type='gain')
        tmp['fold'] = fold

        feature_importances = feature_importances.append(tmp)

    FeatImportance = pd.DataFrame(feature_importances.groupby("feature")["importance"].mean().sort_values(ascending=False))
    # 重要度が100より大きいものだけ抜き出す
    FeatImportance = FeatImportance[FeatImportance["importance"]>100]

    # 可視化
    display(FeatImportance)

    return FeatImportance

In [None]:
seed_everything(CFG.seed)
# FeatImportance_OCC = train_and_evaluate("OCC",save=False)

In [None]:
FeatImportance_allCars = train_and_evaluate("allCars",save=False)

In [None]:
FeatImportance_search_1h = train_and_evaluate("search_1h",save=False)