# Make Data

In [69]:
# ライブラリのインポート
import os
import random

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split 

import lightgbm as lgb
import pickle
import warnings
import gc

from collections import Counter, defaultdict

warnings.simplefilter('ignore')

## hide_function

In [70]:
def read_data():
    road = pd.read_csv("../road.csv")

    # 予測に要らない "start_name"と"end_name" を削除
    road = road.drop(["start_name","end_name"],axis=1)

    # "direction"を数値に変換　（これはsubmitにも実装）
    road = pd.get_dummies(road,drop_first=True)

    search_data = pd.read_csv("../search_data.csv")

    # 時刻まで含まれているので、日付を取り出して"date"に格納
    search_data["datetime"] = pd.to_datetime(search_data["datetime"]) 
    search_data["date"] = search_data["datetime"].dt.date
    search_data["date"] = pd.to_datetime(search_data["date"]) 

    search_unspec_data = pd.read_csv("../search_unspec_data.csv")
    search_unspec_data["date"] = pd.to_datetime(search_unspec_data["date"]) 

    # "date","start_code","end_code"をキーとしてsearch_dataとマージ
    merged_search_data = search_data.merge(search_unspec_data[["date","start_code","end_code","search_unspec_1d"]],on=["date","start_code","end_code"],how="left")

    train = pd.read_csv("../train.csv")
    # 時刻まで含まれているので、日付を取り出して"date"に格納
    train["datetime"] = pd.to_datetime(train["datetime"]) 

    # train と road をマージ
    all_data = train.merge(road,on=["start_code","end_code"],how="left")
    # train_all と merged_search_data をマージ
    all_data = all_data.merge(merged_search_data,on=["datetime","start_code","end_code"],how="left")

    all_data = all_data.drop(["date"],axis=1)

    return all_data

In [71]:
# 日付などの処理
import jpholiday
import datetime as dt
def DateFeatGen(data):
    data["month"] = data["datetime"].dt.month
    data["day"] = data["datetime"].dt.day
    data["dayofweek"] = data["datetime"].dt.dayofweek
    data["hour"] = data["datetime"].dt.hour
    data["is_holiday"] = data["datetime"].map(jpholiday.is_holiday).astype(int)
    return data

In [72]:
def engineer_time_series_features(data):
    outputs = [data]
    grp_df = data.groupby("start_code")[["OCC_1h_old","allCars_1h_old","search_1h_old","speed_1h_old"]]

    for lag in range(1,6):
        # shift
        outputs.append(grp_df.shift(lag).add_prefix(f"shift{lag}_"))
        # diff
        outputs.append(grp_df.diff(lag).add_prefix(f"diff{lag}_"))

    # rolling
    for window in [3]:
        tmp_df = grp_df.rolling(window, min_periods=1)
        # 移動平均を取る
        tmp_df = tmp_df.mean().add_prefix(f"rolling{window}_mean_")
        outputs.append(tmp_df.reset_index(drop=True))

    df = pd.concat(outputs, axis=1)
    return df

In [73]:
def TargetGenFunc(data):
    data["OCC_1h_old"] = data["OCC"].copy()
    data["search_1h_old"] = data["search_1h"].copy()
    data["allCars_1h_old"] = data["allCars"].copy()
    data["speed_1h_old"] = data["speed"].copy()

    # 各目的変数の作成
    data = (data.groupby("start_code", group_keys=False).apply(lambda x: x.assign(speed = x["speed"].shift(-1))))
    data = (data.groupby("start_code", group_keys=False).apply(lambda x: x.assign(OCC = x["OCC"].shift(-1))))
    data = (data.groupby("start_code", group_keys=False).apply(lambda x: x.assign(search_1h = x["search_1h"].shift(-1))))
    data = (data.groupby("start_code", group_keys=False).apply(lambda x: x.assign(allCars = x["allCars"].shift(-1))))
    return data

In [74]:
from typing import Optional
from collections import OrderedDict
from lightgbm.callback import CallbackEnv
from tqdm.auto import tqdm

class LgbmProgressBarCallback:
    description: Optional[str]
    pbar: tqdm

    def __init__(self, description: Optional[str] = None):
        self.description = description
        self.pbar = tqdm()

    def __call__(self, env: CallbackEnv):

        # 初回だけProgressBarを初期化する
        is_first_iteration: bool = env.iteration == env.begin_iteration

        if is_first_iteration:
            total: int = env.end_iteration - env.begin_iteration
            self.pbar.reset(total=total)
            self.pbar.set_description(self.description, refresh=False)

        # valid_setsの評価結果を更新
        if len(env.evaluation_result_list) > 0:
            # OrderedDictにしないと表示順がバラバラになって若干見にくい
            postfix = OrderedDict(
                [
                    (f"{entry[0]}:{entry[1]}", str(entry[2]))
                    for entry in env.evaluation_result_list
                ]
            )
            self.pbar.set_postfix(ordered_dict=postfix, refresh=False)

        # 進捗を1進める
        self.pbar.update(1)
        self.pbar.refresh()

## Train

In [75]:
class CFG:
    seed = 42
    boosting_type = "dart"
    metric = "mse"

In [76]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

In [77]:
train = read_data()
train = train[["datetime","start_code","OCC","allCars","speed","KP","limit_speed","direction_下り","search_1h"]]
train = train[train["allCars"] != 0]
train = DateFeatGen(train)
train = TargetGenFunc(train)
train = train.dropna(how="any")
train = engineer_time_series_features(train)

In [78]:
def Inference_lgbm(train,target):
    # Get feature list
    features = [col for col in train.columns if col not in ["datetime","start_code","speed","allCars","OCC","search_1h"]]
    params = {
        "objective":"regression",
        "metric":CFG.metric,
        "boosting": CFG.boosting_type,
        "seed":CFG.seed,
        "num_leaves":15,
        "max_depth":7,
        "min_data_in_leaf":20,
        "bagging_fraction":0.8,
        "bagging_freq":3,
        "feature_fraction":0.9,
        "lambda_l1":2,
        "lambda_l2":2,
        "learning_rate":0.01,
        "n_jobs":-1,
        "force_col_wise":True,
        "device":"gpu",
        "verbosity": -1
        }
        
    X_train, X_valid, y_train, y_valid = train_test_split(train[features],train[f"{target}"],shuffle=True,random_state=CFG.seed)
    lgb_train = lgb.Dataset(X_train,y_train)
    lgb_valid = lgb.Dataset(X_valid,y_valid)
    model = lgb.train(
        params=params,
        train_set=lgb_train,
        num_boost_round=1000000,
        valid_sets=[lgb_train,lgb_valid],
        early_stopping_rounds=50,
        callbacks=[LgbmProgressBarCallback(description=f"LGBM for {target}")]
        )
    file = f"../../submit/model/{target}_lgbm_{CFG.seed}.pkl"
    pickle.dump(model, open(file, 'wb'))

In [79]:
seed_everything(CFG.seed)
Inference_lgbm(train,"OCC")

0it [00:00, ?it/s]

KeyboardInterrupt: 

In [None]:
Inference_lgbm(train,"allCars")

In [None]:
Inference_lgbm(train,"search_1h")