In [100]:
import pandas as pd
import datetime as dt
import numpy as np
import pickle
import jpholiday
from tqdm import tqdm

In [101]:
road = pd.read_csv("../road.csv")

# 予測に要らない "start_name"と"end_name" を削除
road = road.drop(["start_name","end_name"],axis=1)

# "direction"を数値に変換　（これはsubmitにも実装）
# road = pd.get_dummies(road,drop_first=True)

search_data = pd.read_csv("../search_data.csv")

# 時刻まで含まれているので、日付を取り出して"date"に格納
search_data["datetime"] = pd.to_datetime(search_data["datetime"]) 
search_data["date"] = search_data["datetime"].dt.date
search_data["date"] = pd.to_datetime(search_data["date"]) 

search_unspec_data = pd.read_csv("../search_unspec_data.csv")
search_unspec_data["date"] = pd.to_datetime(search_unspec_data["date"]) 

# "date","start_code","end_code"をキーとしてsearch_dataとマージ
merged_search_data = search_data.merge(search_unspec_data[["date","start_code","end_code","search_unspec_1d"]],on=["date","start_code","end_code"],how="left")

train = pd.read_csv("../train.csv")
# 時刻まで含まれているので、日付を取り出して"date"に格納
train["datetime"] = pd.to_datetime(train["datetime"]) 

# train と road をマージ
all_data = train.merge(road,on=["start_code","end_code"],how="left")
# train_all と merged_search_data をマージ
all_data = all_data.merge(merged_search_data,on=["datetime","start_code","end_code"],how="left")

all_data = all_data.drop(["date"],axis=1)

In [102]:
data = all_data[all_data["datetime"]<dt.datetime(2021,4,9)]

In [103]:
allCars_model = pickle.load(open("../../submit/model/allcars_lgbm_42.pkl", "rb"))
OCC_model = pickle.load(open("../../submit/model/OCC_lgbm_42.pkl", "rb"))
search_1h_model = pickle.load(open("../../submit/model/search_1h_lgbm_42.pkl", "rb"))
speed_model = pickle.load(open("../../submit/model/speed_lgbm_42.pkl", "rb"))

In [104]:
def DateFeatGen(data):
    # 日付などの処理
    data["month"] = data["datetime"].dt.month
    data["day"] = data["datetime"].dt.day
    data["dayofweek"] = data["datetime"].dt.dayofweek
    data["hour"] = data["datetime"].dt.hour
    data["is_holiday"] = data["datetime"].map(jpholiday.is_holiday).astype(int)
    return data

In [105]:
def engineer_time_series_features(data):
    outputs = [data]
    grp_df = data.groupby("start_code")[["OCC_1h_old","allCars_1h_old","search_1h_old","speed_1h_old"]]

    for lag in range(1,6):
        # shift
        outputs.append(grp_df.shift(lag).add_prefix(f'shift{lag}_'))
        # diff
        outputs.append(grp_df.diff(lag).add_prefix(f'diff{lag}_'))

    # rolling
    for window in [3]:
        tmp_df = grp_df.rolling(window, min_periods=1)
        # 移動平均を取る
        tmp_df = tmp_df.mean().add_prefix(f'rolling{window}_mean_')
        outputs.append(tmp_df.reset_index(drop=True))

    df = pd.concat(outputs, axis=1)
    return df

In [106]:
input = data[["datetime","start_code","end_code","OCC","allCars","speed","KP","limit_speed","direction","search_1h"]]
train = train[train["allCars"] != 0]
input = pd.get_dummies(input, drop_first=True)

In [107]:
# 予測をまとめるリストを作成
predictions = []
# "start_code" と "end_code" を１組として、順に処理していく
for start_end in input[["start_code", "end_code"]].drop_duplicates().values:
    print(f"start_code is{start_end[0]}, end_code is {start_end[1]}")

    # "start_code" と "end_code" が同じデータのみに絞る つまりデータが24行になる
    data = input[(input["start_code"]==start_end[0]) & (input["end_code"]==start_end[1])].copy()

    # -23~0まで24時間分、以下を繰り返す
    for h in range(-23, 1): 

        # 直近24時間分にする
        data_tmp = data[-24:].copy()
        
        # 現時刻のデータを１時間古いものとする（speed予測時のため）
        data_tmp["OCC_1h_old"] = data_tmp["OCC"].copy()
        data_tmp["allCars_1h_old"] = data_tmp["allCars"].copy()
        data_tmp["search_1h_old"] = data_tmp["search_1h"].copy()
        data_tmp["speed_1h_old"] = data_tmp["speed"].copy()

        # ラグ特徴量を生成する
        data_tmp = engineer_time_series_features(data_tmp)

        # 最後の時間のデータのみにする
        data_tmp = data_tmp[-1:].copy()
        # 日付を１日ずらして、データを再生成
        data_tmp["datetime"] += dt.timedelta(hours=1)
        # 提出用のデータ登録
        Onecode_pred = [(data_tmp["datetime"]).values[0],start_end[0],start_end[1]]
        
        # 日付の特徴量生成    
        data_tmp = DateFeatGen(data_tmp)

        # １時間後の"OCC","allCars","search_1h"を予測、格納
        unused_variable_names = ["datetime","inference_date","start_code","end_code","OCC","allCars","search_1h","speed"]
        data_tmp["OCC"] = 0 # OCC_model.predict(data_tmp[[col for col in data_tmp.columns if col not in unused_variable_names+["OCC"]]])
        data_tmp["allCars"] = 0 # allCars_model.predict(data_tmp[[col for col in data_tmp.columns if col not in unused_variable_names+["allCars"]]])
        data_tmp["search_1h"] = 0 # search_1h_model.predict(data_tmp[[col for col in data_tmp.columns if col not in unused_variable_names+["search_1h"]]])  

        data_tmp = DateFeatGen(data_tmp)
        # 現時刻の"speed"を予測、格納
        unused_variable_names = ["datetime","inference_date","start_code","end_code","speed"]
        pred_speed = 0 # speed_model.predict(data_tmp[[col for col in data_tmp.columns if col not in unused_variable_names]])  
        data_tmp["speed"] = pred_speed

        # 提出用データに格納
        Onecode_pred.append(pred_speed)
        # 大元の提出用データに格納
        predictions.append(Onecode_pred)
        # 新しい時刻をマージ
        data = pd.concat([data,data_tmp])
        # 予測に使う変数はいったん捨てる
        data = data.drop(["month","day","dayofweek","hour","is_holiday","OCC_1h_old","allCars_1h_old","search_1h_old","speed_1h_old"],axis=1).reset_index(drop=True)
prediction = pd.DataFrame(predictions,columns=["datetime","start_code","end_code","prediction"])
prediction["datetime"] -= dt.timedelta(days=1)

start_code is1110210, end_code is 1800006
['OCC', 'allCars', 'KP', 'limit_speed', 'search_1h', 'direction_下り', 'OCC_1h_old', 'allCars_1h_old', 'search_1h_old', 'speed_1h_old', 'shift1_OCC_1h_old', 'shift1_allCars_1h_old', 'shift1_search_1h_old', 'shift1_speed_1h_old', 'diff1_OCC_1h_old', 'diff1_allCars_1h_old', 'diff1_search_1h_old', 'diff1_speed_1h_old', 'shift2_OCC_1h_old', 'shift2_allCars_1h_old', 'shift2_search_1h_old', 'shift2_speed_1h_old', 'diff2_OCC_1h_old', 'diff2_allCars_1h_old', 'diff2_search_1h_old', 'diff2_speed_1h_old', 'shift3_OCC_1h_old', 'shift3_allCars_1h_old', 'shift3_search_1h_old', 'shift3_speed_1h_old', 'diff3_OCC_1h_old', 'diff3_allCars_1h_old', 'diff3_search_1h_old', 'diff3_speed_1h_old', 'shift4_OCC_1h_old', 'shift4_allCars_1h_old', 'shift4_search_1h_old', 'shift4_speed_1h_old', 'diff4_OCC_1h_old', 'diff4_allCars_1h_old', 'diff4_search_1h_old', 'diff4_speed_1h_old', 'shift5_OCC_1h_old', 'shift5_allCars_1h_old', 'shift5_search_1h_old', 'shift5_speed_1h_old', 'di

In [108]:
prediction

Unnamed: 0,datetime,start_code,end_code,prediction
