# LightGBM Baseline Model

make models for each route(A, B, C, D) and prediction period(predict 2 weeks ahead, 4 weeks ahead, ...)

## Dateset
- [diff value from previous day](https://github.com/hiroshi-kuriyama/rail_level_gap/issues/2) of [rolling mean over days](https://github.com/hiroshi-kuriyama/rail_level_gap/issues/4)
## Target Variable
- vel_l values X weeks ahead (X = 2,4,6,8)
## Features

- track variables
  - present lev_l value
  - mean and variance of recent lev_l value
  - mean and variance of recent days and whole year values of track data
- equipment variables
  - row value of equipment variables
- seasonal variables
  - date (encoded by trigonometric function)
  - holiday dummy

In [1]:
import os
import numpy as np
import pandas as pd
import datetime
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error
import lightgbm as lgb
from hyperopt import hp, tpe, Trials, fmin
%matplotlib inline
# from utils import data_process as dp

In [2]:
input_dir = '../input/'
working_dir = '../working/'
output_dir = '../output/'

## Read data

In [3]:
def read_track(line_name='a'):
    track_df = pd.read_csv(os.path.join(input_dir, 'track_{}.csv'.format(line_name.upper())))
    col_names_track = ['date', 'kilo', 'lev_l', 'lev_r', 'cur_l', 'cur_r', 'cant', 'width', 'speed']
    track_df.columns = col_names_track
    track_df['date'] = pd.to_datetime(track_df['date'])
    print('track_{line_name} shape: {shape}'.format(line_name=line_name.upper(), shape=track_df.shape))
    return track_df

In [4]:
sleeper_type_dict = {
    1: 'pc',
    2: 'wooden',
    3: 'junction',
    4: 'short',
    5: 'synthetic',
    6: 'synth_junc',
    7: 'symth_short',
    8: 'other'
}

In [5]:
def read_equ(line_name='a'):
    equ_df = pd.read_csv(os.path.join(input_dir, 'equipment_{}.csv'.format(line_name.upper())))
    col_names_equ = ['kilo', 'is_ballast', 'is_long', 'sleeper_type', 'is_bridge', 'is_crossing', 'gross_ton', 'radius', 'is_unreliable']
    equ_df.columns = col_names_equ
    equ_df['sleeper_type'] = equ_df['sleeper_type'].replace(sleeper_type_dict).astype('category')
    print('equ_{line_name} shape: {shape}'.format(line_name=line_name.upper(), shape=equ_df.shape))
    return equ_df

In [6]:
abcd_list = ['a', 'b', 'c', 'd']
track = {}
equ = {}
for abcd in abcd_list:
    track[abcd] = read_track(abcd)
    equ[abcd] = read_equ(abcd)

track_A shape: (10185690, 9)
equ_A shape: (27906, 9)
track_B shape: (7815753, 9)
equ_B shape: (21531, 9)
track_C shape: (20324660, 9)
equ_C shape: (55684, 9)
track_D shape: (5601687, 9)
equ_D shape: (15691, 9)


In [7]:
# degrade data types to save memory
def degrade_dtypes(df):
    for col in df.columns:
        if df[col].dtype=='int64':
            df[col] = df[col].astype('int32')
        if df[col].dtype=='float64':
            df[col] = df[col].astype('float32')
    return df

In [8]:
for abcd in abcd_list:
    track[abcd] = degrade_dtypes(track[abcd])
    equ[abcd] = degrade_dtypes(equ[abcd])

## Data Processing
### rolling average, diff

In [9]:
# rolling average params
roll_params = {
    'A': {'window': 21, 'min_periods': 14},
    'B': {'window': 14, 'min_periods': 7},
    'C': {'window': 14, 'min_periods': 7},
    'D': {'window': 14, 'min_periods': 7}
}

In [10]:
def roll_diff(track, abcd):
        # pivot: row is date, col is kilo
        lev_df = track[abcd].pivot(index='date', columns='kilo', values='lev_l')
        lev_df.columns = lev_df.columns.astype('str')
        # rolling average
        lev_df_ra = lev_df.rolling(**roll_params[abcd.upper()], center=True, axis=0).mean()
        # diff
        lev_df_ra_diff = lev_df_ra.diff()
        # reverse pivot
        return pd.melt(lev_df_ra_diff.reset_index(), id_vars='date', value_name='lev_l_diff')  

In [11]:
lev_ra_diff = {}
for abcd in abcd_list:
    lev_ra_diff[abcd] = roll_diff(track, abcd)

In [12]:
lev_ra_diff[abcd].head()

Unnamed: 0,date,kilo,lev_l_diff
0,2017-04-09,10000,
1,2017-04-10,10000,0.015
2,2017-04-11,10000,0.011667
3,2017-04-12,10000,-0.040667
4,2017-04-13,10000,0.0


## Feature processing

In [13]:
track_whole_mean = {}
for abcd in abcd_list:
    track_whole_mean[abcd] = track[abcd].groupby('kilo').mean()
    track_whole_mean[abcd].columns = [i + '_w_mean' for i in track_whole_mean[abcd].columns]
    track_whole_mean[abcd] = track_whole_mean[abcd].reset_index()
    track_whole_mean[abcd]['kilo'] = track_whole_mean[abcd]['kilo'].astype(str)

## Target variable

In [14]:
lev_ra_diff_tgt = {}
for abcd in abcd_list:
    lev_ra_diff_tgt_tmp = lev_ra_diff[abcd].copy()
#     lev_ra_diff_tgt_tmp = lev_ra_diff_tgt_tmp.rename({'lev_l_diff': 'lev_l_diff_tgt'})
    lev_ra_diff_tgt_tmp['date_tgt'] = lev_ra_diff_tgt_tmp['date']
    lev_ra_diff_tgt_tmp['date'] = lev_ra_diff_tgt_tmp['date'] - datetime.timedelta(weeks=2)
    lev_ra_diff_tgt_tmp = lev_ra_diff_tgt_tmp.rename(columns={'lev_l_diff': 'lev_l_diff_tgt'})
    lev_ra_diff_tgt[abcd] = lev_ra_diff_tgt_tmp
    
del lev_ra_diff_tgt_tmp

In [15]:
lev_ra_diff_tgt[abcd].head()

Unnamed: 0,date,kilo,lev_l_diff_tgt,date_tgt
0,2017-03-26,10000,,2017-04-09
1,2017-03-27,10000,0.015,2017-04-10
2,2017-03-28,10000,0.011667,2017-04-11
3,2017-03-29,10000,-0.040667,2017-04-12
4,2017-03-30,10000,0.0,2017-04-13


## Merge Dataset

In [16]:
data_all = {}
for abcd in abcd_list:
    # copy target df
    data_all_tmp = lev_ra_diff_tgt[abcd].copy()
    # merge features
    data_all_tmp = data_all_tmp.merge(lev_ra_diff[abcd], on=['date', 'kilo'])
    data_all_tmp = data_all_tmp.merge(track_whole_mean[abcd], on='kilo')

    data_all_tmp['line_name'] = abcd.upper()
    data_all[abcd] = data_all_tmp
    
del lev_ra_diff_tgt, lev_ra_diff, track_whole_mean, data_all_tmp, track, equ

## Make Dataset

In [17]:
# union
data_u_all = pd.concat([data_all[abcd] for abcd in abcd_list], axis=0)
data_u_all['line_name'] = data_u_all['line_name'].astype('category')
del data_all

In [18]:
data_u_all.to_pickle(os.path.join(input_dir, 'data_u_all_lgbm_baseline.pkl'))
data_u_all = pd.read_pickle(os.path.join(input_dir, 'data_u_all_lgbm_baseline.pkl'))

In [19]:
data_u_all = data_u_all.dropna(how='any', axis=0, subset=['lev_l_diff_tgt', 'lev_l_diff'])

In [21]:
def X_y_split(data, tgt_col='lev_l_diff_tgt', index_cols=['date', 'date_tgt', 'kilo']):
    data = data.set_index(index_cols)
    y = data[tgt_col]
    X = data.drop(tgt_col, axis=1)
    return X, y

In [22]:
# split train and test
threshold_date = '2018-01-01'
X_dev, y_dev = X_y_split(data_u_all.query('date<"{}"'.format(threshold_date)))
X_val, y_val = X_y_split(data_u_all.query('date>"{}"'.format(threshold_date)))

In [23]:
hyperopt_params = {
    'num_leaves': hp.uniform('num_leaves', 500, 700),
    'min_child_samples': hp.uniform('min_child_samples', 200, 300),
    'learning_rate': hp.uniform('learning_rate', 0.05, 0.2),
    'feature_fraction': hp.uniform('feature_fraction', 0.5, 0.7),
    'bagging_freq': hp.uniform('bagging_freq', 6, 8)
}

In [24]:
fit_params = {
    'early_stopping_rounds':20,
    'eval_set':[(X_val, y_val)],
    'eval_metric': 'mean_absolute_error',
    'verbose': False
}

In [25]:
# float to int
tobe_int_params = ['num_leaves', 'min_child_samples', 'bagging_freq']
def int_param_encoder(params):
    for param in tobe_int_params:
        if param in params:
            params[param] = int(params[param])
    return params

In [26]:
num_evals_i = 0
def objective(hyperopt_params):
    # パラメータを適切な型(int)に変換
    hyperopt_params = int_param_encoder(hyperopt_params)
    # モデルのインスタンス化
    model = lgb.LGBMRegressor(**hyperopt_params, objective='mean_absolute_error', n_estimators=1000, random_state=0)
    # trainデータを使ってモデルの学習
    model.fit(X_dev, y_dev, **fit_params)
    # validationデータを使用して、ラベルの予測
    y_val_pred = model.predict(X_val, num_iteration=model.best_iteration_)
    # 予測ラベルと正解ラベルを使用してMAEを計算
    mae_score = mean_absolute_error(y_val, y_val_pred)
    global num_evals_i
    num_evals_i += 1
    print('[{num_evals}] best_ite: {best_ite}\tMAE: {mae_score}'.format(num_evals=str(num_evals_i).zfill(4), mae_score=mae_score, best_ite=model.best_iteration_))
    return mae_score

In [None]:
# iterationする回数
max_evals = 10
# 試行の過程を記録するインスタンス
trials = Trials()

best = fmin(
    # 最小化する値を定義した関数
    fn=objective,
    # 探索するパラメータのdictもしくはlist
    space=hyperopt_params,
    # どのロジックを利用するか、基本的にはtpe.suggestでok
    algo=tpe.suggest,
    max_evals=max_evals,
    trials=trials,
    # 試行の過程を出力
    verbose=-1,
    rstate=np.random.RandomState(0)
)

In [29]:
int_param_encoder(best)

{'bagging_freq': 7,
 'feature_fraction': 0.6570918110525219,
 'learning_rate': 0.15106159518328655,
 'min_child_samples': 266,
 'num_leaves': 607}

In [30]:
del X_dev, y_dev, X_val, y_val
# make submission
best_params = int_param_encoder(best)
X, y = X_y_split(data_u_all)
model_fulldata = lgb.LGBMRegressor(**best_params, n_estimators=10, random_state=0)
model_fulldata.fit(X, y)

LGBMRegressor(bagging_freq=7, boosting_type='gbdt', class_weight=None,
       colsample_bytree=1.0, feature_fraction=0.6570918110525219,
       importance_type='split', learning_rate=0.15106159518328655,
       max_depth=-1, min_child_samples=266, min_child_weight=0.001,
       min_split_gain=0.0, n_estimators=10, n_jobs=-1, num_leaves=607,
       objective=None, random_state=0, reg_alpha=0.0, reg_lambda=0.0,
       silent=True, subsample=1.0, subsample_for_bin=200000,
       subsample_freq=0)

In [31]:
# 最後から2週間の値の平均値を入力としてモデルで2週間後の数値を予測

In [31]:
data_u_all.head()

Unnamed: 0,date,kilo,lev_l_diff_tgt,date_tgt,lev_l_diff,lev_l_w_mean,lev_r_w_mean,cur_l_w_mean,cur_r_w_mean,cant_w_mean,width_w_mean,speed_w_mean,line_name
0,2017-04-01,10000,0.02152,2017-04-15,,-1.330575,-0.679421,0.101544,0.221274,0.281081,3.711467,65.766129,A
1,2017-04-02,10000,-0.007368,2017-04-16,,-1.330575,-0.679421,0.101544,0.221274,0.281081,3.711467,65.766129,A
2,2017-04-03,10000,-0.007368,2017-04-17,,-1.330575,-0.679421,0.101544,0.221274,0.281081,3.711467,65.766129,A
3,2017-04-04,10000,-0.040526,2017-04-18,,-1.330575,-0.679421,0.101544,0.221274,0.281081,3.711467,65.766129,A
4,2017-04-05,10000,0.005789,2017-04-19,,-1.330575,-0.679421,0.101544,0.221274,0.281081,3.711467,65.766129,A


In [59]:
index_tmp = data_u_all.groupby(['line_name', 'kilo'])['date_tgt'].max()

last_date_values_df = index_tmp.reset_index()
last_date_values_df = last_date_values_df.merge(data_u_all, how='left', on=['line_name', 'kilo', 'date_tgt'])
last_date_values_df['date'] = last_date_values_df['date_tgt']
last_date_values_df['lev_l_diff'] = last_date_values_df['lev_l_diff_tgt']
last_date_values_df = last_date_values_df[['line_name', 'kilo', 'date', 'lev_l_diff']]

In [55]:
X_pred_f, y_pred_f_tmp = X_y_split(last_date_values_df)

In [56]:
y_pred_f = model_fulldata.predict(X_pred_f)

In [60]:
last_date_values_df.head()

Unnamed: 0,line_name,kilo,date,lev_l_diff
0,A,10000,2018-03-15,-0.252524
1,A,10001,2018-03-15,-0.201143
2,A,10002,2018-03-15,-0.137143
3,A,10003,2018-03-15,-0.081667
4,A,10004,2018-03-15,-0.03219
