# LightGBM Baseline Model

make models for each route(A, B, C, D) and prediction period(predict 2 weeks ahead, 4 weeks ahead, ...)

## Dateset
- [diff value from previous day](https://github.com/hiroshi-kuriyama/rail_level_gap/issues/2) of [rolling mean over days](https://github.com/hiroshi-kuriyama/rail_level_gap/issues/4)
## Target Variable
- vel_l values X weeks ahead (X = 2,4,6,8)
## Features

- track variables
  - present lev_l value
  - mean and variance of recent lev_l value
  - mean and variance of recent days and whole year values of track data
- equipment variables
  - row value of equipment variables
- seasonal variables
  - date (encoded by trigonometric function)
  - holiday dummy

In [1]:
import os
import numpy as np
import pandas as pd
import datetime
import matplotlib.pyplot as plt
import lightgbm as lgb
%matplotlib inline
# from utils import data_process as dp

In [2]:
input_dir = '../input/'
working_dir = '../working/'
output_dir = '../output/'

## Read data

In [3]:
def read_track(line_name='a'):
    track_df = pd.read_csv(os.path.join(input_dir, 'track_{}.csv'.format(line_name.upper())))
    col_names_track = ['date', 'kilo', 'lev_l', 'lev_r', 'cur_l', 'cur_r', 'cant', 'width', 'speed']
    track_df.columns = col_names_track
    track_df['date'] = pd.to_datetime(track_df['date'])
    print('track_{line_name} shape: {shape}'.format(line_name=line_name.upper(), shape=track_df.shape))
    return track_df

In [4]:
sleeper_type_dict = {
    1: 'pc',
    2: 'wooden',
    3: 'junction',
    4: 'short',
    5: 'synthetic',
    6: 'synth_junc',
    7: 'symth_short',
    8: 'other'
}

In [5]:
def read_equ(line_name='a'):
    equ_df = pd.read_csv(os.path.join(input_dir, 'equipment_{}.csv'.format(line_name.upper())))
    col_names_equ = ['kilo', 'is_ballast', 'is_long', 'sleeper_type', 'is_bridge', 'is_crossing', 'gross_ton', 'radius', 'is_unreliable']
    equ_df.columns = col_names_equ
    equ_df['sleeper_type'] = equ_df['sleeper_type'].replace(sleeper_type_dict)
    print('equ_{line_name} shape: {shape}'.format(line_name=line_name.upper(), shape=equ_df.shape))
    return equ_df

In [6]:
abcd_list = ['a', 'b', 'c', 'd']
track = {}
equ = {}
for abcd in abcd_list:
    track[abcd] = read_track(abcd)
    equ[abcd] = read_equ(abcd)

track_A shape: (10185690, 9)
equ_A shape: (27906, 9)
track_B shape: (7815753, 9)
equ_B shape: (21531, 9)
track_C shape: (20324660, 9)
equ_C shape: (55684, 9)
track_D shape: (5601687, 9)
equ_D shape: (15691, 9)


## Data Processing
### rolling average, diff

In [7]:
# rolling average params
roll_params = {
    'A': {'window': 21, 'min_periods': 14},
    'B': {'window': 14, 'min_periods': 7},
    'C': {'window': 14, 'min_periods': 7},
    'D': {'window': 14, 'min_periods': 7}
}

In [19]:
def roll_diff(track, abcd):
        # pivot: row is date, col is kilo
        lev_df = track[abcd].pivot(index='date', columns='kilo', values='lev_l')
        lev_df.columns = lev_df.columns.astype('str')
        # rolling average
        lev_df_ra = lev_df.rolling(**roll_params[abcd.upper()], center=True, axis=0).mean()
        # diff
        lev_df_ra_diff = lev_df_ra.diff()
        # reverse pivot
        return pd.melt(lev_df_ra_diff.reset_index(), id_vars='date', value_name='lev_l_diff')  

In [20]:
lev_ra_diff = {}
for abcd in abcd_list:
    lev_ra_diff[abcd] = roll_diff(track, abcd)

In [21]:
lev_ra_diff[abcd].head()

Unnamed: 0,date,kilo,lev_l_diff
0,2017-04-09,10000,
1,2017-04-10,10000,0.015
2,2017-04-11,10000,0.011667
3,2017-04-12,10000,-0.040667
4,2017-04-13,10000,0.0


## Feature processing

In [43]:
track_whole_mean = {}
for abcd in abcd_list:
    track_whole_mean[abcd] = track[abcd].groupby('kilo').mean()
    track_whole_mean[abcd].columns = [i + '_w_mean' for i in track_whole_mean[abcd].columns]
    track_whole_mean[abcd] = track_whole_mean[abcd].reset_index()
    track_whole_mean[abcd]['kilo'] = track_whole_mean[abcd]['kilo'].astype(str)

## Target variable

In [31]:
lev_ra_diff_tgt = {}
for abcd in abcd_list:
    lev_ra_diff_tgt_tmp = lev_ra_diff[abcd].copy()
#     lev_ra_diff_tgt_tmp = lev_ra_diff_tgt_tmp.rename({'lev_l_diff': 'lev_l_diff_tgt'})
    lev_ra_diff_tgt_tmp['date_tgt'] = lev_ra_diff_tgt_tmp['date']
    lev_ra_diff_tgt_tmp['date'] = lev_ra_diff_tgt_tmp['date'] - datetime.timedelta(weeks=2)
    lev_ra_diff_tgt_tmp = lev_ra_diff_tgt_tmp.rename(columns={'lev_l_diff': 'lev_l_diff_tgt'})
    lev_ra_diff_tgt[abcd] = lev_ra_diff_tgt_tmp

In [32]:
lev_ra_diff_tgt[abcd].head()

Unnamed: 0,date,kilo,lev_l_diff_tgt,date_tgt
0,2017-03-26,10000,,2017-04-09
1,2017-03-27,10000,0.015,2017-04-10
2,2017-03-28,10000,0.011667,2017-04-11
3,2017-03-29,10000,-0.040667,2017-04-12
4,2017-03-30,10000,0.0,2017-04-13


## Merge Dataset

In [45]:
# copy target df
data_all_tmp = lev_ra_diff_tgt[abcd].copy()
# merge lev_l_diff
data_all_tmp = data_all_tmp.merge(lev_ra_diff[abcd], on=['date', 'kilo'])

data_all_tmp = data_all_tmp.merge(track_whole_mean[abcd], on='kilo')