# 평가지표 함수화

In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb

In [18]:
total = pd.read_csv('data/한국가스공사_시간별 공급량_20181231.csv', encoding='cp949')

In [19]:
# 구분 열을 숫자로 바꾸기
d_map = {}
for i, d in enumerate(total['구분'].unique()):
    d_map[d] = i
total['구분'] = total['구분'].map(d_map)

In [20]:
# 연월일 열은 object형이므로 년, 월, 일로 나눈다.
total['연월일'] = pd.to_datetime(total['연월일'])
total['year'] = total['연월일'].dt.year
total['month'] = total['연월일'].dt.month
total['day'] = total['연월일'].dt.day
total['weekday'] = total['연월일'].dt.weekday

## 훈련용 테스트용 데이터 분류하기


In [21]:
train_years = [2013,2014,2015,2016,2017] 
val_years = [2018]

train = total[total['year'].isin(train_years)] # 훈련용 데이터
val = total[total['year'].isin(val_years)] # 테스트용 데이터

In [22]:
features = ['구분', 'month', 'day', 'weekday', '시간']

# 훈련용 데이터
train_x = train[features]
train_y = train['공급량']
# 테스트용 데이터
val_x = val[features]
val_y = val['공급량']

# LGBM(ensemble 모델) 학습

In [23]:
d_train = lgb.Dataset(train_x, train_y)
d_val = lgb.Dataset(val_x, val_y)

params = {
    'objective': 'regression',
    'metric':'mae',
    'seed':42
}

model = lgb.train(params, d_train, 500, d_val, verbose_eval=20, early_stopping_rounds=10)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84
[LightGBM] [Info] Number of data points in the train set: 306768, number of used features: 5
[LightGBM] [Info] Start training from score 934.864036
Training until validation scores don't improve for 10 rounds
[20]	valid_0's l1: 244.857
[40]	valid_0's l1: 174.855
[60]	valid_0's l1: 158.739
[80]	valid_0's l1: 153.323
[100]	valid_0's l1: 150.948
[120]	valid_0's l1: 150.463
Early stopping, best iteration is:
[112]	valid_0's l1: 150.297


### 추론 및 결과

In [24]:
test = pd.read_csv('data/test.csv')
submission = pd.read_csv('data/sample_submission.csv')

In [25]:
test['일자'] = test['일자|시간|구분'].str.split(' ').str[0]
test['시간'] = test['일자|시간|구분'].str.split(' ').str[1].astype(int)
test['구분'] = test['일자|시간|구분'].str.split(' ').str[2]

In [26]:
test['일자'] = pd.to_datetime(test['일자'])
test['year'] = test['일자'].dt.year
test['month'] = test['일자'].dt.month
test['day'] = test['일자'].dt.day
test['weekday'] = test['일자'].dt.weekday

In [27]:
test['구분'] = test['구분'].map(d_map)

In [28]:
test_x = test[features]

In [29]:
preds = model.predict(test_x)

# 평가지표

### 평가 지표 함수화

In [34]:
def evaluation_index(x, y):
    y = y.reset_index()
    pred = model.predict(x)
    # MSE
    s = 0
    for i in range(len(pred)):
        minus = y.iloc[i,1] - pred[i]
        sq = minus ** 2
        s += sq
    mse = s / len(pred)
    
    # RMSE
    rmse = mse **(1/2)
    
    # MAE
    s = 0
    for i in range(len(pred)):
        minus = y.iloc[i,1] - pred[i]
        a = abs(minus)
        s += a
    mae = s / len(pred)
    
    # R2
    y_mean = y['공급량'].mean()
    lower_s = 0
    upper_s = 0
    for i in range(len(pred)):
        lower_minus = y.iloc[i,1] - y_mean
        lower_sq = lower_minus ** 2
        lower_s += lower_sq
        upper_minus = y.iloc[i,1] - pred[i]
        upper_sq = upper_minus ** 2
        upper_s += upper_sq
    fraction = upper_s / lower_s
    r2 = 1 - fraction
    
    print(f"R2 = {r2}")
    print(f"MSE = {mse}")
    print(f"RMSE = {rmse}")
    print(f"MAE = {mae}")

In [35]:
evaluation_index(val_x, val_y)

R2 = 0.9328467493591341
MSE = 68453.67846858497
RMSE = 261.6365388637164
MAE = 150.29651193751283
