# 평가지표 함수화

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb

In [2]:
total = pd.read_csv('data/한국가스공사_시간별 공급량_20181231.csv', encoding='cp949')

In [3]:
# 구분 열을 숫자로 바꾸기
d_map = {}
for i, d in enumerate(total['구분'].unique()):
    d_map[d] = i
total['구분'] = total['구분'].map(d_map)

In [4]:
# 연월일 열은 object형이므로 년, 월, 일로 나눈다.
total['연월일'] = pd.to_datetime(total['연월일'])
total['year'] = total['연월일'].dt.year
total['month'] = total['연월일'].dt.month
total['day'] = total['연월일'].dt.day
total['weekday'] = total['연월일'].dt.weekday

## 훈련용 테스트용 데이터 분류하기


In [5]:
train_years = [2013,2014,2015,2016,2017] 
val_years = [2018]

train = total[total['year'].isin(train_years)] # 훈련용 데이터
val = total[total['year'].isin(val_years)] # 테스트용 데이터

In [6]:
features = ['구분', 'month', 'day', 'weekday', '시간']

# 훈련용 데이터
train_x = train[features]
train_y = train['공급량']
# 테스트용 데이터
val_x = val[features]
val_y = val['공급량']

# LGBM(ensemble 모델) 학습

In [7]:
d_train = lgb.Dataset(train_x, train_y)
d_val = lgb.Dataset(val_x, val_y)

params = {
    'objective': 'regression',
    'metric':'mae',
    'seed':42
}

model = lgb.train(params, d_train, 500, d_val, verbose_eval=20, early_stopping_rounds=10)



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84
[LightGBM] [Info] Number of data points in the train set: 306768, number of used features: 5
[LightGBM] [Info] Start training from score 934.864036
Training until validation scores don't improve for 10 rounds
[20]	valid_0's l1: 244.857
[40]	valid_0's l1: 174.855
[60]	valid_0's l1: 158.739
[80]	valid_0's l1: 153.323
[100]	valid_0's l1: 150.948
[120]	valid_0's l1: 150.463
Early stopping, best iteration is:
[112]	valid_0's l1: 150.297


### 추론 및 결과

In [8]:
test = pd.read_csv('data/test.csv')
submission = pd.read_csv('data/sample_submission.csv')

In [9]:
test['일자'] = test['일자|시간|구분'].str.split(' ').str[0]
test['시간'] = test['일자|시간|구분'].str.split(' ').str[1].astype(int)
test['구분'] = test['일자|시간|구분'].str.split(' ').str[2]

In [10]:
test['일자'] = pd.to_datetime(test['일자'])
test['year'] = test['일자'].dt.year
test['month'] = test['일자'].dt.month
test['day'] = test['일자'].dt.day
test['weekday'] = test['일자'].dt.weekday

In [11]:
test['구분'] = test['구분'].map(d_map)

In [12]:
test_x = test[features]

In [13]:
preds = model.predict(test_x)

# 평가지표

In [14]:
val_y

306768    1765.008
306769    1679.186
306770    1610.885
306771    1604.123
306772    1711.506
            ...   
368083     681.033
368084     669.961
368085     657.941
368086     610.953
368087     560.896
Name: 공급량, Length: 61320, dtype: float64

In [15]:
prac = val_y.copy()
prac

306768    1765.008
306769    1679.186
306770    1610.885
306771    1604.123
306772    1711.506
            ...   
368083     681.033
368084     669.961
368085     657.941
368086     610.953
368087     560.896
Name: 공급량, Length: 61320, dtype: float64

In [16]:
# prac['공급량'] # KeyError: '공급량'

In [17]:
prac = prac.reset_index()
prac

Unnamed: 0,index,공급량
0,306768,1765.008
1,306769,1679.186
2,306770,1610.885
3,306771,1604.123
4,306772,1711.506
...,...,...
61315,368083,681.033
61316,368084,669.961
61317,368085,657.941
61318,368086,610.953


In [18]:
prac.loc[61315, '공급량']

681.033

In [19]:
prac.iloc[61315,1]

681.033

In [20]:
prac_mean = prac['공급량'].mean()
prac_mean

1014.3163017123462

In [21]:
result = prac['공급량'] - prac_mean
result

0        750.691698
1        664.869698
2        596.568698
3        589.806698
4        697.189698
            ...    
61315   -333.283302
61316   -344.355302
61317   -356.375302
61318   -403.363302
61319   -453.420302
Name: 공급량, Length: 61320, dtype: float64

### 평가 지표 함수화

In [22]:
def evaluation_index(model, x, y):
    y = y.reset_index()
    # 예측값
    pred = model.predict(x)
    # MSE
    mse = ( (y['공급량'] - pred) ** 2 ).sum() / len(pred)
        
    # RMSE
    rmse = mse ** (1/2)
    
    # MAE
    mae = ( abs( y['공급량'] - pred ) ).sum() / len(pred)
    
    # R2
    y_mean = y['공급량'].mean()
    upper = ( (y['공급량'] - pred) ** 2 ).sum()
    lower = ( (y['공급량'] - y_mean) ** 2 ).sum()
    fraction = upper / lower
    r2 = 1 - fraction
    
    #NMAE(Normalized Mean Absolute Error)-정규화 평균 절대 오차 척도
    nmae = ( abs( y['공급량'] - pred ) / y['공급량'] ).sum() / len(pred)
    
    # 평가지표 출력
    print(f"R2 = {r2}")
    print(f"MSE = {mse}")
    print(f"RMSE = {rmse}")
    print(f"MAE = {mae}")
    print(f"NMAE = {nmae}")

In [24]:
evaluation_index(model, val_x, val_y)

R2 = 0.932846749359134
MSE = 68453.67846858544
RMSE = 261.63653886371725
MAE = 150.29651193751076
NMAE = 0.5161785945623045
