# Daicon 가스공급량 수요예측 모델개발
* 일정 : 2021.10.11 ~ 2021.12.10 23:00
* 주소 : https://dacon.io/competitions/official/235830/overview/description
* 데이터 주소1(dacon) : https://dacon.io/competitions/official/235830/data
* 데이터 주소2(공공 데이터 포털) : https://www.data.go.kr/data/15091497/fileData.do

# csv 파일위치
* D:\Data\documents\GitHub\LikeLion_13th_DataCourse\codeclass\2021.10.15\data\한국가스공사_시간별 공급량_20181231.csv
* D:\Data\documents\GitHub\LikeLion_13th_DataCourse\codeclass\2021.10.15\data\test.csv
* D:\Data\documents\GitHub\LikeLion_13th_DataCourse\codeclass\2021.10.15\data\sample_submission.csv

In [7]:
# lightgbm 설치
!pip install lightgbm



In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb

In [123]:
total = pd.read_csv('data/한국가스공사_시간별 공급량_20181231.csv', encoding='cp949')

In [124]:
total.head()

Unnamed: 0,연월일,시간,구분,공급량
0,2013-01-01,1,A,2497.129
1,2013-01-01,2,A,2363.265
2,2013-01-01,3,A,2258.505
3,2013-01-01,4,A,2243.969
4,2013-01-01,5,A,2344.105


## 전처리

In [125]:
total['구분'].unique()

array(['A', 'B', 'C', 'D', 'E', 'G', 'H'], dtype=object)

In [126]:
# 구분 열을 숫자로 바꾸기
d_map = {}
for i, d in enumerate(total['구분'].unique()):
    d_map[d] = i
total['구분'] = total['구분'].map(d_map)
total

Unnamed: 0,연월일,시간,구분,공급량
0,2013-01-01,1,0,2497.129
1,2013-01-01,2,0,2363.265
2,2013-01-01,3,0,2258.505
3,2013-01-01,4,0,2243.969
4,2013-01-01,5,0,2344.105
...,...,...,...,...
368083,2018-12-31,20,6,681.033
368084,2018-12-31,21,6,669.961
368085,2018-12-31,22,6,657.941
368086,2018-12-31,23,6,610.953


In [127]:
total.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 368088 entries, 0 to 368087
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   연월일     368088 non-null  object 
 1   시간      368088 non-null  int64  
 2   구분      368088 non-null  int64  
 3   공급량     368088 non-null  float64
dtypes: float64(1), int64(2), object(1)
memory usage: 11.2+ MB


In [128]:
# 연월일 열은 object형이므로 년, 월, 일로 나눈다.
total['연월일'] = pd.to_datetime(total['연월일'])
total['year'] = total['연월일'].dt.year
total['month'] = total['연월일'].dt.month
total['day'] = total['연월일'].dt.day
total['weekday'] = total['연월일'].dt.weekday

In [129]:
total

Unnamed: 0,연월일,시간,구분,공급량,year,month,day,weekday
0,2013-01-01,1,0,2497.129,2013,1,1,1
1,2013-01-01,2,0,2363.265,2013,1,1,1
2,2013-01-01,3,0,2258.505,2013,1,1,1
3,2013-01-01,4,0,2243.969,2013,1,1,1
4,2013-01-01,5,0,2344.105,2013,1,1,1
...,...,...,...,...,...,...,...,...
368083,2018-12-31,20,6,681.033,2018,12,31,0
368084,2018-12-31,21,6,669.961,2018,12,31,0
368085,2018-12-31,22,6,657.941,2018,12,31,0
368086,2018-12-31,23,6,610.953,2018,12,31,0


## 훈련용 테스트용 데이터 분류하기


In [130]:
train_years = [2013,2014,2015,2016,2017] 
val_years = [2018]

train = total[total['year'].isin(train_years)] # 훈련용 데이터
val = total[total['year'].isin(val_years)] # 테스트용 데이터

In [131]:
features = ['구분', 'month', 'day', 'weekday', '시간']

# 훈련용 데이터
train_x = train[features]
train_y = train['공급량']
# 테스트용 데이터
val_x = val[features]
val_y = val['공급량']

# LGBM(ensemble 모델) 학습

In [132]:
d_train = lgb.Dataset(train_x, train_y)
d_val = lgb.Dataset(val_x, val_y)

params = {
    'objective': 'regression',
    'metric':'mae',
    'seed':42
}

model = lgb.train(params, d_train, 500, d_val, verbose_eval=20, early_stopping_rounds=10)



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84
[LightGBM] [Info] Number of data points in the train set: 306768, number of used features: 5
[LightGBM] [Info] Start training from score 934.864036
Training until validation scores don't improve for 10 rounds
[20]	valid_0's l1: 244.857
[40]	valid_0's l1: 174.855
[60]	valid_0's l1: 158.739
[80]	valid_0's l1: 153.323
[100]	valid_0's l1: 150.948
[120]	valid_0's l1: 150.463
Early stopping, best iteration is:
[112]	valid_0's l1: 150.297


## 추론 및 결과 제출

In [133]:
test = pd.read_csv('data/test.csv')
submission = pd.read_csv('data/sample_submission.csv')

In [134]:
test.head()

Unnamed: 0,일자|시간|구분
0,2019-01-01 01 A
1,2019-01-01 02 A
2,2019-01-01 03 A
3,2019-01-01 04 A
4,2019-01-01 05 A


In [135]:
test['일자'] = test['일자|시간|구분'].str.split(' ').str[0]
test['시간'] = test['일자|시간|구분'].str.split(' ').str[1].astype(int)
test['구분'] = test['일자|시간|구분'].str.split(' ').str[2]

In [136]:
test['일자'] = pd.to_datetime(test['일자'])
test['year'] = test['일자'].dt.year
test['month'] = test['일자'].dt.month
test['day'] = test['일자'].dt.day
test['weekday'] = test['일자'].dt.weekday

In [137]:
test['구분'] = test['구분'].map(d_map)

In [138]:
test_x = test[features]

In [139]:
test_x

Unnamed: 0,구분,month,day,weekday,시간
0,0,1,1,1,1
1,0,1,1,1,2
2,0,1,1,1,3
3,0,1,1,1,4
4,0,1,1,1,5
...,...,...,...,...,...
15115,6,3,31,6,20
15116,6,3,31,6,21
15117,6,3,31,6,22
15118,6,3,31,6,23


In [140]:
preds = model.predict(test_x)

In [141]:
submission.head()

Unnamed: 0,일자|시간|구분,공급량
0,2019-01-01 01 A,0
1,2019-01-01 02 A,0
2,2019-01-01 03 A,0
3,2019-01-01 04 A,0
4,2019-01-01 05 A,0


In [142]:
submission['공급량'] = preds

In [32]:
submission.to_csv('baseline.csv', index=False)

## 평가지표

In [98]:
# 결정 계수
# 실제값, 실제값 평균, 예측값
val_y# 실제값

306768    1765.008
306769    1679.186
306770    1610.885
306771    1604.123
306772    1711.506
            ...   
368083     681.033
368084     669.961
368085     657.941
368086     610.953
368087     560.896
Name: 공급량, Length: 61320, dtype: float64

## MSE

In [151]:
type(val_y)

pandas.core.series.Series

In [152]:
val_y = val_y.reset_index()

In [153]:
val_y

Unnamed: 0,index,공급량
0,306768,1765.008
1,306769,1679.186
2,306770,1610.885
3,306771,1604.123
4,306772,1711.506
...,...,...
61315,368083,681.033
61316,368084,669.961
61317,368085,657.941
61318,368086,610.953


In [154]:
val_y.iloc[0,1]

1765.008

In [155]:
# 실제값 평균
val_y_mean = val_y['공급량'].mean()
val_y_mean

1014.3163017123462

In [156]:
pred = model.predict(val_x)
print(len(pred), pred)

61320 [1692.95022318 1552.98266028 1517.79609818 ...  545.14642348  502.77442403
  426.92784381]


In [157]:
pred[0]

1692.9502231767278

In [158]:
s = 0
for i in range(len(pred)):
    minus = val_y.iloc[i,1] - pred[i]
    sq = minus * minus
    s += sq
mse = s / len(pred)
print(f"MSE = {mse}")

MSE = 68453.67846858497


## RMSE

In [159]:
RMSE = mse **(1/2)
RMSE

261.6365388637164

## MAE

In [164]:
s = 0
for i in range(len(pred)):
    minus = val_y.iloc[i,1] - pred[i]
    a = abs(minus)
    s += a
mae = s / len(pred)
print(f"MAE = {mae}")

MAE = 150.29651193751283


## R2

In [161]:
pred = model.predict(val_x)
val_y_mean = val_y['공급량'].mean()

lower_s = 0

for i in range(len(pred)):
    lower_minus = val_y.iloc[i,1] - val_y_mean
    lower = lower_minus ** 2
    lower_s += lower
print(lower_s)

62507466483.524315


In [119]:
pred = model.predict(val_x)
val_y_mean = val_y['공급량'].mean()

upper_s = 0

for i in range(len(pred)):
    upper_minus = val_y.iloc[i,1] - pred
    
print(len(upper_minus), upper_minus)

print(len(upper_minus ** 2), upper_minus ** 2)

61320 [-1132.05422318  -992.08666028  -956.90009818 ...    15.74957652
    58.12157597   133.96815619]
61320 [1.28154676e+06 9.84235942e+05 9.15657798e+05 ... 2.48049161e+02
 3.37811759e+03 1.79474669e+04]


In [162]:
pred = model.predict(val_x)
val_y_mean = val_y['공급량'].mean()
lower_s = 0
upper_s = 0
for i in range(len(pred)):
    lower_minus = val_y.iloc[i,1] - val_y_mean
    lower_sq = lower_minus ** 2
    lower_s += lower_sq
    upper_minus = val_y.iloc[i,1] - pred[i]
    upper_sq = upper_minus ** 2
    upper_s += upper_sq
    
fraction = upper_s / lower_s
r2 = 1 - fraction
print(f"R2 = {r2}")

R2 = 0.9328467493591341


# 평가지표 함수화

In [149]:
def evaluation_index(x, y):
    y = y.reset_index()
    pred = model.predict(x)
    # MSE
    s = 0
    for i in range(len(pred)):
        minus = y.iloc[i,1] - pred[i]
        sq = minus ** 2
        s += sq
    mse = s / len(pred)
    
    # RMSE
    rmse = mse **(1/2)
    
    # MAE
    s = 0
    for i in range(len(pred)):
        minus = y.iloc[i,1] - pred[i]
        a = abs(minus)
        s += a
    mae = s / len(pred)
    
    # R2
    y_mean = y['공급량'].mean()
    lower_s = 0
    upper_s = 0
    for i in range(len(pred)):
        lower_minus = y.iloc[i,1] - y_mean
        lower_sq = lower_minus ** 2
        lower_s += lower_sq
        upper_minus = y.iloc[i,1] - pred[i]
        upper_sq = upper_minus ** 2
        upper_s += upper_sq
    fraction = upper_s / lower_s
    r2 = 1 - fraction
    
    print(f"R2 = {r2}")
    print(f"MSE = {mse}")
    print(f"RMSE = {rmse}")
    print(f"MAE = {mae}")

In [150]:
evaluation_index(val_x ,val_y)

R2 = -3.394374503130951
MSE = 0.004369476718418022
RMSE = 0.06610201750641218
MAE = 0.002184738359209011
