# pycaret을 이용한 가스공급량 예측
* 2019년 test 데이터 : '../data/test.csv'
* 2019년 제출 파일 : '../data/sample_submission.csv'
* 2013-2018년 가스공급량과 기온 자료 : '../2013-2018년_가스공급량_기온_습도_기압01.csv'
* 특성
    * 'month', 'weekday', '시간'으로 기온 예측
    * 기온, 기압, 습도 순서로 예측
    * 각 특성 예측 마다 앞에서 예측한 특성 활용
    * 공급량 예측(7개) : 'month', 'weekday', '시간', '구분', '기온', '기압', '습도'
* 공급량 log적용
* 2개 모델('lightgbm', 'catboost') 예측 평균
* 튜닝X, finalize_model X

In [1]:
# 제출 파일명 : sub23_pycaret11.csv
# 최고점 파일과 비교 점수(NMAE/R2) : 0.02240 / 0.98544
# 제출 점수 : 0.1052374771(2021-12-02 09:06:26)

## 데이터 가져오기
* 2019년 test 데이터 : '../data/test.csv'
* 2019년 제출 파일 : '../data/sample_submission.csv'
* 2013-2018년 가스공급량과 기온 자료 : '../2013-2018년_가스공급량_기온_습도_기압01.csv'

In [2]:
from pycaret.regression import *

In [3]:
import time
import datetime
import pandas as pd

### 2013-2018년 가스공급량과 기온 자료

In [4]:
total = pd.read_csv('../2013-2018년_가스공급량_기온_습도_기압01.csv')
total.head()

Unnamed: 0,year,month,day,weekday,시간,구분,공급량,기온,습도,기압
0,2013,1,1,1,1,0,2497.129,-8.5,57.0,1010.0
1,2013,1,1,1,1,1,2169.093,-8.5,57.0,1010.0
2,2013,1,1,1,1,2,226.178,-8.5,57.0,1010.0
3,2013,1,1,1,1,3,1434.516,-8.5,57.0,1010.0
4,2013,1,1,1,1,4,3272.837,-8.5,57.0,1010.0


### 2019년 데이터

In [5]:
test2019 = pd.read_csv('../data/test.csv')
test2019.head()

Unnamed: 0,일자|시간|구분
0,2019-01-01 01 A
1,2019-01-01 02 A
2,2019-01-01 03 A
3,2019-01-01 04 A
4,2019-01-01 05 A


In [6]:
test2019[['연월일', '시간', '구분']] = test2019['일자|시간|구분'].str.split(' ').tolist()
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분
0,2019-01-01 01 A,2019-01-01,1,A
1,2019-01-01 02 A,2019-01-01,2,A
2,2019-01-01 03 A,2019-01-01,3,A
3,2019-01-01 04 A,2019-01-01,4,A
4,2019-01-01 05 A,2019-01-01,5,A


In [7]:
# 구분 열을 숫자로 바꾸기
d_map = {}
for i, d in enumerate(test2019['구분'].unique()):
    d_map[d] = i
test2019['구분'] = test2019['구분'].map(d_map)
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분
0,2019-01-01 01 A,2019-01-01,1,0
1,2019-01-01 02 A,2019-01-01,2,0
2,2019-01-01 03 A,2019-01-01,3,0
3,2019-01-01 04 A,2019-01-01,4,0
4,2019-01-01 05 A,2019-01-01,5,0


In [8]:
# 연월일 열은 object형이므로 년, 월, 일로 나눈다.
test2019['연월일'] = pd.to_datetime(test2019['연월일'])
test2019['year'] = test2019['연월일'].dt.year
test2019['month'] = test2019['연월일'].dt.month
test2019['day'] = test2019['연월일'].dt.day
test2019['weekday'] = test2019['연월일'].dt.weekday
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분,year,month,day,weekday
0,2019-01-01 01 A,2019-01-01,1,0,2019,1,1,1
1,2019-01-01 02 A,2019-01-01,2,0,2019,1,1,1
2,2019-01-01 03 A,2019-01-01,3,0,2019,1,1,1
3,2019-01-01 04 A,2019-01-01,4,0,2019,1,1,1
4,2019-01-01 05 A,2019-01-01,5,0,2019,1,1,1


## 2019년 기온 훈련/예측

In [9]:
total.head()

Unnamed: 0,year,month,day,weekday,시간,구분,공급량,기온,습도,기압
0,2013,1,1,1,1,0,2497.129,-8.5,57.0,1010.0
1,2013,1,1,1,1,1,2169.093,-8.5,57.0,1010.0
2,2013,1,1,1,1,2,226.178,-8.5,57.0,1010.0
3,2013,1,1,1,1,3,1434.516,-8.5,57.0,1010.0
4,2013,1,1,1,1,4,3272.837,-8.5,57.0,1010.0


In [10]:
start = time.time()

# 학습 특성 : 'month', 'weekday', '시간'
# 타겟 특성 : '기온'
exp = setup(total, target='기온',
 ignore_features=['공급량', 'year', 'day', '구분', '습도', '기압'],
 silent=True, fold=7, fold_shuffle=True, verbose=False)

# 모델 생성
lgbm = create_model('lightgbm')
cat = create_model('catboost')

# 기온 예측
df = test2019[['month', 'weekday', '시간']]
temp2019_pred_lgbm = predict_model(lgbm, data=df)
temp2019_pred_cat = predict_model(cat, data=df)

# 두 모델 기온 예측 평균
temp2019_pred = (temp2019_pred_lgbm['Label'] + temp2019_pred_cat['Label']) / 2

# 2019년 예측 기온 열 추가
test2019['기온'] = temp2019_pred

end = time.time()
sec = end - start
result = datetime.timedelta(seconds=sec)
print(str(result).split('.')[0])

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,3.0086,15.0358,3.8776,0.881,0.5164,0.803
1,2.9799,14.7841,3.845,0.8817,0.5088,0.7984
2,3.006,15.0278,3.8766,0.8817,0.5221,0.7818
3,2.988,14.8552,3.8542,0.8829,0.5189,0.7927
4,2.9804,14.8499,3.8536,0.8838,0.5177,0.8071
5,2.9893,14.8637,3.8553,0.8835,0.5225,0.7916
6,2.9955,14.9162,3.8621,0.8821,0.5176,0.7672
Mean,2.9925,14.9047,3.8606,0.8824,0.5177,0.7917
SD,0.0106,0.088,0.0114,0.001,0.0042,0.0126


0:03:11


In [11]:
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분,year,month,day,weekday,기온
0,2019-01-01 01 A,2019-01-01,1,0,2019,1,1,1,-3.623291
1,2019-01-01 02 A,2019-01-01,2,0,2019,1,1,1,-4.043813
2,2019-01-01 03 A,2019-01-01,3,0,2019,1,1,1,-4.490847
3,2019-01-01 04 A,2019-01-01,4,0,2019,1,1,1,-5.047465
4,2019-01-01 05 A,2019-01-01,5,0,2019,1,1,1,-5.071368


## 2019년 기압 훈련/예측

In [12]:
total.head()

Unnamed: 0,year,month,day,weekday,시간,구분,공급량,기온,습도,기압
0,2013,1,1,1,1,0,2497.129,-8.5,57.0,1010.0
1,2013,1,1,1,1,1,2169.093,-8.5,57.0,1010.0
2,2013,1,1,1,1,2,226.178,-8.5,57.0,1010.0
3,2013,1,1,1,1,3,1434.516,-8.5,57.0,1010.0
4,2013,1,1,1,1,4,3272.837,-8.5,57.0,1010.0


In [13]:
start = time.time()

# 학습 특성 : 'month', 'weekday', '시간', '기온'
# 타겟 특성 : '기압'
exp = setup(total, target='기압', ignore_features=['공급량', 'year', 'day', '습도', '구분'], 
            silent=True, fold=7, fold_shuffle=True, verbose=False)

# 모델 생성
lgbm = create_model('lightgbm')
cat = create_model('catboost')

# 기압 예측
df = test2019[['month', 'weekday', '시간', '기온']]
pressure2019_pred_lgbm = predict_model(lgbm, data=df)
pressure2019_pred_cat = predict_model(cat, data=df)

# 두 모델 기압 예측 평균
pressure2019_pred = (pressure2019_pred_lgbm['Label'] + pressure2019_pred_cat['Label']) / 2

# 2019년 예측 기압 열 추가
test2019['기압'] = pressure2019_pred

end = time.time()
sec = end - start
result = datetime.timedelta(seconds=sec)
print(str(result).split('.')[0])

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,3.239,17.6549,4.2018,0.7301,0.0042,0.0032
1,3.2512,17.6947,4.2065,0.7284,0.0042,0.0032
2,3.2433,17.5184,4.1855,0.7323,0.0042,0.0032
3,3.2405,17.4483,4.1771,0.7316,0.0042,0.0032
4,3.244,17.7429,4.2122,0.7225,0.0042,0.0032
5,3.2618,17.6887,4.2058,0.7293,0.0042,0.0032
6,3.2538,17.8594,4.226,0.7256,0.0042,0.0032
Mean,3.2477,17.6582,4.2021,0.7285,0.0042,0.0032
SD,0.0077,0.1273,0.0152,0.0032,0.0,0.0


0:03:12


In [14]:
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분,year,month,day,weekday,기온,기압
0,2019-01-01 01 A,2019-01-01,1,0,2019,1,1,1,-3.623291,1014.90914
1,2019-01-01 02 A,2019-01-01,2,0,2019,1,1,1,-4.043813,1015.304981
2,2019-01-01 03 A,2019-01-01,3,0,2019,1,1,1,-4.490847,1015.528668
3,2019-01-01 04 A,2019-01-01,4,0,2019,1,1,1,-5.047465,1015.455786
4,2019-01-01 05 A,2019-01-01,5,0,2019,1,1,1,-5.071368,1014.991495


## 2019년 습도(Humidity) 훈련/예측

In [15]:
total.head()

Unnamed: 0,year,month,day,weekday,시간,구분,공급량,기온,습도,기압
0,2013,1,1,1,1,0,2497.129,-8.5,57.0,1010.0
1,2013,1,1,1,1,1,2169.093,-8.5,57.0,1010.0
2,2013,1,1,1,1,2,226.178,-8.5,57.0,1010.0
3,2013,1,1,1,1,3,1434.516,-8.5,57.0,1010.0
4,2013,1,1,1,1,4,3272.837,-8.5,57.0,1010.0


In [16]:
start = time.time()

# 학습 특성 : 'month', 'weekday', '시간', '기온', '기압'
# 타겟 특성 : '습도'
exp = setup(total, target='습도', ignore_features=['공급량', 'year', 'day', '구분'],
 fold=7, fold_shuffle=True, silent=True, verbose=False)

# 모델 생성
lgbm = create_model('lightgbm')
cat = create_model('catboost')

# 습도 예측
df = test2019[['month', 'weekday', '시간', '기온', '기압']]
humidity2019_pred_lgbm = predict_model(lgbm, data=df)
humidity2019_pred_cat = predict_model(cat, data=df)

# 두 모델 습도 예측 평균
humidity2019_pred = (humidity2019_pred_lgbm['Label'] + humidity2019_pred_cat['Label']) / 2

# 2019년 예측 습도 열 추가
test2019['습도'] = humidity2019_pred

end = time.time()
sec = end - start
result = datetime.timedelta(seconds=sec)
print(str(result).split('.')[0])

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,9.2026,138.854,11.7836,0.6474,0.2286,0.1885
1,9.2099,139.954,11.8302,0.6464,0.2285,0.1877
2,9.1603,138.3336,11.7615,0.6495,0.2284,0.1875
3,9.1763,139.2174,11.799,0.6527,0.2297,0.1887
4,9.2027,140.528,11.8544,0.6419,0.2292,0.1877
5,9.1579,138.2288,11.7571,0.6498,0.2287,0.1878
6,9.2298,140.1708,11.8394,0.6518,0.231,0.1901
Mean,9.1913,139.3267,11.8036,0.6485,0.2292,0.1883
SD,0.025,0.8431,0.0357,0.0034,0.0009,0.0009


0:03:09


In [17]:
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분,year,month,day,weekday,기온,기압,습도
0,2019-01-01 01 A,2019-01-01,1,0,2019,1,1,1,-3.623291,1014.90914,59.765961
1,2019-01-01 02 A,2019-01-01,2,0,2019,1,1,1,-4.043813,1015.304981,58.202277
2,2019-01-01 03 A,2019-01-01,3,0,2019,1,1,1,-4.490847,1015.528668,59.089852
3,2019-01-01 04 A,2019-01-01,4,0,2019,1,1,1,-5.047465,1015.455786,60.510779
4,2019-01-01 05 A,2019-01-01,5,0,2019,1,1,1,-5.071368,1014.991495,61.643057


## 2019년 가스 공급량 훈련/예측

In [18]:
import numpy as np

In [19]:
total['log_공급량'] = np.log1p(total['공급량'])
total.head()

Unnamed: 0,year,month,day,weekday,시간,구분,공급량,기온,습도,기압,log_공급량
0,2013,1,1,1,1,0,2497.129,-8.5,57.0,1010.0,7.823297
1,2013,1,1,1,1,1,2169.093,-8.5,57.0,1010.0,7.682525
2,2013,1,1,1,1,2,226.178,-8.5,57.0,1010.0,5.425734
3,2013,1,1,1,1,3,1434.516,-8.5,57.0,1010.0,7.26928
4,2013,1,1,1,1,4,3272.837,-8.5,57.0,1010.0,8.093718


In [20]:
start = time.time()

# 학습 특성 : 'month', 'weekday', '시간', '구분', '기온', '기압', '습도'
exp = setup(total, target='log_공급량', ignore_features=['year', '공급량', 'day'],
            silent=True, fold=7, fold_shuffle=True)

# 모델 생성
lgbm = create_model('lightgbm')
cat = create_model('catboost')

# log_공급량 예측
df = test2019[['month', 'weekday', '시간', '구분', '기온', '기압', '습도']]
log_gas2019_pred_lgbm = predict_model(lgbm, data=df)
log_gas2019_pred_cat = predict_model(cat, data=df)

# log_공급량 값을 공급량 값으로 변환
gas_pred2019_lgbm = np.expm1(log_gas2019_pred_lgbm['Label'])
gas_pred2019_cat = np.expm1(log_gas2019_pred_cat['Label'])

# 두 모델 공급량 예측 평균
gas2019_pred = (gas_pred2019_lgbm + gas_pred2019_cat) / 2

end = time.time()
sec = end - start
result = datetime.timedelta(seconds=sec)
print(str(result).split('.')[0])

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,0.1144,0.0774,0.2782,0.9462,0.0771,0.0367
1,0.1145,0.0827,0.2876,0.9427,0.0804,0.0383
2,0.1158,0.0846,0.2909,0.9416,0.0813,0.0389
3,0.1138,0.0779,0.279,0.9454,0.0777,0.0368
4,0.115,0.08,0.2828,0.9447,0.0788,0.0376
5,0.1125,0.074,0.272,0.9476,0.0748,0.0351
6,0.1122,0.0747,0.2733,0.9476,0.0761,0.0359
Mean,0.114,0.0788,0.2806,0.9451,0.078,0.0371
SD,0.0012,0.0037,0.0065,0.0021,0.0021,0.0012


0:03:24


In [21]:
gas2019_pred.head()

0    1986.488313
1    1777.625538
2    1710.387693
3    1777.399099
4    1936.957736
Name: Label, dtype: float64

## 제출 파일 만들기

In [22]:
sub = pd.read_csv('../data/sample_submission.csv')
sub.head()

Unnamed: 0,일자|시간|구분,공급량
0,2019-01-01 01 A,0
1,2019-01-01 02 A,0
2,2019-01-01 03 A,0
3,2019-01-01 04 A,0
4,2019-01-01 05 A,0


In [23]:
sub['공급량'] = gas2019_pred
sub.head()

Unnamed: 0,일자|시간|구분,공급량
0,2019-01-01 01 A,1986.488313
1,2019-01-01 02 A,1777.625538
2,2019-01-01 03 A,1710.387693
3,2019-01-01 04 A,1777.399099
4,2019-01-01 05 A,1936.957736


In [24]:
sub.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15120 entries, 0 to 15119
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   일자|시간|구분  15120 non-null  object 
 1   공급량       15120 non-null  float64
dtypes: float64(1), object(1)
memory usage: 236.4+ KB


In [25]:
sub.to_csv("sub23_pycaret11.csv", index=False)