# pycaret을 이용한 가스공급량 예측
* 2019년 test 데이터 : '../data/test.csv'
* 2019년 제출 파일 : '../data/sample_submission.csv'
* 2013-2018년 가스공급량과 기온 자료 : '../2013-2018년_가스공급량_기온_습도_기압01.csv'
* 특성
    * 'month', 'weekay', '시간'으로 기온 예측
    * 기온, 기압, 습도 순서로 예측
    * 각 특성 예측 마다 앞에서 예측한 특성 활용
    * 공급량 예측(7개) : 'month', 'weekday', '시간', '구분', '기온', '기압', '습도'
* 공급량 log적용
* 2개 모델('lightgbm', 'catboost') 예측 평균
* 튜닝O, finalize_model X
* 구분별 훈련/예측
* 1~3월 데이터로만 훈련

In [1]:
# 제출 파일명 : sub28_pycaret16.csv
# 최고점 파일과 비교 점수(NMAE/R2) : 0.02058 / 0.98496
# 제출 점수 : 0.1035994724(2021-12-04 19:28:04)

## 데이터 가져오기
* 2019년 test 데이터 : '../data/test.csv'
* 2019년 제출 파일 : '../data/sample_submission.csv'
* 2013-2018년 가스공급량과 기온 자료 : '../2013-2018년_가스공급량_기온_습도_기압01.csv'

In [2]:
from pycaret.regression import *

In [3]:
import time
import datetime
import pandas as pd
import numpy as np
from tqdm import tqdm

### 2013-2018년 가스공급량과 기온 자료

In [4]:
total = pd.read_csv('../2013-2018년_가스공급량_기온_습도_기압01.csv')
total.head()

Unnamed: 0,year,month,day,weekday,시간,구분,공급량,기온,습도,기압
0,2013,1,1,1,1,0,2497.129,-8.5,57.0,1010.0
1,2013,1,1,1,1,1,2169.093,-8.5,57.0,1010.0
2,2013,1,1,1,1,2,226.178,-8.5,57.0,1010.0
3,2013,1,1,1,1,3,1434.516,-8.5,57.0,1010.0
4,2013,1,1,1,1,4,3272.837,-8.5,57.0,1010.0


### 2019년 데이터

In [5]:
test2019 = pd.read_csv('../data/test.csv')
test2019.head()

Unnamed: 0,일자|시간|구분
0,2019-01-01 01 A
1,2019-01-01 02 A
2,2019-01-01 03 A
3,2019-01-01 04 A
4,2019-01-01 05 A


In [6]:
test2019[['연월일', '시간', '구분']] = test2019['일자|시간|구분'].str.split(' ').tolist()
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분
0,2019-01-01 01 A,2019-01-01,1,A
1,2019-01-01 02 A,2019-01-01,2,A
2,2019-01-01 03 A,2019-01-01,3,A
3,2019-01-01 04 A,2019-01-01,4,A
4,2019-01-01 05 A,2019-01-01,5,A


In [7]:
# 구분 열을 숫자로 바꾸기
d_map = {}
for i, d in enumerate(test2019['구분'].unique()):
    d_map[d] = i
test2019['구분'] = test2019['구분'].map(d_map)
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분
0,2019-01-01 01 A,2019-01-01,1,0
1,2019-01-01 02 A,2019-01-01,2,0
2,2019-01-01 03 A,2019-01-01,3,0
3,2019-01-01 04 A,2019-01-01,4,0
4,2019-01-01 05 A,2019-01-01,5,0


In [8]:
# 연월일 열은 object형이므로 년, 월, 일로 나눈다.
test2019['연월일'] = pd.to_datetime(test2019['연월일'])
test2019['year'] = test2019['연월일'].dt.year
test2019['month'] = test2019['연월일'].dt.month
test2019['day'] = test2019['연월일'].dt.day
test2019['weekday'] = test2019['연월일'].dt.weekday
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분,year,month,day,weekday
0,2019-01-01 01 A,2019-01-01,1,0,2019,1,1,1
1,2019-01-01 02 A,2019-01-01,2,0,2019,1,1,1
2,2019-01-01 03 A,2019-01-01,3,0,2019,1,1,1
3,2019-01-01 04 A,2019-01-01,4,0,2019,1,1,1
4,2019-01-01 05 A,2019-01-01,5,0,2019,1,1,1


## 2019년 기온 훈련/예측

In [9]:
start = time.time()

# 학습 특성 : 'month', 'weekday', '시간'
# 타겟 특성 : '기온'
temp_train = total[['month', 'weekday', '시간', '기온']]
temp_train = temp_train[ (temp_train['month']>=1) & (temp_train['month']<=3) ]
temp_train = temp_train.reset_index(drop=True)

temp_test = test2019[['month', 'weekday', '시간']]

# pycaret 설정
exp = setup(temp_train, target='기온', silent=True, fold=7, fold_shuffle=True, verbose=False)

# 모델 생성
lgbm = create_model('lightgbm')
cat = create_model('catboost')

# 기온 예측
temp2019_pred_lgbm = predict_model(lgbm, data=temp_test)
temp2019_pred_cat = predict_model(cat, data=temp_test)

# 두 모델 기온 예측 평균
temp2019_pred = (temp2019_pred_lgbm['Label'] + temp2019_pred_cat['Label']) / 2

# 2019년 예측 기온 열 추가
test2019['기온'] = temp2019_pred

end = time.time()
sec = end - start
result = datetime.timedelta(seconds=sec)
print(str(result).split('.')[0])

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,3.7093,21.9151,4.6814,0.5026,0.7732,1.6523
1,3.6482,21.2675,4.6117,0.5023,0.7629,1.7515
2,3.6655,21.3758,4.6234,0.4907,0.764,1.6448
3,3.6967,21.7386,4.6625,0.4989,0.7709,1.6442
4,3.6311,21.0865,4.592,0.4975,0.7645,1.6829
5,3.6512,21.3435,4.6199,0.5047,0.7628,1.6012
6,3.6357,21.2538,4.6102,0.5022,0.7485,1.6814
Mean,3.6625,21.4258,4.6287,0.4998,0.7638,1.6655
SD,0.0278,0.2716,0.0293,0.0044,0.0073,0.0433


0:01:04


In [10]:
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분,year,month,day,weekday,기온
0,2019-01-01 01 A,2019-01-01,1,0,2019,1,1,1,-3.726847
1,2019-01-01 02 A,2019-01-01,2,0,2019,1,1,1,-4.403736
2,2019-01-01 03 A,2019-01-01,3,0,2019,1,1,1,-4.545435
3,2019-01-01 04 A,2019-01-01,4,0,2019,1,1,1,-4.957938
4,2019-01-01 05 A,2019-01-01,5,0,2019,1,1,1,-5.154883


## 2019년 기압 훈련/예측

In [11]:
start = time.time()

# 학습 특성 : 'month', 'weekday', '시간', '기온'
# 타겟 특성 : '기압'
temp_train = total[['month', 'weekday', '시간', '기온', '기압']]
temp_train = temp_train[ (temp_train['month']>=1) & (temp_train['month']<=3) ]
temp_train = temp_train.reset_index(drop=True)

temp_test = test2019[['month', 'weekday', '시간', '기온']]

# pycaret 설정
exp = setup(temp_train, target='기압', silent=True, fold=7, fold_shuffle=True, verbose=False)

# 모델 생성
lgbm = create_model('lightgbm')
cat = create_model('catboost')

# 기압 예측
pressure2019_pred_lgbm = predict_model(lgbm, data=temp_test)
pressure2019_pred_cat = predict_model(cat, data=temp_test)

# 두 모델 기압 예측 평균
pressure2019_pred = (pressure2019_pred_lgbm['Label'] + pressure2019_pred_cat['Label']) / 2

# 2019년 예측 기압 열 추가
test2019['기압'] = pressure2019_pred

end = time.time()
sec = end - start
result = datetime.timedelta(seconds=sec)
print(str(result).split('.')[0])

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,3.1154,15.8066,3.9758,0.4716,0.0039,0.0031
1,3.1461,16.2562,4.0319,0.4747,0.004,0.0031
2,3.146,16.2882,4.0359,0.471,0.004,0.0031
3,3.1254,16.1598,4.0199,0.4815,0.004,0.0031
4,3.1094,15.9204,3.99,0.4812,0.0039,0.0031
5,3.1451,16.2184,4.0272,0.4793,0.004,0.0031
6,3.1265,16.0299,4.0037,0.4821,0.004,0.0031
Mean,3.1305,16.0971,4.0121,0.4774,0.004,0.0031
SD,0.0142,0.1693,0.0211,0.0045,0.0,0.0


0:00:53


In [12]:
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분,year,month,day,weekday,기온,기압
0,2019-01-01 01 A,2019-01-01,1,0,2019,1,1,1,-3.726847,1014.694317
1,2019-01-01 02 A,2019-01-01,2,0,2019,1,1,1,-4.403736,1015.041032
2,2019-01-01 03 A,2019-01-01,3,0,2019,1,1,1,-4.545435,1015.395001
3,2019-01-01 04 A,2019-01-01,4,0,2019,1,1,1,-4.957938,1015.29827
4,2019-01-01 05 A,2019-01-01,5,0,2019,1,1,1,-5.154883,1015.071694


## 2019년 습도(Humidity) 훈련/예측

In [13]:
start = time.time()

# 학습 특성 : 'month', 'weekday', '시간', '기온', '기압'
# 타겟 특성 : '습도'
temp_train = total[['month', 'weekday', '시간', '기온', '기압', '습도']]
temp_train = temp_train[ (temp_train['month']>=1) & (temp_train['month']<=3) ]
temp_train = temp_train.reset_index(drop=True)

temp_test = test2019[['month', 'weekday', '시간', '기온', '기압']]

# pycaret 설정
exp = setup(temp_train, target='습도', fold=7, fold_shuffle=True, silent=True, verbose=False)

# 모델 생성
lgbm = create_model('lightgbm')
cat = create_model('catboost')

# 습도 예측
humidity2019_pred_lgbm = predict_model(lgbm, data=temp_test)
humidity2019_pred_cat = predict_model(cat, data=temp_test)

# 두 모델 습도 예측 평균
humidity2019_pred = (humidity2019_pred_lgbm['Label'] + humidity2019_pred_cat['Label']) / 2

# 2019년 예측 습도 열 추가
test2019['습도'] = humidity2019_pred

end = time.time()
sec = end - start
result = datetime.timedelta(seconds=sec)
print(str(result).split('.')[0])

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,8.2368,110.6283,10.518,0.7027,0.2246,0.1872
1,8.2383,110.738,10.5232,0.7008,0.227,0.1895
2,8.2246,110.2275,10.4989,0.7033,0.2232,0.1863
3,8.2471,111.8479,10.5758,0.6989,0.2276,0.1894
4,8.3809,115.0375,10.7256,0.6931,0.2296,0.1919
5,8.3935,114.9132,10.7198,0.6941,0.232,0.1949
6,8.301,113.0482,10.6324,0.6903,0.2268,0.1892
Mean,8.2889,112.3487,10.5991,0.6976,0.2273,0.1898
SD,0.0662,1.8747,0.0883,0.0047,0.0027,0.0027


0:00:51


In [14]:
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분,year,month,day,weekday,기온,기압,습도
0,2019-01-01 01 A,2019-01-01,1,0,2019,1,1,1,-3.726847,1014.694317,59.965608
1,2019-01-01 02 A,2019-01-01,2,0,2019,1,1,1,-4.403736,1015.041032,58.620164
2,2019-01-01 03 A,2019-01-01,3,0,2019,1,1,1,-4.545435,1015.395001,59.880628
3,2019-01-01 04 A,2019-01-01,4,0,2019,1,1,1,-4.957938,1015.29827,60.875524
4,2019-01-01 05 A,2019-01-01,5,0,2019,1,1,1,-5.154883,1015.071694,61.475688


## 2019년 가스 공급량 훈련/예측

### log_공급량으로 진행

In [15]:
total['log_공급량'] = np.log1p(total['공급량'])
total.head()

Unnamed: 0,year,month,day,weekday,시간,구분,공급량,기온,습도,기압,log_공급량
0,2013,1,1,1,1,0,2497.129,-8.5,57.0,1010.0,7.823297
1,2013,1,1,1,1,1,2169.093,-8.5,57.0,1010.0,7.682525
2,2013,1,1,1,1,2,226.178,-8.5,57.0,1010.0,5.425734
3,2013,1,1,1,1,3,1434.516,-8.5,57.0,1010.0,7.26928
4,2013,1,1,1,1,4,3272.837,-8.5,57.0,1010.0,8.093718


## 2019년 log_공급량 훈련/예측, 구분별

In [16]:
start = time.time()

# 구분에 따라 예측하기
divisions = test2019['구분'].unique()

# 학습 특성 : 'month', 'weekday', '시간', '구분', '기온', '기압'
# 타겟 특성 : 'log_공급량'
gas2019_pred = pd.DataFrame()
for division in tqdm(divisions):
    # 구분별 훈련, 테스트 셋 설정
    temp_train = total[total['구분'] == division].reset_index(drop=True)
    temp_train = temp_train[['month', 'weekday', '시간', '구분', '기온', '기압', 'log_공급량']]
    temp_train = temp_train[ (temp_train['month']>=1) & (temp_train['month']<=3) ]
    temp_train = temp_train.reset_index(drop=True)

    temp_test = test2019[test2019['구분'] == division].reset_index(drop=True)
    temp_test = temp_test[['month', 'weekday', '시간', '구분', '기온', '기압']]

    # pycaret 설정
    exp = setup(temp_train, target='log_공급량', silent=True, fold=7, fold_shuffle=True, verbose=False)

    # 모델 생성
    lgbm = create_model('lightgbm')
    cat = create_model('catboost')

    # log_공급량 예측
    log_gas2019_pred_lgbm = predict_model(lgbm, data=temp_test)
    log_gas2019_pred_cat = predict_model(cat, data=temp_test)

    # log_공급량 값을 공급량 값으로 변환
    gas_pred2019_lgbm = np.expm1(log_gas2019_pred_lgbm['Label'])
    gas_pred2019_cat = np.expm1(log_gas2019_pred_cat['Label'])

    # 두 모델 공급량 예측 평균
    gas2019_division_pred = (gas_pred2019_lgbm + gas_pred2019_cat) / 2

    # 구분별 예측 값 합치기
    gas2019_pred = pd.concat([gas2019_pred, gas2019_division_pred])
    gas2019_pred = gas2019_pred.reset_index(drop=True)

end = time.time()
sec = end - start
result = datetime.timedelta(seconds=sec)
print(str(result).split('.')[0])

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,0.0593,0.0081,0.0899,0.9324,0.0123,0.0099
1,0.0577,0.0058,0.076,0.9414,0.011,0.0097
2,0.059,0.006,0.0773,0.9436,0.0113,0.0099
3,0.0551,0.0061,0.0778,0.9442,0.011,0.0092
4,0.0574,0.0056,0.0752,0.9493,0.0109,0.0097
5,0.0567,0.0055,0.0741,0.9467,0.0107,0.0095
6,0.0566,0.0056,0.075,0.9479,0.0108,0.0095
Mean,0.0574,0.0061,0.0779,0.9436,0.0112,0.0096
SD,0.0013,0.0008,0.005,0.0052,0.0005,0.0002


100%|██████████| 7/7 [01:57<00:00, 16.82s/it]

0:01:57





In [17]:
# 예측 결과 확인
gas2019_pred.head()

Unnamed: 0,0
0,2001.788319
1,1791.452341
2,1674.101796
3,1746.044287
4,1927.334172


## 제출 파일 불러오기

In [18]:
sub = pd.read_csv('../data/sample_submission.csv')
sub.head()

Unnamed: 0,일자|시간|구분,공급량
0,2019-01-01 01 A,0
1,2019-01-01 02 A,0
2,2019-01-01 03 A,0
3,2019-01-01 04 A,0
4,2019-01-01 05 A,0


In [19]:
sub.tail()

Unnamed: 0,일자|시간|구분,공급량
15115,2019-03-31 20 H,0
15116,2019-03-31 21 H,0
15117,2019-03-31 22 H,0
15118,2019-03-31 23 H,0
15119,2019-03-31 24 H,0


In [20]:
sub['공급량'] = gas2019_pred
sub.head()

Unnamed: 0,일자|시간|구분,공급량
0,2019-01-01 01 A,2001.788319
1,2019-01-01 02 A,1791.452341
2,2019-01-01 03 A,1674.101796
3,2019-01-01 04 A,1746.044287
4,2019-01-01 05 A,1927.334172


In [21]:
sub.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15120 entries, 0 to 15119
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   일자|시간|구분  15120 non-null  object 
 1   공급량       15120 non-null  float64
dtypes: float64(1), object(1)
memory usage: 236.4+ KB


## 제출 파일 출력

In [23]:
sub.to_csv("sub28_pycaret16.csv", index=False)