# pycaret을 이용한 가스공급량 예측
* 2019년 test 데이터 : '../data/test.csv'
* 2019년 제출 파일 : '../data/sample_submission.csv'
* 2013-2018년 가스공급량과 기온 자료 : '../2013-2018년_가스공급량_기온_습도_기압01.csv'
* 특성
    * 'month', 'weekay', '시간'으로 기온 예측
    * 기온, 기압, 습도 순서로 예측
    * 각 특성 예측 마다 앞에서 예측한 특성 활용
    * 공급량 예측(7개) : 'month', 'weekday', '시간', '구분', '기온', '기압', '습도'
* 공급량 log적용
* top3 모델 블랜드
* 튜닝X, finalize_model X
* 구분별 훈련/예측
* 이상치 제거(outliers_threshold=0.1)

In [1]:
# 제출 파일명 : sub34_pycaret22(ver2.5).csv
# 최고점 파일과 비교 점수(NMAE/R2) : 0.02539 / 0.96779
# 제출 점수 : X

## 데이터 가져오기
* 2019년 test 데이터 : '../data/test.csv'
* 2019년 제출 파일 : '../data/sample_submission.csv'
* 2013-2018년 가스공급량과 기온 자료 : '../2013-2018년_가스공급량_기온_습도_기압01.csv'

In [2]:
from pycaret.regression import *

In [3]:
import time
import datetime
import pandas as pd
import numpy as np
from tqdm import tqdm

### 2013-2018년 가스공급량과 기온, 기압, 습도 자료

In [4]:
total = pd.read_csv('../2013-2018년_가스공급량_기온_습도_기압01.csv')
total.head()

Unnamed: 0,연월일,year,month,day,weekday,시간,구분,공급량,기온,습도,기압
0,2013-01-01,2013,1,1,1,1,0,2497.129,-8.5,57.0,1010.0
1,2013-01-01,2013,1,1,1,1,1,2169.093,-8.5,57.0,1010.0
2,2013-01-01,2013,1,1,1,1,2,226.178,-8.5,57.0,1010.0
3,2013-01-01,2013,1,1,1,1,3,1434.516,-8.5,57.0,1010.0
4,2013-01-01,2013,1,1,1,1,4,3272.837,-8.5,57.0,1010.0


### 2019년 테스트 데이터

In [5]:
test2019 = pd.read_csv('../data/test.csv')
test2019.head()

Unnamed: 0,일자|시간|구분
0,2019-01-01 01 A
1,2019-01-01 02 A
2,2019-01-01 03 A
3,2019-01-01 04 A
4,2019-01-01 05 A


In [6]:
test2019[['연월일', '시간', '구분']] = test2019['일자|시간|구분'].str.split(' ').tolist()
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분
0,2019-01-01 01 A,2019-01-01,1,A
1,2019-01-01 02 A,2019-01-01,2,A
2,2019-01-01 03 A,2019-01-01,3,A
3,2019-01-01 04 A,2019-01-01,4,A
4,2019-01-01 05 A,2019-01-01,5,A


In [7]:
# 구분 열을 숫자로 바꾸기
d_map = {}
for i, d in enumerate(test2019['구분'].unique()):
    d_map[d] = i
test2019['구분'] = test2019['구분'].map(d_map)
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분
0,2019-01-01 01 A,2019-01-01,1,0
1,2019-01-01 02 A,2019-01-01,2,0
2,2019-01-01 03 A,2019-01-01,3,0
3,2019-01-01 04 A,2019-01-01,4,0
4,2019-01-01 05 A,2019-01-01,5,0


In [8]:
# 연월일 열은 object형이므로 년, 월, 일로 나눈다.
test2019['연월일'] = pd.to_datetime(test2019['연월일'])
test2019['year'] = test2019['연월일'].dt.year
test2019['month'] = test2019['연월일'].dt.month
test2019['day'] = test2019['연월일'].dt.day
test2019['weekday'] = test2019['연월일'].dt.weekday
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분,year,month,day,weekday
0,2019-01-01 01 A,2019-01-01,1,0,2019,1,1,1
1,2019-01-01 02 A,2019-01-01,2,0,2019,1,1,1
2,2019-01-01 03 A,2019-01-01,3,0,2019,1,1,1
3,2019-01-01 04 A,2019-01-01,4,0,2019,1,1,1
4,2019-01-01 05 A,2019-01-01,5,0,2019,1,1,1


## 2019년 기온 훈련/예측

In [9]:
start = time.time()

# 학습 특성 : 'month', 'weekday', '시간'
# 타겟 특성 : '기온'
test_col = ['month', 'weekday', '시간']
target_col = ['기온']
train_col = test_col + target_col

temp_train = total[train_col]
temp_test = test2019[test_col]

# pycaret 설정
exp = setup(temp_train, target=target_col[0], session_id=0, silent=True, fold=7, fold_shuffle=True, verbose=False)

# 모델 생성(lgbm, cat, xgb)
top3_models = compare_models(sort='MAPE', n_select=3)

end = time.time()
sec = end - start
result = datetime.timedelta(seconds=sec)
print(str(result).split('.')[0])

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lightgbm,Light Gradient Boosting Machine,2.9867,14.8477,3.8532,0.8831,0.5169,0.8,1.2314
catboost,CatBoost Regressor,2.9891,14.8784,3.8572,0.8829,0.5191,0.8015,20.5157
xgboost,Extreme Gradient Boosting,2.9924,14.9107,3.8614,0.8826,0.5186,0.8024,11.0229
et,Extra Trees Regressor,2.9998,14.985,3.871,0.882,0.5199,0.8038,33.11
dt,Decision Tree Regressor,2.9998,14.985,3.871,0.882,0.5199,0.8038,0.4529
rf,Random Forest Regressor,3.0,14.9861,3.8712,0.882,0.5202,0.8039,26.4071
gbr,Gradient Boosting Regressor,3.0347,15.2176,3.901,0.8802,0.5221,0.8158,10.67
huber,Huber Regressor,3.5595,20.1918,4.4935,0.841,0.6011,0.8233,1.9657
ridge,Ridge Regression,3.5634,20.1686,4.4909,0.8412,0.5932,0.8338,0.0943
br,Bayesian Ridge,3.5634,20.1686,4.4909,0.8412,0.5932,0.8338,0.2629


0:16:25


In [10]:
start = time.time()

# 블랜드 모델
blended_top3_models = blend_models(estimator_list=top3_models, optimize='MAPE')

# 기온 예측
temp2019_pred_top3 = predict_model(blended_top3_models, data=temp_test)

# 2019년 예측 기온 열 추가
test2019[target_col[0]] = temp2019_pred_top3['Label']

end = time.time()
sec = end - start
result = datetime.timedelta(seconds=sec)
print(str(result).split('.')[0])

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,2.9731,14.7102,3.8354,0.8833,0.5121,0.8168
1,2.9872,14.887,3.8584,0.882,0.5156,0.7825
2,2.9907,14.9187,3.8625,0.8829,0.5202,0.802
3,2.9861,14.9033,3.8605,0.8824,0.5186,0.819
4,3.0061,15.0172,3.8752,0.8828,0.5222,0.8082
5,2.9707,14.6959,3.8335,0.8847,0.5184,0.7846
6,3.0057,14.962,3.8681,0.8825,0.5202,0.7945
Mean,2.9885,14.8706,3.8562,0.8829,0.5182,0.8011
SD,0.0129,0.1132,0.0147,0.0008,0.0031,0.0135


0:04:01


In [11]:
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분,year,month,day,weekday,기온
0,2019-01-01 01 A,2019-01-01,1,0,2019,1,1,1,-3.894176
1,2019-01-01 02 A,2019-01-01,2,0,2019,1,1,1,-4.363952
2,2019-01-01 03 A,2019-01-01,3,0,2019,1,1,1,-4.476645
3,2019-01-01 04 A,2019-01-01,4,0,2019,1,1,1,-4.922204
4,2019-01-01 05 A,2019-01-01,5,0,2019,1,1,1,-5.048251


## 2019년 기압 훈련/예측

In [12]:
start = time.time()

# 학습 특성 : 'month', 'weekday', '시간', '기온'
# 타겟 특성 : '기압'
test_col = ['month', 'weekday', '시간', '기온']
target_col = ['기압']
train_col = test_col + target_col

temp_train = total[train_col]
temp_test = test2019[test_col]

# pycaret 설정
exp = setup(temp_train, target=target_col[0], session_id=0, silent=True, fold=7, fold_shuffle=True, verbose=False)

# 모델 생성(dt, et, rf)
top3_models = compare_models(sort='MAPE', n_select=3)

end = time.time()
sec = end - start
result = datetime.timedelta(seconds=sec)
print(str(result).split('.')[0])

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
dt,Decision Tree Regressor,0.514,2.4898,1.5778,0.9617,0.0016,0.0005,0.6014
et,Extra Trees Regressor,0.5115,2.445,1.5636,0.9624,0.0016,0.0005,42.8614
rf,Random Forest Regressor,0.6781,2.6323,1.6224,0.9595,0.0016,0.0007,37.6586
knn,K Neighbors Regressor,1.607,6.6596,2.5806,0.8975,0.0026,0.0016,2.8343
catboost,CatBoost Regressor,3.2454,17.6192,4.1975,0.7288,0.0042,0.0032,19.4957
xgboost,Extreme Gradient Boosting,3.2041,17.2745,4.1563,0.7341,0.0041,0.0032,10.9057
lightgbm,Light Gradient Boosting Machine,3.381,19.0631,4.3661,0.7066,0.0043,0.0034,0.8214
gbr,Gradient Boosting Regressor,3.5261,20.7527,4.5555,0.6806,0.0045,0.0035,14.89
lr,Linear Regression,3.6156,21.7702,4.6658,0.6649,0.0046,0.0036,0.1243
ridge,Ridge Regression,3.6156,21.7702,4.6658,0.6649,0.0046,0.0036,0.0857


0:18:14


In [13]:
start = time.time()

# 블랜드 모델
blended_top3_models = blend_models(estimator_list=top3_models, optimize='MAPE')

# 기압 예측
pressure2019_pred_top3 = predict_model(blended_top3_models, data=temp_test)

# 2019년 예측 기압 열 추가
test2019[target_col[0]] = pressure2019_pred_top3['Label']

end = time.time()
sec = end - start
result = datetime.timedelta(seconds=sec)
print(str(result).split('.')[0])

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,0.5613,2.3826,1.5436,0.9631,0.0015,0.0006
1,0.5678,2.4687,1.5712,0.9619,0.0016,0.0006
2,0.5602,2.5127,1.5851,0.9614,0.0016,0.0006
3,0.5707,2.4924,1.5787,0.9616,0.0016,0.0006
4,0.5709,2.5535,1.598,0.9607,0.0016,0.0006
5,0.5727,2.4334,1.5599,0.9629,0.0016,0.0006
6,0.566,2.4517,1.5658,0.9622,0.0016,0.0006
Mean,0.5671,2.4707,1.5718,0.962,0.0016,0.0006
SD,0.0045,0.0516,0.0164,0.0008,0.0,0.0


0:11:26


In [14]:
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분,year,month,day,weekday,기온,기압
0,2019-01-01 01 A,2019-01-01,1,0,2019,1,1,1,-3.894176,1012.103393
1,2019-01-01 02 A,2019-01-01,2,0,2019,1,1,1,-4.363952,1015.612686
2,2019-01-01 03 A,2019-01-01,3,0,2019,1,1,1,-4.476645,1024.535644
3,2019-01-01 04 A,2019-01-01,4,0,2019,1,1,1,-4.922204,1014.696643
4,2019-01-01 05 A,2019-01-01,5,0,2019,1,1,1,-5.048251,1006.996327


## 2019년 습도(Humidity) 훈련/예측

In [15]:
start = time.time()

# 학습 특성 : 'month', 'weekday', '시간', '기온', '기압'
# 타겟 특성 : '습도'
test_col = ['month', 'weekday', '시간', '기온', '기압']
target_col = ['습도']
train_col = test_col + target_col

temp_train = total[train_col]
temp_test = test2019[test_col]

# pycaret 설정
exp = setup(temp_train, target=target_col[0], session_id=0, silent=True, fold=7, fold_shuffle=True, verbose=False)

# 모델 생성(et, dt, rf)
top3_models = compare_models(sort='MAPE', n_select=3)

end = time.time()
sec = end - start
result = datetime.timedelta(seconds=sec)
print(str(result).split('.')[0])

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,0.0429,0.5902,0.7675,0.9985,0.0141,0.0008,70.2886
dt,Decision Tree Regressor,0.0519,1.0219,1.0083,0.9974,0.0193,0.001,0.7586
rf,Random Forest Regressor,0.4825,1.7154,1.3096,0.9957,0.0271,0.0099,53.1357
knn,K Neighbors Regressor,2.5277,25.2091,5.0205,0.9365,0.0996,0.0514,2.8214
xgboost,Extreme Gradient Boosting,9.1373,139.3099,11.8027,0.6491,0.2283,0.1866,17.3929
catboost,CatBoost Regressor,9.2096,139.6841,11.8187,0.6482,0.2293,0.1885,27.9071
lightgbm,Light Gradient Boosting Machine,10.319,169.9603,13.0368,0.5719,0.2502,0.2114,1.2543
gbr,Gradient Boosting Regressor,11.2802,202.0262,14.2135,0.4911,0.2708,0.2324,21.3214
ada,AdaBoost Regressor,13.5165,269.3497,16.4117,0.3216,0.3187,0.2911,9.8943
lr,Linear Regression,13.9772,289.533,17.0156,0.2707,0.3239,0.2928,0.16


0:27:45


In [16]:
start = time.time()

# 블랜드 모델
blended_top3_models = blend_models(estimator_list=top3_models, optimize='MAPE')

# 습도 예측
humidity2019_pred_top3 = predict_model(blended_top3_models, data=temp_test)

# 2019년 예측 습도 열 추가
test2019[target_col[0]] = humidity2019_pred_top3['Label']

end = time.time()
sec = end - start
result = datetime.timedelta(seconds=sec)
print(str(result).split('.')[0])

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,0.1854,0.6559,0.8099,0.9984,0.0156,0.0038
1,0.1974,0.8248,0.9082,0.9979,0.0179,0.004
2,0.1894,0.7628,0.8734,0.9981,0.0172,0.0039
3,0.1868,0.7193,0.8481,0.9982,0.0155,0.0037
4,0.1895,0.7535,0.8681,0.9981,0.0168,0.0039
5,0.1905,0.7707,0.8779,0.9981,0.0176,0.0038
6,0.1928,0.7904,0.889,0.998,0.0168,0.0039
Mean,0.1903,0.7539,0.8678,0.9981,0.0168,0.0039
SD,0.0037,0.05,0.0292,0.0001,0.0009,0.0001


0:17:42


In [17]:
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분,year,month,day,weekday,기온,기압,습도
0,2019-01-01 01 A,2019-01-01,1,0,2019,1,1,1,-3.894176,1012.103393,59.38
1,2019-01-01 02 A,2019-01-01,2,0,2019,1,1,1,-4.363952,1015.612686,61.166667
2,2019-01-01 03 A,2019-01-01,3,0,2019,1,1,1,-4.476645,1024.535644,54.536667
3,2019-01-01 04 A,2019-01-01,4,0,2019,1,1,1,-4.922204,1014.696643,60.153333
4,2019-01-01 05 A,2019-01-01,5,0,2019,1,1,1,-5.048251,1006.996327,61.723333


### 2019년 예측 기온, 기압, 습도 데이터 파일로 출력

In [25]:
test2019.to_csv('2019년_예측_기온_기압_습도.csv', index=False)

## 2019년 가스 공급량 훈련/예측

### log_공급량으로 진행

In [18]:
total['log_공급량'] = np.log1p(total['공급량'])
total.head()

Unnamed: 0,연월일,year,month,day,weekday,시간,구분,공급량,기온,습도,기압,log_공급량
0,2013-01-01,2013,1,1,1,1,0,2497.129,-8.5,57.0,1010.0,7.823297
1,2013-01-01,2013,1,1,1,1,1,2169.093,-8.5,57.0,1010.0,7.682525
2,2013-01-01,2013,1,1,1,1,2,226.178,-8.5,57.0,1010.0,5.425734
3,2013-01-01,2013,1,1,1,1,3,1434.516,-8.5,57.0,1010.0,7.26928
4,2013-01-01,2013,1,1,1,1,4,3272.837,-8.5,57.0,1010.0,8.093718


## 2019년 log_공급량 훈련/예측, 구분별

In [19]:
# 구분에 따라 예측하기
divisions = test2019['구분'].unique()

# 학습 특성 : 'month', 'weekday', '시간', '구분', '기온', '기압', '습도'
# 타겟 특성 : 'log_공급량'
test_col = ['month', 'weekday', '시간', '구분', '기온', '기압', '습도']
target_col = ['log_공급량']
train_col = test_col + target_col

gas2019_pred = pd.DataFrame()
for division in tqdm(divisions):
    # 구분별 훈련, 테스트 셋 설정
    temp_train = total[total['구분'] == division].reset_index(drop=True)
    temp_train = temp_train[train_col]

    temp_test = test2019[test2019['구분'] == division].reset_index(drop=True)
    temp_test = temp_test[test_col]

    # pycaret 설정
    exp = setup(temp_train, target=target_col[0], session_id=0, silent=True, remove_outliers=True, outliers_threshold=0.1, fold=7, fold_shuffle=True)

    # 모델 생성{ (cat, xgb, et) or (cat, xgb, lgbm) or (et, dt, rf) }
    top3_models = compare_models(sort='MAPE', n_select=3)

    # 블랜드 모델
    blended_top3_models = blend_models(estimator_list=top3_models, optimize='MAPE')

    # 기압 예측
    log_gas2019_pred_top3 = predict_model(blended_top3_models, data=temp_test)

    # log_공급량 값을 공급량 값으로 변환
    gas_pred2019_top3 = np.expm1(log_gas2019_pred_top3['Label'])

    # 구분별 예측 값 합치기
    gas2019_pred = pd.concat([gas2019_pred, gas_pred2019_top3])
    gas2019_pred = gas2019_pred.reset_index(drop=True)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,0.0711,0.0101,0.1004,0.9838,0.0179,0.0145
1,0.0719,0.0108,0.104,0.9823,0.0179,0.0145
2,0.0723,0.0131,0.1143,0.979,0.0222,0.015
3,0.0716,0.0106,0.1028,0.9836,0.0183,0.0146
4,0.0724,0.0112,0.1056,0.9819,0.0184,0.0146
5,0.0724,0.0125,0.1119,0.9793,0.0188,0.0146
6,0.0701,0.0096,0.0982,0.9846,0.0173,0.0142
Mean,0.0717,0.0111,0.1053,0.9821,0.0187,0.0146
SD,0.0008,0.0012,0.0054,0.002,0.0015,0.0002


100%|██████████| 7/7 [41:25<00:00, 355.10s/it]


In [20]:
# 예측 결과 확인
gas2019_pred.head()

Unnamed: 0,0
0,2010.968452
1,1820.834736
2,1703.292898
3,1696.259298
4,1904.915174


## 제출 파일 가져오기

In [21]:
sub = pd.read_csv('../data/sample_submission.csv')
sub.head()

Unnamed: 0,일자|시간|구분,공급량
0,2019-01-01 01 A,0
1,2019-01-01 02 A,0
2,2019-01-01 03 A,0
3,2019-01-01 04 A,0
4,2019-01-01 05 A,0


In [22]:
sub['공급량'] = gas2019_pred
sub.head()

Unnamed: 0,일자|시간|구분,공급량
0,2019-01-01 01 A,2010.968452
1,2019-01-01 02 A,1820.834736
2,2019-01-01 03 A,1703.292898
3,2019-01-01 04 A,1696.259298
4,2019-01-01 05 A,1904.915174


In [23]:
sub.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15120 entries, 0 to 15119
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   일자|시간|구분  15120 non-null  object 
 1   공급량       15120 non-null  float64
dtypes: float64(1), object(1)
memory usage: 236.4+ KB


## 제출 파일 출력

In [24]:
sub.to_csv("sub34_pycaret22(ver2.5).csv", index=False)