# pycaret을 이용한 가스공급량 예측
* 2019년 test 데이터 : '../data/test.csv'
* 2019년 제출 파일 : '../data/sample_submission.csv'
* 2013-2018년 가스공급량과 기온 자료 : '../2013-2018년_가스공급량_기온_습도_기압01.csv'
* 사용 특성 : 'month', 'day', '시간', '구분', '기온','습도','기압'
* log적용
* 3개 모델

In [None]:
# 제출 점수 : 

## 데이터 가져오기
* 2019년 test 데이터 : '../data/test.csv'
* 2019년 제출 파일 : '../data/sample_submission.csv'
* 2013-2018년 가스공급량과 기온 자료 : '../2013-2018년_가스공급량_기온_습도_기압01.csv'

In [22]:
import pandas as pd

### 2013-2018년 가스공급량과 기온 자료

In [23]:
total = pd.read_csv('../2013-2018년_가스공급량_기온_습도_기압01.csv')
total.head()

Unnamed: 0,year,month,day,weekday,시간,구분,공급량,기온,습도,기압
0,2013,1,1,1,1,0,2497.129,-8.5,57.0,1010.0
1,2013,1,1,1,1,1,2169.093,-8.5,57.0,1010.0
2,2013,1,1,1,1,2,226.178,-8.5,57.0,1010.0
3,2013,1,1,1,1,3,1434.516,-8.5,57.0,1010.0
4,2013,1,1,1,1,4,3272.837,-8.5,57.0,1010.0


#### 2019년 데이터

In [24]:
test2019 = pd.read_csv('../data/test.csv')
test2019.head()

Unnamed: 0,일자|시간|구분
0,2019-01-01 01 A
1,2019-01-01 02 A
2,2019-01-01 03 A
3,2019-01-01 04 A
4,2019-01-01 05 A


In [25]:
test2019[['연월일', '시간', '구분']] = test2019['일자|시간|구분'].str.split(' ').tolist()
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분
0,2019-01-01 01 A,2019-01-01,1,A
1,2019-01-01 02 A,2019-01-01,2,A
2,2019-01-01 03 A,2019-01-01,3,A
3,2019-01-01 04 A,2019-01-01,4,A
4,2019-01-01 05 A,2019-01-01,5,A


In [26]:
# 구분 열을 숫자로 바꾸기
d_map = {}
for i, d in enumerate(test2019['구분'].unique()):
    d_map[d] = i
test2019['구분'] = test2019['구분'].map(d_map)
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분
0,2019-01-01 01 A,2019-01-01,1,0
1,2019-01-01 02 A,2019-01-01,2,0
2,2019-01-01 03 A,2019-01-01,3,0
3,2019-01-01 04 A,2019-01-01,4,0
4,2019-01-01 05 A,2019-01-01,5,0


In [27]:
# 연월일 열은 object형이므로 년, 월, 일로 나눈다.
test2019['연월일'] = pd.to_datetime(test2019['연월일'])
test2019['year'] = test2019['연월일'].dt.year
test2019['month'] = test2019['연월일'].dt.month
test2019['day'] = test2019['연월일'].dt.day
test2019['weekday'] = test2019['연월일'].dt.weekday
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분,year,month,day,weekday
0,2019-01-01 01 A,2019-01-01,1,0,2019,1,1,1
1,2019-01-01 02 A,2019-01-01,2,0,2019,1,1,1
2,2019-01-01 03 A,2019-01-01,3,0,2019,1,1,1
3,2019-01-01 04 A,2019-01-01,4,0,2019,1,1,1
4,2019-01-01 05 A,2019-01-01,5,0,2019,1,1,1


## 2019년 기온 예측하기
* 습도 예측
* 기압 예측

In [28]:
from pycaret.regression import *

#### 습도(Humidity) 예측

In [29]:
# 학습 특성 : 'month', 'day', 'weekday', '시간', '구분'
# 타겟 특성 : '습도'

humidity_prediction_model = load_model('humidity_prediction_model_01')

Transformation Pipeline and Model Successfully Loaded


In [30]:
%%time
df_for_temp = test2019[['month', 'day', 'weekday', '시간', '구분']]
humidity2019_pred = predict_model(humidity_prediction_model, data=df_for_temp)

Wall time: 3.04 s


In [31]:
humidity_pred.head()

Unnamed: 0,month,day,weekday,시간,구분,Label
0,1,1,1,1,0,64.463024
1,1,1,1,2,0,65.66405
2,1,1,1,3,0,65.659699
3,1,1,1,4,0,66.32069
4,1,1,1,5,0,67.903796


In [32]:
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분,year,month,day,weekday
0,2019-01-01 01 A,2019-01-01,1,0,2019,1,1,1
1,2019-01-01 02 A,2019-01-01,2,0,2019,1,1,1
2,2019-01-01 03 A,2019-01-01,3,0,2019,1,1,1
3,2019-01-01 04 A,2019-01-01,4,0,2019,1,1,1
4,2019-01-01 05 A,2019-01-01,5,0,2019,1,1,1


In [33]:
test2019['습도'] = humidity2019_pred['Label']
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분,year,month,day,weekday,습도
0,2019-01-01 01 A,2019-01-01,1,0,2019,1,1,1,64.463024
1,2019-01-01 02 A,2019-01-01,2,0,2019,1,1,1,65.66405
2,2019-01-01 03 A,2019-01-01,3,0,2019,1,1,1,65.659699
3,2019-01-01 04 A,2019-01-01,4,0,2019,1,1,1,66.32069
4,2019-01-01 05 A,2019-01-01,5,0,2019,1,1,1,67.903796


#### 기압 예측

In [34]:
# 학습 특성 : 'month', 'day', 'weekday', '시간', '구분'
# 타겟 특성 : '기압'

exp = setup(total, target='기압', ignore_features=['공급량', 'year', '기온', '습도'])

Unnamed: 0,Description,Value
0,session_id,6480
1,Target,기압
2,Original Data,"(368088, 10)"
3,Missing Values,False
4,Numeric Features,2
5,Categorical Features,3
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(257661, 28)"


In [35]:
%%time
pressure_model3 = compare_models(sort='MAPE', n_select=3)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
dt,Decision Tree Regressor,0.0088,0.046,0.2088,0.9993,0.0002,0.0,0.797
rf,Random Forest Regressor,0.0453,0.0096,0.098,0.9999,0.0001,0.0,48.754
et,Extra Trees Regressor,0.079,0.025,0.158,0.9996,0.0002,0.0001,60.446
xgboost,Extreme Gradient Boosting,2.0973,7.3468,2.7101,0.8872,0.0027,0.0021,12.601
catboost,CatBoost Regressor,2.3382,8.7624,2.9601,0.8655,0.0029,0.0023,19.517
lightgbm,Light Gradient Boosting Machine,2.9777,14.5325,3.812,0.7769,0.0038,0.003,0.863
knn,K Neighbors Regressor,3.2405,18.0919,4.2534,0.7223,0.0042,0.0032,6.335
gbr,Gradient Boosting Regressor,3.5071,20.5911,4.5376,0.684,0.0045,0.0035,16.482
lr,Linear Regression,3.7272,23.0266,4.7985,0.6466,0.0048,0.0037,0.91
ridge,Ridge Regression,3.7261,23.0177,4.7976,0.6467,0.0048,0.0037,0.111


Wall time: 33min 8s


In [36]:
%%time
blended_pressure_model3 = blend_models(estimator_list=pressure_model3)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,0.0391,0.0093,0.0965,0.9999,0.0001,0.0
1,0.0403,0.0088,0.0941,0.9999,0.0001,0.0
2,0.039,0.0104,0.1017,0.9998,0.0001,0.0
3,0.0388,0.0065,0.0808,0.9999,0.0001,0.0
4,0.0425,0.0142,0.1192,0.9998,0.0001,0.0
5,0.0401,0.0115,0.1073,0.9998,0.0001,0.0
6,0.0419,0.0152,0.1232,0.9998,0.0001,0.0
7,0.0399,0.0087,0.0931,0.9999,0.0001,0.0
8,0.0418,0.0148,0.1217,0.9998,0.0001,0.0
9,0.0401,0.0102,0.1008,0.9998,0.0001,0.0


Wall time: 21min 6s


In [37]:
%%time
final_pressure_model3 = finalize_model(blended_pressure_model3)

Wall time: 14min 19s


In [38]:
pressure2019_pred = predict_model(final_pressure_model3, data=test2019)

In [39]:
pressure2019_pred.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분,year,month,day,weekday,습도,Label
0,2019-01-01 01 A,2019-01-01,1,0,2019,1,1,1,64.463024,1010.0
1,2019-01-01 02 A,2019-01-01,2,0,2019,1,1,1,65.66405,1009.400024
2,2019-01-01 03 A,2019-01-01,3,0,2019,1,1,1,65.659699,1009.200012
3,2019-01-01 04 A,2019-01-01,4,0,2019,1,1,1,66.32069,1008.200012
4,2019-01-01 05 A,2019-01-01,5,0,2019,1,1,1,67.903796,1007.297321


In [40]:
test2019['기압'] = pressure2019_pred['Label']
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분,year,month,day,weekday,습도,기압
0,2019-01-01 01 A,2019-01-01,1,0,2019,1,1,1,64.463024,1010.0
1,2019-01-01 02 A,2019-01-01,2,0,2019,1,1,1,65.66405,1009.400024
2,2019-01-01 03 A,2019-01-01,3,0,2019,1,1,1,65.659699,1009.200012
3,2019-01-01 04 A,2019-01-01,4,0,2019,1,1,1,66.32069,1008.200012
4,2019-01-01 05 A,2019-01-01,5,0,2019,1,1,1,67.903796,1007.297321


#### 기온예측

In [41]:
# 학습 특성 : 'month', 'day', 'weekday', '시간', '구분', '습도', '기압'
# 타겟 특성 : '기온'

exp = setup(total, target='기온', ignore_features=['공급량', 'year'])

Unnamed: 0,Description,Value
0,session_id,3903
1,Target,기온
2,Original Data,"(368088, 10)"
3,Missing Values,False
4,Numeric Features,4
5,Categorical Features,3
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(257661, 30)"


In [42]:
%%time
# 07-05_결과_출력(pycaret)_ver0.1(2021.11.15) 참고 모델 선택(결정트리모델은 과대적합으로 판단되어 제외)
temp_models3 = compare_models(sort='MAPE',n_select=3, include=['knn','catboost','lightgbm'])

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
knn,K Neighbors Regressor,0.7226,2.8834,1.6979,0.9773,0.2204,0.2164,2.854
catboost,CatBoost Regressor,1.4373,3.5221,1.8767,0.9723,0.3087,0.399,21.144
lightgbm,Light Gradient Boosting Machine,2.0176,6.8165,2.6108,0.9463,0.399,0.5588,1.262


Wall time: 4min 47s


In [43]:
%%time
tuned_temp_models3 = [tune_model(model) for model in temp_models3]

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,1.2291,2.5729,1.604,0.9798,0.2619,0.3332
1,1.2282,2.5765,1.6051,0.9797,0.264,0.3399
2,1.2085,2.5066,1.5832,0.9803,0.2618,0.3273
3,1.2283,2.5727,1.604,0.9797,0.2616,0.3256
4,1.221,2.5544,1.5983,0.98,0.264,0.3397
5,1.2222,2.5419,1.5943,0.98,0.264,0.3271
6,1.2162,2.5218,1.588,0.9801,0.2647,0.3375
7,1.2321,2.6113,1.6159,0.9793,0.2677,0.3463
8,1.2142,2.5125,1.5851,0.9801,0.2591,0.3353
9,1.2052,2.4881,1.5774,0.9804,0.2654,0.3308


Wall time: 17min 31s


In [None]:
%%time
blend_models3 = blend_models(estimator_list=tuned_temp_models3)

IntProgress(value=0, description='Processing: ', max=6)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE


In [None]:
%%time
temp_prediction_model = finalize_model(blend_models3)

In [None]:
save_model(temp_prediction_model, 'temp_prediction_model01')

In [None]:
temp_pred = predict_model(temp_prediction_model, data=test2019)
temp_pred.head()

In [None]:
test2019['기온'] = temp_pred['Label']
test2019.head()

### 가스 공급량 예측

In [None]:
import numpy as np

In [None]:
total['log_공급량'] = np.log1p(total['공급량'])
total.head()

In [None]:
# 학습 특성 : 'month', 'day', 'weekday', '시간', '구분', '습도', '기압', '기온'
exp = setup(total, target='log_공급량', ignore_features=['year', '공급량'])

In [None]:
# 모델 비교

# compare_models(sort='MAPE', exclude=['lr','lasso','ridge','rf','dt','et'])

In [None]:
cat = create_model('catboost')
lgbm = create_model('lightgbm')
xgb = create_model('xgboost')

In [None]:
%%time
tuned_cat = tune_model(cat)
tuned_lgbm = tune_model(lgbm)
tuned_xgb = tune_model(xgb)

In [None]:
%%time
blend_model = blend_models(estimators=[tuned_cat, tuned_lgbm, tuned_xgb])

In [None]:
%%time
gas_prediction_model = finalize_model(blend_model)

In [None]:
save_model(gas_prediction_model, 'gas_prediction_model02')

In [None]:
log_gas_pred = predict_model(gas_prediction_model, data=test2019)
gas_pred.head()

### 제출 파일 만들기

In [None]:
sub = pd.read_csv('../data/sample_submission.csv')
sub.head()

In [None]:
pred = np.expm1(gas_pred['Label'])

In [None]:
sub['공급량'] = pred
sub.head()

In [None]:
sub.info()

In [None]:
sub.to_csv("sub14_pycaret02.csv", index=False)