# pycaret을 이용한 가스공급량 예측
* 2019년 test 데이터 : '../data/test.csv'
* 2019년 제출 파일 : '../data/sample_submission.csv'
* 2013-2018년 가스공급량과 기온 자료 : '../2013-2018년_가스공급량_기온_습도_기압01.csv'
* 사용 특성 : 'month', 'day', '시간', '구분', '기온','습도','기압'
* log적용
* 3개 모델

In [None]:
# 제출 점수 : 0.1230232924

## 데이터 가져오기
* 2019년 test 데이터 : '../data/test.csv'
* 2019년 제출 파일 : '../data/sample_submission.csv'
* 2013-2018년 가스공급량과 기온 자료 : '../2013-2018년_가스공급량_기온_습도_기압01.csv'

In [22]:
import pandas as pd

### 2013-2018년 가스공급량과 기온 자료

In [23]:
total = pd.read_csv('../2013-2018년_가스공급량_기온_습도_기압01.csv')
total.head()

Unnamed: 0,year,month,day,weekday,시간,구분,공급량,기온,습도,기압
0,2013,1,1,1,1,0,2497.129,-8.5,57.0,1010.0
1,2013,1,1,1,1,1,2169.093,-8.5,57.0,1010.0
2,2013,1,1,1,1,2,226.178,-8.5,57.0,1010.0
3,2013,1,1,1,1,3,1434.516,-8.5,57.0,1010.0
4,2013,1,1,1,1,4,3272.837,-8.5,57.0,1010.0


#### 2019년 데이터

In [24]:
test2019 = pd.read_csv('../data/test.csv')
test2019.head()

Unnamed: 0,일자|시간|구분
0,2019-01-01 01 A
1,2019-01-01 02 A
2,2019-01-01 03 A
3,2019-01-01 04 A
4,2019-01-01 05 A


In [25]:
test2019[['연월일', '시간', '구분']] = test2019['일자|시간|구분'].str.split(' ').tolist()
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분
0,2019-01-01 01 A,2019-01-01,1,A
1,2019-01-01 02 A,2019-01-01,2,A
2,2019-01-01 03 A,2019-01-01,3,A
3,2019-01-01 04 A,2019-01-01,4,A
4,2019-01-01 05 A,2019-01-01,5,A


In [26]:
# 구분 열을 숫자로 바꾸기
d_map = {}
for i, d in enumerate(test2019['구분'].unique()):
    d_map[d] = i
test2019['구분'] = test2019['구분'].map(d_map)
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분
0,2019-01-01 01 A,2019-01-01,1,0
1,2019-01-01 02 A,2019-01-01,2,0
2,2019-01-01 03 A,2019-01-01,3,0
3,2019-01-01 04 A,2019-01-01,4,0
4,2019-01-01 05 A,2019-01-01,5,0


In [27]:
# 연월일 열은 object형이므로 년, 월, 일로 나눈다.
test2019['연월일'] = pd.to_datetime(test2019['연월일'])
test2019['year'] = test2019['연월일'].dt.year
test2019['month'] = test2019['연월일'].dt.month
test2019['day'] = test2019['연월일'].dt.day
test2019['weekday'] = test2019['연월일'].dt.weekday
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분,year,month,day,weekday
0,2019-01-01 01 A,2019-01-01,1,0,2019,1,1,1
1,2019-01-01 02 A,2019-01-01,2,0,2019,1,1,1
2,2019-01-01 03 A,2019-01-01,3,0,2019,1,1,1
3,2019-01-01 04 A,2019-01-01,4,0,2019,1,1,1
4,2019-01-01 05 A,2019-01-01,5,0,2019,1,1,1


## 2019년 기온 예측하기
* 습도 예측
* 기압 예측

In [28]:
from pycaret.regression import *

#### 습도(Humidity) 예측

In [29]:
# 학습 특성 : 'month', 'day', 'weekday', '시간', '구분'
# 타겟 특성 : '습도'

humidity_prediction_model = load_model('humidity_prediction_model_01')

Transformation Pipeline and Model Successfully Loaded


In [30]:
%%time
df_for_temp = test2019[['month', 'day', 'weekday', '시간', '구분']]
humidity2019_pred = predict_model(humidity_prediction_model, data=df_for_temp)

Wall time: 3.04 s


In [31]:
humidity_pred.head()

Unnamed: 0,month,day,weekday,시간,구분,Label
0,1,1,1,1,0,64.463024
1,1,1,1,2,0,65.66405
2,1,1,1,3,0,65.659699
3,1,1,1,4,0,66.32069
4,1,1,1,5,0,67.903796


In [32]:
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분,year,month,day,weekday
0,2019-01-01 01 A,2019-01-01,1,0,2019,1,1,1
1,2019-01-01 02 A,2019-01-01,2,0,2019,1,1,1
2,2019-01-01 03 A,2019-01-01,3,0,2019,1,1,1
3,2019-01-01 04 A,2019-01-01,4,0,2019,1,1,1
4,2019-01-01 05 A,2019-01-01,5,0,2019,1,1,1


In [33]:
test2019['습도'] = humidity2019_pred['Label']
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분,year,month,day,weekday,습도
0,2019-01-01 01 A,2019-01-01,1,0,2019,1,1,1,64.463024
1,2019-01-01 02 A,2019-01-01,2,0,2019,1,1,1,65.66405
2,2019-01-01 03 A,2019-01-01,3,0,2019,1,1,1,65.659699
3,2019-01-01 04 A,2019-01-01,4,0,2019,1,1,1,66.32069
4,2019-01-01 05 A,2019-01-01,5,0,2019,1,1,1,67.903796


#### 기압 예측

In [34]:
# 학습 특성 : 'month', 'day', 'weekday', '시간', '구분'
# 타겟 특성 : '기압'

exp = setup(total, target='기압', ignore_features=['공급량', 'year', '기온', '습도'])

Unnamed: 0,Description,Value
0,session_id,6480
1,Target,기압
2,Original Data,"(368088, 10)"
3,Missing Values,False
4,Numeric Features,2
5,Categorical Features,3
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(257661, 28)"


In [35]:
%%time
pressure_model3 = compare_models(sort='MAPE', n_select=3)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
dt,Decision Tree Regressor,0.0088,0.046,0.2088,0.9993,0.0002,0.0,0.797
rf,Random Forest Regressor,0.0453,0.0096,0.098,0.9999,0.0001,0.0,48.754
et,Extra Trees Regressor,0.079,0.025,0.158,0.9996,0.0002,0.0001,60.446
xgboost,Extreme Gradient Boosting,2.0973,7.3468,2.7101,0.8872,0.0027,0.0021,12.601
catboost,CatBoost Regressor,2.3382,8.7624,2.9601,0.8655,0.0029,0.0023,19.517
lightgbm,Light Gradient Boosting Machine,2.9777,14.5325,3.812,0.7769,0.0038,0.003,0.863
knn,K Neighbors Regressor,3.2405,18.0919,4.2534,0.7223,0.0042,0.0032,6.335
gbr,Gradient Boosting Regressor,3.5071,20.5911,4.5376,0.684,0.0045,0.0035,16.482
lr,Linear Regression,3.7272,23.0266,4.7985,0.6466,0.0048,0.0037,0.91
ridge,Ridge Regression,3.7261,23.0177,4.7976,0.6467,0.0048,0.0037,0.111


Wall time: 33min 8s


In [36]:
%%time
blended_pressure_model3 = blend_models(estimator_list=pressure_model3)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,0.0391,0.0093,0.0965,0.9999,0.0001,0.0
1,0.0403,0.0088,0.0941,0.9999,0.0001,0.0
2,0.039,0.0104,0.1017,0.9998,0.0001,0.0
3,0.0388,0.0065,0.0808,0.9999,0.0001,0.0
4,0.0425,0.0142,0.1192,0.9998,0.0001,0.0
5,0.0401,0.0115,0.1073,0.9998,0.0001,0.0
6,0.0419,0.0152,0.1232,0.9998,0.0001,0.0
7,0.0399,0.0087,0.0931,0.9999,0.0001,0.0
8,0.0418,0.0148,0.1217,0.9998,0.0001,0.0
9,0.0401,0.0102,0.1008,0.9998,0.0001,0.0


Wall time: 21min 6s


In [37]:
%%time
final_pressure_model3 = finalize_model(blended_pressure_model3)

Wall time: 14min 19s


In [68]:
save_model(final_pressure_model3, 'final_pressure_model01')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True,
                                       features_todrop=['year', '공급량'],
                                       id_columns=[], ml_usecase='regression',
                                       numerical_features=[], target='log_공급량',
                                       time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 nume...
                                                                   criterion='mse',
                                                                   max_depth=None,
                                                                   max_features='auto',
                  

In [38]:
pressure2019_pred = predict_model(final_pressure_model3, data=test2019)

In [39]:
pressure2019_pred.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분,year,month,day,weekday,습도,Label
0,2019-01-01 01 A,2019-01-01,1,0,2019,1,1,1,64.463024,1010.0
1,2019-01-01 02 A,2019-01-01,2,0,2019,1,1,1,65.66405,1009.400024
2,2019-01-01 03 A,2019-01-01,3,0,2019,1,1,1,65.659699,1009.200012
3,2019-01-01 04 A,2019-01-01,4,0,2019,1,1,1,66.32069,1008.200012
4,2019-01-01 05 A,2019-01-01,5,0,2019,1,1,1,67.903796,1007.297321


In [40]:
test2019['기압'] = pressure2019_pred['Label']
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분,year,month,day,weekday,습도,기압
0,2019-01-01 01 A,2019-01-01,1,0,2019,1,1,1,64.463024,1010.0
1,2019-01-01 02 A,2019-01-01,2,0,2019,1,1,1,65.66405,1009.400024
2,2019-01-01 03 A,2019-01-01,3,0,2019,1,1,1,65.659699,1009.200012
3,2019-01-01 04 A,2019-01-01,4,0,2019,1,1,1,66.32069,1008.200012
4,2019-01-01 05 A,2019-01-01,5,0,2019,1,1,1,67.903796,1007.297321


#### 기온예측

In [41]:
# 학습 특성 : 'month', 'day', 'weekday', '시간', '구분', '습도', '기압'
# 타겟 특성 : '기온'

exp = setup(total, target='기온', ignore_features=['공급량', 'year'])

Unnamed: 0,Description,Value
0,session_id,3903
1,Target,기온
2,Original Data,"(368088, 10)"
3,Missing Values,False
4,Numeric Features,4
5,Categorical Features,3
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(257661, 30)"


In [42]:
%%time
# 07-05_결과_출력(pycaret)_ver0.1(2021.11.15) 참고 모델 선택(결정트리모델은 과대적합으로 판단되어 제외)
temp_models3 = compare_models(sort='MAPE',n_select=3, include=['knn','catboost','lightgbm'])

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
knn,K Neighbors Regressor,0.7226,2.8834,1.6979,0.9773,0.2204,0.2164,2.854
catboost,CatBoost Regressor,1.4373,3.5221,1.8767,0.9723,0.3087,0.399,21.144
lightgbm,Light Gradient Boosting Machine,2.0176,6.8165,2.6108,0.9463,0.399,0.5588,1.262


Wall time: 4min 47s


In [43]:
%%time
tuned_temp_models3 = [tune_model(model) for model in temp_models3]

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,1.2291,2.5729,1.604,0.9798,0.2619,0.3332
1,1.2282,2.5765,1.6051,0.9797,0.264,0.3399
2,1.2085,2.5066,1.5832,0.9803,0.2618,0.3273
3,1.2283,2.5727,1.604,0.9797,0.2616,0.3256
4,1.221,2.5544,1.5983,0.98,0.264,0.3397
5,1.2222,2.5419,1.5943,0.98,0.264,0.3271
6,1.2162,2.5218,1.588,0.9801,0.2647,0.3375
7,1.2321,2.6113,1.6159,0.9793,0.2677,0.3463
8,1.2142,2.5125,1.5851,0.9801,0.2591,0.3353
9,1.2052,2.4881,1.5774,0.9804,0.2654,0.3308


Wall time: 17min 31s


In [44]:
%%time
blend_models3 = blend_models(estimator_list=tuned_temp_models3)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,0.861,1.2964,1.1386,0.9898,0.195,0.2352
1,0.8623,1.2928,1.137,0.9898,0.1978,0.2382
2,0.8579,1.2973,1.139,0.9898,0.1993,0.2273
3,0.8698,1.3315,1.1539,0.9895,0.1981,0.2284
4,0.8637,1.3135,1.1461,0.9897,0.2032,0.24
5,0.8644,1.3158,1.1471,0.9896,0.1939,0.2287
6,0.8546,1.296,1.1384,0.9898,0.1962,0.2323
7,0.8649,1.3043,1.1421,0.9897,0.2009,0.2458
8,0.8522,1.2792,1.131,0.9899,0.1925,0.2326
9,0.8566,1.2929,1.1371,0.9898,0.1999,0.2371


Wall time: 2min 51s


In [45]:
%%time
temp_prediction_model = finalize_model(blend_models3)

Wall time: 3min 34s


In [46]:
save_model(temp_prediction_model, 'temp_prediction_model01')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True,
                                       features_todrop=['공급량', 'year'],
                                       id_columns=[], ml_usecase='regression',
                                       numerical_features=[], target='기온',
                                       time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numeric_s...
                                                             importance_type='split',
                                                             learning_rate=0.1,
                                                             max_depth=-1,
                                

In [47]:
temp_pred = predict_model(temp_prediction_model, data=test2019)
temp_pred.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분,year,month,day,weekday,습도,기압,Label
0,2019-01-01 01 A,2019-01-01,1,0,2019,1,1,1,64.463024,1010.0,-0.74253
1,2019-01-01 02 A,2019-01-01,2,0,2019,1,1,1,65.66405,1009.400024,-1.873191
2,2019-01-01 03 A,2019-01-01,3,0,2019,1,1,1,65.659699,1009.200012,-2.116468
3,2019-01-01 04 A,2019-01-01,4,0,2019,1,1,1,66.32069,1008.200012,-0.642952
4,2019-01-01 05 A,2019-01-01,5,0,2019,1,1,1,67.903796,1007.297321,1.665344


In [48]:
test2019['기온'] = temp_pred['Label']
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분,year,month,day,weekday,습도,기압,기온
0,2019-01-01 01 A,2019-01-01,1,0,2019,1,1,1,64.463024,1010.0,-0.74253
1,2019-01-01 02 A,2019-01-01,2,0,2019,1,1,1,65.66405,1009.400024,-1.873191
2,2019-01-01 03 A,2019-01-01,3,0,2019,1,1,1,65.659699,1009.200012,-2.116468
3,2019-01-01 04 A,2019-01-01,4,0,2019,1,1,1,66.32069,1008.200012,-0.642952
4,2019-01-01 05 A,2019-01-01,5,0,2019,1,1,1,67.903796,1007.297321,1.665344


### 가스 공급량 예측

In [49]:
import numpy as np

In [50]:
total['log_공급량'] = np.log1p(total['공급량'])
total.head()

Unnamed: 0,year,month,day,weekday,시간,구분,공급량,기온,습도,기압,log_공급량
0,2013,1,1,1,1,0,2497.129,-8.5,57.0,1010.0,7.823297
1,2013,1,1,1,1,1,2169.093,-8.5,57.0,1010.0,7.682525
2,2013,1,1,1,1,2,226.178,-8.5,57.0,1010.0,5.425734
3,2013,1,1,1,1,3,1434.516,-8.5,57.0,1010.0,7.26928
4,2013,1,1,1,1,4,3272.837,-8.5,57.0,1010.0,8.093718


In [51]:
# 학습 특성 : 'month', 'day', 'weekday', '시간', '구분', '습도', '기압', '기온'
exp = setup(total, target='log_공급량', ignore_features=['year', '공급량'])

Unnamed: 0,Description,Value
0,session_id,2117
1,Target,log_공급량
2,Original Data,"(368088, 11)"
3,Missing Values,False
4,Numeric Features,5
5,Categorical Features,3
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(257661, 31)"


In [52]:
# 모델 비교

# compare_models(sort='MAPE', exclude=['lr','lasso','ridge','rf','dt','et'])

In [53]:
cat = create_model('catboost')
lgbm = create_model('lightgbm')
xgb = create_model('xgboost')

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,0.1053,0.0618,0.2486,0.9572,0.0701,0.0328
1,0.1042,0.0569,0.2386,0.9597,0.0671,0.0315
2,0.1058,0.0616,0.2482,0.9574,0.0703,0.0331
3,0.1046,0.0568,0.2384,0.9606,0.0656,0.0307
4,0.1075,0.066,0.2568,0.954,0.0718,0.0336
5,0.1074,0.0642,0.2533,0.955,0.0707,0.0332
6,0.1075,0.0661,0.2572,0.9542,0.0713,0.0333
7,0.1044,0.0582,0.2413,0.959,0.0674,0.0315
8,0.1073,0.0644,0.2538,0.9553,0.0706,0.0331
9,0.1068,0.0666,0.258,0.9533,0.0721,0.0336


In [54]:
%%time
tuned_cat = tune_model(cat)
tuned_lgbm = tune_model(lgbm)
tuned_xgb = tune_model(xgb)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,0.0609,0.0188,0.137,0.987,0.0388,0.0163
1,0.0605,0.0185,0.1359,0.9869,0.0397,0.0165
2,0.0614,0.0195,0.1396,0.9865,0.0405,0.0169
3,0.0613,0.019,0.1379,0.9868,0.0392,0.0165
4,0.0605,0.0199,0.1411,0.9861,0.0404,0.0167
5,0.0614,0.0192,0.1384,0.9866,0.0397,0.0167
6,0.0617,0.0204,0.1428,0.9859,0.0414,0.0172
7,0.061,0.0201,0.1418,0.9858,0.0416,0.0171
8,0.0628,0.0218,0.1476,0.9849,0.0424,0.0176
9,0.0626,0.0223,0.1492,0.9844,0.0438,0.0179


Wall time: 1h 13min 12s


In [58]:
%%time
blend_model = blend_models(estimator_list=[tuned_cat, tuned_lgbm, tuned_xgb])

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,0.08,0.0328,0.181,0.9773,0.0542,0.0241
1,0.0792,0.032,0.179,0.9773,0.0539,0.0239
2,0.0802,0.0333,0.1825,0.9769,0.0552,0.0246
3,0.0804,0.0326,0.1807,0.9774,0.0534,0.0238
4,0.08,0.034,0.1844,0.9763,0.0553,0.0245
5,0.0798,0.0328,0.1811,0.977,0.054,0.024
6,0.0807,0.0358,0.1891,0.9752,0.0567,0.025
7,0.0794,0.0338,0.1838,0.9762,0.0552,0.0242
8,0.0824,0.0369,0.192,0.9744,0.0569,0.0253
9,0.0811,0.0372,0.1928,0.9739,0.0578,0.0254


Wall time: 14min 51s


In [59]:
%%time
gas_prediction_model = finalize_model(blend_model)

Wall time: 16min 50s


In [60]:
save_model(gas_prediction_model, 'gas_prediction_model02')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True,
                                       features_todrop=['year', '공급량'],
                                       id_columns=[], ml_usecase='regression',
                                       numerical_features=[], target='log_공급량',
                                       time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 nume...
                                                            max_depth=11,
                                                            min_child_weight=4,
                                                            missing=nan,
                                              

In [62]:
log_gas_pred = predict_model(gas_prediction_model, data=test2019)
log_gas_pred.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분,year,month,day,weekday,습도,기압,기온,Label
0,2019-01-01 01 A,2019-01-01,1,0,2019,1,1,1,64.463024,1010.0,-0.74253,7.539387
1,2019-01-01 02 A,2019-01-01,2,0,2019,1,1,1,65.66405,1009.400024,-1.873191,7.426608
2,2019-01-01 03 A,2019-01-01,3,0,2019,1,1,1,65.659699,1009.200012,-2.116468,7.354074
3,2019-01-01 04 A,2019-01-01,4,0,2019,1,1,1,66.32069,1008.200012,-0.642952,7.33664
4,2019-01-01 05 A,2019-01-01,5,0,2019,1,1,1,67.903796,1007.297321,1.665344,7.409057


### 제출 파일 만들기

In [63]:
sub = pd.read_csv('../data/sample_submission.csv')
sub.head()

Unnamed: 0,일자|시간|구분,공급량
0,2019-01-01 01 A,0
1,2019-01-01 02 A,0
2,2019-01-01 03 A,0
3,2019-01-01 04 A,0
4,2019-01-01 05 A,0


In [64]:
pred = np.expm1(log_gas_pred['Label'])

In [65]:
sub['공급량'] = pred
sub.head()

Unnamed: 0,일자|시간|구분,공급량
0,2019-01-01 01 A,1879.677671
1,2019-01-01 02 A,1679.098661
2,2019-01-01 03 A,1561.549214
3,2019-01-01 04 A,1534.543857
4,2019-01-01 05 A,1649.868094


In [66]:
sub.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15120 entries, 0 to 15119
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   일자|시간|구분  15120 non-null  object 
 1   공급량       15120 non-null  float64
dtypes: float64(1), object(1)
memory usage: 236.4+ KB


In [67]:
sub.to_csv("sub14_pycaret02.csv", index=False)