# pycaret을 이용한 가스공급량 예측
* 2019년 test 데이터 : '../data/test.csv'
* 2019년 제출 파일 : '../data/sample_submission.csv'
* 2013-2018년 가스공급량과 기온 자료 : '../2013-2018년_가스공급량_기온_습도_기압01.csv'
* 사용 특성 : 'month', 'day', '시간', '구분', '기온','습도','기압'
* 3개 모델

In [None]:
# 제출 점수 : 

## 데이터 가져오기
* 2019년 test 데이터 : '../data/test.csv'
* 2019년 제출 파일 : '../data/sample_submission.csv'
* 2013-2018년 가스공급량과 기온 자료 : '../2013-2018년_가스공급량_기온_습도_기압01.csv'

In [1]:
import pandas as pd

### 2013-2018년 가스공급량과 기온 자료

In [2]:
total = pd.read_csv('../2013-2018년_가스공급량_기온_습도_기압01.csv')
total.head()

Unnamed: 0,year,month,day,weekday,시간,구분,공급량,기온,습도,기압
0,2013,1,1,1,1,0,2497.129,-8.5,57.0,1010.0
1,2013,1,1,1,1,1,2169.093,-8.5,57.0,1010.0
2,2013,1,1,1,1,2,226.178,-8.5,57.0,1010.0
3,2013,1,1,1,1,3,1434.516,-8.5,57.0,1010.0
4,2013,1,1,1,1,4,3272.837,-8.5,57.0,1010.0


#### 2019년 데이터

In [3]:
test2019 = pd.read_csv('../data/test.csv')
test2019.head()

Unnamed: 0,일자|시간|구분
0,2019-01-01 01 A
1,2019-01-01 02 A
2,2019-01-01 03 A
3,2019-01-01 04 A
4,2019-01-01 05 A


In [4]:
test2019[['연월일', '시간', '구분']] = test2019['일자|시간|구분'].str.split(' ').tolist()
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분
0,2019-01-01 01 A,2019-01-01,1,A
1,2019-01-01 02 A,2019-01-01,2,A
2,2019-01-01 03 A,2019-01-01,3,A
3,2019-01-01 04 A,2019-01-01,4,A
4,2019-01-01 05 A,2019-01-01,5,A


In [5]:
# 구분 열을 숫자로 바꾸기
d_map = {}
for i, d in enumerate(test2019['구분'].unique()):
    d_map[d] = i
test2019['구분'] = test2019['구분'].map(d_map)
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분
0,2019-01-01 01 A,2019-01-01,1,0
1,2019-01-01 02 A,2019-01-01,2,0
2,2019-01-01 03 A,2019-01-01,3,0
3,2019-01-01 04 A,2019-01-01,4,0
4,2019-01-01 05 A,2019-01-01,5,0


In [6]:
# 연월일 열은 object형이므로 년, 월, 일로 나눈다.
test2019['연월일'] = pd.to_datetime(test2019['연월일'])
test2019['year'] = test2019['연월일'].dt.year
test2019['month'] = test2019['연월일'].dt.month
test2019['day'] = test2019['연월일'].dt.day
test2019['weekday'] = test2019['연월일'].dt.weekday
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분,year,month,day,weekday
0,2019-01-01 01 A,2019-01-01,1,0,2019,1,1,1
1,2019-01-01 02 A,2019-01-01,2,0,2019,1,1,1
2,2019-01-01 03 A,2019-01-01,3,0,2019,1,1,1
3,2019-01-01 04 A,2019-01-01,4,0,2019,1,1,1
4,2019-01-01 05 A,2019-01-01,5,0,2019,1,1,1


## 2019년 기온 예측하기
* 습도 예측
* 기압 예측

In [7]:
from pycaret.regression import *

#### 습도(Humidity) 예측

In [8]:
# 학습 특성 : 'month', 'day', 'weekday', '시간', '구분'
# 타겟 특성 : '습도'

humidity_prediction_model = load_model('humidity_prediction_model_01')

Transformation Pipeline and Model Successfully Loaded


In [15]:
%%time
df_for_temp = test2019[['month', 'day', 'weekday', '시간', '구분']]
humidity2019_pred = predict_model(humidity_prediction_model, data=df_for_temp)

Wall time: 2.69 s


In [16]:
humidity_pred.head()

Unnamed: 0,month,day,weekday,시간,구분,Label
0,1,1,1,1,0,64.463024
1,1,1,1,2,0,65.66405
2,1,1,1,3,0,65.659699
3,1,1,1,4,0,66.32069
4,1,1,1,5,0,67.903796


In [17]:
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분,year,month,day,weekday
0,2019-01-01 01 A,2019-01-01,1,0,2019,1,1,1
1,2019-01-01 02 A,2019-01-01,2,0,2019,1,1,1
2,2019-01-01 03 A,2019-01-01,3,0,2019,1,1,1
3,2019-01-01 04 A,2019-01-01,4,0,2019,1,1,1
4,2019-01-01 05 A,2019-01-01,5,0,2019,1,1,1


In [18]:
test2019['습도'] = humidity2019_pred['Label']
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분,year,month,day,weekday,습도
0,2019-01-01 01 A,2019-01-01,1,0,2019,1,1,1,64.463024
1,2019-01-01 02 A,2019-01-01,2,0,2019,1,1,1,65.66405
2,2019-01-01 03 A,2019-01-01,3,0,2019,1,1,1,65.659699
3,2019-01-01 04 A,2019-01-01,4,0,2019,1,1,1,66.32069
4,2019-01-01 05 A,2019-01-01,5,0,2019,1,1,1,67.903796


#### 기압 예측

In [23]:
# 학습 특성 : 'month', 'day', 'weekday', '시간', '구분'
# 타겟 특성 : '기압'

exp = setup(total, target='기압', ignore_features=['공급량', 'year', '기온', '습도'])

Unnamed: 0,Description,Value
0,session_id,4853
1,Target,기압
2,Original Data,"(368088, 10)"
3,Missing Values,False
4,Numeric Features,2
5,Categorical Features,3
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(257661, 28)"


In [25]:
%%time
pressure_model3 = compare_models(sort='MAPE', n_select=3)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
dt,Decision Tree Regressor,0.0059,0.0189,0.1298,0.9997,0.0001,0.0,0.83
rf,Random Forest Regressor,0.045,0.0096,0.098,0.9999,0.0001,0.0,51.215
et,Extra Trees Regressor,0.0787,0.025,0.158,0.9996,0.0002,0.0001,64.95
xgboost,Extreme Gradient Boosting,2.1009,7.3405,2.7093,0.8873,0.0027,0.0021,13.754
catboost,CatBoost Regressor,2.3363,8.7558,2.959,0.8656,0.0029,0.0023,21.093
lightgbm,Light Gradient Boosting Machine,2.9767,14.537,3.8127,0.7769,0.0038,0.003,1.032
knn,K Neighbors Regressor,3.2381,18.0498,4.2485,0.723,0.0042,0.0032,6.72
gbr,Gradient Boosting Regressor,3.5052,20.5728,4.5357,0.6843,0.0045,0.0035,19.546
lr,Linear Regression,3.7266,23.0261,4.7985,0.6466,0.0048,0.0037,0.761
ridge,Ridge Regression,3.7261,23.0018,4.796,0.647,0.0048,0.0037,0.097


Wall time: 35min 34s


In [26]:
%%time
blended_pressure_model3 = blend_models(estimator_list=pressure_model3)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,0.0379,0.0052,0.0721,0.9999,0.0001,0.0
1,0.04,0.0091,0.0953,0.9999,0.0001,0.0
2,0.0381,0.0054,0.0735,0.9999,0.0001,0.0
3,0.0384,0.0096,0.0978,0.9999,0.0001,0.0
4,0.0402,0.0079,0.0888,0.9999,0.0001,0.0
5,0.0391,0.0074,0.0862,0.9999,0.0001,0.0
6,0.0393,0.006,0.0773,0.9999,0.0001,0.0
7,0.0403,0.0091,0.0956,0.9999,0.0001,0.0
8,0.0397,0.0073,0.0852,0.9999,0.0001,0.0
9,0.0398,0.0069,0.0831,0.9999,0.0001,0.0


In [27]:
%%time
final_pressure_model3 = finalize_model(blended_pressure_model3)

In [28]:
pressure_pred = predict_model(final_pressure_model3, data=test2019)

In [29]:
pressure_pred.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분,year,month,day,weekday,습도,Label
0,2019-01-01 01 A,2019-01-01,1,0,2019,1,1,1,64.463024,1010.0
1,2019-01-01 02 A,2019-01-01,2,0,2019,1,1,1,65.66405,1009.401358
2,2019-01-01 03 A,2019-01-01,3,0,2019,1,1,1,65.659699,1009.200012
3,2019-01-01 04 A,2019-01-01,4,0,2019,1,1,1,66.32069,1008.198012
4,2019-01-01 05 A,2019-01-01,5,0,2019,1,1,1,67.903796,1007.294988


In [30]:
test2019['기압'] = pressure_pred['Label']
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분,year,month,day,weekday,습도,기압
0,2019-01-01 01 A,2019-01-01,1,0,2019,1,1,1,64.463024,64.463024
1,2019-01-01 02 A,2019-01-01,2,0,2019,1,1,1,65.66405,65.66405
2,2019-01-01 03 A,2019-01-01,3,0,2019,1,1,1,65.659699,65.659699
3,2019-01-01 04 A,2019-01-01,4,0,2019,1,1,1,66.32069,66.32069
4,2019-01-01 05 A,2019-01-01,5,0,2019,1,1,1,67.903796,67.903796


#### 기온예측

In [33]:
# 학습 특성 : 'month', 'day', 'weekday', '시간', '구분', '습도', '기압'
# 타겟 특성 : '기온'

exp = setup(total, target='기온', ignore_features=['공급량', 'year'])

Unnamed: 0,Description,Value
0,session_id,2138
1,Target,기온
2,Original Data,"(368088, 10)"
3,Missing Values,False
4,Numeric Features,4
5,Categorical Features,3
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(257661, 30)"


In [35]:
%%time
# 07-05_결과_출력(pycaret)_ver0.1(2021.11.15) 참고 모델 선택(결정트리모델은 과대적합으로 판단되어 제외)
temp_models3 = compare_models(sort='MAPE',n_select=3, include=['knn','catboost','lightgbm'])

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
knn,K Neighbors Regressor,0.7257,2.9182,1.7082,0.977,0.223,0.2228,3.335
catboost,CatBoost Regressor,1.4397,3.5307,1.879,0.9722,0.3083,0.4001,19.303
lightgbm,Light Gradient Boosting Machine,2.0151,6.8078,2.6091,0.9463,0.399,0.5619,1.145


Wall time: 4min 30s


In [36]:
%%time
tuned_temp_models3 = [tune_model(model) for model in temp_models3]

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,0.8661,1.3074,1.1434,0.9897,0.1878,0.2277
1,0.8845,1.3593,1.1659,0.9892,0.1966,0.2262
2,0.8862,1.3699,1.1704,0.9891,0.197,0.2384
3,0.8745,1.3308,1.1536,0.9896,0.1964,0.2286
4,0.8845,1.377,1.1735,0.9891,0.2029,0.2416
5,0.8873,1.3486,1.1613,0.9894,0.1959,0.2378
6,0.9056,1.4257,1.194,0.9888,0.204,0.2589
7,0.8994,1.4163,1.1901,0.9889,0.2024,0.241
8,0.8763,1.3492,1.1616,0.9894,0.1994,0.2296
9,0.8819,1.3635,1.1677,0.9893,0.2006,0.2349


Wall time: 18min 18s


In [37]:
%%time
blend_models3 = blend_models(estimator_list=tuned_temp_models3)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,0.7178,0.892,0.9445,0.993,0.1653,0.192
1,0.7325,0.9224,0.9604,0.9926,0.1711,0.1935
2,0.7367,0.9256,0.9621,0.9927,0.1714,0.2026
3,0.7318,0.9182,0.9582,0.9928,0.1757,0.201
4,0.7324,0.9275,0.9631,0.9926,0.1744,0.2007
5,0.7397,0.9236,0.961,0.9927,0.1692,0.199
6,0.7372,0.9357,0.9673,0.9927,0.173,0.2096
7,0.7359,0.9321,0.9654,0.9927,0.1707,0.1946
8,0.728,0.9109,0.9544,0.9928,0.1714,0.1957
9,0.7408,0.9475,0.9734,0.9926,0.1742,0.1994


Wall time: 1min 40s


In [38]:
%%time
temp_prediction_model = finalize_model(blend_models3)

Wall time: 2min 7s


In [39]:
save_model(temp_prediction_model, 'temp_prediction_model01')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True,
                                       features_todrop=['공급량', 'year'],
                                       id_columns=[], ml_usecase='regression',
                                       numerical_features=[], target='기온',
                                       time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numeric_s...
                                                             feature_fraction=0.9,
                                                             importance_type='split',
                                                             learning_rate=0.2,
                        

In [40]:
temp_pred = predict_model(temp_prediction_model, data=test2019)
temp_pred.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분,year,month,day,weekday,습도,기압,Label
0,2019-01-01 01 A,2019-01-01,1,0,2019,1,1,1,64.463024,64.463024,2.807896
1,2019-01-01 02 A,2019-01-01,2,0,2019,1,1,1,65.66405,65.66405,0.471236
2,2019-01-01 03 A,2019-01-01,3,0,2019,1,1,1,65.659699,65.659699,0.052972
3,2019-01-01 04 A,2019-01-01,4,0,2019,1,1,1,66.32069,66.32069,1.895545
4,2019-01-01 05 A,2019-01-01,5,0,2019,1,1,1,67.903796,67.903796,3.859672


In [41]:
test2019['기온'] = temp_pred['Label']
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분,year,month,day,weekday,습도,기압,기온
0,2019-01-01 01 A,2019-01-01,1,0,2019,1,1,1,64.463024,64.463024,2.807896
1,2019-01-01 02 A,2019-01-01,2,0,2019,1,1,1,65.66405,65.66405,0.471236
2,2019-01-01 03 A,2019-01-01,3,0,2019,1,1,1,65.659699,65.659699,0.052972
3,2019-01-01 04 A,2019-01-01,4,0,2019,1,1,1,66.32069,66.32069,1.895545
4,2019-01-01 05 A,2019-01-01,5,0,2019,1,1,1,67.903796,67.903796,3.859672


### 가스 공급량 예측

In [42]:
# 학습 특성 : 'month', 'day', 'weekday', '시간', '구분', '습도', '기압', '기온'
exp = setup(total, target='공급량', ignore_features=['year'])

Unnamed: 0,Description,Value
0,session_id,3708
1,Target,공급량
2,Original Data,"(368088, 10)"
3,Missing Values,False
4,Numeric Features,5
5,Categorical Features,3
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(257661, 31)"


In [47]:
# 모델 비교

compare_models(sort='MAPE', exclude=['lr','lasso','ridge','rf','dt','et'])

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,60.6618,9993.8675,99.6812,0.9884,0.3482,0.4804,20.572
xgboost,Extreme Gradient Boosting,64.6768,11174.4744,105.497,0.987,0.3573,0.4989,15.212
lightgbm,Light Gradient Boosting Machine,74.0938,14680.7124,121.0082,0.9829,0.3714,0.5324,1.097
gbr,Gradient Boosting Regressor,123.7098,34227.4243,184.9473,0.9602,0.4731,0.6787,21.043
huber,Huber Regressor,346.2966,232245.0631,477.0521,0.7297,0.8286,2.0986,6.908
br,Bayesian Ridge,318.044,174394.1045,417.5952,0.7971,0.8435,2.5031,0.656
lar,Least Angle Regression,318.0536,174394.1069,417.5952,0.7971,0.8436,2.5034,0.123
omp,Orthogonal Matching Pursuit,448.5083,336516.4532,580.0924,0.6085,1.1235,2.5817,0.111
en,Elastic Net,478.86,430205.8406,655.895,0.4995,0.9873,3.0931,0.205
knn,K Neighbors Regressor,553.9561,592453.225,769.7044,0.3107,1.0834,4.0398,2.73


<catboost.core.CatBoostRegressor at 0x1e6e3770340>

In [48]:
cat = create_model('catboost')
lgbm = create_model('lightgbm')
xgb = create_model('xgboost')

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,64.5142,10216.4072,101.0762,0.988,0.3447,0.4894
1,63.7018,10459.2568,102.2705,0.9877,0.3539,0.4867
2,64.1366,15332.7197,123.8254,0.9822,0.3581,0.5135
3,64.7307,10389.4834,101.9288,0.9879,0.3713,0.5318
4,65.345,10476.9932,102.3572,0.9878,0.3617,0.5352
5,65.3129,10667.7588,103.2848,0.9877,0.3657,0.5154
6,64.954,12312.0938,110.9599,0.9858,0.359,0.4829
7,65.695,10727.2451,103.5724,0.9877,0.3481,0.4641
8,64.3299,10177.998,100.8861,0.9881,0.3596,0.4967
9,64.0478,10984.7881,104.8083,0.9871,0.3505,0.4738


In [None]:
%%time
tuned_cat = tune_model(cat)
tuned_lgbm = tune_model(lgbm)
tuned_xgb = tune_model(xgb)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,50.2504,5698.6002,75.4891,0.9933,0.3445,0.4471
1,50.396,6037.1283,77.699,0.9929,0.338,0.4313
2,50.5092,11258.9897,106.1084,0.987,0.3469,0.4611
3,51.8535,6320.4873,79.5015,0.9926,0.3597,0.4949
4,51.8424,6177.4992,78.5971,0.9928,0.3598,0.4931
5,50.9022,6126.381,78.2712,0.9929,0.3511,0.4769
6,51.3626,8059.6151,89.7754,0.9907,0.3507,0.4419
7,50.6233,5938.4877,77.0616,0.9932,0.3492,0.4421
8,50.1936,5688.7736,75.424,0.9934,0.3483,0.4426
9,49.9755,6778.8506,82.3338,0.992,0.3319,0.4114


IntProgress(value=0, description='Processing: ', max=7)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE


Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 14.5min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 56.1min finished


In [None]:
%%time
blend_model = blend_models(estimators=[tuned_cat, tuned_lgbm, tuned_xgb])

In [None]:
%%time
gas_prediction_model = finalize_model(blend_model)

In [None]:
save_model(gas_prediction_model, 'gas_prediction_model01')

In [None]:
gas_pred = predict_model(gas_prediction_model, data=test2019)
gas_pred.head()

### 제출 파일 만들기

In [None]:
sub = pd.read_csv('../data/sample_submission.csv')
sub.head()

In [None]:
sub['공급량'] = gas_pred['Label']
sub.head()

In [None]:
sub.info()

In [None]:
sub.to_csv("sub13_pycaret01.csv", index=False)