# pycaret을 이용한 가스공급량 예측
* 2019년 test 데이터 : '../data/test.csv'
* 2019년 제출 파일 : '../data/sample_submission.csv'
* 2013-2018년 가스공급량과 기온 자료 : '../2013-2018년_가스공급량_기온_습도_기압01.csv'
* 특성
    * 'month', 'weekday', '시간'으로 기온 예측
    * 기온, 기압, 습도 순서로 예측
    * 각 특성 예측 마다 앞에서 예측한 특성 활용
    * 공급량 예측(7개) : 'month', 'weekday', '시간', '구분', '기온', '기압', '습도'
* 공급량 log적용
* 2개 모델('lightgbm', 'catboost') 예측 평균
* 튜닝X, finalize_model X
* 구분별 훈련/예측

In [1]:
# 제출 파일명 : sub24_pycaret12.csv
# 제출 점수 : 0.1032155541(2021-12-02 10:57:53)

## 데이터 가져오기
* 2019년 test 데이터 : '../data/test.csv'
* 2019년 제출 파일 : '../data/sample_submission.csv'
* 2013-2018년 가스공급량과 기온 자료 : '../2013-2018년_가스공급량_기온_습도_기압01.csv'

In [2]:
from pycaret.regression import *

In [26]:
import time
import datetime
import pandas as pd
import numpy as np
from tqdm import tqdm

### 2013-2018년 가스공급량과 기온 자료

In [4]:
total = pd.read_csv('../2013-2018년_가스공급량_기온_습도_기압01.csv')
total.head()

Unnamed: 0,year,month,day,weekday,시간,구분,공급량,기온,습도,기압
0,2013,1,1,1,1,0,2497.129,-8.5,57.0,1010.0
1,2013,1,1,1,1,1,2169.093,-8.5,57.0,1010.0
2,2013,1,1,1,1,2,226.178,-8.5,57.0,1010.0
3,2013,1,1,1,1,3,1434.516,-8.5,57.0,1010.0
4,2013,1,1,1,1,4,3272.837,-8.5,57.0,1010.0


### 2019년 데이터

In [5]:
test2019 = pd.read_csv('../data/test.csv')
test2019.head()

Unnamed: 0,일자|시간|구분
0,2019-01-01 01 A
1,2019-01-01 02 A
2,2019-01-01 03 A
3,2019-01-01 04 A
4,2019-01-01 05 A


In [6]:
test2019[['연월일', '시간', '구분']] = test2019['일자|시간|구분'].str.split(' ').tolist()
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분
0,2019-01-01 01 A,2019-01-01,1,A
1,2019-01-01 02 A,2019-01-01,2,A
2,2019-01-01 03 A,2019-01-01,3,A
3,2019-01-01 04 A,2019-01-01,4,A
4,2019-01-01 05 A,2019-01-01,5,A


In [7]:
# 구분 열을 숫자로 바꾸기
d_map = {}
for i, d in enumerate(test2019['구분'].unique()):
    d_map[d] = i
test2019['구분'] = test2019['구분'].map(d_map)
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분
0,2019-01-01 01 A,2019-01-01,1,0
1,2019-01-01 02 A,2019-01-01,2,0
2,2019-01-01 03 A,2019-01-01,3,0
3,2019-01-01 04 A,2019-01-01,4,0
4,2019-01-01 05 A,2019-01-01,5,0


In [8]:
# 연월일 열은 object형이므로 년, 월, 일로 나눈다.
test2019['연월일'] = pd.to_datetime(test2019['연월일'])
test2019['year'] = test2019['연월일'].dt.year
test2019['month'] = test2019['연월일'].dt.month
test2019['day'] = test2019['연월일'].dt.day
test2019['weekday'] = test2019['연월일'].dt.weekday
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분,year,month,day,weekday
0,2019-01-01 01 A,2019-01-01,1,0,2019,1,1,1
1,2019-01-01 02 A,2019-01-01,2,0,2019,1,1,1
2,2019-01-01 03 A,2019-01-01,3,0,2019,1,1,1
3,2019-01-01 04 A,2019-01-01,4,0,2019,1,1,1
4,2019-01-01 05 A,2019-01-01,5,0,2019,1,1,1


## 2019년 기온 훈련/예측

In [9]:
total.head()

Unnamed: 0,year,month,day,weekday,시간,구분,공급량,기온,습도,기압
0,2013,1,1,1,1,0,2497.129,-8.5,57.0,1010.0
1,2013,1,1,1,1,1,2169.093,-8.5,57.0,1010.0
2,2013,1,1,1,1,2,226.178,-8.5,57.0,1010.0
3,2013,1,1,1,1,3,1434.516,-8.5,57.0,1010.0
4,2013,1,1,1,1,4,3272.837,-8.5,57.0,1010.0


In [10]:
start = time.time()

# 학습 특성 : 'month', 'weekday', '시간'
# 타겟 특성 : '기온'
exp = setup(total, target='기온',
 ignore_features=['공급량', 'year', 'day', '구분', '습도', '기압'],
 silent=True, fold=7, fold_shuffle=True, verbose=False)

# 모델 생성
lgbm = create_model('lightgbm')
cat = create_model('catboost')

# 기온 예측
df = test2019[['month', 'weekday', '시간']]
temp2019_pred_lgbm = predict_model(lgbm, data=df)
temp2019_pred_cat = predict_model(cat, data=df)

# 두 모델 기온 예측 평균
temp2019_pred = (temp2019_pred_lgbm['Label'] + temp2019_pred_cat['Label']) / 2

# 2019년 예측 기온 열 추가
test2019['기온'] = temp2019_pred

end = time.time()
sec = end - start
result = datetime.timedelta(seconds=sec)
print(str(result).split('.')[0])

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,2.9885,14.9009,3.8602,0.882,0.5125,0.7864
1,2.9614,14.5989,3.8208,0.8841,0.5111,0.7773
2,3.0047,15.0197,3.8755,0.8817,0.5226,0.8001
3,2.9862,14.84,3.8523,0.8844,0.5186,0.7923
4,2.9716,14.7278,3.8377,0.8838,0.5203,0.8138
5,3.0202,15.0808,3.8834,0.8816,0.5228,0.7931
6,2.9773,14.7567,3.8414,0.8837,0.512,0.7931
Mean,2.9871,14.8464,3.853,0.8831,0.5171,0.7937
SD,0.0185,0.1564,0.0203,0.0011,0.0047,0.0105


0:03:00


In [11]:
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분,year,month,day,weekday,기온
0,2019-01-01 01 A,2019-01-01,1,0,2019,1,1,1,-3.818043
1,2019-01-01 02 A,2019-01-01,2,0,2019,1,1,1,-3.997455
2,2019-01-01 03 A,2019-01-01,3,0,2019,1,1,1,-4.814473
3,2019-01-01 04 A,2019-01-01,4,0,2019,1,1,1,-4.993942
4,2019-01-01 05 A,2019-01-01,5,0,2019,1,1,1,-5.254072


## 2019년 기압 훈련/예측

In [12]:
total.head()

Unnamed: 0,year,month,day,weekday,시간,구분,공급량,기온,습도,기압
0,2013,1,1,1,1,0,2497.129,-8.5,57.0,1010.0
1,2013,1,1,1,1,1,2169.093,-8.5,57.0,1010.0
2,2013,1,1,1,1,2,226.178,-8.5,57.0,1010.0
3,2013,1,1,1,1,3,1434.516,-8.5,57.0,1010.0
4,2013,1,1,1,1,4,3272.837,-8.5,57.0,1010.0


In [13]:
start = time.time()

# 학습 특성 : 'month', 'weekday', '시간', '기온'
# 타겟 특성 : '기압'
exp = setup(total, target='기압', ignore_features=['공급량', 'year', 'day', '습도', '구분'], 
            silent=True, fold=7, fold_shuffle=True, verbose=False)

# 모델 생성
lgbm = create_model('lightgbm')
cat = create_model('catboost')

# 기압 예측
df = test2019[['month', 'weekday', '시간', '기온']]
pressure2019_pred_lgbm = predict_model(lgbm, data=df)
pressure2019_pred_cat = predict_model(cat, data=df)

# 두 모델 기압 예측 평균
pressure2019_pred = (pressure2019_pred_lgbm['Label'] + pressure2019_pred_cat['Label']) / 2

# 2019년 예측 기압 열 추가
test2019['기압'] = pressure2019_pred

end = time.time()
sec = end - start
result = datetime.timedelta(seconds=sec)
print(str(result).split('.')[0])

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,3.2362,17.5122,4.1848,0.731,0.0042,0.0032
1,3.25,17.6952,4.2066,0.7277,0.0042,0.0032
2,3.2246,17.3028,4.1597,0.7321,0.0041,0.0032
3,3.2567,17.9577,4.2376,0.7255,0.0042,0.0032
4,3.2614,17.7423,4.2122,0.7275,0.0042,0.0032
5,3.282,18.0046,4.2432,0.725,0.0042,0.0033
6,3.2322,17.4716,4.1799,0.7314,0.0042,0.0032
Mean,3.249,17.6695,4.2034,0.7286,0.0042,0.0032
SD,0.0183,0.2389,0.0284,0.0027,0.0,0.0


0:02:48


In [14]:
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분,year,month,day,weekday,기온,기압
0,2019-01-01 01 A,2019-01-01,1,0,2019,1,1,1,-3.818043,1014.955863
1,2019-01-01 02 A,2019-01-01,2,0,2019,1,1,1,-3.997455,1015.052326
2,2019-01-01 03 A,2019-01-01,3,0,2019,1,1,1,-4.814473,1015.342391
3,2019-01-01 04 A,2019-01-01,4,0,2019,1,1,1,-4.993942,1015.248838
4,2019-01-01 05 A,2019-01-01,5,0,2019,1,1,1,-5.254072,1015.170581


## 2019년 습도(Humidity) 훈련/예측

In [15]:
total.head()

Unnamed: 0,year,month,day,weekday,시간,구분,공급량,기온,습도,기압
0,2013,1,1,1,1,0,2497.129,-8.5,57.0,1010.0
1,2013,1,1,1,1,1,2169.093,-8.5,57.0,1010.0
2,2013,1,1,1,1,2,226.178,-8.5,57.0,1010.0
3,2013,1,1,1,1,3,1434.516,-8.5,57.0,1010.0
4,2013,1,1,1,1,4,3272.837,-8.5,57.0,1010.0


In [16]:
start = time.time()

# 학습 특성 : 'month', 'weekday', '시간', '기온', '기압'
# 타겟 특성 : '습도'
exp = setup(total, target='습도', ignore_features=['공급량', 'year', 'day', '구분'],
 fold=7, fold_shuffle=True, silent=True, verbose=False)

# 모델 생성
lgbm = create_model('lightgbm')
cat = create_model('catboost')

# 습도 예측
df = test2019[['month', 'weekday', '시간', '기온', '기압']]
humidity2019_pred_lgbm = predict_model(lgbm, data=df)
humidity2019_pred_cat = predict_model(cat, data=df)

# 두 모델 습도 예측 평균
humidity2019_pred = (humidity2019_pred_lgbm['Label'] + humidity2019_pred_cat['Label']) / 2

# 2019년 예측 습도 열 추가
test2019['습도'] = humidity2019_pred

end = time.time()
sec = end - start
result = datetime.timedelta(seconds=sec)
print(str(result).split('.')[0])

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,9.1964,138.9603,11.7881,0.6514,0.2284,0.1881
1,9.1516,138.4298,11.7656,0.6528,0.2293,0.1883
2,9.166,138.9538,11.7879,0.6503,0.229,0.1877
3,9.1761,139.4257,11.8079,0.6481,0.229,0.1877
4,9.1562,138.1792,11.755,0.6496,0.2279,0.1869
5,9.3292,142.817,11.9506,0.6405,0.2318,0.1911
6,9.1972,139.3827,11.806,0.6474,0.2291,0.1885
Mean,9.1961,139.4498,11.8087,0.6486,0.2292,0.1883
SD,0.0568,1.4384,0.0606,0.0037,0.0011,0.0012


0:02:41


In [17]:
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분,year,month,day,weekday,기온,기압,습도
0,2019-01-01 01 A,2019-01-01,1,0,2019,1,1,1,-3.818043,1014.955863,58.921387
1,2019-01-01 02 A,2019-01-01,2,0,2019,1,1,1,-3.997455,1015.052326,59.321405
2,2019-01-01 03 A,2019-01-01,3,0,2019,1,1,1,-4.814473,1015.342391,60.090852
3,2019-01-01 04 A,2019-01-01,4,0,2019,1,1,1,-4.993942,1015.248838,62.056605
4,2019-01-01 05 A,2019-01-01,5,0,2019,1,1,1,-5.254072,1015.170581,62.604837


## 2019년 가스 공급량 훈련/예측
* 구분별로 예측하기(구분 7개)

### log_공급량으로 진행

In [19]:
total['log_공급량'] = np.log1p(total['공급량'])
total.head()

Unnamed: 0,year,month,day,weekday,시간,구분,공급량,기온,습도,기압,log_공급량
0,2013,1,1,1,1,0,2497.129,-8.5,57.0,1010.0,7.823297
1,2013,1,1,1,1,1,2169.093,-8.5,57.0,1010.0,7.682525
2,2013,1,1,1,1,2,226.178,-8.5,57.0,1010.0,5.425734
3,2013,1,1,1,1,3,1434.516,-8.5,57.0,1010.0,7.26928
4,2013,1,1,1,1,4,3272.837,-8.5,57.0,1010.0,8.093718


In [20]:
test2019.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15120 entries, 0 to 15119
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   일자|시간|구분  15120 non-null  object        
 1   연월일       15120 non-null  datetime64[ns]
 2   시간        15120 non-null  object        
 3   구분        15120 non-null  int64         
 4   year      15120 non-null  int64         
 5   month     15120 non-null  int64         
 6   day       15120 non-null  int64         
 7   weekday   15120 non-null  int64         
 8   기온        15120 non-null  float64       
 9   기압        15120 non-null  float64       
 10  습도        15120 non-null  float64       
dtypes: datetime64[ns](1), float64(3), int64(5), object(2)
memory usage: 1.3+ MB


In [27]:
test2019['구분'].unique()

array([0, 1, 2, 3, 4, 5, 6], dtype=int64)

In [23]:
test2019.head(30)

Unnamed: 0,일자|시간|구분,연월일,시간,구분,year,month,day,weekday,기온,기압,습도
0,2019-01-01 01 A,2019-01-01,1,0,2019,1,1,1,-3.818043,1014.955863,58.921387
1,2019-01-01 02 A,2019-01-01,2,0,2019,1,1,1,-3.997455,1015.052326,59.321405
2,2019-01-01 03 A,2019-01-01,3,0,2019,1,1,1,-4.814473,1015.342391,60.090852
3,2019-01-01 04 A,2019-01-01,4,0,2019,1,1,1,-4.993942,1015.248838,62.056605
4,2019-01-01 05 A,2019-01-01,5,0,2019,1,1,1,-5.254072,1015.170581,62.604837
5,2019-01-01 06 A,2019-01-01,6,0,2019,1,1,1,-5.317249,1015.213498,63.175403
6,2019-01-01 07 A,2019-01-01,7,0,2019,1,1,1,-5.348648,1015.252051,63.48375
7,2019-01-01 08 A,2019-01-01,8,0,2019,1,1,1,-5.551069,1015.161462,63.049614
8,2019-01-01 09 A,2019-01-01,9,0,2019,1,1,1,-5.187105,1015.845699,58.303817
9,2019-01-01 10 A,2019-01-01,10,0,2019,1,1,1,-3.643272,1015.997635,50.544167


In [24]:
test2019[test2019['구분'] == 0]

Unnamed: 0,일자|시간|구분,연월일,시간,구분,year,month,day,weekday,기온,기압,습도
0,2019-01-01 01 A,2019-01-01,01,0,2019,1,1,1,-3.818043,1014.955863,58.921387
1,2019-01-01 02 A,2019-01-01,02,0,2019,1,1,1,-3.997455,1015.052326,59.321405
2,2019-01-01 03 A,2019-01-01,03,0,2019,1,1,1,-4.814473,1015.342391,60.090852
3,2019-01-01 04 A,2019-01-01,04,0,2019,1,1,1,-4.993942,1015.248838,62.056605
4,2019-01-01 05 A,2019-01-01,05,0,2019,1,1,1,-5.254072,1015.170581,62.604837
...,...,...,...,...,...,...,...,...,...,...,...
2155,2019-03-31 20 A,2019-03-31,20,0,2019,3,31,6,7.385245,1009.223597,51.768945
2156,2019-03-31 21 A,2019-03-31,21,0,2019,3,31,6,6.731474,1009.409736,54.040092
2157,2019-03-31 22 A,2019-03-31,22,0,2019,3,31,6,6.167349,1009.537493,55.721268
2158,2019-03-31 23 A,2019-03-31,23,0,2019,3,31,6,5.577864,1009.474469,58.445519


In [25]:
test2019[test2019['구분'] == 1]

Unnamed: 0,일자|시간|구분,연월일,시간,구분,year,month,day,weekday,기온,기압,습도
2160,2019-01-01 01 B,2019-01-01,01,1,2019,1,1,1,-3.818043,1014.955863,58.921387
2161,2019-01-01 02 B,2019-01-01,02,1,2019,1,1,1,-3.997455,1015.052326,59.321405
2162,2019-01-01 03 B,2019-01-01,03,1,2019,1,1,1,-4.814473,1015.342391,60.090852
2163,2019-01-01 04 B,2019-01-01,04,1,2019,1,1,1,-4.993942,1015.248838,62.056605
2164,2019-01-01 05 B,2019-01-01,05,1,2019,1,1,1,-5.254072,1015.170581,62.604837
...,...,...,...,...,...,...,...,...,...,...,...
4315,2019-03-31 20 B,2019-03-31,20,1,2019,3,31,6,7.385245,1009.223597,51.768945
4316,2019-03-31 21 B,2019-03-31,21,1,2019,3,31,6,6.731474,1009.409736,54.040092
4317,2019-03-31 22 B,2019-03-31,22,1,2019,3,31,6,6.167349,1009.537493,55.721268
4318,2019-03-31 23 B,2019-03-31,23,1,2019,3,31,6,5.577864,1009.474469,58.445519


In [29]:
total[total['구분'] == 1].reset_index(drop=True)

Unnamed: 0,year,month,day,weekday,시간,구분,공급량,기온,습도,기압,log_공급량
0,2013,1,1,1,1,1,2169.093,-8.5,57.0,1010.0,7.682525
1,2013,1,1,1,2,1,2051.733,-8.4,60.0,1009.4,7.626927
2,2013,1,1,1,3,1,1923.253,-8.1,58.0,1009.2,7.562293
3,2013,1,1,1,4,1,1927.253,-8.2,58.0,1008.2,7.564370
4,2013,1,1,1,5,1,2047.333,-8.2,61.0,1007.3,7.624782
...,...,...,...,...,...,...,...,...,...,...,...
52579,2018,12,31,0,20,1,2668.688,-3.7,39.0,1024.9,7.889717
52580,2018,12,31,0,21,1,2606.681,-4.6,44.0,1024.8,7.866217
52581,2018,12,31,0,22,1,2540.169,-5.4,46.0,1024.4,7.840379
52582,2018,12,31,0,23,1,2407.945,-5.2,47.0,1024.6,7.786944


## 구분에 따라 log_공급량 훈련/예측

In [30]:
start = time.time()

# 구분에 따라 예측하기
divisions = test2019['구분'].unique()

gas2019_pred = pd.DataFrame()
for division in tqdm(divisions):
    # 구분별 훈련, 테스트 셋 설정
    temp_train = total[total['구분'] == division].reset_index(drop=True)
    temp_test = test2019[test2019['구분'] == division].reset_index(drop=True)

    # 학습 특성 : 'month', 'weekday', '시간', '구분', '기온', '기압', '습도'
    exp = setup(temp_train, target='log_공급량', ignore_features=['year', '공급량', 'day'],
                silent=True, fold=7, fold_shuffle=True, verbose=False)

    # 모델 생성
    lgbm = create_model('lightgbm')
    cat = create_model('catboost')

    # log_공급량 예측
    pop_test = temp_test[['month', 'weekday', '시간', '구분', '기온', '기압', '습도']]
    log_gas2019_pred_lgbm = predict_model(lgbm, data=pop_test)
    log_gas2019_pred_cat = predict_model(cat, data=pop_test)

    # log_공급량 값을 공급량 값으로 변환
    gas_pred2019_lgbm = np.expm1(log_gas2019_pred_lgbm['Label'])
    gas_pred2019_cat = np.expm1(log_gas2019_pred_cat['Label'])

    # 두 모델 공급량 예측 평균
    gas2019_division_pred = (gas_pred2019_lgbm + gas_pred2019_cat) / 2

    # 구분별 예측 값 합치기
    gas2019_pred = pd.concat([gas2019_pred, gas2019_division_pred])
    gas2019_pred = gas2019_pred.reset_index(drop=True)

end = time.time()
sec = end - start
result = datetime.timedelta(seconds=sec)
print(str(result).split('.')[0])

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,0.0721,0.0104,0.1018,0.9831,0.0179,0.0146
1,0.0707,0.0096,0.0977,0.9845,0.0173,0.0144
2,0.0742,0.0164,0.1282,0.973,0.0233,0.0153
3,0.0718,0.0099,0.0997,0.9837,0.0176,0.0146
4,0.073,0.0111,0.1055,0.9816,0.0186,0.0148
5,0.0696,0.0097,0.0984,0.9842,0.0174,0.0141
6,0.0699,0.0109,0.1046,0.9827,0.0181,0.0142
Mean,0.0716,0.0112,0.1051,0.9818,0.0186,0.0146
SD,0.0015,0.0022,0.0098,0.0037,0.002,0.0004


100%|██████████| 7/7 [04:57<00:00, 42.57s/it]

0:04:57





In [31]:
# 예측 결과 확인
gas2019_pred.head()

Unnamed: 0,0
0,2066.196096
1,1830.912959
2,1757.392366
3,1771.582285
4,1992.175751


## 제출 파일 만들기

In [32]:
sub = pd.read_csv('../data/sample_submission.csv')
sub.head()

Unnamed: 0,일자|시간|구분,공급량
0,2019-01-01 01 A,0
1,2019-01-01 02 A,0
2,2019-01-01 03 A,0
3,2019-01-01 04 A,0
4,2019-01-01 05 A,0


In [33]:
sub.tail()

Unnamed: 0,일자|시간|구분,공급량
15115,2019-03-31 20 H,0
15116,2019-03-31 21 H,0
15117,2019-03-31 22 H,0
15118,2019-03-31 23 H,0
15119,2019-03-31 24 H,0


In [34]:
sub['공급량'] = gas2019_pred
sub.head()

Unnamed: 0,일자|시간|구분,공급량
0,2019-01-01 01 A,2066.196096
1,2019-01-01 02 A,1830.912959
2,2019-01-01 03 A,1757.392366
3,2019-01-01 04 A,1771.582285
4,2019-01-01 05 A,1992.175751


In [35]:
sub.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15120 entries, 0 to 15119
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   일자|시간|구분  15120 non-null  object 
 1   공급량       15120 non-null  float64
dtypes: float64(1), object(1)
memory usage: 236.4+ KB


In [36]:
sub.to_csv("sub24_pycaret12.csv", index=False)