# pycaret을 이용한 가스공급량 예측
* 2019년 test 데이터 : '../data/test.csv'
* 2019년 제출 파일 : '../data/sample_submission.csv'
* 2013-2018년 가스공급량과 기온 자료 : '../2013-2018년_가스공급량_기온_습도_기압01.csv'
* 특성
    * 'month', 'weekay', '시간'으로 기온 예측
    * 기온, 기압, 습도 순서로 예측
    * 각 특성 예측 마다 앞에서 예측한 특성 활용
    * 공급량 예측(7개) : 'month', 'weekday', '시간', '구분', '기온', '기압', '습도'
* 공급량 log적용
* 3개 모델 예측 평균
* 튜닝X, finalize_model X
* 구분별 훈련/예측
* 이상치 제거(outliers_threshold=0.1)

In [1]:
# 제출 파일명 : sub35_pycaret23(ver2.6).csv
# 최고점 파일과 비교 점수(NMAE/R2) : 0.02275 / 0.98309
# private 제출 점수 : X

## 데이터 가져오기
* 2019년 test 데이터 : '../data/test.csv'
* 2019년 제출 파일 : '../data/sample_submission.csv'
* 2013-2018년 가스공급량과 기온 자료 : '../2013-2018년_가스공급량_기온_습도_기압01.csv'

In [2]:
from pycaret.regression import *

In [3]:
import time
import datetime
import pandas as pd
import numpy as np
from tqdm import tqdm

### 2013-2018년 가스공급량과 기온, 기압, 습도 자료

In [4]:
total = pd.read_csv('../2013-2018년_가스공급량_기온_습도_기압01.csv')
total.head()

Unnamed: 0,연월일,year,month,day,weekday,시간,구분,공급량,기온,습도,기압
0,2013-01-01,2013,1,1,1,1,0,2497.129,-8.5,57.0,1010.0
1,2013-01-01,2013,1,1,1,1,1,2169.093,-8.5,57.0,1010.0
2,2013-01-01,2013,1,1,1,1,2,226.178,-8.5,57.0,1010.0
3,2013-01-01,2013,1,1,1,1,3,1434.516,-8.5,57.0,1010.0
4,2013-01-01,2013,1,1,1,1,4,3272.837,-8.5,57.0,1010.0


### 2019년 테스트 데이터

In [5]:
test2019 = pd.read_csv('../data/test.csv')
test2019.head()

Unnamed: 0,일자|시간|구분
0,2019-01-01 01 A
1,2019-01-01 02 A
2,2019-01-01 03 A
3,2019-01-01 04 A
4,2019-01-01 05 A


In [6]:
test2019[['연월일', '시간', '구분']] = test2019['일자|시간|구분'].str.split(' ').tolist()
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분
0,2019-01-01 01 A,2019-01-01,1,A
1,2019-01-01 02 A,2019-01-01,2,A
2,2019-01-01 03 A,2019-01-01,3,A
3,2019-01-01 04 A,2019-01-01,4,A
4,2019-01-01 05 A,2019-01-01,5,A


In [7]:
# 구분 열을 숫자로 바꾸기
d_map = {}
for i, d in enumerate(test2019['구분'].unique()):
    d_map[d] = i
test2019['구분'] = test2019['구분'].map(d_map)
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분
0,2019-01-01 01 A,2019-01-01,1,0
1,2019-01-01 02 A,2019-01-01,2,0
2,2019-01-01 03 A,2019-01-01,3,0
3,2019-01-01 04 A,2019-01-01,4,0
4,2019-01-01 05 A,2019-01-01,5,0


In [8]:
# 연월일 열은 object형이므로 년, 월, 일로 나눈다.
test2019['연월일'] = pd.to_datetime(test2019['연월일'])
test2019['year'] = test2019['연월일'].dt.year
test2019['month'] = test2019['연월일'].dt.month
test2019['day'] = test2019['연월일'].dt.day
test2019['weekday'] = test2019['연월일'].dt.weekday
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분,year,month,day,weekday
0,2019-01-01 01 A,2019-01-01,1,0,2019,1,1,1
1,2019-01-01 02 A,2019-01-01,2,0,2019,1,1,1
2,2019-01-01 03 A,2019-01-01,3,0,2019,1,1,1
3,2019-01-01 04 A,2019-01-01,4,0,2019,1,1,1
4,2019-01-01 05 A,2019-01-01,5,0,2019,1,1,1


## 2019년 기온 훈련/예측

In [9]:
start = time.time()

# 학습 특성 : 'month', 'weekday', '시간'
# 타겟 특성 : '기온'
test_col = ['month', 'weekday', '시간']
target_col = ['기온']
train_col = test_col + target_col

temp_train = total[train_col]
temp_test = test2019[test_col]

# pycaret 설정
exp = setup(temp_train, target=target_col[0], session_id=0, silent=True, fold=7, fold_shuffle=True, verbose=False)

# 모델 생성
lgbm = create_model('lightgbm')
cat = create_model('catboost')
xgb = create_model('xgboost')

# 기온 예측
temp2019_pred_lgbm = predict_model(lgbm, data=temp_test)
temp2019_pred_cat = predict_model(cat, data=temp_test)
temp2019_pred_xgb = predict_model(xgb, data=temp_test)

# 모델 기온 예측 평균
temp2019_pred = (temp2019_pred_lgbm['Label'] + temp2019_pred_cat['Label'] + temp2019_pred_xgb['Label']) / 3

# 2019년 예측 기온 열 추가
test2019[target_col[0]] = temp2019_pred

end = time.time()
sec = end - start
result = datetime.timedelta(seconds=sec)
print(str(result).split('.')[0])
# 00:06:58

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,2.9769,14.7525,3.8409,0.8829,0.5129,0.8185
1,2.9912,14.9267,3.8635,0.8817,0.5148,0.7835
2,2.9948,14.9598,3.8678,0.8826,0.5204,0.8038
3,2.99,14.941,3.8654,0.8821,0.5195,0.8199
4,3.01,15.0591,3.8806,0.8825,0.523,0.8091
5,2.9746,14.7322,3.8383,0.8844,0.5182,0.786
6,3.0095,15.0037,3.8735,0.8821,0.5211,0.7962
Mean,2.9924,14.9107,3.8614,0.8826,0.5186,0.8024
SD,0.0129,0.1141,0.0148,0.0008,0.0033,0.0135


0:06:58


In [10]:
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분,year,month,day,weekday,기온
0,2019-01-01 01 A,2019-01-01,1,0,2019,1,1,1,-3.894176
1,2019-01-01 02 A,2019-01-01,2,0,2019,1,1,1,-4.363952
2,2019-01-01 03 A,2019-01-01,3,0,2019,1,1,1,-4.476645
3,2019-01-01 04 A,2019-01-01,4,0,2019,1,1,1,-4.922204
4,2019-01-01 05 A,2019-01-01,5,0,2019,1,1,1,-5.048251


## 2019년 기압 훈련/예측

In [11]:
start = time.time()

# 학습 특성 : 'month', 'weekday', '시간', '기온'
# 타겟 특성 : '기압'
test_col = ['month', 'weekday', '시간', '기온']
target_col = ['기압']
train_col = test_col + target_col

temp_train = total[train_col]
temp_test = test2019[test_col]

# pycaret 설정
exp = setup(temp_train, target=target_col[0], session_id=0, silent=True, fold=7, fold_shuffle=True, verbose=False)

# 모델 생성
dt = create_model('dt')
et = create_model('et')
rf = create_model('rf')

# 기압 예측
pressure2019_pred_dt = predict_model(dt, data=temp_test)
pressure2019_pred_et = predict_model(et, data=temp_test)
pressure2019_pred_rf = predict_model(rf, data=temp_test)

# 모델 기압 예측 평균
pressure2019_pred = (pressure2019_pred_dt['Label'] + pressure2019_pred_et['Label'] + pressure2019_pred_rf['Label']) / 3

# 2019년 예측 기압 열 추가
test2019[target_col[0]] = pressure2019_pred

end = time.time()
sec = end - start
result = datetime.timedelta(seconds=sec)
print(str(result).split('.')[0])
# 00:13:11

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,0.6738,2.5616,1.6005,0.9603,0.0016,0.0007
1,0.6777,2.627,1.6208,0.9595,0.0016,0.0007
2,0.6689,2.6507,1.6281,0.9592,0.0016,0.0007
3,0.6837,2.6617,1.6315,0.959,0.0016,0.0007
4,0.6815,2.7132,1.6472,0.9582,0.0016,0.0007
5,0.6829,2.5859,1.6081,0.9606,0.0016,0.0007
6,0.6781,2.6256,1.6204,0.9595,0.0016,0.0007
Mean,0.6781,2.6323,1.6224,0.9595,0.0016,0.0007
SD,0.0049,0.0463,0.0143,0.0008,0.0,0.0


0:13:11


In [12]:
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분,year,month,day,weekday,기온,기압
0,2019-01-01 01 A,2019-01-01,1,0,2019,1,1,1,-3.894176,1012.103393
1,2019-01-01 02 A,2019-01-01,2,0,2019,1,1,1,-4.363952,1015.612686
2,2019-01-01 03 A,2019-01-01,3,0,2019,1,1,1,-4.476645,1024.535644
3,2019-01-01 04 A,2019-01-01,4,0,2019,1,1,1,-4.922204,1014.696643
4,2019-01-01 05 A,2019-01-01,5,0,2019,1,1,1,-5.048251,1006.996327


## 2019년 습도(Humidity) 훈련/예측

In [14]:
start = time.time()

# 학습 특성 : 'month', 'weekday', '시간', '기온', '기압'
# 타겟 특성 : '습도'
test_col = ['month', 'weekday', '시간', '기온', '기압']
target_col = ['습도']
train_col = test_col + target_col

temp_train = total[train_col]
temp_test = test2019[test_col]

# pycaret 설정
exp = setup(temp_train, target=target_col[0], session_id=0, silent=True, fold=7, fold_shuffle=True, verbose=False)

# 모델 생성
et = create_model('et')
dt = create_model('dt')
rf = create_model('rf')

# 습도 예측
humidity2019_pred_et = predict_model(et, data=temp_test)
humidity2019_pred_dt = predict_model(dt, data=temp_test)
humidity2019_pred_rf = predict_model(rf, data=temp_test)

# 모델 습도 예측 평균
humidity2019_pred = (humidity2019_pred_et['Label'] + humidity2019_pred_dt['Label'] + humidity2019_pred_rf['Label']) / 3

# 2019년 예측 습도 열 추가
test2019[target_col[0]] = humidity2019_pred

end = time.time()
sec = end - start
result = datetime.timedelta(seconds=sec)
print(str(result).split('.')[0])
# 00:17:41

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,0.4784,1.6201,1.2728,0.996,0.0266,0.0098
1,0.4913,1.7666,1.3292,0.9955,0.0269,0.0099
2,0.4821,1.7994,1.3414,0.9955,0.0285,0.01
3,0.477,1.6772,1.2951,0.9958,0.0262,0.0096
4,0.4846,1.7376,1.3182,0.9956,0.0282,0.0101
5,0.4782,1.7104,1.3078,0.9957,0.0268,0.0097
6,0.4855,1.6963,1.3024,0.9957,0.0265,0.0098
Mean,0.4825,1.7154,1.3096,0.9957,0.0271,0.0099
SD,0.0047,0.0549,0.021,0.0001,0.0008,0.0002


0:17:41


In [15]:
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분,year,month,day,weekday,기온,기압,습도
0,2019-01-01 01 A,2019-01-01,1,0,2019,1,1,1,-3.894176,1012.103393,59.38
1,2019-01-01 02 A,2019-01-01,2,0,2019,1,1,1,-4.363952,1015.612686,61.166667
2,2019-01-01 03 A,2019-01-01,3,0,2019,1,1,1,-4.476645,1024.535644,54.536667
3,2019-01-01 04 A,2019-01-01,4,0,2019,1,1,1,-4.922204,1014.696643,60.153333
4,2019-01-01 05 A,2019-01-01,5,0,2019,1,1,1,-5.048251,1006.996327,61.723333


## 2019년 가스 공급량 훈련/예측

### log_공급량 열 추가

In [16]:
total['log_공급량'] = np.log1p(total['공급량'])
total.head()

Unnamed: 0,연월일,year,month,day,weekday,시간,구분,공급량,기온,습도,기압,log_공급량
0,2013-01-01,2013,1,1,1,1,0,2497.129,-8.5,57.0,1010.0,7.823297
1,2013-01-01,2013,1,1,1,1,1,2169.093,-8.5,57.0,1010.0,7.682525
2,2013-01-01,2013,1,1,1,1,2,226.178,-8.5,57.0,1010.0,5.425734
3,2013-01-01,2013,1,1,1,1,3,1434.516,-8.5,57.0,1010.0,7.26928
4,2013-01-01,2013,1,1,1,1,4,3272.837,-8.5,57.0,1010.0,8.093718


## 2019년 log_공급량 훈련/예측, 구분별

In [17]:
# 구분에 따라 예측하기
divisions = test2019['구분'].unique()

# 학습 특성 : 'month', 'weekday', '시간', '구분', '기온', '기압', '습도'
# 타겟 특성 : 'log_공급량'
test_col = ['month', 'weekday', '시간', '구분', '기온', '기압', '습도']
target_col = ['log_공급량']
train_col = test_col + target_col

gas2019_pred = pd.DataFrame()
for division in tqdm(divisions):
    # 구분별 훈련, 테스트 셋 설정
    temp_train = total[total['구분'] == division].reset_index(drop=True)
    temp_train = temp_train[train_col]

    temp_test = test2019[test2019['구분'] == division].reset_index(drop=True)
    temp_test = temp_test[test_col]

    # pycaret 설정
    exp = setup(temp_train, target=target_col[0], session_id=0, silent=True, remove_outliers=True, outliers_threshold=0.1, fold=7, fold_shuffle=True, verbose=False)

    # 모델 생성
    cat = create_model('catboost')
    xgb = create_model('xgboost')
    et = create_model('et')

    # log_공급량 예측
    log_gas2019_pred_cat = predict_model(cat, data=temp_test)
    log_gas2019_pred_xgb = predict_model(xgb, data=temp_test)
    log_gas2019_pred_et = predict_model(et, data=temp_test)

    # log_공급량 값을 공급량 값으로 변환
    gas_pred2019_cat = np.expm1(log_gas2019_pred_cat['Label'])
    gas_pred2019_xgb = np.expm1(log_gas2019_pred_xgb['Label'])
    gas_pred2019_et = np.expm1(log_gas2019_pred_et['Label'])

    # 두 모델 공급량 예측 평균
    gas2019_division_pred = (gas_pred2019_cat + gas_pred2019_xgb + gas_pred2019_et) / 3

    # 구분별 예측 값 합치기
    gas2019_pred = pd.concat([gas2019_pred, gas2019_division_pred])
    gas2019_pred = gas2019_pred.reset_index(drop=True)
    #00:13:40

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,0.0769,0.0123,0.1107,0.9803,0.0199,0.0157
1,0.0761,0.012,0.1096,0.9804,0.0192,0.0154
2,0.0775,0.0154,0.1241,0.9752,0.0239,0.0162
3,0.0772,0.0124,0.1113,0.9807,0.0201,0.0159
4,0.0782,0.0129,0.1136,0.9791,0.0201,0.0159
5,0.0763,0.014,0.1184,0.9768,0.0202,0.0155
6,0.0755,0.0124,0.1116,0.9801,0.0205,0.0154
Mean,0.0768,0.0131,0.1142,0.979,0.0205,0.0157
SD,0.0008,0.0011,0.0049,0.002,0.0014,0.0003


100%|██████████| 7/7 [13:40<00:00, 117.26s/it]


In [18]:
# 예측 결과 확인
gas2019_pred.head()

Unnamed: 0,0
0,2011.612581
1,1820.897795
2,1703.468438
3,1697.320911
4,1908.195283


## 제출 파일 가져오기

In [19]:
sub = pd.read_csv('../data/sample_submission.csv')
sub.head()

Unnamed: 0,일자|시간|구분,공급량
0,2019-01-01 01 A,0
1,2019-01-01 02 A,0
2,2019-01-01 03 A,0
3,2019-01-01 04 A,0
4,2019-01-01 05 A,0


In [20]:
sub['공급량'] = gas2019_pred
sub.head()

Unnamed: 0,일자|시간|구분,공급량
0,2019-01-01 01 A,2011.612581
1,2019-01-01 02 A,1820.897795
2,2019-01-01 03 A,1703.468438
3,2019-01-01 04 A,1697.320911
4,2019-01-01 05 A,1908.195283


In [21]:
sub.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15120 entries, 0 to 15119
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   일자|시간|구분  15120 non-null  object 
 1   공급량       15120 non-null  float64
dtypes: float64(1), object(1)
memory usage: 236.4+ KB


## 제출 파일 출력

In [22]:
sub.to_csv("sub35_pycaret23(ver2.6).csv", index=False)