# pycaret을 이용한 가스공급량 예측
* 2019년 test 데이터 : '../data/test.csv'
* 2019년 제출 파일 : '../data/sample_submission.csv'
* 2013-2018년 가스공급량과 기온 자료 : '../2013-2018년_가스공급량_기온_습도_기압01.csv'
* 특성
    * 'month', 'weekay', '시간'으로 기온 예측
    * 기온, 기압, 습도 순서로 예측
    * 각 특성 예측 마다 앞에서 예측한 특성 활용
    * 공급량 예측(7개) : 'month', 'weekday', '시간', '구분', '기온', '기압', '습도'
* 공급량 log적용
* 2개 모델('lightgbm', 'catboost') 예측 평균
* 튜닝X, finalize_model X
* 구분별 훈련/예측
* 이상치 제거

In [1]:
# 제출 파일명 : sub32_pycaret20.csv
# 최고점 파일과 비교 점수(NMAE/R2) : 0.01162 / 0.99611
# 제출 점수 : 

## 데이터 가져오기
* 2019년 test 데이터 : '../data/test.csv'
* 2019년 제출 파일 : '../data/sample_submission.csv'
* 2013-2018년 가스공급량과 기온 자료 : '../2013-2018년_가스공급량_기온_습도_기압01.csv'

In [2]:
from pycaret.regression import *

In [3]:
import time
import datetime
import pandas as pd
import numpy as np
from tqdm import tqdm

### 2013-2018년 가스공급량과 기온, 기압, 습도 자료

In [4]:
total = pd.read_csv('../2013-2018년_가스공급량_기온_습도_기압01.csv')
total.head()

Unnamed: 0,연월일,year,month,day,weekday,시간,구분,공급량,기온,습도,기압
0,2013-01-01,2013,1,1,1,1,0,2497.129,-8.5,57.0,1010.0
1,2013-01-01,2013,1,1,1,1,1,2169.093,-8.5,57.0,1010.0
2,2013-01-01,2013,1,1,1,1,2,226.178,-8.5,57.0,1010.0
3,2013-01-01,2013,1,1,1,1,3,1434.516,-8.5,57.0,1010.0
4,2013-01-01,2013,1,1,1,1,4,3272.837,-8.5,57.0,1010.0


### 2019년 테스트 데이터

In [5]:
test2019 = pd.read_csv('../data/test.csv')
test2019.head()

Unnamed: 0,일자|시간|구분
0,2019-01-01 01 A
1,2019-01-01 02 A
2,2019-01-01 03 A
3,2019-01-01 04 A
4,2019-01-01 05 A


In [6]:
test2019[['연월일', '시간', '구분']] = test2019['일자|시간|구분'].str.split(' ').tolist()
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분
0,2019-01-01 01 A,2019-01-01,1,A
1,2019-01-01 02 A,2019-01-01,2,A
2,2019-01-01 03 A,2019-01-01,3,A
3,2019-01-01 04 A,2019-01-01,4,A
4,2019-01-01 05 A,2019-01-01,5,A


In [7]:
# 구분 열을 숫자로 바꾸기
d_map = {}
for i, d in enumerate(test2019['구분'].unique()):
    d_map[d] = i
test2019['구분'] = test2019['구분'].map(d_map)
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분
0,2019-01-01 01 A,2019-01-01,1,0
1,2019-01-01 02 A,2019-01-01,2,0
2,2019-01-01 03 A,2019-01-01,3,0
3,2019-01-01 04 A,2019-01-01,4,0
4,2019-01-01 05 A,2019-01-01,5,0


In [8]:
# 연월일 열은 object형이므로 년, 월, 일로 나눈다.
test2019['연월일'] = pd.to_datetime(test2019['연월일'])
test2019['year'] = test2019['연월일'].dt.year
test2019['month'] = test2019['연월일'].dt.month
test2019['day'] = test2019['연월일'].dt.day
test2019['weekday'] = test2019['연월일'].dt.weekday
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분,year,month,day,weekday
0,2019-01-01 01 A,2019-01-01,1,0,2019,1,1,1
1,2019-01-01 02 A,2019-01-01,2,0,2019,1,1,1
2,2019-01-01 03 A,2019-01-01,3,0,2019,1,1,1
3,2019-01-01 04 A,2019-01-01,4,0,2019,1,1,1
4,2019-01-01 05 A,2019-01-01,5,0,2019,1,1,1


## 2019년 기온 훈련/예측

In [9]:
start = time.time()

# 학습 특성 : 'month', 'weekday', '시간'
# 타겟 특성 : '기온'
test_col = ['month', 'weekday', '시간']
target_col = ['기온']
train_col = test_col + target_col

temp_train = total[train_col]
temp_test = test2019[test_col]

# pycaret 설정
exp = setup(temp_train, target=target_col[0], silent=True, remove_outliers=True, fold=7, fold_shuffle=True, verbose=False)

# 모델 생성
lgbm = create_model('lightgbm')
cat = create_model('catboost')

# 기온 예측
temp2019_pred_lgbm = predict_model(lgbm, data=temp_test)
temp2019_pred_cat = predict_model(cat, data=temp_test)

# 두 모델 기온 예측 평균
temp2019_pred = (temp2019_pred_lgbm['Label'] + temp2019_pred_cat['Label']) / 2

# 2019년 예측 기온 열 추가
test2019[target_col[0]] = temp2019_pred

end = time.time()
sec = end - start
result = datetime.timedelta(seconds=sec)
print(str(result).split('.')[0])

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,3.0012,14.9367,3.8648,0.8819,0.5156,0.8085
1,2.9963,14.8513,3.8537,0.8832,0.5134,0.8012
2,3.0018,14.9622,3.8681,0.8827,0.511,0.7779
3,2.9863,14.7941,3.8463,0.8836,0.5111,0.7602
4,2.9838,14.8489,3.8534,0.8823,0.5098,0.7895
5,3.0116,15.0545,3.88,0.8812,0.5135,0.796
6,3.0202,15.1344,3.8903,0.8802,0.5124,0.7958
Mean,3.0002,14.9403,3.8652,0.8821,0.5124,0.7899
SD,0.012,0.1127,0.0146,0.0011,0.0018,0.015


0:02:56


In [10]:
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분,year,month,day,weekday,기온
0,2019-01-01 01 A,2019-01-01,1,0,2019,1,1,1,-3.995499
1,2019-01-01 02 A,2019-01-01,2,0,2019,1,1,1,-4.182278
2,2019-01-01 03 A,2019-01-01,3,0,2019,1,1,1,-4.351876
3,2019-01-01 04 A,2019-01-01,4,0,2019,1,1,1,-4.47119
4,2019-01-01 05 A,2019-01-01,5,0,2019,1,1,1,-4.858772


## 2019년 기압 훈련/예측

In [11]:
start = time.time()

# 학습 특성 : 'month', 'weekday', '시간', '기온'
# 타겟 특성 : '기압'
test_col = ['month', 'weekday', '시간', '기온']
target_col = ['기압']
train_col = test_col + target_col

temp_train = total[train_col]
temp_test = test2019[test_col]

# pycaret 설정
exp = setup(temp_train, target=target_col[0], silent=True, remove_outliers=True, fold=7, fold_shuffle=True, verbose=False)

# 모델 생성
lgbm = create_model('lightgbm')
cat = create_model('catboost')

# 기압 예측
pressure2019_pred_lgbm = predict_model(lgbm, data=temp_test)
pressure2019_pred_cat = predict_model(cat, data=temp_test)

# 두 모델 기압 예측 평균
pressure2019_pred = (pressure2019_pred_lgbm['Label'] + pressure2019_pred_cat['Label']) / 2

# 2019년 예측 기압 열 추가
test2019[target_col[0]] = pressure2019_pred

end = time.time()
sec = end - start
result = datetime.timedelta(seconds=sec)
print(str(result).split('.')[0])

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,3.2629,17.8234,4.2218,0.7134,0.0042,0.0032
1,3.2558,17.5127,4.1848,0.7157,0.0042,0.0032
2,3.2558,17.8452,4.2244,0.7128,0.0042,0.0032
3,3.2222,17.5151,4.1851,0.7137,0.0042,0.0032
4,3.2495,17.6706,4.2036,0.7121,0.0042,0.0032
5,3.2329,17.5432,4.1885,0.7163,0.0042,0.0032
6,3.254,17.6926,4.2063,0.7136,0.0042,0.0032
Mean,3.2476,17.6575,4.2021,0.7139,0.0042,0.0032
SD,0.0135,0.1301,0.0155,0.0014,0.0,0.0


0:02:55


In [12]:
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분,year,month,day,weekday,기온,기압
0,2019-01-01 01 A,2019-01-01,1,0,2019,1,1,1,-3.995499,1014.222253
1,2019-01-01 02 A,2019-01-01,2,0,2019,1,1,1,-4.182278,1016.893713
2,2019-01-01 03 A,2019-01-01,3,0,2019,1,1,1,-4.351876,1016.013267
3,2019-01-01 04 A,2019-01-01,4,0,2019,1,1,1,-4.47119,1015.664004
4,2019-01-01 05 A,2019-01-01,5,0,2019,1,1,1,-4.858772,1015.777433


## 2019년 습도(Humidity) 훈련/예측

In [13]:
start = time.time()

# 학습 특성 : 'month', 'weekday', '시간', '기온', '기압'
# 타겟 특성 : '습도'
test_col = ['month', 'weekday', '시간', '기온', '기압']
target_col = ['습도']
train_col = test_col + target_col

temp_train = total[train_col]
temp_test = test2019[test_col]

# pycaret 설정
exp = setup(temp_train, target=target_col[0], silent=True, remove_outliers=True, fold=7, fold_shuffle=True, verbose=False)

# 모델 생성
lgbm = create_model('lightgbm')
cat = create_model('catboost')

# 습도 예측
humidity2019_pred_lgbm = predict_model(lgbm, data=temp_test)
humidity2019_pred_cat = predict_model(cat, data=temp_test)

# 두 모델 습도 예측 평균
humidity2019_pred = (humidity2019_pred_lgbm['Label'] + humidity2019_pred_cat['Label']) / 2

# 2019년 예측 습도 열 추가
test2019[target_col[0]] = humidity2019_pred

end = time.time()
sec = end - start
result = datetime.timedelta(seconds=sec)
print(str(result).split('.')[0])

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,9.3153,141.9086,11.9125,0.6461,0.2325,0.1919
1,9.2775,141.0996,11.8785,0.6467,0.2298,0.189
2,9.2199,138.9974,11.7897,0.6542,0.229,0.1884
3,9.2513,140.2973,11.8447,0.6463,0.2305,0.1896
4,9.2749,140.7538,11.864,0.6445,0.2297,0.1893
5,9.306,142.6399,11.9432,0.6448,0.2319,0.1909
6,9.3231,142.8973,11.954,0.6421,0.2329,0.1915
Mean,9.2811,141.2277,11.8838,0.6464,0.2309,0.1901
SD,0.0343,1.272,0.0536,0.0035,0.0014,0.0012


0:02:45


In [14]:
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분,year,month,day,weekday,기온,기압,습도
0,2019-01-01 01 A,2019-01-01,1,0,2019,1,1,1,-3.995499,1014.222253,57.817913
1,2019-01-01 02 A,2019-01-01,2,0,2019,1,1,1,-4.182278,1016.893713,59.147953
2,2019-01-01 03 A,2019-01-01,3,0,2019,1,1,1,-4.351876,1016.013267,59.717267
3,2019-01-01 04 A,2019-01-01,4,0,2019,1,1,1,-4.47119,1015.664004,60.220933
4,2019-01-01 05 A,2019-01-01,5,0,2019,1,1,1,-4.858772,1015.777433,63.435311


## 2019년 가스 공급량 훈련/예측

### log_공급량으로 진행

In [15]:
total['log_공급량'] = np.log1p(total['공급량'])
total.head()

Unnamed: 0,연월일,year,month,day,weekday,시간,구분,공급량,기온,습도,기압,log_공급량
0,2013-01-01,2013,1,1,1,1,0,2497.129,-8.5,57.0,1010.0,7.823297
1,2013-01-01,2013,1,1,1,1,1,2169.093,-8.5,57.0,1010.0,7.682525
2,2013-01-01,2013,1,1,1,1,2,226.178,-8.5,57.0,1010.0,5.425734
3,2013-01-01,2013,1,1,1,1,3,1434.516,-8.5,57.0,1010.0,7.26928
4,2013-01-01,2013,1,1,1,1,4,3272.837,-8.5,57.0,1010.0,8.093718


## 2019년 log_공급량 훈련/예측, 구분별

In [16]:
# 구분에 따라 예측하기
divisions = test2019['구분'].unique()

# 학습 특성 : 'month', 'weekday', '시간', '구분', '기온', '기압', '습도'
# 타겟 특성 : 'log_공급량'
test_col = ['month', 'weekday', '시간', '구분', '기온', '기압', '습도']
target_col = ['log_공급량']
train_col = test_col + target_col

gas2019_pred = pd.DataFrame()
for division in tqdm(divisions):
    # 구분별 훈련, 테스트 셋 설정
    temp_train = total[total['구분'] == division].reset_index(drop=True)
    temp_train = temp_train[train_col]

    temp_test = test2019[test2019['구분'] == division].reset_index(drop=True)
    temp_test = temp_test[test_col]

    # pycaret 설정
    exp = setup(temp_train, target=target_col[0], silent=True, remove_outliers=True, fold=7, fold_shuffle=True, verbose=False)

    # 모델 생성
    lgbm = create_model('lightgbm')
    cat = create_model('catboost')

    # log_공급량 예측
    log_gas2019_pred_lgbm = predict_model(lgbm, data=temp_test)
    log_gas2019_pred_cat = predict_model(cat, data=temp_test)

    # log_공급량 값을 공급량 값으로 변환
    gas_pred2019_lgbm = np.expm1(log_gas2019_pred_lgbm['Label'])
    gas_pred2019_cat = np.expm1(log_gas2019_pred_cat['Label'])

    # 두 모델 공급량 예측 평균
    gas2019_division_pred = (gas_pred2019_lgbm + gas_pred2019_cat) / 2

    # 구분별 예측 값 합치기
    gas2019_pred = pd.concat([gas2019_pred, gas2019_division_pred])
    gas2019_pred = gas2019_pred.reset_index(drop=True)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,0.0718,0.0097,0.0987,0.9841,0.0175,0.0146
1,0.0705,0.0105,0.1026,0.9825,0.0176,0.0142
2,0.0706,0.0098,0.0992,0.9838,0.0175,0.0143
3,0.071,0.0107,0.1036,0.9829,0.018,0.0144
4,0.0729,0.0104,0.1018,0.9833,0.0182,0.0149
5,0.0715,0.0102,0.1012,0.9838,0.0179,0.0145
6,0.073,0.0154,0.124,0.9753,0.0231,0.0151
Mean,0.0716,0.011,0.1044,0.9823,0.0185,0.0146
SD,0.0009,0.0018,0.0081,0.0029,0.0019,0.0003


100%|██████████| 7/7 [04:34<00:00, 39.26s/it]


In [17]:
# 예측 결과 확인
gas2019_pred.head()

Unnamed: 0,0
0,2005.533436
1,1812.946352
2,1712.445154
3,1717.366088
4,1959.75848


## 제출 파일 가져오기

In [18]:
sub = pd.read_csv('../data/sample_submission.csv')
sub.head()

Unnamed: 0,일자|시간|구분,공급량
0,2019-01-01 01 A,0
1,2019-01-01 02 A,0
2,2019-01-01 03 A,0
3,2019-01-01 04 A,0
4,2019-01-01 05 A,0


In [19]:
sub['공급량'] = gas2019_pred
sub.head()

Unnamed: 0,일자|시간|구분,공급량
0,2019-01-01 01 A,2005.533436
1,2019-01-01 02 A,1812.946352
2,2019-01-01 03 A,1712.445154
3,2019-01-01 04 A,1717.366088
4,2019-01-01 05 A,1959.75848


In [20]:
sub.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15120 entries, 0 to 15119
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   일자|시간|구분  15120 non-null  object 
 1   공급량       15120 non-null  float64
dtypes: float64(1), object(1)
memory usage: 236.4+ KB


## 제출 파일 출력

In [21]:
sub.to_csv("sub32_pycaret20.csv", index=False)