# pycaret을 이용한 가스공급량 예측
* 2019년 test 데이터 : '../data/test.csv'
* 2019년 제출 파일 : '../data/sample_submission.csv'
* 2013-2018년 가스공급량과 기온 자료 : '../2013-2018년_가스공급량_기온_습도_기압01.csv'
* 특성
    * 'month', 'weekay', '시간'으로 기온 예측
    * 기온, 기압, 습도 순서로 예측
    * 각 특성 예측 마다 앞에서 예측한 특성 활용
    * 공급량 예측(7개) : 'month', 'weekday', '시간', '구분', '기온', '기압', '습도'
* 공급량 log적용
* 2개 모델('lightgbm', 'catboost') 예측 평균
* 튜닝X, finalize_model X
* 구분별 훈련/예측
* 이상치 제거

In [1]:
# 제출 파일명 : sub32_pycaret20.csv
# 최고점 파일과 비교 점수(NMAE/R2) : 0.01250 / 0.99547
# 제출 점수 : 0.1030720506(2021-12-04 18:44:17)

## 데이터 가져오기
* 2019년 test 데이터 : '../data/test.csv'
* 2019년 제출 파일 : '../data/sample_submission.csv'
* 2013-2018년 가스공급량과 기온 자료 : '../2013-2018년_가스공급량_기온_습도_기압01.csv'

In [2]:
from pycaret.regression import *

In [3]:
import time
import datetime
import pandas as pd
import numpy as np
from tqdm import tqdm

### 2013-2018년 가스공급량과 기온, 기압, 습도 자료

In [4]:
total = pd.read_csv('../2013-2018년_가스공급량_기온_습도_기압01.csv')
total.head()

Unnamed: 0,연월일,year,month,day,weekday,시간,구분,공급량,기온,습도,기압
0,2013-01-01,2013,1,1,1,1,0,2497.129,-8.5,57.0,1010.0
1,2013-01-01,2013,1,1,1,1,1,2169.093,-8.5,57.0,1010.0
2,2013-01-01,2013,1,1,1,1,2,226.178,-8.5,57.0,1010.0
3,2013-01-01,2013,1,1,1,1,3,1434.516,-8.5,57.0,1010.0
4,2013-01-01,2013,1,1,1,1,4,3272.837,-8.5,57.0,1010.0


### 2019년 테스트 데이터

In [5]:
test2019 = pd.read_csv('../data/test.csv')
test2019.head()

Unnamed: 0,일자|시간|구분
0,2019-01-01 01 A
1,2019-01-01 02 A
2,2019-01-01 03 A
3,2019-01-01 04 A
4,2019-01-01 05 A


In [6]:
test2019[['연월일', '시간', '구분']] = test2019['일자|시간|구분'].str.split(' ').tolist()
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분
0,2019-01-01 01 A,2019-01-01,1,A
1,2019-01-01 02 A,2019-01-01,2,A
2,2019-01-01 03 A,2019-01-01,3,A
3,2019-01-01 04 A,2019-01-01,4,A
4,2019-01-01 05 A,2019-01-01,5,A


In [7]:
# 구분 열을 숫자로 바꾸기
d_map = {}
for i, d in enumerate(test2019['구분'].unique()):
    d_map[d] = i
test2019['구분'] = test2019['구분'].map(d_map)
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분
0,2019-01-01 01 A,2019-01-01,1,0
1,2019-01-01 02 A,2019-01-01,2,0
2,2019-01-01 03 A,2019-01-01,3,0
3,2019-01-01 04 A,2019-01-01,4,0
4,2019-01-01 05 A,2019-01-01,5,0


In [8]:
# 연월일 열은 object형이므로 년, 월, 일로 나눈다.
test2019['연월일'] = pd.to_datetime(test2019['연월일'])
test2019['year'] = test2019['연월일'].dt.year
test2019['month'] = test2019['연월일'].dt.month
test2019['day'] = test2019['연월일'].dt.day
test2019['weekday'] = test2019['연월일'].dt.weekday
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분,year,month,day,weekday
0,2019-01-01 01 A,2019-01-01,1,0,2019,1,1,1
1,2019-01-01 02 A,2019-01-01,2,0,2019,1,1,1
2,2019-01-01 03 A,2019-01-01,3,0,2019,1,1,1
3,2019-01-01 04 A,2019-01-01,4,0,2019,1,1,1
4,2019-01-01 05 A,2019-01-01,5,0,2019,1,1,1


## 2019년 기온 훈련/예측

In [9]:
start = time.time()

# 학습 특성 : 'month', 'weekday', '시간'
# 타겟 특성 : '기온'
test_col = ['month', 'weekday', '시간']
target_col = ['기온']
train_col = test_col + target_col

temp_train = total[train_col]
temp_test = test2019[test_col]

# pycaret 설정
exp = setup(temp_train, target=target_col[0], silent=True, remove_outliers=True, fold=7, fold_shuffle=True, verbose=False)

# 모델 생성
lgbm = create_model('lightgbm')
cat = create_model('catboost')

# 기온 예측
temp2019_pred_lgbm = predict_model(lgbm, data=temp_test)
temp2019_pred_cat = predict_model(cat, data=temp_test)

# 두 모델 기온 예측 평균
temp2019_pred = (temp2019_pred_lgbm['Label'] + temp2019_pred_cat['Label']) / 2

# 2019년 예측 기온 열 추가
test2019[target_col[0]] = temp2019_pred

end = time.time()
sec = end - start
result = datetime.timedelta(seconds=sec)
print(str(result).split('.')[0])

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,2.9956,14.9129,3.8617,0.8828,0.5062,0.7694
1,2.9868,14.8858,3.8582,0.8824,0.509,0.8034
2,2.9937,14.8863,3.8583,0.8823,0.5024,0.7759
3,2.9981,14.893,3.8591,0.8828,0.5141,0.8017
4,2.9943,14.9128,3.8617,0.8822,0.5091,0.8124
5,3.0171,15.0507,3.8795,0.8819,0.517,0.7914
6,2.9947,14.905,3.8607,0.8833,0.5119,0.7861
Mean,2.9972,14.9209,3.8628,0.8825,0.51,0.7915
SD,0.0087,0.0541,0.007,0.0004,0.0045,0.0144


0:04:00


In [10]:
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분,year,month,day,weekday,기온
0,2019-01-01 01 A,2019-01-01,1,0,2019,1,1,1,-3.832914
1,2019-01-01 02 A,2019-01-01,2,0,2019,1,1,1,-4.400807
2,2019-01-01 03 A,2019-01-01,3,0,2019,1,1,1,-4.613112
3,2019-01-01 04 A,2019-01-01,4,0,2019,1,1,1,-4.899615
4,2019-01-01 05 A,2019-01-01,5,0,2019,1,1,1,-5.011229


## 2019년 기압 훈련/예측

In [11]:
start = time.time()

# 학습 특성 : 'month', 'weekday', '시간', '기온'
# 타겟 특성 : '기압'
test_col = ['month', 'weekday', '시간', '기온']
target_col = ['기압']
train_col = test_col + target_col

temp_train = total[train_col]
temp_test = test2019[test_col]

# pycaret 설정
exp = setup(temp_train, target=target_col[0], silent=True, remove_outliers=True, fold=7, fold_shuffle=True, verbose=False)

# 모델 생성
lgbm = create_model('lightgbm')
cat = create_model('catboost')

# 기압 예측
pressure2019_pred_lgbm = predict_model(lgbm, data=temp_test)
pressure2019_pred_cat = predict_model(cat, data=temp_test)

# 두 모델 기압 예측 평균
pressure2019_pred = (pressure2019_pred_lgbm['Label'] + pressure2019_pred_cat['Label']) / 2

# 2019년 예측 기압 열 추가
test2019[target_col[0]] = pressure2019_pred

end = time.time()
sec = end - start
result = datetime.timedelta(seconds=sec)
print(str(result).split('.')[0])

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,3.2365,17.5202,4.1857,0.7166,0.0042,0.0032
1,3.2489,17.6898,4.2059,0.7154,0.0042,0.0032
2,3.2178,17.4874,4.1818,0.7155,0.0042,0.0032
3,3.2517,17.6549,4.2018,0.7137,0.0042,0.0032
4,3.2627,17.8565,4.2257,0.7113,0.0042,0.0032
5,3.2681,18.0271,4.2458,0.7104,0.0042,0.0033
6,3.266,17.8402,4.2238,0.7107,0.0042,0.0032
Mean,3.2503,17.7251,4.2101,0.7134,0.0042,0.0032
SD,0.0168,0.1798,0.0213,0.0024,0.0,0.0


0:03:25


In [12]:
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분,year,month,day,weekday,기온,기압
0,2019-01-01 01 A,2019-01-01,1,0,2019,1,1,1,-3.832914,1013.405642
1,2019-01-01 02 A,2019-01-01,2,0,2019,1,1,1,-4.400807,1015.146309
2,2019-01-01 03 A,2019-01-01,3,0,2019,1,1,1,-4.613112,1015.593736
3,2019-01-01 04 A,2019-01-01,4,0,2019,1,1,1,-4.899615,1015.335428
4,2019-01-01 05 A,2019-01-01,5,0,2019,1,1,1,-5.011229,1015.023332


## 2019년 습도(Humidity) 훈련/예측

In [13]:
start = time.time()

# 학습 특성 : 'month', 'weekday', '시간', '기온', '기압'
# 타겟 특성 : '습도'
test_col = ['month', 'weekday', '시간', '기온', '기압']
target_col = ['습도']
train_col = test_col + target_col

temp_train = total[train_col]
temp_test = test2019[test_col]

# pycaret 설정
exp = setup(temp_train, target=target_col[0], silent=True, remove_outliers=True, fold=7, fold_shuffle=True, verbose=False)

# 모델 생성
lgbm = create_model('lightgbm')
cat = create_model('catboost')

# 습도 예측
humidity2019_pred_lgbm = predict_model(lgbm, data=temp_test)
humidity2019_pred_cat = predict_model(cat, data=temp_test)

# 두 모델 습도 예측 평균
humidity2019_pred = (humidity2019_pred_lgbm['Label'] + humidity2019_pred_cat['Label']) / 2

# 2019년 예측 습도 열 추가
test2019[target_col[0]] = humidity2019_pred

end = time.time()
sec = end - start
result = datetime.timedelta(seconds=sec)
print(str(result).split('.')[0])

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,9.27,140.7188,11.8625,0.6491,0.2294,0.1889
1,9.2776,140.3193,11.8456,0.6517,0.2315,0.1912
2,9.2377,139.6802,11.8186,0.6489,0.2296,0.1889
3,9.3761,143.6736,11.9864,0.6429,0.2348,0.194
4,9.2506,140.1552,11.8387,0.6461,0.2299,0.189
5,9.3935,144.2066,12.0086,0.6407,0.2318,0.191
6,9.2122,139.4188,11.8076,0.6482,0.2288,0.1881
Mean,9.2882,141.1675,11.8812,0.6468,0.2308,0.1902
SD,0.0644,1.8021,0.0756,0.0036,0.0019,0.0019


0:03:28


In [14]:
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분,year,month,day,weekday,기온,기압,습도
0,2019-01-01 01 A,2019-01-01,1,0,2019,1,1,1,-3.832914,1013.405642,58.864857
1,2019-01-01 02 A,2019-01-01,2,0,2019,1,1,1,-4.400807,1015.146309,59.603353
2,2019-01-01 03 A,2019-01-01,3,0,2019,1,1,1,-4.613112,1015.593736,59.82586
3,2019-01-01 04 A,2019-01-01,4,0,2019,1,1,1,-4.899615,1015.335428,62.6704
4,2019-01-01 05 A,2019-01-01,5,0,2019,1,1,1,-5.011229,1015.023332,63.95385


## 2019년 가스 공급량 훈련/예측

### log_공급량으로 진행

In [15]:
total['log_공급량'] = np.log1p(total['공급량'])
total.head()

Unnamed: 0,연월일,year,month,day,weekday,시간,구분,공급량,기온,습도,기압,log_공급량
0,2013-01-01,2013,1,1,1,1,0,2497.129,-8.5,57.0,1010.0,7.823297
1,2013-01-01,2013,1,1,1,1,1,2169.093,-8.5,57.0,1010.0,7.682525
2,2013-01-01,2013,1,1,1,1,2,226.178,-8.5,57.0,1010.0,5.425734
3,2013-01-01,2013,1,1,1,1,3,1434.516,-8.5,57.0,1010.0,7.26928
4,2013-01-01,2013,1,1,1,1,4,3272.837,-8.5,57.0,1010.0,8.093718


## 2019년 log_공급량 훈련/예측, 구분별

In [16]:
# 구분에 따라 예측하기
divisions = test2019['구분'].unique()

# 학습 특성 : 'month', 'weekday', '시간', '구분', '기온', '기압'
# 타겟 특성 : 'log_공급량'
test_col = ['month', 'weekday', '시간', '구분', '기온', '기압']
target_col = ['log_공급량']
train_col = test_col + target_col

gas2019_pred = pd.DataFrame()
for division in tqdm(divisions):
    # 구분별 훈련, 테스트 셋 설정
    temp_train = total[total['구분'] == division].reset_index(drop=True)
    temp_train = temp_train[train_col]

    temp_test = test2019[test2019['구분'] == division].reset_index(drop=True)
    temp_test = temp_test[test_col]

    # pycaret 설정
    exp = setup(temp_train, target=target_col[0], silent=True, remove_outliers=True, fold=7, fold_shuffle=True, verbose=False)

    # 모델 생성
    lgbm = create_model('lightgbm')
    cat = create_model('catboost')

    # log_공급량 예측
    log_gas2019_pred_lgbm = predict_model(lgbm, data=temp_test)
    log_gas2019_pred_cat = predict_model(cat, data=temp_test)

    # log_공급량 값을 공급량 값으로 변환
    gas_pred2019_lgbm = np.expm1(log_gas2019_pred_lgbm['Label'])
    gas_pred2019_cat = np.expm1(log_gas2019_pred_cat['Label'])

    # 두 모델 공급량 예측 평균
    gas2019_division_pred = (gas_pred2019_lgbm + gas_pred2019_cat) / 2

    # 구분별 예측 값 합치기
    gas2019_pred = pd.concat([gas2019_pred, gas2019_division_pred])
    gas2019_pred = gas2019_pred.reset_index(drop=True)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,0.0731,0.0117,0.108,0.9813,0.019,0.0147
1,0.0736,0.0118,0.1084,0.9811,0.0187,0.0149
2,0.072,0.0138,0.1173,0.9776,0.0195,0.0145
3,0.0726,0.0126,0.1122,0.9795,0.0221,0.0151
4,0.0725,0.0104,0.1021,0.9834,0.0181,0.0148
5,0.0719,0.0105,0.1024,0.9831,0.0182,0.0145
6,0.0715,0.0101,0.1006,0.9829,0.0175,0.0144
Mean,0.0724,0.0115,0.1073,0.9813,0.019,0.0147
SD,0.0006,0.0012,0.0056,0.002,0.0014,0.0002


100%|██████████| 7/7 [05:14<00:00, 44.88s/it]


In [17]:
# 예측 결과 확인
gas2019_pred.head()

Unnamed: 0,0
0,2017.317663
1,1796.894612
2,1707.909671
3,1735.39218
4,1926.688007


## 제출 파일 가져오기

In [18]:
sub = pd.read_csv('../data/sample_submission.csv')
sub.head()

Unnamed: 0,일자|시간|구분,공급량
0,2019-01-01 01 A,0
1,2019-01-01 02 A,0
2,2019-01-01 03 A,0
3,2019-01-01 04 A,0
4,2019-01-01 05 A,0


In [19]:
sub['공급량'] = gas2019_pred
sub.head()

Unnamed: 0,일자|시간|구분,공급량
0,2019-01-01 01 A,2017.317663
1,2019-01-01 02 A,1796.894612
2,2019-01-01 03 A,1707.909671
3,2019-01-01 04 A,1735.39218
4,2019-01-01 05 A,1926.688007


In [20]:
sub.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15120 entries, 0 to 15119
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   일자|시간|구분  15120 non-null  object 
 1   공급량       15120 non-null  float64
dtypes: float64(1), object(1)
memory usage: 236.4+ KB


## 제출 파일 출력

In [21]:
sub.to_csv("sub32_pycaret20.csv", index=False)