# pycaret을 이용한 가스공급량 예측
* 2019년 test 데이터 : '../data/test.csv'
* 2019년 제출 파일 : '../data/sample_submission.csv'
* 2013-2018년 가스공급량과 기온 자료 : '../2013-2018년_가스공급량_기온_습도_기압01.csv'
* 특성
    * 'month', 'weekday', '시간'으로 기온 예측
    * 기온, 기압, 습도 순서로 예측
    * 각 특성 예측 마다 앞에서 예측한 특성 활용
    * 공급량 예측(7개) : 'month', 'weekday', '시간', '구분', '기온비율', '기압비율', '습도비율'
* 2개 모델('lightgbm', 'catboost') 예측 평균
* 튜닝X, finalize_model X
* 구분별 훈련/예측
* 차로 계산
* outliers 제거

In [1]:
# 제출 파일명 : sub31_pycaret19.csv
# 최고점 파일과 비교 점수(NMAE/R2) : 0.45226 / -6.44388
# 제출 점수 : -

## 데이터 가져오기
* 2019년 test 데이터 : '../data/test.csv'
* 2019년 제출 파일 : '../data/sample_submission.csv'
* 2013-2018년 가스공급량과 기온 자료 : '../2013-2018년_가스공급량_기온_습도_기압_비율.csv'

In [1]:
from pycaret.regression import *

In [2]:
import time
import datetime
import pandas as pd
from tqdm import tqdm

### 2013-2018년 가스공급량과 기온, 습도, 기압 비율 자료 불러오기

In [3]:
total = pd.read_csv('../2013-2018년_가스공급량_기온_습도_기압_차.csv')
total.head()

Unnamed: 0,연월일,year,month,day,weekday,시간,구분,공급량,기온,습도,기압,공급량차,기온차,습도차,기압차
0,2013-01-01,2013,1,1,1,1,0,2497.129,-8.5,57.0,1010.0,0.0,0.0,0.0,0.0
1,2013-01-01,2013,1,1,1,2,0,2363.265,-8.4,60.0,1009.4,-133.864,0.1,3.0,-0.6
2,2013-01-01,2013,1,1,1,3,0,2258.505,-8.1,58.0,1009.2,-104.76,0.3,-2.0,-0.2
3,2013-01-01,2013,1,1,1,4,0,2243.969,-8.2,58.0,1008.2,-14.536,-0.1,0.0,-1.0
4,2013-01-01,2013,1,1,1,5,0,2344.105,-8.2,61.0,1007.3,100.136,0.0,3.0,-0.9


### 2019년 데이터

In [4]:
test2019 = pd.read_csv('../data/test.csv')
test2019.head()

Unnamed: 0,일자|시간|구분
0,2019-01-01 01 A
1,2019-01-01 02 A
2,2019-01-01 03 A
3,2019-01-01 04 A
4,2019-01-01 05 A


In [5]:
test2019[['연월일', '시간', '구분']] = test2019['일자|시간|구분'].str.split(' ').tolist()
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분
0,2019-01-01 01 A,2019-01-01,1,A
1,2019-01-01 02 A,2019-01-01,2,A
2,2019-01-01 03 A,2019-01-01,3,A
3,2019-01-01 04 A,2019-01-01,4,A
4,2019-01-01 05 A,2019-01-01,5,A


In [6]:
# 구분 열을 숫자로 바꾸기
d_map = {}
for i, d in enumerate(test2019['구분'].unique()):
    d_map[d] = i
test2019['구분'] = test2019['구분'].map(d_map)
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분
0,2019-01-01 01 A,2019-01-01,1,0
1,2019-01-01 02 A,2019-01-01,2,0
2,2019-01-01 03 A,2019-01-01,3,0
3,2019-01-01 04 A,2019-01-01,4,0
4,2019-01-01 05 A,2019-01-01,5,0


In [7]:
# 연월일 열은 object형이므로 년, 월, 일로 나눈다.
test2019['연월일'] = pd.to_datetime(test2019['연월일'])
test2019['year'] = test2019['연월일'].dt.year
test2019['month'] = test2019['연월일'].dt.month
test2019['day'] = test2019['연월일'].dt.day
test2019['weekday'] = test2019['연월일'].dt.weekday
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분,year,month,day,weekday
0,2019-01-01 01 A,2019-01-01,1,0,2019,1,1,1
1,2019-01-01 02 A,2019-01-01,2,0,2019,1,1,1
2,2019-01-01 03 A,2019-01-01,3,0,2019,1,1,1
3,2019-01-01 04 A,2019-01-01,4,0,2019,1,1,1
4,2019-01-01 05 A,2019-01-01,5,0,2019,1,1,1


In [8]:
test2019.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15120 entries, 0 to 15119
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   일자|시간|구분  15120 non-null  object        
 1   연월일       15120 non-null  datetime64[ns]
 2   시간        15120 non-null  object        
 3   구분        15120 non-null  int64         
 4   year      15120 non-null  int64         
 5   month     15120 non-null  int64         
 6   day       15120 non-null  int64         
 7   weekday   15120 non-null  int64         
dtypes: datetime64[ns](1), int64(5), object(2)
memory usage: 945.1+ KB


In [9]:
test2019['시간']= test2019['시간'].astype(int)
test2019.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15120 entries, 0 to 15119
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   일자|시간|구분  15120 non-null  object        
 1   연월일       15120 non-null  datetime64[ns]
 2   시간        15120 non-null  int32         
 3   구분        15120 non-null  int64         
 4   year      15120 non-null  int64         
 5   month     15120 non-null  int64         
 6   day       15120 non-null  int64         
 7   weekday   15120 non-null  int64         
dtypes: datetime64[ns](1), int32(1), int64(5), object(1)
memory usage: 886.1+ KB


## 2019년 기온차 훈련/예측

In [10]:
start = time.time()

# 학습 특성 : 'month', 'weekday', '시간'
# 타겟 특성 : '기온차'
test_col = ['month', 'weekday', '시간']
target_col = ['기온차']
train_col = test_col + target_col

train = total[train_col]
test = test2019[test_col]

# pycaret 설정
exp = setup(train, target=target_col[0], silent=True, remove_outliers=True, fold=7, fold_shuffle=True, verbose=False)

# 모델 생성
lgbm = create_model('lightgbm')
cat = create_model('catboost')

# 기온차 예측
temp2019_pred_lgbm = predict_model(lgbm, test)
temp2019_pred_cat = predict_model(cat, test)

# 두 모델 기온차 예측 평균
temp2019_pred = (temp2019_pred_lgbm['Label'] + temp2019_pred_cat['Label']) / 2

# 2019년 예측 기온차 열 추가
test2019[target_col[0]] = temp2019_pred

end = time.time()
sec = end - start
result = datetime.timedelta(seconds=sec)
print(str(result).split('.')[0])

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,0.3958,0.303,0.5505,0.6674,0.2613,0.9596
1,0.3988,0.3082,0.5551,0.6636,0.2625,0.9632
2,0.3982,0.3066,0.5537,0.6635,0.2633,0.9536
3,0.3949,0.3022,0.5497,0.6685,0.2611,0.9482
4,0.3953,0.3025,0.55,0.6698,0.2619,0.9619
5,0.3936,0.3032,0.5507,0.6686,0.2614,0.9414
6,0.3975,0.3068,0.5539,0.6606,0.2628,0.9738
Mean,0.3963,0.3046,0.5519,0.666,0.262,0.9574
SD,0.0018,0.0023,0.0021,0.0032,0.0008,0.0099


0:04:06


In [11]:
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분,year,month,day,weekday,기온차
0,2019-01-01 01 A,2019-01-01,1,0,2019,1,1,1,-0.367125
1,2019-01-01 02 A,2019-01-01,2,0,2019,1,1,1,-0.370324
2,2019-01-01 03 A,2019-01-01,3,0,2019,1,1,1,-0.335105
3,2019-01-01 04 A,2019-01-01,4,0,2019,1,1,1,-0.298285
4,2019-01-01 05 A,2019-01-01,5,0,2019,1,1,1,-0.23088


## 2019년 기압차 훈련/예측

In [13]:
start = time.time()

# 학습 특성 : 'month', 'weekday', '시간', '기온차'
# 타겟 특성 : '기압차'
test_col = ['month', 'weekday', '시간', '기온차']
target_col = ['기압차']
train_col = test_col + target_col

train = total[train_col]
test = test2019[test_col]

# pycaret 설정
exp = setup(train, target=target_col[0], silent=True, remove_outliers=True, fold=7, fold_shuffle=True, verbose=False)

# 모델 생성
lgbm = create_model('lightgbm')
cat = create_model('catboost')

# 기압차 예측
pressure2019_pred_lgbm = predict_model(lgbm, data=test)
pressure2019_pred_cat = predict_model(cat, data=test)

# 두 모델 기압차 예측 평균
pressure2019_pred = (pressure2019_pred_lgbm['Label'] + pressure2019_pred_cat['Label']) / 2

# 2019년 예측 기압차 열 추가
test2019[target_col[0]] = pressure2019_pred

end = time.time()
sec = end - start
result = datetime.timedelta(seconds=sec)
print(str(result).split('.')[0])

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,0.253,0.1142,0.338,0.4576,0.2048,0.8576
1,0.254,0.1145,0.3384,0.4529,0.205,0.865
2,0.2568,0.1166,0.3415,0.452,0.207,0.8655
3,0.2554,0.1155,0.3398,0.454,0.2061,0.8619
4,0.2558,0.1184,0.3441,0.4447,0.2065,0.8617
5,0.2552,0.1157,0.3401,0.4568,0.2057,0.8619
6,0.2541,0.1154,0.3398,0.4531,0.2057,0.8602
Mean,0.2549,0.1158,0.3402,0.453,0.2058,0.862
SD,0.0012,0.0013,0.0019,0.0039,0.0007,0.0025


0:03:57


In [14]:
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분,year,month,day,weekday,기온차,기압차
0,2019-01-01 01 A,2019-01-01,1,0,2019,1,1,1,-0.367125,-0.01411
1,2019-01-01 02 A,2019-01-01,2,0,2019,1,1,1,-0.370324,0.115782
2,2019-01-01 03 A,2019-01-01,3,0,2019,1,1,1,-0.335105,0.201484
3,2019-01-01 04 A,2019-01-01,4,0,2019,1,1,1,-0.298285,-0.105884
4,2019-01-01 05 A,2019-01-01,5,0,2019,1,1,1,-0.23088,-0.001142


## 2019년 습도차 훈련/예측

In [15]:
start = time.time()

# 학습 특성 : 'month', 'weekday', '시간', '기온차', '기압차'
# 타겟 특성 : '습도차'
test_col = ['month', 'weekday', '시간', '기온차', '기압차']
target_col = ['습도차']
train_col = test_col + target_col

train = total[train_col]
test = test2019[test_col]

# pycaret 설정
exp = setup(train, target=target_col[0], silent=True, remove_outliers=True, fold=7, fold_shuffle=True, verbose=False)

# 모델 생성
lgbm = create_model('lightgbm')
cat = create_model('catboost')

# 습도차 예측
humidity2019_pred_lgbm = predict_model(lgbm, data=test)
humidity2019_pred_cat = predict_model(cat, data=test)

# 두 모델 습도차 예측 평균
humidity2019_pred = (humidity2019_pred_lgbm['Label'] + humidity2019_pred_cat['Label']) / 2

# 2019년 예측 습도차 열 추가
test2019[target_col[0]] = humidity2019_pred

end = time.time()
sec = end - start
result = datetime.timedelta(seconds=sec)
print(str(result).split('.')[0])

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,2.3383,11.0147,3.3188,0.512,0.6855,0.829
1,2.3299,10.9168,3.3041,0.5199,0.6806,0.8362
2,2.3456,11.0725,3.3275,0.5199,0.6835,0.8365
3,2.3105,10.6019,3.256,0.5344,0.6813,0.824
4,2.3431,11.0021,3.3169,0.5197,0.6862,0.8289
5,2.319,11.0157,3.319,0.5224,0.6825,0.8273
6,2.3497,11.3261,3.3654,0.5225,0.6829,0.8266
Mean,2.3337,10.9928,3.3154,0.5215,0.6832,0.8298
SD,0.0135,0.199,0.0301,0.0062,0.0019,0.0044


0:04:06


In [16]:
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분,year,month,day,weekday,기온차,기압차,습도차
0,2019-01-01 01 A,2019-01-01,1,0,2019,1,1,1,-0.367125,-0.01411,1.247158
1,2019-01-01 02 A,2019-01-01,2,0,2019,1,1,1,-0.370324,0.115782,1.374254
2,2019-01-01 03 A,2019-01-01,3,0,2019,1,1,1,-0.335105,0.201484,1.016894
3,2019-01-01 04 A,2019-01-01,4,0,2019,1,1,1,-0.298285,-0.105884,1.128415
4,2019-01-01 05 A,2019-01-01,5,0,2019,1,1,1,-0.23088,-0.001142,1.181394


## 2019년 공급량차 구분별 훈련/예측

In [17]:
# 학습 특성 : 'month', 'weekday', '시간', '구분', '기온비율', '기압비율', '습도비율'
# 타겟 특성 : '공급량차'
test_col = ['month', 'weekday', '시간', '구분', '기온차', '기압차', '습도차']
target_col = ['공급량차']
train_col = test_col + target_col

train = total[train_col]
test = test2019[test_col]

# 구분에 따라 예측하기
divisions = test['구분'].unique()

gas_rate_2019_pred = pd.DataFrame()
for division in tqdm(divisions):
    # 구분별 훈련, 테스트 셋 설정
    temp_train = train[train['구분'] == division].reset_index(drop=True)
    temp_test = test[test['구분'] == division].reset_index(drop=True)

    # pycaret 설정
    exp = setup(temp_train, target=target_col[0], silent=True, remove_outliers=True, fold=7, fold_shuffle=True, verbose=False)

    # 모델 생성
    lgbm = create_model('lightgbm')
    cat = create_model('catboost')

    # 공급량차 예측
    gas_rate_2019_pred_lgbm = predict_model(lgbm, data=temp_test)
    gas_rate_2019_pred_cat = predict_model(cat, data=temp_test)

    # 두 모델 공급량차 예측 평균
    gas_rate_2019_division_pred = (gas_rate_2019_pred_lgbm['Label'] + gas_rate_2019_pred_cat['Label']) / 2

    # 구분별 예측 값 합치기
    gas_rate_2019_pred = pd.concat( [gas_rate_2019_pred, gas_rate_2019_division_pred], ignore_index=True )

# 2019년 예측 공급량차 열 추가
test2019[target_col[0]] = gas_rate_2019_pred

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,9.6523,3973.9739,63.0395,-0.1393,0.685,1.9231
1,9.0498,1083.9122,32.9228,0.3185,0.6732,1.3771
2,8.7666,216.1708,14.7027,0.8289,0.6674,1.49
3,8.958,308.6741,17.5691,0.7681,0.6836,1.9487
4,9.446,707.0604,26.5906,0.4741,0.7069,1.8985
5,10.1931,7787.9164,88.2492,0.1282,0.6682,1.7497
6,9.5624,2482.0936,49.8206,-0.026,0.6898,2.1509
Mean,9.3754,2365.6859,41.8421,0.3361,0.682,1.7912
SD,0.4529,2542.3954,24.7977,0.3483,0.0129,0.2527


100%|██████████| 7/7 [04:55<00:00, 42.21s/it]


In [18]:
# 예측 결과 확인
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분,year,month,day,weekday,기온차,기압차,습도차,공급량차
0,2019-01-01 01 A,2019-01-01,1,0,2019,1,1,1,-0.367125,-0.01411,1.247158,-234.323065
1,2019-01-01 02 A,2019-01-01,2,0,2019,1,1,1,-0.370324,0.115782,1.374254,-240.075409
2,2019-01-01 03 A,2019-01-01,3,0,2019,1,1,1,-0.335105,0.201484,1.016894,-117.782227
3,2019-01-01 04 A,2019-01-01,4,0,2019,1,1,1,-0.298285,-0.105884,1.128415,48.134819
4,2019-01-01 05 A,2019-01-01,5,0,2019,1,1,1,-0.23088,-0.001142,1.181394,175.905266


## 2019년 공급량차로 공급량 구하기

In [19]:
# 전체 데이터에서 필요한 특성만 가져오기
train = total[['year', 'month', 'day', 'weekday', '시간', '구분', '공급량']]

# 구분에 따라 예측하기
divisions = test2019['구분'].unique()

total_gas = []
for division in tqdm(divisions):
    # 구분별 셋 설정
    temp_train = train[train['구분'] == division].reset_index(drop=True)
    temp_test = test2019[test2019['구분'] == division].reset_index(drop=True)

    # 2018년 12월 31일 24시 값
    one_div_gas = []
    temp_gas = temp_train.iloc[-1, 6]
    one_div_gas.append(temp_gas)
    
    # 해당일 공급량 = 전날 공급량  + 해당일 공급량차
    for i in range(len(temp_test['공급량차'])):
        present_gas_minus = temp_test.loc[i, '공급량차']
        present_gas = one_div_gas[-1]  + present_gas_minus
        one_div_gas.append(present_gas)
    
    # 첫번째 값은 2018년 12월 31일 24시 값이므로 제거
    one_div_gas.pop(0)

    # 구분 전체 가스공급량을 하나의 리스트에 저장
    total_gas += one_div_gas

100%|██████████| 7/7 [00:00<00:00, 20.10it/s]


In [20]:
# 총 15120개
len(total_gas)

15120

## 제출 파일 불러오기

In [21]:
sub = pd.read_csv('../data/sample_submission.csv')
sub.head()

Unnamed: 0,일자|시간|구분,공급량
0,2019-01-01 01 A,0
1,2019-01-01 02 A,0
2,2019-01-01 03 A,0
3,2019-01-01 04 A,0
4,2019-01-01 05 A,0


In [22]:
sub.tail()

Unnamed: 0,일자|시간|구분,공급량
15115,2019-03-31 20 H,0
15116,2019-03-31 21 H,0
15117,2019-03-31 22 H,0
15118,2019-03-31 23 H,0
15119,2019-03-31 24 H,0


In [23]:
sub['공급량'] = total_gas
sub.head()

Unnamed: 0,일자|시간|구분,공급량
0,2019-01-01 01 A,2349.016935
1,2019-01-01 02 A,2108.941525
2,2019-01-01 03 A,1991.159299
3,2019-01-01 04 A,2039.294118
4,2019-01-01 05 A,2215.199384


In [24]:
sub.tail()

Unnamed: 0,일자|시간|구분,공급량
15115,2019-03-31 20 H,205.321971
15116,2019-03-31 21 H,206.585266
15117,2019-03-31 22 H,183.106325
15118,2019-03-31 23 H,148.369404
15119,2019-03-31 24 H,118.094997


In [25]:
sub.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15120 entries, 0 to 15119
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   일자|시간|구분  15120 non-null  object 
 1   공급량       15120 non-null  float64
dtypes: float64(1), object(1)
memory usage: 236.4+ KB


## 제출 파일 만들기

In [26]:
sub.to_csv("sub31_pycaret19.csv", index=False)