# colab 이용
* 이용했으나 GPU 사용 잘 안돼서, 기압 예측부터 PC에서 진행

# pycaret을 이용한 가스공급량 예측
* 2019년 test 데이터 : 'test.csv'
* 2019년 제출 파일 : 'sample_submission.csv'
* 2013-2018년 가스공급량과 기온 자료 : '2013-2018년_가스공급량_기온_습도_기압01.csv'
* 사용 특성 : 'month', 'day', '시간', '구분', '기온','습도','기압'
* log적용
* top3 모델 결과 평균으로 제출

## 데이터 가져오기
* 2019년 test 데이터 : 'test.csv'
* 2019년 제출 파일 : 'sample_submission.csv'
* 2013-2018년 가스공급량과 기온 자료 : '2013-2018년_가스공급량_기온_습도_기압01.csv'

In [1]:
import pandas as pd

### 2013-2018년 가스공급량과 기온 자료

In [2]:
total = pd.read_csv('2013-2018년_가스공급량_기온_습도_기압01.csv')
total.head()

Unnamed: 0,year,month,day,weekday,시간,구분,공급량,기온,습도,기압
0,2013,1,1,1,1,0,2497.129,-8.5,57.0,1010.0
1,2013,1,1,1,1,1,2169.093,-8.5,57.0,1010.0
2,2013,1,1,1,1,2,226.178,-8.5,57.0,1010.0
3,2013,1,1,1,1,3,1434.516,-8.5,57.0,1010.0
4,2013,1,1,1,1,4,3272.837,-8.5,57.0,1010.0


#### 2019년 데이터

In [3]:
test2019 = pd.read_csv('test.csv')
test2019.head()

Unnamed: 0,일자|시간|구분
0,2019-01-01 01 A
1,2019-01-01 02 A
2,2019-01-01 03 A
3,2019-01-01 04 A
4,2019-01-01 05 A


In [4]:
test2019[['연월일', '시간', '구분']] = test2019['일자|시간|구분'].str.split(' ').tolist()
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분
0,2019-01-01 01 A,2019-01-01,1,A
1,2019-01-01 02 A,2019-01-01,2,A
2,2019-01-01 03 A,2019-01-01,3,A
3,2019-01-01 04 A,2019-01-01,4,A
4,2019-01-01 05 A,2019-01-01,5,A


In [5]:
# 구분 열을 숫자로 바꾸기
d_map = {}
for i, d in enumerate(test2019['구분'].unique()):
    d_map[d] = i
test2019['구분'] = test2019['구분'].map(d_map)
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분
0,2019-01-01 01 A,2019-01-01,1,0
1,2019-01-01 02 A,2019-01-01,2,0
2,2019-01-01 03 A,2019-01-01,3,0
3,2019-01-01 04 A,2019-01-01,4,0
4,2019-01-01 05 A,2019-01-01,5,0


In [6]:
# 연월일 열은 object형이므로 년, 월, 일로 나눈다.
test2019['연월일'] = pd.to_datetime(test2019['연월일'])
test2019['year'] = test2019['연월일'].dt.year
test2019['month'] = test2019['연월일'].dt.month
test2019['day'] = test2019['연월일'].dt.day
test2019['weekday'] = test2019['연월일'].dt.weekday
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분,year,month,day,weekday
0,2019-01-01 01 A,2019-01-01,1,0,2019,1,1,1
1,2019-01-01 02 A,2019-01-01,2,0,2019,1,1,1
2,2019-01-01 03 A,2019-01-01,3,0,2019,1,1,1
3,2019-01-01 04 A,2019-01-01,4,0,2019,1,1,1
4,2019-01-01 05 A,2019-01-01,5,0,2019,1,1,1


## 2019년 기온 예측하기
* 습도 예측
* 기압 예측

In [7]:
!pip install pycaret

Collecting pycaret
  Downloading pycaret-2.3.4-py3-none-any.whl (266 kB)
[K     |████████████████████████████████| 266 kB 4.3 MB/s 
[?25hCollecting lightgbm>=2.3.1
  Downloading lightgbm-3.3.1-py3-none-manylinux1_x86_64.whl (2.0 MB)
[K     |████████████████████████████████| 2.0 MB 35.0 MB/s 
[?25hCollecting mlxtend>=0.17.0
  Downloading mlxtend-0.19.0-py2.py3-none-any.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 22.1 MB/s 
Collecting Boruta
  Downloading Boruta-0.3-py3-none-any.whl (56 kB)
[K     |████████████████████████████████| 56 kB 4.8 MB/s 
[?25hCollecting umap-learn
  Downloading umap-learn-0.5.2.tar.gz (86 kB)
[K     |████████████████████████████████| 86 kB 5.1 MB/s 
Collecting pyLDAvis
  Downloading pyLDAvis-3.3.1.tar.gz (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 50.9 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?

In [8]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.0.3-cp37-none-manylinux1_x86_64.whl (76.3 MB)
[K     |████████████████████████████████| 76.3 MB 17 kB/s 
Installing collected packages: catboost
Successfully installed catboost-1.0.3


In [9]:
from pycaret.regression import *

#### 습도(Humidity) 예측

In [10]:
# 학습 특성 : 'month', 'day', 'weekday', '시간', '구분'
# 타겟 특성 : '습도'

%%time
exp = setup(total, target='습도', ignore_features=['공급량', 'year', '기온', '기압'], use_gpu = True)

Unnamed: 0,Description,Value
0,session_id,679
1,Target,습도
2,Original Data,"(368088, 10)"
3,Missing Values,False
4,Numeric Features,2
5,Categorical Features,3
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(257661, 28)"


CPU times: user 6.01 s, sys: 428 ms, total: 6.44 s
Wall time: 10 s


In [12]:
# 시간 단축을 위해 모델 3개만 이용
%%time
models3 = compare_models(sort='MAPE', n_select=3, include=['knn','catboost','lightgbm'])
# GPU 사용 안됨

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
knn,K Neighbors Regressor,9.5536,153.6965,12.3972,0.6132,0.2361,0.1942,10.857
catboost,CatBoost Regressor,9.695,149.1967,12.2144,0.6246,0.2307,0.1948,8.794
lightgbm,Light Gradient Boosting Machine,11.4841,207.2427,14.3957,0.4785,0.2712,0.2339,1.177


CPU times: user 2min 14s, sys: 58.5 s, total: 3min 13s
Wall time: 3min 42s


In [13]:
%%time
tuned_models3 = [tune_model(i, optimize='MAPE') for i in models3]
# GPU 사용 안됨

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,8.4888,118.332,10.8781,0.702,0.2053,0.169
1,8.5445,119.7785,10.9443,0.6995,0.2084,0.1714
2,8.7312,123.4011,11.1086,0.6918,0.2111,0.1753
3,8.7626,125.0468,11.1824,0.6865,0.2118,0.1754
4,8.8265,125.7462,11.2137,0.6881,0.2138,0.178
5,8.6425,120.7821,10.9901,0.6966,0.2076,0.172
6,8.7549,123.9199,11.1319,0.6849,0.211,0.1756
7,8.5465,118.5791,10.8894,0.698,0.2058,0.1706
8,8.6152,119.6295,10.9375,0.6972,0.2055,0.1708
9,8.57,119.6126,10.9368,0.6984,0.2061,0.1704


CPU times: user 6min 30s, sys: 3min 21s, total: 9min 52s
Wall time: 37min 20s


In [14]:
%%time
blend_models3 = blend_models(estimator_list=tuned_models3)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,8.1819,107.8397,10.3846,0.7285,0.1992,0.1647
1,8.1833,107.8079,10.3831,0.7295,0.2008,0.1657
2,8.2545,109.0965,10.4449,0.7275,0.2013,0.167
3,8.2507,109.0298,10.4417,0.7267,0.2012,0.1669
4,8.2886,109.2852,10.454,0.7289,0.2023,0.1684
5,8.1878,106.6937,10.3293,0.732,0.1982,0.1644
6,8.2098,107.6273,10.3744,0.7263,0.2001,0.1662
7,8.1069,105.4203,10.2674,0.7315,0.1972,0.1634
8,8.192,107.0935,10.3486,0.729,0.1978,0.1642
9,8.2001,107.6644,10.3761,0.7286,0.1992,0.1651


In [15]:
%%time
humidity_prediction_model = finalize_model(blend_models3)



In [17]:
%%time
df_for_temp = total[['month', 'day', 'weekday', '시간', '구분']]
humidity_pred = predict_model(humidity_prediction_model, data=df_for_temp)

CPU times: user 4min 37s, sys: 77.1 ms, total: 4min 37s
Wall time: 2min 22s


In [18]:
humidity_pred.head()

Unnamed: 0,month,day,weekday,시간,구분,Label
0,1,1,1,1,0,64.463024
1,1,1,1,1,1,64.764833
2,1,1,1,1,2,64.593622
3,1,1,1,1,3,64.657404
4,1,1,1,1,4,64.448516


In [20]:
from pycaret.utils import check_metric
check_metric(total['습도'], humidity_pred.Label, 'MAPE')

0.1009

In [21]:
check_metric(total['습도'], humidity_pred.Label, 'MAE')

5.0718

In [22]:
save_model(humidity_prediction_model, 'humidity_prediction_model_01')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True,
                                       features_todrop=['공급량', 'year', '기온',
                                                        '기압'],
                                       id_columns=[], ml_usecase='regression',
                                       numerical_features=[], target='습도',
                                       time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None...
                                                             importance_type='split',
                                                             learning_rate=0.1,
                                                             max_depth=-1,
        

In [23]:
test2019.head()

Unnamed: 0,일자|시간|구분,연월일,시간,구분,year,month,day,weekday
0,2019-01-01 01 A,2019-01-01,1,0,2019,1,1,1
1,2019-01-01 02 A,2019-01-01,2,0,2019,1,1,1
2,2019-01-01 03 A,2019-01-01,3,0,2019,1,1,1
3,2019-01-01 04 A,2019-01-01,4,0,2019,1,1,1
4,2019-01-01 05 A,2019-01-01,5,0,2019,1,1,1


In [24]:
col = ['month', 'day', 'weekday', '시간', '구분']
humidity2019_pred = predict_model(humidity_prediction_model, data=test2019[col])