### 1. 데이터 로드

In [1]:
import numpy as np
import pandas as pd

import seaborn as sns 
import matplotlib.pyplot as plt

plt.rcParams['figure.figsize']=(10,10)
plt.rcParams['font.family']='AppleGothic'

import warnings
warnings.filterwarnings(action='ignore')

In [2]:
def read_csv_by_dir(path, index_col=None):
    df_raw = pd.DataFrame()
    for files in os.listdir(path):
        if files.endswith('.csv'):
            df = pd.read_csv('/'.join([path,files]),
                            index_col=index_col)
        df_raw = pd.concat((df_raw,df),axis=0)
    return df_raw

In [3]:
path = 'C:/Users/liked/OneDrive/바탕 화면/데이콘/팔당댐'

_df_rf_raw = read_csv_by_dir('/'.join([path,'rf_data']),
                            index_col=0)

_df_water_raw = read_csv_by_dir('/'.join([path,'water_data']),
                               index_col=0)

_submission_raw = pd.read_csv('/'.join([path,'sample_submission.csv']),
                             index_col=0)

In [4]:
# raw_data 보존하기
df_rf=_df_rf_raw.copy()
df_rf.name = "rain_data"

df_water=_df_water_raw.copy()
df_water.name = "water_data"

submission=_submission_raw.copy()
submission.name = "submission"

### 2. 데이터 전처리

In [7]:
df_water

Unnamed: 0_level_0,swl,inf,sfw,ecpc,tototf,tide_level,wl_1018662,fw_1018662,wl_1018680,fw_1018680,wl_1018683,fw_1018683,wl_1019630,fw_1019630
ymdhm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2012-05-01 00:00,24.800,555.00,219.07,24.93,555.00,445.0,310.7,469.05,300.2,0.0,290.0,729.80,275.3,540.18
2012-05-01 00:10,24.794,464.60,218.86,25.15,562.90,449.0,314.7,498.00,300.2,0.0,290.0,731.48,275.3,540.18
2012-05-01 00:20,24.789,478.10,218.69,25.31,576.40,451.0,313.7,490.68,301.2,0.0,290.0,726.42,275.3,540.18
2012-05-01 00:30,24.789,464.80,218.69,25.31,563.10,452.0,311.7,476.21,301.2,0.0,290.0,726.42,276.3,552.17
2012-05-01 00:40,24.789,478.10,218.69,25.31,576.40,450.0,311.7,476.21,301.2,0.0,291.0,707.17,277.3,564.29
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-07-18 23:10,25.040,259.23,212.86,31.14,259.23,510.0,0.0,319.84,0.0,,0.0,-456.41,0.0,974.40
2022-07-18 23:20,25.040,260.46,212.86,31.14,260.46,492.0,0.0,314.01,0.0,,0.0,-717.30,0.0,1006.88
2022-07-18 23:30,25.040,259.37,212.86,31.14,259.37,475.0,0.0,387.55,0.0,,0.0,-843.37,0.0,1039.90
2022-07-18 23:40,25.040,259.13,212.86,31.14,259.13,458.0,0.0,454.91,0.0,,0.0,-1023.37,0.0,1073.46


In [8]:
def index_to_datetime(df,format):
    df.index = pd.to_datetime(df.index,
                              format=format)
    return df

In [9]:
df_rf=index_to_datetime(df=df_rf,format='%Y-%m-%d %H:%M')
df_water=index_to_datetime(df=df_water,format='%Y-%m-%d %H:%M')
submission=index_to_datetime(df=submission,format='%Y-%m-%d %H:%M')

df_rf.sort_index(inplace=True)
df_water.sort_index(inplace=True)
submission.sort_index(inplace=True)

In [10]:
# 데이터 시간대 확인하기
def check_datetime(df):
    print(df.name)
    print(df.select_dtypes('datetime64[ns]').head(1).index[0])
    print(df.select_dtypes('datetime64[ns]').tail(1).index[0])
    return None

check_datetime(df_rf)
check_datetime(df_water)
check_datetime(submission)

rain_data
2012-05-01 00:00:00
2022-07-18 23:50:00
water_data
2012-05-01 00:00:00
2022-07-18 23:50:00
submission
2022-06-01 00:00:00
2022-07-18 23:50:00


In [11]:
# data target 분리하기
target = df_water.loc[:,submission.columns]
data = pd.concat((df_rf,df_water.drop(submission.columns,axis=1)),axis=1)

In [12]:
target

Unnamed: 0_level_0,wl_1018662,wl_1018680,wl_1018683,wl_1019630
ymdhm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2012-05-01 00:00:00,310.7,300.2,290.0,275.3
2012-05-01 00:10:00,314.7,300.2,290.0,275.3
2012-05-01 00:20:00,313.7,301.2,290.0,275.3
2012-05-01 00:30:00,311.7,301.2,290.0,276.3
2012-05-01 00:40:00,311.7,301.2,291.0,277.3
...,...,...,...,...
2022-07-18 23:10:00,0.0,0.0,0.0,0.0
2022-07-18 23:20:00,0.0,0.0,0.0,0.0
2022-07-18 23:30:00,0.0,0.0,0.0,0.0
2022-07-18 23:40:00,0.0,0.0,0.0,0.0


In [13]:
_target = target.reset_index(drop=True)
_data = data.reset_index(drop=True)
_data.index += 1
tot=pd.concat((_data,_target),axis=1)
tot=tot.sort_index()

tot=tot.iloc[1:-1]
target = tot.loc[:,submission.columns]
data = tot.drop(submission.columns,axis=1)

In [14]:
train_target=target.iloc[:-len(submission),:]
test_target=target.iloc[-len(submission):,:]

train_data=data.iloc[:-len(submission),:]
test_data=data.iloc[-len(submission):,:]

In [15]:
train_target.fillna(train_target.mean(),inplace=True)
test_target.fillna(train_target.mean(),inplace=True)
train_data.fillna(train_data.mean(),inplace=True)
test_data.fillna(train_data.mean(),inplace=True)

In [16]:
train_data

Unnamed: 0,rf_10184100,rf_10184110,rf_10184140,swl,inf,sfw,ecpc,tototf,tide_level,fw_1018662,fw_1018680,fw_1018683,fw_1019630
1,0.0,0.0,0.0,24.800,555.00,219.07,24.93,555.00,445.0,469.05,0.0,729.80,540.18
2,0.0,0.0,0.0,24.794,464.60,218.86,25.15,562.90,449.0,498.00,0.0,731.48,540.18
3,0.0,0.0,0.0,24.789,478.10,218.69,25.31,576.40,451.0,490.68,0.0,726.42,540.18
4,0.0,0.0,0.0,24.789,464.80,218.69,25.31,563.10,452.0,476.21,0.0,726.42,552.17
5,0.0,0.0,0.0,24.789,478.10,218.69,25.31,576.40,450.0,476.21,0.0,707.17,564.29
...,...,...,...,...,...,...,...,...,...,...,...,...,...
269419,0.0,0.0,0.0,25.180,140.89,217.99,26.01,140.89,232.0,314.01,0.0,597.98,471.08
269420,0.0,0.0,0.0,25.180,140.94,217.99,26.01,140.94,220.0,285.72,0.0,575.57,493.58
269421,0.0,0.0,0.0,25.180,141.07,217.99,26.01,141.07,208.0,274.78,0.0,501.04,505.03
269422,0.0,0.0,0.0,25.180,141.01,217.99,26.01,141.01,196.0,269.40,0.0,425.89,505.03


In [17]:
print('--data--')
print(train_data.shape)
print(test_data.shape)
print('--target--')
print(train_target.shape)
print(test_target.shape)

--data--
(269423, 13)
(6912, 13)
--target--
(269423, 4)
(6912, 4)


In [15]:
def null(x):
    print(x.isnull().sum())

## 3. grid search 사용

In [16]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

In [17]:
model = RandomForestRegressor(criterion = 'mse', random_state=2020)

In [18]:
params = {'n_estimators': [200, 300, 500],
          'max_features': [5, 6, 8],
          'min_samples_leaf': [1, 3, 5]}

In [19]:
greedy_CV = GridSearchCV(model, param_grid=params, cv = 3, n_jobs = -1)

In [20]:
greedy_CV.fit(train_data,train_target)

GridSearchCV(cv=3,
             estimator=RandomForestRegressor(criterion='mse',
                                             random_state=2020),
             n_jobs=-1,
             param_grid={'max_features': [5, 6, 8],
                         'min_samples_leaf': [1, 3, 5],
                         'n_estimators': [200, 300, 500]})

In [21]:
greedy_CV.best_estimator_

RandomForestRegressor(criterion='mse', max_features=6, min_samples_leaf=5,
                      n_estimators=300, random_state=2020)

In [23]:
model = greedy_CV.best_estimator_
model.fit(train_data,train_target)
y_pred = model.predict(test_data)

In [24]:
_submission_raw.iloc[:,:] = y_pred
_submission_raw.to_csv('submit.csv')

rmse 결과 : 3.976으로 baseline보다 정밀한 결과 확인, 베이지안 최적화 등으로 더 좋은 결과 제작 예정

### 4. 다른 알고리즘 사용

In [22]:
import xgboost as xgb

In [27]:
  # 탐색 대상 함수 (XGBRegressor)
def XGB_cv(max_depth,learning_rate, n_estimators, gamma
             ,min_child_weight, subsample
             ,colsample_bytree, silent=True, nthread=-1):
     # 모델 정의
      model = xgb.XGBRegressor(max_depth=int(max_depth),
                                learning_rate=learning_rate,
                                n_estimators=int(n_estimators),
                                gamma=gamma,
                                min_child_weight=min_child_weight,
                                subsample=subsample,
                                colsample_bytree=colsample_bytree, 
                                nthread=nthread
                                )
    # 모델 훈련
      model.fit(train_data,train_target)

    # 예측값 출력
      y_pred= model.predict(test_data)


In [30]:
from bayes_opt import BayesianOptimization
import numpy as np

  # 실험해보고자하는 hyperparameter 집합
pbounds = {'max_depth': (3, 7),
                'learning_rate': (0.01, 0.2),
                'n_estimators': (200,400),
                'gamma': (0, 100),
                'min_child_weight': (0, 3),
                'subsample': (0.5, 1),
                'colsample_bytree' :(0.2, 1)
                }

  # Bayesian optimization 객체 생성
  # f : 탐색 대상 함수, pbounds : hyperparameter 집합
  # verbose = 2 항상 출력, verbose = 1 최댓값일 때 출력, verbose = 0 출력 안함
  # random_state : Bayesian Optimization 상의 랜덤성이 존재하는 부분을 통제 
bo=BayesianOptimization(f=XGB_cv, pbounds=pbounds, verbose=2, random_state=1 )    

  # 메소드를 이용해 최대화 과정 수행
  # init_points :  초기 Random Search 갯수
  # n_iter : 반복 횟수 (몇개의 입력값-함숫값 점들을 확인할지! 많을 수록 정확한 값을 얻을 수 있다.)
  # acq : Acquisition Function들 중 Expected Improvement(EI) 를 사용
  # xi : exploration 강도 (기본값은 0.0)
bo.maximize(init_points=2, n_iter=10, acq='ei', xi=0.01)

  # ‘iter’는 반복 회차, ‘target’은 목적 함수의 값, 나머지는 입력값을 나타냅니다. 
  # 현재 회차 이전까지 조사된 함숫값들과 비교하여, 현재 회차에 최댓값이 얻어진 경우, 
  # bayesian-optimization 라이브러리는 이를 자동으로 다른 색 글자로 표시하는 것을 확인할 수 있습니다

  # 찾은 파라미터 값 확인
print(bo.max)

|   iter    |  target   | colsam... |   gamma   | learni... | max_depth | min_ch... | n_esti... | subsample |
-------------------------------------------------------------------------------------------------------------
