In [None]:
### 환경설정


## 1.기본
import numpy as np  # numpy 패키지 가져오기
import matplotlib.pyplot as plt # 시각화 패키지 가져오기

## 2.데이터 가져오기
import pandas as pd # csv -> dataframe으로 전환
from sklearn import datasets # python 저장 데이터 가져오기

## 4. 훈련/검증용 데이터 분리
from sklearn.model_selection import train_test_split 

## 5.분류모델구축
from sklearn.tree import DecisionTreeClassifier # 결정 트리
from sklearn.neighbors import KNeighborsClassifier # K-최근접 이웃
from sklearn.linear_model import LogisticRegression # 로지스틱 회귀 모델
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

## 5_1.앙상블 모델 구축
from sklearn.ensemble import VotingClassifier # 과반수 투표(Majority Voting) 

## 6.모델검정
from sklearn.metrics import confusion_matrix, classification_report # 정오분류표
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score # 정확도, 민감도 등
from sklearn.metrics import roc_curve, auc # ROC 곡선 그리기
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

## 7.최적화
from sklearn.model_selection import learning_curve, validation_curve # 학습곡선, 검증곡선
from sklearn.model_selection import GridSearchCV # 하이퍼파라미터 튜닝
from sklearn.model_selection import cross_val_score # 교차타당도 # 추가


## 8. 데이터 전처리
from sklearn.preprocessing import StandardScaler # 표준화
from sklearn.preprocessing import MinMaxScaler # 정규화
from sklearn.preprocessing import LabelEncoder # 문자형 변수를 숫자형 변수로 변경
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA

## 9. 데이터 시각화
import pandas as pd
import seaborn as sns
from scipy import stats # 확률분포 분석할때와 해당 그래프 그릴 경우 필요
import plotly.express as px 

## 10. 경고무시
import warnings
warnings.filterwarnings('ignore')

## 11. Tensorflow
import tensorflow as tf
from tensorflow.keras import Sequential # Sequential 형태의 모델을 만들어줌
from tensorflow.keras import layers #Dense를 만들때 사용
from tensorflow.keras import activations # activation을 만들 때 사용
from tensorflow.keras.layers import Dense, Activation, Flatten, Input # 전부 다 불러오는 방법
from tensorflow.keras.utils import plot_model # 모델을 시각화

# 12. 파일 불러오기
from glob import glob

In [None]:
# xgboost 설치
!pip install xgboost



In [None]:
# optuna 설치
!pip install optuna



In [None]:
X_train_sample = pd.read_csv('X_train_sample.csv', encoding='cp949', index_col = 0)
X_test_sample = pd.read_csv('X_test_sample.csv', encoding='cp949', index_col = 0)
y_train_sample = pd.read_csv('y_train_sample.csv', encoding='cp949', index_col = 0)
y_test_sample = pd.read_csv('y_test_sample.csv', encoding='cp949', index_col = 0)

In [None]:
X_train_sample

Unnamed: 0,팔당댐 유입량 (단위: m^3/s),총 방류량 (단위: m^3/s),대곡교 강수량,진관교 강수량,송정동 강수량,대곡교 누적 강수량,진관교 누적 강수량,송정동 누적 강수량,청담대교 유량 부호,한강대교 유량 부호,행주대교 유량 부호,청담대교 유량_abs,한강대교 유량_abs,행주대교 유량_abs,장마
0,555.0,555.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1,1,469.05,729.80,540.18,0
1,464.6,562.9,0.0,0.0,0.0,0.0,0.0,0.0,1,1,1,498.00,731.48,540.18,0
2,478.1,576.4,0.0,0.0,0.0,0.0,0.0,0.0,1,1,1,490.68,726.42,540.18,0
3,464.8,563.1,0.0,0.0,0.0,0.0,0.0,0.0,1,1,1,476.21,726.42,552.17,0
4,478.1,576.4,0.0,0.0,0.0,0.0,0.0,0.0,1,1,1,476.21,707.17,564.29,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
247033,183.0,284.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1,1,264.07,291.89,376.45,0
247034,183.0,284.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1,1,264.07,275.45,386.43,0
247035,183.0,284.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1,1,258.79,279.56,386.43,0
247036,83.0,284.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1,1,264.07,287.78,376.45,0


In [None]:
X_test_sample

Unnamed: 0,팔당댐 유입량 (단위: m^3/s),총 방류량 (단위: m^3/s),대곡교 강수량,진관교 강수량,송정동 강수량,대곡교 누적 강수량,진관교 누적 강수량,송정동 누적 강수량,청담대교 유량 부호,한강대교 유량 부호,행주대교 유량 부호,청담대교 유량_abs,한강대교 유량_abs,행주대교 유량_abs,장마
247038,277.00,277.00,0.0,0.0,0.0,0.0,0.0,0.0,1,1,1,264.07,263.12,386.43,0
247039,173.00,173.00,0.0,0.0,0.0,0.0,0.0,0.0,1,1,1,269.40,287.78,386.43,0
247040,179.00,179.00,0.0,0.0,0.0,0.0,0.0,0.0,1,1,1,269.40,300.12,386.43,0
247041,178.00,178.00,0.0,0.0,0.0,0.0,0.0,0.0,1,1,1,264.07,296.01,386.43,0
247042,379.00,178.00,0.0,0.0,0.0,0.0,0.0,0.0,1,1,1,264.07,291.89,386.43,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
253006,140.94,140.94,0.0,0.0,0.0,0.0,0.0,0.0,1,1,1,285.72,575.57,493.58,0
253007,141.07,141.07,0.0,0.0,0.0,0.0,0.0,0.0,1,1,1,274.78,501.04,505.03,0
253008,141.01,141.01,0.0,0.0,0.0,0.0,0.0,0.0,1,1,1,269.40,425.89,505.03,0
253009,755.75,140.75,0.0,0.0,0.0,0.0,0.0,0.0,1,1,1,264.07,198.19,493.58,0


In [None]:
y_train_sample

Unnamed: 0,청담대교 수위 (단위: cm),잠수교 수위 (단위: cm),한강대교 수위 (단위: cm),행주대교 수위 (단위: cm)
0,310.7,300.2,290.0,275.3
1,314.7,300.2,290.0,275.3
2,313.7,301.2,290.0,275.3
3,311.7,301.2,290.0,276.3
4,311.7,301.2,291.0,277.3
...,...,...,...,...
247033,277.7,266.2,273.0,260.3
247034,277.7,266.2,273.0,261.3
247035,276.7,267.2,273.0,261.3
247036,277.7,267.2,273.0,260.3


In [None]:
y_test_sample

Unnamed: 0,청담대교 수위 (단위: cm),잠수교 수위 (단위: cm),한강대교 수위 (단위: cm),행주대교 수위 (단위: cm)
247038,277.7,267.2,273.0,261.3
247039,278.7,267.2,273.0,261.3
247040,278.7,267.2,273.0,261.3
247041,277.7,267.2,273.0,261.3
247042,277.7,267.2,273.0,261.3
...,...,...,...,...
253006,281.7,281.2,278.0,271.3
253007,279.7,279.2,278.0,272.3
253008,278.7,277.2,277.0,272.3
253009,277.7,276.2,276.0,271.3


In [None]:
X_train_sample_index = X_train_sample.loc[:247036,:].reset_index(drop = True)
y_train_sample_index = y_train_sample.loc[1:,:].reset_index(drop = True)

X_test_sample_index = X_test_sample.loc[:253009, :].reset_index(drop = True)
y_test_sample_index = y_test_sample.loc[247039:, ].reset_index(drop = True)

In [None]:
X_train_sample_index

Unnamed: 0,팔당댐 유입량 (단위: m^3/s),총 방류량 (단위: m^3/s),대곡교 강수량,진관교 강수량,송정동 강수량,대곡교 누적 강수량,진관교 누적 강수량,송정동 누적 강수량,청담대교 유량 부호,한강대교 유량 부호,행주대교 유량 부호,청담대교 유량_abs,한강대교 유량_abs,행주대교 유량_abs,장마
0,555.0,555.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1,1,469.05,729.80,540.18,0
1,464.6,562.9,0.0,0.0,0.0,0.0,0.0,0.0,1,1,1,498.00,731.48,540.18,0
2,478.1,576.4,0.0,0.0,0.0,0.0,0.0,0.0,1,1,1,490.68,726.42,540.18,0
3,464.8,563.1,0.0,0.0,0.0,0.0,0.0,0.0,1,1,1,476.21,726.42,552.17,0
4,478.1,576.4,0.0,0.0,0.0,0.0,0.0,0.0,1,1,1,476.21,707.17,564.29,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
247032,183.0,284.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1,1,264.07,304.23,376.45,0
247033,183.0,284.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1,1,264.07,291.89,376.45,0
247034,183.0,284.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1,1,264.07,275.45,386.43,0
247035,183.0,284.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1,1,258.79,279.56,386.43,0


In [None]:
y_train_sample_index

Unnamed: 0,청담대교 수위 (단위: cm),잠수교 수위 (단위: cm),한강대교 수위 (단위: cm),행주대교 수위 (단위: cm)
0,314.7,300.2,290.0,275.3
1,313.7,301.2,290.0,275.3
2,311.7,301.2,290.0,276.3
3,311.7,301.2,291.0,277.3
4,311.7,301.2,291.0,277.3
...,...,...,...,...
247032,277.7,266.2,273.0,260.3
247033,277.7,266.2,273.0,261.3
247034,276.7,267.2,273.0,261.3
247035,277.7,267.2,273.0,260.3


In [None]:
from optuna.samplers import TPESampler
import optuna
from xgboost import XGBRegressor
from xgboost import plot_importance

def objective(trial):
    
    param = {
        'tree_method' : 'exact',
        'objective' : 'reg:squarederror',
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
        'gamma' : trial.suggest_loguniform('gamma', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 1.0),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'learning_rate': trial.suggest_uniform("learning_rate", 0.00001, 1),
        'n_estimators': trial.suggest_int('n_estimators', 100, 3000),
        'max_depth': trial.suggest_int('max_depth',3, 15),
        'random_state': trial.suggest_categorical('random_state', [2022]),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300)
    }
    model = XGBRegressor(**param)  
    
    model.fit(X_train_sample_index,y_train_sample_index,eval_set=[(X_test_sample_index,y_test_sample_index)],early_stopping_rounds=50,verbose=False)
    
    pred = model.predict(X_test_sample_index)
    
    rmse = mean_squared_error(y_test_sample_index, pred,squared=False)
    
    return rmse

In [None]:
study = optuna.create_study(direction='minimize', sampler=TPESampler())
study.optimize(objective, n_trials=30)

[32m[I 2022-09-01 20:12:51,185][0m A new study created in memory with name: no-name-5e0c14d9-66c3-4e67-9721-b58f246859e5[0m
[32m[I 2022-09-01 20:13:46,948][0m Trial 0 finished with value: 4.132307834397587 and parameters: {'lambda': 0.1060809619486914, 'alpha': 0.5140118480905642, 'gamma': 4.46695558657611, 'colsample_bytree': 0.5592087656941143, 'subsample': 0.8687166824069275, 'learning_rate': 0.6559737284794043, 'n_estimators': 2569, 'max_depth': 10, 'random_state': 2022, 'min_child_weight': 77}. Best is trial 0 with value: 4.132307834397587.[0m
[32m[I 2022-09-01 20:14:23,889][0m Trial 1 finished with value: 4.14695500066132 and parameters: {'lambda': 1.9777148372860047, 'alpha': 0.013840501096745979, 'gamma': 0.11931357678997877, 'colsample_bytree': 0.6599708683977039, 'subsample': 0.9062840576534786, 'learning_rate': 0.7782197058101781, 'n_estimators': 659, 'max_depth': 6, 'random_state': 2022, 'min_child_weight': 170}. Best is trial 0 with value: 4.132307834397587.[0m
[

    4) subsample. 하나의 트리를 만들 때 사용할 데이터의 비율을 나타냅니다. 0.0 ~ 1.0 사이의 값을 넣으면 지정한 비율만큼만 랜덤하게 데이터를 사용합니다.
    5) colsample_bytree. 하나의 트리를 만들 때 사용할 feature의 비율을 나타냅니다. 0.0 ~ 1.0 사이의 값을 넣으면 트리를 만들 때 지정한 비율만큼만 랜덤하게 feature를 사용합니다.
    6) lambda : 과적합을 방지하기 위해서 정규화 정도를 의미한다. (높을수록 정규분포)
    7) alpha : 과적합을 방지하기 위해서 규제정도를 의미한다.
        -> 알파값이 증가할 수록 각 파라미터가 낮아져 곡선의 복잡성이 낮아져 오버피팅이 줄어듦
    8) gamma : 과적합을 방지하기 위한 파라미터
        -> gamma가 크면 decision boundary에서 멀리 있는 포인트는 영향이 없어지므로 선과 가까이 있는 포인트 하나하나의 영향이 상대적으로              크기 때문에 decision boundary가 곡선의 형태를 가짐. (reach가 짧음)
        -> gamma가 작으면 decision boundary는 직선의 형태를 가짐, 가까이 있는 포인트 하나하나가 decision boundary에 주는 영향이 상대적으로             작음

In [None]:
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

Number of finished trials: 50
Best trial: {'lambda': 6.42624959349907, 'alpha': 3.698153718736625, 'gamma': 0.010761744241057256, 'colsample_bytree': 0.9568847274563008, 'subsample': 0.9028949888156346, 'learning_rate': 0.2373366639379164, 'n_estimators': 2145, 'max_depth': 3, 'random_state': 2022, 'min_child_weight': 11}
