In [None]:
import pandas as pd
import numpy as np

import matplotlib
import matplotlib.pyplot as plt

# 인코딩
from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import train_test_split
# GridSearchCV
from sklearn.model_selection import GridSearchCV

# 다중 회귀 분석
import statsmodels.formula.api as smf
# Decision Tree
from sklearn.tree import DecisionTreeRegressor
# RandomForest, GradientBoosting
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# 평가
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error

matplotlib.rc("font", family = "NanumGothic")
matplotlib.rc("axes", unicode_minus = False)

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
!pip install tqdm

In [None]:
import time
from tqdm import tqdm

## Parameter Candidates

In [None]:
# short.ver

## parameter candidates
dt_params = {
    "min_samples_leaf": [10, 20, 30, 40, 50],
    "min_samples_split": [10, 20, 30, 40, 50],
    "max_depth": [3, 5, 7, 9],
}
rf_params = {
    "max_features" : ["sqrt", "log2"],
    "n_estimators" : [10, 20, 30, 40, 50],
    "max_depth": [3, 5, 7, 9],
    "min_samples_split": [10, 20, 30, 40, 50],
    "min_samples_leaf": [10, 20, 30, 40, 50]
}
gb_params = {
    "max_features" : ["sqrt", "log2"],
    "n_estimators": [10, 20, 30, 40, 50],
    "min_samples_leaf": [10, 20, 30, 40, 50],
    "min_samples_split": [10, 20, 30, 40, 50],
    "max_depth": [3, 5, 7, 9]
}

## parameter dictionary
param_dict = {
    0: dt_params,
    1: rf_params,
    2: gb_params
}

In [None]:
# long.ver

## parameter candidates
dt_params = {
    "min_samples_leaf": [10, 20, 30, 40, 50],
    "min_samples_split": [10, 20, 30, 40, 50],
    "max_depth": [3, 5, 7, 9],
}
rf_params = {
    "max_features" : ["sqrt", "log2"],
    "n_estimators" : [200, 300],
    "max_depth": [3, 5, 7, 9],
    "min_samples_split": [10, 20, 30, 40, 50],
    "min_samples_leaf": [10, 20, 30, 40, 50]
}
gb_params = {
    "max_features" : ["sqrt", "log2"],
    "n_estimators": [200, 300],
    "min_samples_leaf": [10, 20, 30, 40, 50],
    "min_samples_split": [10, 20, 30, 40, 50],
    "max_depth": [3, 5, 7, 9]
}

## parameter dictionary
param_dict = {
    0: dt_params,
    1: rf_params,
    2: gb_params
}

# !!!!! tqdm 잘 돌아가는지 정상 작동 확인 용 data로 한번 돌려봐주세요 !!!!!

## Model Selection

In [None]:
start_time = time.time()

models = [dt, rf, gb]
model_names = ["DecisionTree", "RandomForest", "GradientBoosting"]

fig1 = plt.figure(figsize = (10, 13))

## 평가
mse, rmse, mae, mape = [], [], [], []
## best parameter 목록
best_params = {}

n = 0

# for n in range(3):   # param_dict의 n번째 value를 가져오기 위함 -> lr 추가시 4로 변경
#     print(n)
for model in tqdm(models):              ## Model Selection - 나중에 random_state 다 빼기
    # GridSearchCVs
        # cv: crossvalidation 수 (default = 5)
        # refit: 최적의 parameter를 찾은 뒤 최종 적용
        # return_train_score: train score의 평균 return

    grid = GridSearchCV(
        model, param_dict[n], scoring="r2", n_jobs = -1, return_train_score = True, refit = True
    )
    # 적용
    grid.fit(train_x, train_y)

    # GridSearchCV가 고른 parameter로 예측
    _pred = grid.best_estimator_.predict(test_x)

    # 평가 저장
    mse.append(mean_squared_error(test_y, _pred))
    rmse.append(mean_squared_error(test_y, _pred, squared = False))
    mae.append(mean_absolute_error(test_y, _pred))
    mape.append(mean_absolute_percentage_error(test_y, _pred))

    ## best parameter 저장
    best_params[model_names[n]] = grid.best_params_


    ## 설명변수 중요도
    col_name = model_names[n] + " Importance"
    df_importance[col_name] = grid.best_estimator_.feature_importances_

    # df_feature_importance의 테이블을 중요도 순으로 정렬
    df_importance.sort_values(col_name, ascending=False, inplace = True)
    df_importance.round(3)

    # 시각화
    df_importance.sort_values(col_name, ascending = True, inplace = True)
    coordinates = range(len(df_importance))

    ax = fig1.add_subplot(3, 1, n+1)
    ax.barh(y = coordinates, width = df_importance[col_name], label=col_name)
    plt.yticks(coordinates, df_importance["Feature"])
    ax.set_title(col_name)

    n += 1


end_time = time.time()