<a href="https://colab.research.google.com/github/jooeun921/Big-Data-Analyst/blob/main/Part03_Section_01_scikit_learn_model_evaluation_tunning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Part 03. 데이터 처리
scikit-learn을 활용한 머신러닝 모델 구축과 최적화를 다룸.
- 모델 평가 및 튜닝 : 교차 검증, 성능 지표, 하이퍼파라미터 최적화
- 회귀분석 : KNN, 트리 기반 회귀, SVR
- 분류모델 : KNN, 트리기반 분류, SVM
- 군집분석 : k-평균, 계층적 군집, DBSCAN, 군집 유효성 평가

In [None]:
import pandas as pd
import numpy as np

### Section 01 학습 : scikit-learn을 활용한 모델 평가 & 파라미터 튜닝

In [None]:
# Hold-out 방법
# 훈련, 검증, 테스트로 데이터를 나누는 방법.

train = pd.read_csv('https://raw.githubusercontent.com/YoungjinBD/data/main/s11_train.csv')
test= pd.read_csv('https://raw.githubusercontent.com/YoungjinBD/data/main/s11_test.csv')

Y_train = train['grade']
X_train = train.drop(['grade'], axis = 1)

Y_test = test['grade']
X_test = test.drop(['grade'], axis = 1)

In [None]:
from sklearn.model_selection import train_test_split

X_train_sub, X_valid, Y_train_sub, Y_valid = train_test_split(X_train, Y_train, test_size = 0.3, random_state = 1)

print(X_train_sub.shape, X_valid.shape, Y_train_sub.shape, Y_valid.shape)

In [None]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train_sub, Y_train_sub)

In [None]:
from sklearn.metrics import root_mean_squared_error

pred_val = lr.predict(X_valid)
print("Valid RSME: ", root_mean_squared_error(Y_valid, pred_val))

In [None]:
# K-fold 교차 검증 방법
# 훈련 데이터를 여러 개의 fold로 분할한 뒤에, fold 하나를 검증 데이터로, k-1개는 훈련 데이터로 사용. 이를 k번 반복.
# 따라서 검증 추정치(ex. MSE)는 k개가 계산되고, 최종 추정치는 k개의 추정치의 평균으로 계산.

from sklearn.model_selection import cross_val_score

cv_score = cross_val_score(lr, X_train, Y_train, scoring = 'neg_root_mean_squared_error')
rmse_score = -cv_score

mean_rmse_score = np.mean(rmse_score)

print('폴드별 RMSE: ', rmse_score)
print('교차검증 RMSE: ', mean_rmse_score)

In [None]:
# sklearn.model_selection.KFold 를 사용하여 KFold의 세부 설정을 수정할 수 있음.

from sklearn.model_selection import KFold

cv = KFold(n_splits = 5, shuffle = True, random_state = 0)

from sklearn.model_selection import cross_val_score

cv2_score = cross_val_score(lr, X_train, Y_train, scoring = 'neg_root_mean_squared_error', cv = cv)
rmse_score2 = -cv2_score
mean_rmse_score2 = np.mean(rmse_score2)

print('폴드별 RMSE: ', rmse_score2)
print('교차검증 RMSE: ', mean_rmse_score2)

In [None]:
train = pd.read_csv('https://raw.githubusercontent.com/YoungjinBD/data/main/s11_train.csv')
test= pd.read_csv('https://raw.githubusercontent.com/YoungjinBD/data/main/s11_test.csv')

train_X = train.drop(['grade'], axis = 1)
train_y = train['grade']
test_X = test.drop(['grade'], axis = 1)
test_y = test['grade']

In [None]:
# Grid Search.
# 모든 하이퍼파라미터 조합을 탐색하여 최적의 매개변수를 찾는 방법이나, 시험에서는 모형 학습 시간이 1분으로 제한되어 있기 때문에 튜닝을 생략하거나 하이퍼파라미터의 범위를 제한하여 튜닝을 진행해야 함.

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(random_state = 1)

In [None]:
params = rf.get_params()

for param_name, param_value in params.items():
    print(f"{param_name} : {param_value}")

In [None]:
# hyper-parameter 정의
param_grid = {
    'max_depth' : [10, 20, 30],
    'ccp_alpha' : [0.1, 0.3, 0.5]
    }

rf_search = GridSearchCV(estimator  = rf, param_grid = param_grid, scoring = 'neg_root_mean_squared_error', cv = 5)

rf_search.fit(train_X, train_y)

In [None]:
best_params = rf_search.best_params_
print('최적 조합 : ', best_params)

In [None]:
mean_rmse_score = -rf_search.best_score_
print('교차검증 RMSE: ', mean_rmse_score)

In [None]:
print(rf_search.best_estimator_)

In [None]:
train = pd.read_csv('https://raw.githubusercontent.com/YoungjinBD/data/main/s11_train.csv')
test= pd.read_csv('https://raw.githubusercontent.com/YoungjinBD/data/main/s11_test.csv')

train_X = train.drop(['grade'], axis = 1)
train_y = train['grade']
test_X = test.drop(['grade'], axis = 1)
test_y = test['grade']

from sklearn.model_selection import train_test_split

train_X_sub, valid_X, train_y_sub, valid_y = train_test_split(train_X, train_y, test_size = 0.3, random_state = 1)

In [None]:
# 데이터 누수 처리를 위한 전처리를 진행할 때, 검증 데이터셋에도 학습 데이터셋에서 학습시킨 것을 활용해야 함.(= fit 하면 안 된다)
from sklearn.preprocessing import StandardScaler

std_scaler = StandardScaler()

num_columns = train_X.select_dtypes('number').columns

train_X_numeric_scaled = std_scaler.fit_transform(train_X[num_columns])
valid_X_numeric_scaled = std_scaler.transform(valid_X[num_columns])
test_X_numeric_scaled = std_scaler.transform(test_X[num_columns])

In [None]:
train = pd.read_csv('https://raw.githubusercontent.com/YoungjinBD/data/main/s11_train.csv')
test= pd.read_csv('https://raw.githubusercontent.com/YoungjinBD/data/main/s11_test.csv')

train_X = train.drop('grade', axis = 1)
train_y = train['grade']

test_X = test.drop('grade', axis = 1)
test_y = test['grade']

In [None]:
# 파이프라인.
# 반복적으로 사용되는 데이터의 경우에 데이터 전처리와 모델링을 일괄적으로 하기 위한 방식.

# 정형 데이터에서 파이프라인 적용
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR

svr_pipe = Pipeline([
    ('preprocess', StandardScaler()),
                      ('regressor', SVR())
])

svr_pipe.fit(train_X, train_y)

In [None]:
from sklearn.pipeline import make_pipeline

svr_pipe2 = make_pipeline(StandardScaler(), SVR())

svr_pipe2.fit(train_X, train_y)

In [None]:
from sklearn.model_selection import cross_val_score

cv_score4 = cross_val_score(svr_pipe, train_X, train_y, scoring = 'neg_root_mean_squared_error', cv = 5)

rmse_score4 = -cv_score4
mean_rmse_score4 = np.mean(rmse_score4)
print(mean_rmse_score4)

In [None]:
print(SVR().get_params())

In [None]:
SVR_params = {'regressor__C': np.arange(1, 100, 20)}

In [None]:
from sklearn.model_selection import GridSearchCV

SVR_search = GridSearchCV(estimator = svr_pipe, param_grid = SVR_params, scoring = 'neg_root_mean_squared_error', cv = 5)

SVR_search.fit(train_X, train_y)

In [None]:
print(SVR_search.best_params_)
print(-SVR_search.best_score_)

In [None]:
# 범주형 변수와 추피형 변수 혼합 데이터에서 파이프라인 적용

train = pd.read_csv('https://raw.githubusercontent.com/YoungjinBD/data/main/s11_train.csv')
test= pd.read_csv('https://raw.githubusercontent.com/YoungjinBD/data/main/s11_test.csv')

train_X = train.drop(['grade'], axis = 1)
train_y = train['grade']
test_X = test.drop(['grade'], axis = 1)
test_y = test['grade']

num_columns = train_X.select_dtypes('number').columns.tolist()
cat_columns = train_X.select_dtypes('object').columns.tolist()

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

cat_preprocess = make_pipeline(OneHotEncoder(handle_unknown = 'ignore', sparse_output = False))
num_preprocess = make_pipeline(SimpleImputer(strategy = 'mean'), StandardScaler())

from sklearn.compose import ColumnTransformer

preprocess = ColumnTransformer( [("num", num_preprocess, num_columns), ("cat", cat_preprocess, cat_columns)] )

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR

full_pipe = Pipeline(
    [
        ("preprocess", preprocess),
        ("regressor", SVR())
    ]
)

In [None]:
SVR_param = {'regressor__C' : np.arange(1, 100, 20)}

In [None]:
from sklearn.model_selection import GridSearchCV

SVR_search = GridSearchCV(estimator = full_pipe, param_grid = SVR_param, scoring = 'neg_root_mean_squared_error', cv = 5)
SVR_search.fit(train_X, train_y)

print(SVR_search.best_params_)
print(-SVR_search.best_score_)

In [None]:
test_pred = SVR_search.predict(test_X)
test_pred = pd.DataFrame(test_pred, columns=['pred'])

In [None]:
test_pred.to_csv('submission.csv', index = False)