In [2]:
import pandas as pd
train = pd.read_csv('https://raw.githubusercontent.com/YoungjinBD/data/main/st_train.csv')
test = pd.read_csv('https://raw.githubusercontent.com/YoungjinBD/data/main/st_test.csv')

train_X = train.drop(['grade'], axis=1)
train_y = train['grade']

test_X = test.drop(['grade'], axis=1)
test_y = test['grade']

In [3]:
# 파이프라인과 ColumnTransformer()를 사용하여 데이터 전처리 진행

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer, make_column_transformer # 열마다 다른 전처리기를 쓰고 싶을 때
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import GridSearchCV

num_columns = train_X.select_dtypes('number').columns.tolist() # Index 객체를 일반적인 Python 리스트로 변환하기 위해서
cat_columns = train_X.select_dtypes('object').columns.tolist()

cat_preprocess = make_pipeline(
    OneHotEncoder(handle_unknown='ignore', sparse_output=False)
)

num_preprocess = make_pipeline(
    SimpleImputer(strategy= 'mean'),
    StandardScaler()
)

preprocess = ColumnTransformer(
    [("num", num_preprocess, num_columns),
     ("cat", cat_preprocess, cat_columns)]
)

#### 회귀 분석 알고리즘

1. K-Nearest Neighbors(KNN)

In [10]:
from sklearn.neighbors import KNeighborsRegressor

full_pipe = Pipeline(
    [
        ("preprocess", preprocess),
        ('regressor', KNeighborsRegressor())
    ]
)

In [11]:
# knn 모형의 파라미터 명칭 확인

KNeighborsRegressor().get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [16]:
# K = 5 ~ 10 까지 튜닝 파라미터로 설정
import numpy as np

knn_params = {'regressor__n_neighbors': np.arange(5, 10, 1)}

In [17]:
# GridSearchCV()를 활용하여 KNN 모형에 대한 파라미터 튜닝 진행 cv=3 3-fold

knn_search = GridSearchCV(estimator=full_pipe,
                          param_grid=knn_params,
                          cv = 3,
                          scoring= 'neg_mean_squared_error')

knn_search.fit(train_X, train_y)