## 파이프라인 설명 예제

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score

data = load_breast_cancer()
X = data.data
y = data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svd', TruncatedSVD(n_components=2)),
    ('logreg', LogisticRegression(max_iter=1000))
])

param_grid = {
    'svd__n_components': [2, 5, 10],
    'logreg__C': [0.1, 1, 10]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print(f'Best parameters: {grid_search.best_params_}')
print(f'Best cross-validation score: {grid_search.best_score_}')

y_pred = grid_search.predict(X_test)
y_pred_proba = grid_search.predict_proba(X_test)[:, 1]
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'ROC AUC: {roc_auc_score(y_test, y_pred_proba)}')
print(classification_report(y_test, y_pred))



Best parameters: {'logreg__C': 1, 'svd__n_components': 10}
Best cross-validation score: 0.9758241758241759
Accuracy: 0.9824561403508771
ROC AUC: 0.9977071732721913
              precision    recall  f1-score   support

           0       0.98      0.98      0.98        43
           1       0.99      0.99      0.99        71

    accuracy                           0.98       114
   macro avg       0.98      0.98      0.98       114
weighted avg       0.98      0.98      0.98       114



In [2]:
# function to evaluate the model

import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score

data = load_breast_cancer()
X = data.data
y = data.target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

pipeline = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("svd", TruncatedSVD(n_components=2)),
        ("logreg", LogisticRegression(max_iter=1000)),
    ]
)

param_grid = {"svd__n_components": [2, 5, 10], "logreg__C": [0.1, 1, 10]}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring="accuracy")
grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    
    print(f"Accuracy: {accuracy}")
    print("\n Classification report:")
    print(report)
    print(f"ROC AUC: {roc_auc}")
    
evaluate_model(grid_search, X_test, y_test)
    

Best parameters: {'logreg__C': 1, 'svd__n_components': 10}
Accuracy: 0.9824561403508771

 Classification report:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98        43
           1       0.99      0.99      0.99        71

    accuracy                           0.98       114
   macro avg       0.98      0.98      0.98       114
weighted avg       0.98      0.98      0.98       114

ROC AUC: 0.9977071732721913


## California_housing 데이터셋으로 아래사항을 참조하여 주택가격을 예측하는 회귀모델을 개발하세요.

- 전처리하고 피쳐 엔지니어링을 통해 데이터를 준비, 파이프라인 써도 됨
- 전체 회귀모델을 적용 9개 모델
- 각 모델별 최적 하이퍼파라미터 - GridSearchCV 활용
- 평가지수 MSE 기준으로 가장 성능이 좋은 모델과 파라미터를 적용하여 평가 결과를 출력 

In [9]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

housing = fetch_california_housing()
y = housing.target
X = housing.data

df = pd.DataFrame(X, columns=housing.feature_names)
df["Target"] = y

q1 = df["Target"].quantile(0.25)
q3 = df["Target"].quantile(0.75)
iqr = q3 - q1
lower = q1 - 1.5 * iqr
upper = q3 + 1.5 * iqr

df = df[(df["Target"] >= lower) & (df["Target"] <= upper)]

X = df.drop(["Target"], axis=1)
y = df["Target"]


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

models = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "RandomForest": RandomForestRegressor(),
    "GradientBoosting": GradientBoostingRegressor(),
    "XGBoost": XGBRegressor(),
    "LightGBM": LGBMRegressor(),
}
pipelines = {}
param_grids = {}

for name, model in models.items():
    pipelines[name] = Pipeline([("scaler", StandardScaler()), ("model", model)])

    if name == "LinearRegression":
        param_grids[name] = {}
    elif name in ["Ridge", "Lasso"]:
        param_grids[name] = {"model__alpha": [0.05,0.1, 1]}
    elif name in ["RandomForest", "GradientBoosting"]:
        param_grids[name] = {
            "model__n_estimators": [300, 500, 1000],
            "model__max_depth": [ 5, 7, 9],
        }
    elif name == "XGBoost":
        param_grids[name] = {
            "model__n_estimators": [300, 500, 1000],
            "model__max_depth": [ 5, 7, 9],
            "model__learning_rate": [0.01, 0.1, 0.3],
        }
    elif name == "LightGBM":
        param_grids[name] = {
            "model__n_estimators": [300, 500, 1000],
            "model__max_depth": [5, 7, 9],
            "model__learning_rate": [0.01, 0.1, 0.3],
        }

# GridSearchCV를 사용한 모델 학습 및 평가
results = {}

for name, pipeline in pipelines.items():
    grid_search = GridSearchCV(
        pipeline, param_grids[name], cv=5, scoring="neg_mean_squared_error", n_jobs=-1
    )
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_
    mse = mean_squared_error(y_test, best_model.predict(X_test))

    results[name] = {
        "best_params": grid_search.best_params_,
        "mse": mse,
    }

for name, result in results.items():
    print(f"\n{name}:")
    print(f"Best parameters: {result['best_params']}")
    print(f"MSE: {result['mse']:.4f}")

best_model = min(results, key=lambda x: results[x]["mse"])
print(f"\nBest model: {best_model}")
print(f"MSE: {results[best_model]['mse']:.4f}")


LinearRegression:
Best parameters: {}
MSE: 0.3688

Ridge:
Best parameters: {'model__alpha': 1}
MSE: 0.3688

Lasso:
Best parameters: {'model__alpha': 0.05}
MSE: 0.4566

RandomForest:
Best parameters: {'model__max_depth': 9, 'model__n_estimators': 500}
MSE: 0.2412

GradientBoosting:
Best parameters: {'model__max_depth': 7, 'model__n_estimators': 1000}
MSE: 0.1617

XGBoost:
Best parameters: {'model__learning_rate': 0.1, 'model__max_depth': 5, 'model__n_estimators': 1000}
MSE: 0.1585

LightGBM:
Best parameters: {'model__learning_rate': 0.1, 'model__max_depth': 9, 'model__n_estimators': 500}
MSE: 0.1540

Best performing model: LightGBM
Test MSE: 0.1540
