<a href="https://colab.research.google.com/github/hyojunyee/kita_2404/blob/main/m5_%EB%A8%B8%EC%8B%A0%EB%9F%AC%EB%8B%9D/Task/ML_training1_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Pipeline 연습

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score

# 데이터 로딩
data = load_breast_cancer()
X, y = data.data, data.target

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 파이프라인 구성
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('svd', TruncatedSVD(n_components=2)),
    ('lr', LogisticRegression(max_iter=1000))
])

# 하이퍼파라미터 그리드 설정
param_grid = {
    'svd__n_components': [2, 5, 10],
    'lr__C': [0.1, 1, 10]
}

# GridSearchCV를 사용한 하이퍼 파라미터 튜닝
grid_search = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# 최적의 하이퍼 파라미터 출력
print(f"Best parameters found: {grid_search.best_params_}")


# 평가 사용자 함수 정의
def evaluate_model(model, X_test, y_test):
    # 예측 수행
    y_pred = model.predict(X_test)
    # 정확도 계산
    accuracy = accuracy_score(y_test, y_pred)
    # 분류 보고서 생성
    report = classification_report(y_test, y_pred)
    # ROC AUC 계산
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    roc_auc = roc_auc_score(y_test, y_pred)

    # 결과 출력
    print(f"Test Accuracy: {accuracy:.4f}\n")
    print(f"Classification Report:\n{report}\n")
    print(f"ROC AUC: {roc_auc:.4f}")

# 최적의 모델로 평가
evaluate_model(grid_search, X_test, y_test)

Best parameters found: {'lr__C': 1, 'svd__n_components': 10}
Test Accuracy: 0.9825

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98        43
           1       0.99      0.99      0.99        71

    accuracy                           0.98       114
   macro avg       0.98      0.98      0.98       114
weighted avg       0.98      0.98      0.98       114


ROC AUC: 0.9813


California_housing 데이터셋으로 아래사항을 참조하여 주택가격을 예측하는 회귀모델을 개발하세요.
- 전체 회귀모델을 적용
- 각 모델별 최적 하이퍼파라미터 - GridSearchCV 활용
- 평가지수 MSE 기준으로 가장 성능이 좋은 모델과 파라미터를 적용하여 평가 결과를 출력

In [None]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd

# 데이터 로딩
data = fetch_california_housing()
X, y = data.data, data.target
feature_names = data.feature_names

# 데이터 프레임으로 변환
df = pd.DataFrame(X, columns=feature_names)
df['target'] = y

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# 모델 리스트
models = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'ElasticNet': ElasticNet(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'XGBoost': XGBRegressor(),
    'LightGBM': LGBMRegressor(),
    'SVR': SVR()
}

# 모델별 파이프라인 생성
pipelines = {name: Pipeline([
    ('scaler', StandardScaler()),  # 데이터 스케일링
    ('model', model)               # 회귀 모델
]) for name, model in models.items()}

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [None]:
from sklearn.model_selection import GridSearchCV

# 파이프라인별 그리드서치 함수 정의
def perform_grid_search(pipeline, param_grid, X_train, y_train, cv_folds=5):
    grid_search = GridSearchCV(pipeline, param_grid, cv=cv_folds, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    return best_model, best_params, best_score

In [None]:
# 모델별 하이퍼파라미터 그리드
param_grids = {
    'Linear Regression': {},
    'Ridge': {'model__alpha': [0.1, 1, 10, 100]},
    'Lasso': {'model__alpha': [0.1, 1, 10, 100]},
    'ElasticNet': {'model__alpha': [0.1, 1, 10, 100],
                   'model__l1_ratio': [0.1, 0.5, 0.9]},
    'Random Forest': {
        'model__n_estimators': [100, 200, 500],
        'model__max_depth': [None, 10, 20, 30, 50],
        'model__min_samples_leaf': [1, 5, 10]
    },
    'Gradient Boosting': {
        'model__n_estimators': [100, 200, 500],
        'model__learning_rate': [0.01, 0.1, 0.2]
    },
    'XGBoost': {
        'model__n_estimators': [100, 200, 500],
        'model__learning_rate': [0.01, 0.1, 0.2]
    },
    'LightGBM': {
        'model__n_estimators': [100, 200, 500],
        'model__learning_rate': [0.01, 0.1, 0.2]
    },
    'SVR': {
        'model__C': [0.1, 1, 10],
        'model__epsilon': [0.01, 0.1, 0.2]
    }
}


In [None]:
# 각 모델별로 그리드 서치를 수행
results = {}
for name, pipeline in pipelines.items():
    print(f"Performing Grid Search for {name}...")
    param_grid = param_grids.get(name, {})
    best_model, best_params, best_score = perform_grid_search(pipeline, param_grid, X_train, y_train)
    results[name] = {
        'Best Model': best_model,
        'Best Params': best_params,
        'Best Score': best_score
    }

# 결과 출력
for model_name, result in results.items():
    print(f"\nModel: {model_name}")
    print(f"Best Parameters: {result['Best Params']}")
    print(f"Best Score (MSE): {result['Best Score']}")

Performing Grid Search for Linear Regression...
Performing Grid Search for Ridge...
Performing Grid Search for Lasso...
Performing Grid Search for ElasticNet...
Performing Grid Search for Random Forest...
Performing Grid Search for Gradient Boosting...
Performing Grid Search for XGBoost...
Performing Grid Search for LightGBM...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001366 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1838
[LightGBM] [Info] Number of data points in the train set: 16512, number of used features: 8
[LightGBM] [Info] Start training from score 2.071947
Performing Grid Search for SVR...

Model: Linear Regression
Best Parameters: {}
Best Score (MSE): -0.5192652011433679

Model: Ridge
Best Parameters: {'model__alpha': 0.1}
Best Score (MSE): -0.5192652367434867

Model: Lasso
Best Parameters: {'model__alpha': 0.1}
Best Score (MSE): -0.6720918682458477

Model: ElasticNet
Best Param

- LightGBM

    - Hyperparameters: {'model__learning_rate': 0.1, 'model__n_estimators': 500}

In [None]:
# from sklearn.model_selection import cross_val_score
# import numpy as np

# # 사용할 모델 리스트
# selected_models = ['Random Forest', 'Gradient Boosting', 'XGBoost', 'LightGBM']

# # 각 모델에 대해 크로스 밸리데이션 수행
# cv_results = {}
# for name in selected_models:
#     pipeline = pipelines[name]
#     print(f"Performing Cross Val Score for {name}...")
#     scores = cross_val_score(pipeline, X, y, cv=5, scoring='neg_mean_squared_error')
#     mse_scores = -scores
#     avg_mse = np.mean(mse_scores)

#     cv_results[name] = {
#         'Average MSE': avg_mse,
#         'Scores': mse_scores
#     }

# # 결과 출력
# for model_name, result in cv_results.items():
#     print(f"\nModel: {model_name}")
#     print(f"Average MSE: {result['Average MSE']:.4f}")
#     print(f"Individual Fold MSEs: {result['Scores']}")

Performing Cross Val Score for Random Forest...
Performing Cross Val Score for Gradient Boosting...
Performing Cross Val Score for XGBoost...
Performing Cross Val Score for LightGBM...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001644 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1838
[LightGBM] [Info] Number of data points in the train set: 16512, number of used features: 8
[LightGBM] [Info] Start training from score 2.164930
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002277 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1838
[LightGBM] [Info] Number of data points in the train set: 16512, number of used features: 8
[LightGBM] [Info] Start training from score 2.034871
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001274 seconds.
You can set `force_col_wise=t