<a href="https://colab.research.google.com/github/inhamjchoi/SafetyDataClass/blob/main/Ex03_2_Classification_HeartData_AutoML_lazypredict.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# This script uses PyCaret, a low-code machine learning library that simplifies model training and comparison.
# PyCaret is suitable for beginners, quick prototyping, and works well in Google Colab.
# It automatically handles preprocessing, model training, cross-validation, and comparison of many algorithms.
# Ideal for tabular data and classification or regression tasks.

# PyCaret is especially useful when:
# - You want to compare many models quickly without writing separate code for each
# - You prefer automation of preprocessing and hyperparameter tuning
# - You are working in a limited environment like Google Colab or Jupyter

# If you need full control or custom model architecture (e.g., deep learning, text/image data), consider:
# - scikit-learn for customizable ML pipelines
# - XGBoost or LightGBM for high-performance gradient boosting
# - PyTorch or TensorFlow for deep learning tasks
# - Auto-sklearn or Google AutoML for advanced AutoML capabilities

In [3]:
!pip install lazypredict



In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from lazypredict.Supervised import LazyClassifier

In [5]:
# Load the dataset
df = pd.read_csv('Ex03_2_heartData.csv')

In [6]:
# Features and target
X = df.drop(columns=['HeartDisease'])   # input features
y = df['HeartDisease']                  # target label

In [7]:
# Automatically encode categorical variables using One-Hot Encoding
X = pd.get_dummies(X, drop_first=True)

In [8]:
# Train/test split (cv는 LazyPredict 내부에서는 사용되지 않음)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

In [9]:
# LazyClassifier 실행
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)
# 결과 보기
print(models)

  0%|          | 0/32 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 165, number of negative: 133
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000198 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 429
[LightGBM] [Info] Number of data points in the train set: 298, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.553691 -> initscore=0.215596
[LightGBM] [Info] Start training from score 0.215596
                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
AdaBoostClassifier                 0.72               0.71     0.71      0.72   
Perceptron                         0.71               0.71     0.71      0.71   
BernoulliNB                        0.71               0.71     0.71      0.71   
ExtraTreesClassifier               0.71               0.71     0.71      0.71   
LabelPropagation             

Grid Search

In [10]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

In [11]:
# 약한 학습기 정의
base_model = DecisionTreeClassifier(random_state=42)

In [12]:
#  AdaBoost 모델 정의
ada = AdaBoostClassifier(estimator=base_model, random_state=42)

In [13]:
# 하이퍼파라미터 탐색 공간 정의
param_grid = {
    # n_estimators: 부스팅 반복 횟수 (결합할 약한 모델 개수)
    'n_estimators': [50, 100, 200],

    # learning_rate: 각 모델의 기여도 (값이 작으면 느리지만 더 안정적, 보통 0.01 ~ 1.0 사이)
    'learning_rate': [0.01, 0.1, 1.0],

    # estimator__max_depth: 약한 학습기(의사결정나무)의 깊이
    # → 너무 깊으면 과적합, 너무 얕으면 성능 제한됨 (보통 1~3 사이 추천)
    'estimator__max_depth': [1, 2, 3],
}

# 그리드 서치로 하이퍼파라미터 튜닝 수행
grid_search = GridSearchCV(
    estimator=ada,             # AdaBoost 분류기
    param_grid=param_grid,     # 탐색할 하이퍼파라미터
    scoring='accuracy',        # 정확도를 기준으로 평가
    cv=5,                      # 5-fold 교차검증
    n_jobs=-1,                 # 모든 CPU 사용
    verbose=1                  # 출력 자세히 보기
)



In [14]:
# 학습
grid_search.fit(X_train, y_train)


Fitting 5 folds for each of 27 candidates, totalling 135 fits


In [15]:
# 최적의 하이퍼파라미터 및 성능 출력
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Accuracy:", grid_search.best_score_)



Best Parameters: {'estimator__max_depth': 1, 'learning_rate': 0.1, 'n_estimators': 100}
Best Cross-Validation Accuracy: 0.7383050847457626


In [16]:
# 테스트셋 성능 확인
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred))

Test Accuracy: 0.6821705426356589
