In [16]:
import warnings
warnings.filterwarnings('ignore')

In [17]:
!python --version

Python 3.7.16


In [29]:
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# 1. 데이터 준비 및 전처리
cancer = load_breast_cancer()
X, y = cancer.data, cancer.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# SVC는 스케일링이 매우 중요
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 2. SVC 모델 학습 (RBF 커널)
# 주요 하이퍼파라미터
# kernel: 커널 종류 ('linear', 'poly', 'rbf', 'sigmoid')
# C: 규제 파라미터
# gamma: 커널 계수
svc_rbf = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42)
svc_rbf.fit(X_train_scaled, y_train)
svc_pred = svc_rbf.predict(X_test_scaled)
print(f"SVC (RBF Kernel) 정확도: {accuracy_score(y_test, svc_pred):.3f}") # 0.977

SVC (RBF Kernel) 정확도: 0.977


In [31]:
# 3. GridSearchCV를 이용한 최적 하이퍼파라미터 탐색
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto', 0.01, 0.1, 1]
}

grid_svc = GridSearchCV(SVC(kernel='rbf', random_state=42), param_grid, cv=5, scoring='accuracy', n_jobs=1)
grid_svc.fit(X_train_scaled, y_train)

print("\nGridSearchCV 최적 파라미터 (SVC):", grid_svc.best_params_) # {'C': 10, 'gamma': 0.01}
print(f"최적 파라미터 적용 시 정확도: {grid_svc.best_score_:.3f} (교차검증 평균)") # 0.975

# 최적 모델로 예측 및 평가
best_svc = grid_svc.best_estimator_
best_pred = best_svc.predict(X_test_scaled)
print("\nBest SVC 분류 리포트:\n", classification_report(y_test, best_pred))


GridSearchCV 최적 파라미터 (SVC): {'C': 10, 'gamma': 0.01}
최적 파라미터 적용 시 정확도: 0.975 (교차검증 평균)

Best SVC 분류 리포트:
               precision    recall  f1-score   support

           0       0.97      0.98      0.98        64
           1       0.99      0.98      0.99       107

    accuracy                           0.98       171
   macro avg       0.98      0.98      0.98       171
weighted avg       0.98      0.98      0.98       171



In [32]:
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score

# 1. 데이터 준비
cancer = load_breast_cancer()
X, y = cancer.data, cancer.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 2. 그래디언트 부스팅 (Scikit-learn)
from sklearn.ensemble import GradientBoostingClassifier
gb_clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gb_clf.fit(X_train, y_train)
gb_pred = gb_clf.predict(X_test)
print(f"Gradient Boosting 정확도: {accuracy_score(y_test, gb_pred):.3f}") # 0.956

# 3. XGBoost
# !pip install xgboost
import xgboost as xgb
# 이진 분류의 경우, objective='binary:logistic'
xgb_clf = xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42, 
                            objective='binary:logistic', eval_metric='logloss', use_label_encoder=False)
xgb_clf.fit(X_train, y_train)
xgb_pred = xgb_clf.predict(X_test)
print(f"XGBoost 정확도: {accuracy_score(y_test, xgb_pred):.3f}") # 0.956

# 4. LightGBM
# !pip install lightgbm
import lightgbm as lgb
lgbm_clf = lgb.LGBMClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
lgbm_clf.fit(X_train, y_train)
lgbm_pred = lgbm_clf.predict(X_test)
print(f"LightGBM 정확도: {accuracy_score(y_test, lgbm_pred):.3f}") # 0.956

# 5. 조기 종료(Early Stopping) 기능 활용 (XGBoost 예시)
# 검증 세트 준비
X_train_sub, X_val, y_train_sub, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

xgb_clf_early = xgb.XGBClassifier(n_estimators=1000, learning_rate=0.05, random_state=42,
                                  objective='binary:logistic', eval_metric='logloss', use_label_encoder=False)
# early_stopping_rounds: 검증 성능이 향상되지 않아도 기다릴 반복 횟수
# eval_set: 성능 평가에 사용할 검증 세트
xgb_clf_early.fit(X_train_sub, y_train_sub, 
                  eval_set=[(X_val, y_val)], 
                  early_stopping_rounds=100, 
                  verbose=False) # verbose=True로 하면 학습 과정 출력

early_pred = xgb_clf_early.predict(X_test)
early_pred_proba = xgb_clf_early.predict_proba(X_test)[:, 1]
print(f"\nXGBoost (Early Stopping) 정확도: {accuracy_score(y_test, early_pred):.3f}") # 0.939
print(f"XGBoost (Early Stopping) AUC: {roc_auc_score(y_test, early_pred_proba):.3f}") # 0.990

Gradient Boosting 정확도: 0.956
XGBoost 정확도: 0.956
LightGBM 정확도: 0.956

XGBoost (Early Stopping) 정확도: 0.939
XGBoost (Early Stopping) AUC: 0.990


In [33]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# 1. 데이터 준비
iris = load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# 2. 가우시안 나이브 베이즈 모델 학습
# GaussianNB 주요 하이퍼파라미터
# priors: 각 클래스의 사전 확률. None이면 데이터에 따라 자동 계산.
# var_smoothing: 분산 계산 시 안정성을 위해 더해주는 작은 값. (기본값=1e-9)
gnb_clf = GaussianNB()
gnb_clf.fit(X_train, y_train)

# 3. 예측 및 평가
y_pred = gnb_clf.predict(X_test)
y_pred_proba = gnb_clf.predict_proba(X_test)

print("--- 가우시안 나이브 베이즈 모델 평가 ---")
print(f"정확도: {accuracy_score(y_test, y_pred):.3f}") # 0.911
print("\n혼동 행렬:\n", confusion_matrix(y_test, y_pred))
print("\n분류 리포트:\n", classification_report(y_test, y_pred))

# 4. 학습된 파라미터 확인
print("\n클래스별 사전 확률 (priors):", gnb_clf.class_prior_) # [0.33333333 0.33333333 0.33333333]
print("클래스별 특성의 평균 (theta):", gnb_clf.theta_)
print("클래스별 특성의 분산 (sigma):", gnb_clf.sigma_)

--- 가우시안 나이브 베이즈 모델 평가 ---
정확도: 0.911

혼동 행렬:
 [[15  0  0]
 [ 0 14  1]
 [ 0  3 12]]

분류 리포트:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       0.82      0.93      0.87        15
           2       0.92      0.80      0.86        15

    accuracy                           0.91        45
   macro avg       0.92      0.91      0.91        45
weighted avg       0.92      0.91      0.91        45


클래스별 사전 확률 (priors): [0.33333333 0.33333333 0.33333333]
클래스별 특성의 평균 (theta): [[4.98857143 3.42571429 1.48571429 0.24      ]
 [5.94857143 2.73142857 4.23714286 1.30857143]
 [6.68285714 3.00857143 5.63142857 2.06857143]]
클래스별 특성의 분산 (sigma): [[0.10329796 0.17391021 0.02293878 0.00925715]
 [0.24078368 0.08558368 0.21147755 0.03564082]
 [0.42484898 0.11735511 0.32272653 0.06386939]]


'\n[[0.10329796 0.17391021 0.02293878 0.00925715]\n [0.24078368 0.08558368 0.21147755 0.03564082]\n [0.42484898 0.11735511 0.32272653 0.06386939]]\n'

In [34]:
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# 개별 모델 임포트
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

# 앙상블 모델 임포트
from sklearn.ensemble import VotingClassifier

from sklearn.metrics import accuracy_score

# 1. 데이터 준비 및 전처리
cancer = load_breast_cancer()
X, y = cancer.data, cancer.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# 스케일링 (로지스틱 회귀, KNN, SVC에 필요)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 2. 개별 모델 정의
# SVC는 probability=True로 설정해야 soft voting 가능
lr_clf = LogisticRegression(solver='liblinear', random_state=42)
knn_clf = KNeighborsClassifier(n_neighbors=5)
svc_clf = SVC(kernel='rbf', C=10, gamma=0.1, probability=True, random_state=42)
dt_clf = DecisionTreeClassifier(max_depth=3, random_state=42)

# 3. 보팅 분류기 생성
# Hard Voting
# estimators: (모델 이름, 모델 객체)의 리스트
# voting: 'hard' 또는 'soft'
hard_voting_clf = VotingClassifier(
    estimators=[('lr', lr_clf), ('knn', knn_clf), ('svc', svc_clf), ('dt', dt_clf)],
    voting='hard'
)

# Soft Voting
soft_voting_clf = VotingClassifier(
    estimators=[('lr', lr_clf), ('knn', knn_clf), ('svc', svc_clf), ('dt', dt_clf)],
    voting='soft'
)

# 4. 모델 학습 및 평가
# VotingClassifier는 내부적으로 각 모델에 데이터를 전달하므로, 스케일링된 데이터를 사용
hard_voting_clf.fit(X_train_scaled, y_train)
hard_pred = hard_voting_clf.predict(X_test_scaled)
print(f"Hard Voting 정확도: {accuracy_score(y_test, hard_pred):.3f}") # 0.977

soft_voting_clf.fit(X_train_scaled, y_train)
soft_pred = soft_voting_clf.predict(X_test_scaled)
print(f"Soft Voting 정확도: {accuracy_score(y_test, soft_pred):.3f}") # 0.953

# 개별 모델 성능과 비교
lr_clf.fit(X_train_scaled, y_train)
lr_pred = lr_clf.predict(X_test_scaled)
print(f"\nLogistic Regression 정확도: {accuracy_score(y_test, lr_pred):.3f}") # 0.988

knn_clf.fit(X_train_scaled, y_train)
knn_pred = knn_clf.predict(X_test_scaled)
print(f"KNN 정확도: {accuracy_score(y_test, knn_pred):.3f}") # 0.959

svc_clf.fit(X_train_scaled, y_train)
svc_pred = svc_clf.predict(X_test_scaled)
print(f"SVC 정확도: {accuracy_score(y_test, svc_pred):.3f}") # 0.953

# 결정 트리는 스케일링이 필요 없지만, 비교를 위해 스케일링된 데이터로 학습
dt_clf.fit(X_train_scaled, y_train)
dt_pred = dt_clf.predict(X_test_scaled)
print(f"Decision Tree 정확도: {accuracy_score(y_test, dt_pred):.3f}") # 0.924

Hard Voting 정확도: 0.977
Soft Voting 정확도: 0.953

Logistic Regression 정확도: 0.988
KNN 정확도: 0.959
SVC 정확도: 0.953
Decision Tree 정확도: 0.924
