### 차트 작성 시 한글 깨짐 방지를 위한 koreanize-matplotlib 설치

In [1]:
!pip install koreanize-matplotlib

Collecting koreanize-matplotlib
  Downloading koreanize_matplotlib-0.1.1-py3-none-any.whl.metadata (992 bytes)
Downloading koreanize_matplotlib-0.1.1-py3-none-any.whl (7.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m40.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: koreanize-matplotlib
Successfully installed koreanize-matplotlib-0.1.1


### 라이브러리 import

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import koreanize_matplotlib
import seaborn as sns
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

### 1. 데이터 로드 및 분할

In [3]:
cancer = load_breast_cancer()
df = pd.DataFrame(cancer['data'], columns=cancer['feature_names'])
df['target'] = cancer['target']

In [4]:
# Feature(X)와 Target(y) 분리
X = df.drop('target', axis=1)
y = df['target']

# 학습용/테스트용 데이터 분할 (8:2)
# stratify=y: 타겟 클래스 비율 유지 (강의자료 강조 사항)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"학습 데이터: {X_train.shape}")
print(f"테스트 데이터: {X_test.shape}")

학습 데이터: (455, 30)
테스트 데이터: (114, 30)


### 2. 모델 학습(DecisionTree)

In [5]:
# 모델 객체 생성
gnb = GaussianNB()

# 모델 학습
gnb.fit(X_train, y_train)
print("모델 학습 완료.")
print(f"클래스별 사전 확률 (Prior): {gnb.class_prior_}") # P(Y)
print(f"클래스별 특성 평균 (Theta):\n{gnb.theta_[:2]}") # 각 클래스별 특성의 평균값 일부 출력

모델 학습 완료.
클래스별 사전 확률 (Prior): [0.37362637 0.62637363]
클래스별 특성 평균 (Theta):
[[1.73812353e+01 2.15798824e+01 1.14830647e+02 9.66820000e+02
  1.02993941e-01 1.46144824e-01 1.62362882e-01 8.76072353e-02
  1.93630588e-01 6.28292941e-02 5.94710588e-01 1.20105000e+00
  4.23124118e+00 6.98808824e+01 6.79774118e-03 3.24939824e-02
  4.31357059e-02 1.50777235e-02 2.09235235e-02 4.01020588e-03
  2.09670588e+01 2.92767647e+01 1.40258176e+02 1.39677118e+03
  1.44847588e-01 3.78881824e-01 4.63242824e-01 1.82847412e-01
  3.26841176e-01 9.17371176e-02]
 [1.20904281e+01 1.78560351e+01 7.76751228e+01 4.58690526e+02
  9.20954737e-02 7.86516842e-02 4.55483218e-02 2.49237474e-02
  1.72856140e-01 6.28141053e-02 2.82215439e-01 1.22927474e+00
  1.98227825e+00 2.09689649e+01 7.25420351e-03 2.13887860e-02
  2.62396828e-02 9.64176491e-03 2.06925228e-02 3.67027754e-03
  1.33201333e+01 2.34823509e+01 8.65635789e+01 5.54231579e+02
  1.24861649e-01 1.80033649e-01 1.65234375e-01 7.27803895e-02
  2.69405965e-01 7.929645

### 3. 모델 예측 및 성능 평가

In [6]:
# 테스트 데이터 예측
y_pred = gnb.predict(X_test)

# 정확도 계산
accuracy = accuracy_score(y_test, y_pred)
print(f"모델의 정확도: {accuracy:.4f}")

# 상세 리포트 출력
print("\n[Classification Report]")
print(classification_report(y_test, y_pred, target_names=['No cancer (0)', 'cancer (1)']))

모델의 정확도: 0.9386

[Classification Report]
               precision    recall  f1-score   support

No cancer (0)       0.93      0.90      0.92        42
   cancer (1)       0.95      0.96      0.95        72

     accuracy                           0.94       114
    macro avg       0.94      0.93      0.93       114
 weighted avg       0.94      0.94      0.94       114

