In [77]:
import seaborn as sns
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score, r2_score, precision_score, recall_score, roc_auc_score, roc_curve

In [78]:
penguins = sns.load_dataset("penguins")

In [79]:
num_species = penguins["species"].nunique()
num_species

3

In [80]:
penguins.dropna(subset=["bill_length_mm"], inplace=True)
penguins.isna().sum()

species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  9
dtype: int64

In [81]:
# 나이브 베이즈, knn(k=3), 의사결정트리(depth=3) 로
# 펭귄의 종을 예측하는 모델을 만들고 학습하자!

# 스탠다드 스케일러로 정규화 해야 함

# 각 모델의 정확도, r2스코어, precision스코어, recall스코어
# roc auc 스코어를 각각 출력하자!

In [82]:
# [나이브 베이즈 : model01]
# 1. 데이터 준비
x = penguins[["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g"]]
y = penguins["species"].map(lambda x: 0 if x=="Adelie" else 1 if x=="Gentoo" else 2)

# 1-1. 스케일링
x_scale = StandardScaler().fit_transform(x)

# 2. 데이터 나누기
x_train, x_test, y_train, y_test = train_test_split(x_scale, y, test_size=0.2, random_state=1)

# 3. 모델 선택
model01 = GaussianNB()

# 4. 학습
model01.fit(x_train, y_train)

# 5. 평가
model01.score(x_test, y_test)

# 6. 예측
predict = model01.predict(x_test)
predict

array([0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 2, 0, 1, 1, 1, 1, 1, 1, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 2, 2, 1, 0, 2, 1, 0, 0, 1, 0, 1, 2, 2, 2,
       1, 0, 0, 2, 1, 0, 1, 0, 1, 1, 0, 2, 0, 1, 1, 2, 0, 0, 1, 0, 0, 0,
       0, 1, 1], dtype=int64)

In [83]:
# 정확도(accuracy)
accuracy01 = accuracy_score(y_test, predict)

# r2 스코어
r201 = r2_score(y_test, predict)

# precision 스코어
precision01 = precision_score(y_test, predict, average=None)

# recall 스코어
recall01 = recall_score(y_test, predict, average=None)

print(accuracy01)
print(r201)
print(precision01)
print(recall01)


0.9710144927536232
0.7630901287553649
[0.96551724 1.         0.9       ]
[0.96551724 1.         0.9       ]


In [97]:
predict_proba = model01.predict_proba(x_test)
predict_proba

array([[9.98805163e-01, 1.60602438e-11, 1.19483698e-03],
       [9.97557731e-01, 1.09290759e-09, 2.44226797e-03],
       [9.92475033e-01, 2.42592255e-11, 7.52496672e-03],
       [3.82599407e-14, 1.00000000e+00, 1.45887237e-10],
       [1.28728904e-13, 1.00000000e+00, 3.25140737e-10],
       [7.32897875e-10, 9.99999904e-01, 9.51323244e-08],
       [8.36775813e-01, 3.28513875e-07, 1.63223859e-01],
       [9.99411571e-01, 2.13303285e-09, 5.88426724e-04],
       [9.98297886e-01, 1.33838571e-07, 1.70198036e-03],
       [9.58614419e-01, 1.71590994e-08, 4.13855639e-02],
       [9.99835942e-01, 3.00966090e-10, 1.64057389e-04],
       [1.96783129e-11, 9.99999992e-01, 8.09045612e-09],
       [9.99965265e-01, 1.90763249e-10, 3.47343961e-05],
       [3.95535122e-02, 2.81730397e-06, 9.60443671e-01],
       [9.94808843e-01, 1.93009313e-13, 5.19115738e-03],
       [4.66364089e-12, 1.00000000e+00, 1.02776481e-10],
       [1.77384690e-10, 9.99999921e-01, 7.83721862e-08],
       [2.91755632e-05, 9.99919

In [98]:
roc_auc01 = roc_auc_score(y_test, predict_proba, multi_class='ovr')
print("ROC-AUC Score:", roc_auc01)

ROC-AUC Score: 0.9948860315604909


In [86]:
# roc_auc  
# roc_auc = roc_auc_score(y_test, predict)

# 이진분류가 아니라서 roc_curve 함수를 그릴 수 없는건가?

"""
fpr, tpr, thresholds = roc_curve(y_test, predict)

plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.title('ROC Curve')
plt.show()
"""

"\nfpr, tpr, thresholds = roc_curve(y_test, predict)\n\nplt.plot(fpr, tpr)\nplt.xlabel('False Positive Rate (FPR)')\nplt.ylabel('True Positive Rate (TPR)')\nplt.title('ROC Curve')\nplt.show()\n"

In [87]:
# [Knn : model02]
# 1. 데이터 준비
x = penguins[["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g"]]
y = penguins["species"].map(lambda x: 0 if x=="Adelie" else 1 if x=="Gentoo" else 2)

# 1-1. 스케일링
x_scale = StandardScaler().fit_transform(x)

# 2. 데이터 나누기
x_train, x_test, y_train, y_test = train_test_split(x_scale, y, test_size=0.2, random_state=1)

# 3. 모델 선택
model02 = KNeighborsClassifier()

# 4. 학습
model02.fit(x_train, y_train)

# 5. 평가
model02.score(x_test, y_test)

# 6. 예측
predict = model02.predict(x_test)
predict

array([0, 0, 0, 1, 1, 1, 0, 0, 0, 2, 0, 1, 0, 2, 0, 1, 1, 1, 1, 1, 1, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 2, 2, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 2, 2,
       1, 0, 0, 2, 1, 0, 1, 0, 1, 1, 0, 2, 0, 1, 1, 2, 0, 0, 1, 0, 0, 0,
       0, 1, 1], dtype=int64)

In [102]:
# 정확도(accuracy)
accuracy02 = accuracy_score(y_test, predict)

# r2 스코어
r202 = r2_score(y_test, predict)

# precision 스코어
precision02 = precision_score(y_test, predict, average=None)

# recall 스코어
recall02 = recall_score(y_test, predict, average=None)

print(accuracy02)
print(r202)
print(precision02)
print(recall02)

# roc_auc_score
predict_proba = model02.predict_proba(x_test)
predict_proba
roc_auc02 = roc_auc_score(y_test, predict_proba, multi_class='ovr')
print("ROC-AUC Score:", roc_auc02)


0.9710144927536232
0.851931330472103
[0.96551724 1.         0.90909091]
[0.96551724 0.96666667 1.        ]
ROC-AUC Score: 0.9991476719267484


In [89]:
# [의사결정트리(depth=3) : model03]
# 1. 데이터 준비
x = penguins[["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g"]]
y = penguins["species"].map(lambda x: 0 if x=="Adelie" else 1 if x=="Gentoo" else 2)

# 1-1. 스케일링
x_scale = StandardScaler().fit_transform(x)

# 2. 데이터 나누기
x_train, x_test, y_train, y_test = train_test_split(x_scale, y, test_size=0.2, random_state=1)

# 3. 모델 선택
model03 = DecisionTreeClassifier()

# 4. 학습
model03.fit(x_train, y_train)

# 5. 평가
model03.score(x_test, y_test)

# 6. 예측
predict = model03.predict(x_test)
predict

array([0, 0, 0, 1, 1, 1, 0, 0, 0, 2, 0, 1, 0, 2, 0, 1, 1, 1, 1, 1, 1, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 2, 2, 1, 0, 2, 1, 0, 0, 1, 0, 1, 2, 2, 2,
       1, 0, 0, 2, 1, 0, 1, 0, 1, 1, 0, 2, 0, 1, 1, 2, 0, 0, 1, 0, 0, 0,
       0, 1, 1], dtype=int64)

In [100]:
# 정확도(accuracy)
accuracy03 = accuracy_score(y_test, predict)

# r2 스코어
r203 = r2_score(y_test, predict)

# precision 스코어
precision03 = precision_score(y_test, predict, average=None)

# recall 스코어
recall03 = recall_score(y_test, predict, average=None)

print(accuracy03)
print(r203)
print(precision03)
print(recall03)

# roc_auc_score
predict_proba = model03.predict_proba(x_test)
predict_proba
roc_auc03 = roc_auc_score(y_test, predict_proba, multi_class='ovr')
print("ROC-AUC Score:", roc_auc03)

0.9710144927536232
0.851931330472103
[0.96551724 1.         0.90909091]
[0.96551724 0.96666667 1.        ]
ROC-AUC Score: 0.981705792583934
