In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

In [None]:
data = pd.read_csv('/content/diabetes.csv')

In [None]:
data.head(5)

# 데이터 기초 정보 확인

In [None]:
data.shape

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
data.isnull().sum()

In [None]:
data.columns

In [None]:
data.corr()

# 데이터 시각화하기

In [None]:
plt.figure(figsize = (4,3))
data.Outcome.value_counts().plot.pie(labels = ("No", "Yes"), autopct = "%.2f%%")

In [None]:
plt.figure(figsize = (7,5))

sns.heatmap(data.corr(), annot =True)

In [None]:
data.hist(figsize=(18,12))
plt.show()

In [None]:
plt.figure(figsize=(14,8))
sns.set_style(style='whitegrid')
plt.subplot(2,3,1)
sns.boxplot(x='Glucose',data=data)
plt.subplot(2,3,2)
sns.boxplot(x='BloodPressure',data=data)
plt.subplot(2,3,3)
sns.boxplot(x='Insulin',data=data)
plt.subplot(2,3,4)
sns.boxplot(x='BMI',data=data)
plt.subplot(2,3,5)
sns.boxplot(x='Age',data=data)
plt.subplot(2,3,6)
sns.boxplot(x='SkinThickness',data=data)

In [None]:
sns.scatterplot(x='Glucose', y='Insulin', hue='Outcome', data=data)

# Dataset 만들기


In [None]:
x = data.drop(columns = 'Outcome')
y = data['Outcome']

In [None]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [None]:
print(len(x_train), len(x_test), len(y_train), len(y_test))

# Logistic Regression Model

In [None]:
# 1. 사용할 머신러닝 모델 import
from sklearn.linear_model import LogisticRegression

# 2. 머신러닝 모델 객체 생성
lr = LogisticRegression(random_state=0)

# 3. fit()함수 이용하여 training 데이터로 학습하기
lr.fit(x_train,y_train)

# 4. predict()함수 이용하여 test 데이터로 예측하기
y_pred=lr.predict(x_test)

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("Training Accuracy: ", lr.score(x_train, y_train)*100)
print("Test Accuracy: ", accuracy_score(y_test,y_pred)*100)

# 데이터 전처리

In [None]:
data['Glucose'] = data['Glucose'].replace(0,data['Glucose'].median())

# **[문제 1]**
BloodPressure/SkinThickness/Insulin/BMI의 0인 값들을 각 feature들의 평균값으로 대체하시오.(4점)

In [None]:
# BloodPressure


In [None]:
# SkinThickness


In [None]:
# Insulin


In [None]:
# BMI


In [None]:
data.describe()

# 다양한 성능지표

In [None]:
# 새로운 로지스틱 회귀 모델
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, classification_report, roc_auc_score, RocCurveDisplay

lr_new = LogisticRegression(random_state=0)
lr_new.fit(x_train,y_train)
y_pred=lr_new.predict(x_test)
print("Training Accuracy: ", lr_new.score(x_train, y_train)*100)
print("Test Accuracy: ", accuracy_score(y_test,y_pred)*100)
print("\nPrecision: ", precision_score(y_test,y_pred)*100)
print("Recall: ", recall_score(y_test,y_pred)*100)
print("F1 Score: ", f1_score(y_test,y_pred)*100)
print("ROC_AUC Score : ",roc_auc_score(y_test, y_pred))
print("\nNew Logistic Regression lassification Report: \n",classification_report(y_test,y_pred))

In [None]:
plt.figure(figsize = (3,2))
cm = confusion_matrix(y_pred, y_test)
sns.heatmap(cm, annot=True)
plt.show()

# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state=0)

dt.fit(x_train,y_train)
y_pred=dt.predict(x_test)

print("Decision Tree Training Accuracy: {:.2f}%".format(dt.score(x_train, y_train)*100))
print("Decision Tree Accuracy: {:.2f}%".format(accuracy_score(y_test,y_pred)*100))
print("\nPrecision: ", precision_score(y_test,y_pred)*100)
print("Recall: ", recall_score(y_test,y_pred)*100)
print("F1 Score: ", f1_score(y_test,y_pred)*100)
print("ROC_AUC Score : ",roc_auc_score(y_test, y_pred))

print("\nDecision Tree Classification Report: \n", classification_report(y_test,y_pred))

plt.figure(figsize = (3,2))
cm = confusion_matrix(y_pred, y_test)
sns.heatmap(cm, annot=True)
plt.show()

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier, BaggingClassifier

In [None]:
rf = RandomForestClassifier(random_state=0)
rf.fit(x_train, y_train)
y_pred = rf.predict(x_test)

print("Random Forest Training Accuracy: {:.2f}%".format(rf.score(x_train, y_train)*100))
print("Random Forest Accuracy: {:.2f}%".format(accuracy_score(y_test,y_pred)*100))
print("\nPrecision: ", precision_score(y_test,y_pred)*100)
print("Recall: ", recall_score(y_test,y_pred)*100)
print("F1 Score: ", f1_score(y_test,y_pred)*100)
print("ROC_AUC Score : ",roc_auc_score(y_test, y_pred))
print("\nRandom Forest Classification Report: \n", classification_report(y_test,y_pred))

plt.figure(figsize = (3,2))
cm = confusion_matrix(y_pred, y_test)
sns.heatmap(cm, annot=True)
plt.show()

# Bagging Model

In [None]:
tree = DecisionTreeClassifier()
bagging = BaggingClassifier(base_estimator=tree, n_estimators=1500, random_state=42)
bagging.fit(x_train, y_train)
y_pred = bagging.predict(x_test)

print("Bagging Training Accuracy: {:.2f}%".format(bagging.score(x_train, y_train)*100))
print("Bagging Test Accuracy: {:.2f}%".format(accuracy_score(y_test,y_pred)*100))

print("\nPrecision: ", precision_score(y_test,y_pred)*100)
print("Recall: ", recall_score(y_test,y_pred)*100)
print("F1 Score: ", f1_score(y_test,y_pred)*100)
print("ROC_AUC Score : ",roc_auc_score(y_test, y_pred))
print("\nBagging Classification Report: \n", classification_report(y_test,y_pred))
print(f"\nCONFUSION MATRIX:\n{confusion_matrix(y_pred, y_test)}")

# AdaBoost

In [None]:
ab = AdaBoostClassifier(n_estimators=200, random_state=0)
ab.fit(x_train, y_train)
y_pred = ab.predict(x_test)

print("AdaBoost Training Accuracy: {:.2f}%".format(ab.score(x_train, y_train)*100))
print("AdaBoost Test Accuracy: {:.2f}%".format(accuracy_score(y_test,y_pred)*100))
print("\nPrecision: ", precision_score(y_test,y_pred)*100)
print("Recall: ", recall_score(y_test,y_pred)*100)
print("F1 Score: ", f1_score(y_test,y_pred)*100)
print("ROC_AUC Score : ",roc_auc_score(y_test, y_pred))
print("AdaBoost Classification Report: \n", classification_report(y_test,y_pred))
print(f"CONFUSION MATRIX:\n{confusion_matrix(y_pred, y_test)}")

# Gradient Boosting Machine

# **[문제 2]**
Gradient Boosting Machine 모델을 생성하고 accuracy와 confusion matrix를 구하는 프로그램을 완성하시오.(6점)

In [None]:
# 머신러닝 모델 객체 생성


# fit() 함수 이용하여 training 데이터로 학습하기


# predict()함수 이용하여 test 데이터로 예측하기


# 성능 지표 출력하기


In [None]:
def roc_auc(model, x_train, x_test, y_train, y_test):
    y_test_pred = model.predict(x_test)
    fig, axes = plt.subplots(1, 2, figsize=(20,5))
    axes[0].set_title('ROC-AUC from estimator')
    axes[1].set_title('ROC-AUC from predictions')
    RocCurveDisplay.from_estimator(model, x_test, y_test, ax=axes[0])
    RocCurveDisplay.from_predictions(y_test, y_test_pred, ax=axes[1])
    plt.show()

In [None]:
roc_auc(gbm, x_train, x_test, y_train, y_test)

# Voting model

In [None]:
estimators = []
log = LogisticRegression()
estimators.append(('Logistic', log))

tree = DecisionTreeClassifier()
estimators.append(('Tree', tree))

ada = AdaBoostClassifier()
estimators.append(('AdaBoost', ada))

gbm = GradientBoostingClassifier()
estimators.append(('GBM', gbm))

voting = VotingClassifier(estimators=estimators)
voting.fit(x_train, y_train)
y_pred = voting.predict(x_test)

print("Voting Training Accuracy: {:.2f}%".format(voting.score(x_train, y_train)*100))
print("Voting Test Accuracy: {:.2f}%".format(accuracy_score(y_test,y_pred)*100))
print("\nPrecision: ", precision_score(y_test,y_pred)*100)
print("Recall: ", recall_score(y_test,y_pred)*100)
print("F1 Score: ", f1_score(y_test,y_pred)*100)
print("ROC_AUC Score : ",roc_auc_score(y_test, y_pred))
print("\nVoting Classification Report: \n", classification_report(y_test,y_pred))

print(f"CONFUSION MATRIX:\n{confusion_matrix(y_pred, y_test)}")