In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('/content/diabetes.csv')
df.head(10)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression

X = df.drop(["Outcome"], axis=1)
y = df["Outcome"]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [None]:
# Logistic Regression Model
LR = LogisticRegression(max_iter=1000, random_state=42)

#fiting the model
LR.fit(X_train, y_train)

#prediction
y_pred = LR.predict(X_test)

print('train_accuracy: {:.4f}'.format(LR.score(X_train, y_train)))
print('accuracy: {:.4f}'.format(accuracy_score(y_test, y_pred)))
print('precision: {:.4f}'.format(precision_score(y_test, y_pred)))
print('recall: {:.4f}'.format(recall_score(y_test, y_pred)))
print('f1-score: {:.4f}'.format(f1_score(y_test, y_pred)))

conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(3, 2))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier()
dt_clf.fit(X_train, y_train)
dt_pred = dt_clf.predict(X_test)

print('train_accuracy: {:.4f}'.format(dt_clf.score(X_train, y_train)))
print('accuracy: {:.4f}'.format(accuracy_score(y_test, dt_pred)))
print('precision: {:.4f}'.format(precision_score(y_test, dt_pred)))
print('recall: {:.4f}'.format(recall_score(y_test, dt_pred)))
print('f1-score: {:.4f}'.format(f1_score(y_test, dt_pred)))

conf_matrix = confusion_matrix(y_test, dt_pred)
plt.figure(figsize=(3, 2))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

In [None]:
# evaluate Method
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
def evaluate(model, X_train, X_test, y_train, y_test):
    y_test_pred = model.predict(X_test)
    y_train_pred = model.predict(X_train)

    print("TRAINIG RESULTS: \n===============================")
    clf_report = pd.DataFrame(classification_report(y_train, y_train_pred, output_dict=True))
    print(f"CONFUSION MATRIX:\n{confusion_matrix(y_train, y_train_pred)}")
    print(f"ACCURACY SCORE:\n{accuracy_score(y_train, y_train_pred):.4f}")
    print(f"CLASSIFICATION REPORT:\n{clf_report}")

    print("\nTESTING RESULTS: \n===============================")
    clf_report = pd.DataFrame(classification_report(y_test, y_test_pred, output_dict=True))
    print(f"CONFUSION MATRIX:\n{confusion_matrix(y_test, y_test_pred)}")
    print(f"ACCURACY SCORE:\n{accuracy_score(y_test, y_test_pred):.4f}")
    print(f"CLASSIFICATION REPORT:\n{clf_report}")

In [None]:
evaluate(LR, X_train, X_test, y_train, y_test)

In [None]:
evaluate(dt_clf, X_train, X_test, y_train, y_test)

In [None]:
# Random Forest Model
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier

RF_clf = RandomForestClassifier(n_estimators=100, random_state=42)

RF_clf.fit(X_train, y_train)

evaluate(RF_clf, X_train, X_test, y_train, y_test)

In [None]:
RF_clf = RandomForestClassifier(n_estimators=1000, random_state=42)

RF_clf.fit(X_train, y_train)

evaluate(RF_clf, X_train, X_test, y_train, y_test)

In [None]:
# AdaBoost Model
from sklearn.ensemble import AdaBoostClassifier

Ada_clf = AdaBoostClassifier(n_estimators=1000, random_state=42)

Ada_clf.fit(X_train, y_train)

evaluate(Ada_clf, X_train, X_test, y_train, y_test)

In [None]:
# GBM Model max_depth=5,
from sklearn.ensemble import GradientBoostingClassifier

GBM_clf = GradientBoostingClassifier(n_estimators=1000, learning_rate=0.01, random_state=42)

GBM_clf.fit(X_train, y_train)

evaluate(GBM_clf, X_train, X_test, y_train, y_test)

In [None]:
# XGBoost Model
from xgboost import XGBClassifier

XGB_clf = XGBClassifier(n_estimators=1000, learning_rate=0.01, random_state=42)

XGB_clf.fit(X_train, y_train)

evaluate(XGB_clf, X_train, X_test, y_train, y_test)

In [None]:
# LightGBM Model
from lightgbm import LGBMClassifier

LGBM_clf = LGBMClassifier(
    n_estimators=1000,
    learning_rate=0.01,
    random_state=42,
    max_depth=3,
    num_leaves=8,
    min_child_samples=20,
    verbose=-1
    )

LGBM_clf.fit(X_train, y_train)

evaluate(LGBM_clf, X_train, X_test, y_train, y_test)

In [None]:
!pip install catboost

In [None]:
from catboost import CatBoostClassifier

CB_clf = CatBoostClassifier(
    n_estimators=1000,
    learning_rate=0.01,
    random_state=42,
    max_depth=3,
    verbose=0
)

CB_clf.fit(X_train, y_train)

evaluate(CB_clf, X_train, X_test, y_train, y_test)

In [None]:
import shap
import matplotlib.pyplot as plt

# Step 1: TreeExplainer로 SHAP값 계산
explainer = shap.TreeExplainer(CB_clf)
shap_values = explainer.shap_values(X_test)

# Step 2: 요약 플롯 (Summary Plot)
plt.title("SHAP Summary Plot (X_test)")
shap.summary_plot(shap_values, X_test)

# Step 3: 상위 feature 중요도 바 그래프
plt.title("SHAP Feature Importance (Bar)")
shap.summary_plot(shap_values, X_test, plot_type="bar")

# Step : 단일 예측에 대한 SHAP force plot 시각화
sample_idx = 0
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[sample_idx], X_test.iloc[sample_idx])


In [None]:
# 1. TreeExplainer 생성
explainer = shap.TreeExplainer(CB_clf)
# 2. SHAP 값 계산
shap_values = explainer.shap_values(X_test)

# 3. Decision Plot: 첫 번째 샘플에 대해 선 하나만 출력
shap.decision_plot(
    explainer.expected_value,     # 단일 기준값 사용
    shap_values,                  # 전체 SHAP 값
    X_test,                       # 테스트 데이터
    feature_display_range=slice(None, 5),  # 상위 5개 특성
    show=True
)

In [None]:

explainer = shap.TreeExplainer(CB_clf)
shap_values = explainer.shap_values(X_test)

# 3. feature 이름 목록
feature_names = X_test.columns.tolist()

# 4. Decision Plot: 첫 번째 샘플에 대해 선 하나만 출력
shap.decision_plot(
    base_value=explainer.expected_value,     # 스칼라 (예: 0.46)
    shap_values=shap_values[0],              # 첫 번째 샘플의 SHAP 값
    features=X_test.iloc[0],                 # 첫 번째 샘플의 원본 특성값
    feature_names=feature_names              # feature 이름 리스트
)

In [None]:
explainer = shap.TreeExplainer(CB_clf)
shap_values = explainer.shap_values(X_test)

# 3. feature 이름 목록
feature_names = X_test.columns.tolist()

# 4. Decision Plot: 첫 번째 샘플에 대해 선 하나만 출력
shap.decision_plot(
    base_value=explainer.expected_value,     # 스칼라 (예: 0.46)
    shap_values=shap_values[191],              # 첫 번째 샘플의 SHAP 값
    features=X_test.iloc[191],                 # 첫 번째 샘플의 원본 특성값
    feature_names=feature_names              # feature 이름 리스트
)

In [None]:
#KNN Model
from sklearn.neighbors import KNeighborsClassifier

KNN_clf = KNeighborsClassifier(n_neighbors=3, metric='euclidean')

KNN_clf.fit(X_train, y_train)

evaluate(KNN_clf, X_train, X_test, y_train, y_test)

In [None]:
#KNN Model
from sklearn.neighbors import KNeighborsClassifier

KNN_clf = KNeighborsClassifier(n_neighbors=9, metric='euclidean')

KNN_clf.fit(X_train, y_train)

evaluate(KNN_clf, X_train, X_test, y_train, y_test)

In [None]:
#SVM Model
from sklearn.svm import SVC

SVM_clf = SVC(kernel='rbf', random_state=42)

SVM_clf.fit(X_train, y_train)

evaluate(SVM_clf, X_train, X_test, y_train, y_test)

In [None]:
# K-means Model
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

df = pd.read_csv('/content/diabetes.csv')
X = df.drop(["Outcome"], axis=1)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

kmeans = KMeans(n_clusters=3, random_state=42)
cluster_labels = kmeans.fit_predict(X_scaled)

results_df = X.copy()
results_df['Cluster'] = cluster_labels

print("\n 클러스터별 평균값:")
print(results_df.groupby('Cluster').mean())

print("\n 클러스터별 중심점:")
cluster_centers_df = pd.DataFrame(kmeans.cluster_centers_, columns=X.columns)
print(cluster_centers_df)

In [None]:
plt.figure(figsize=(15, 6))
for i, column in enumerate(X.columns):
  plt.subplot(2, 4, i + 1)
  sns.boxplot(x='Cluster', y=column, data=results_df)
  plt.title(column)
  plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(9, 7))

scatter = plt.scatter(results_df['Pregnancies'], results_df['Age'], c=results_df['Cluster'], cmap='viridis')
plt.xlabel('Age')
plt.ylabel('Pregnancies')
plt.title('K-means Clustering')
plt.colorbar(scatter)
plt.show()

In [None]:
plt.figure(figsize=(10, 7))

scatter = plt.scatter(X['Insulin'], X['Glucose'], c=cluster_labels, cmap='viridis')
plt.xlabel('Insulin')
plt.ylabel('Glucose')
plt.title('Clusters: Insulin vs Glucose')
plt.colorbar(scatter)
plt.show()