# Klasyfikacja ryzyka cukrzycy na podstawie wskaźników zdrowotnych
**Autorzy:** *Imię Nazwisko*

Projekt realizowany w ramach przedmiotu **Metody Analizy Danych**. Celem jest predykcja wystąpienia cukrzycy w oparciu o zestaw wskaźników zdrowotnych pochodzących z BRFSS 2015.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')


In [None]:
df = pd.read_csv("diabetes_012_health_indicators_BRFSS2015.csv")
print("Rozmiar danych:", df.shape)
df.head()

In [None]:
# Mapowanie 3-klasowego celu na zmienną binarną
df["Diabetes_binary"] = df["Diabetes_012"].map(lambda x: 1 if x > 0 else 0)

# Obsługa outlierów w BMI
q_low = df["BMI"].quantile(0.01)
q_hi = df["BMI"].quantile(0.99)
df = df[(df["BMI"] >= q_low) & (df["BMI"] <= q_hi)]

# Dane i etykiety
X = df.drop(columns=["Diabetes_012", "Diabetes_binary"])
y = df["Diabetes_binary"]

# Standaryzacja zmiennych ilościowych
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("Dane przygotowane.")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, stratify=y, random_state=42)

In [None]:
logreg = LogisticRegression(max_iter=1000)
rf = RandomForestClassifier(n_estimators=100, max_features='sqrt', random_state=42)
svm = SVC(probability=True)
knn = KNeighborsClassifier(n_neighbors=5)

models = {'Logistic': logreg, 'Random Forest': rf, 'SVM': svm, 'kNN': knn}

for name, model in models.items():
    model.fit(X_train, y_train)
    print(f"{name} trained.")

In [None]:
y_proba_log = logreg.predict_proba(X_test)[:, 1]
y_proba_rf = rf.predict_proba(X_test)[:, 1]
y_proba_svm = svm.predict_proba(X_test)[:, 1]
y_proba_knn = knn.predict_proba(X_test)[:, 1]

y_proba_hybrid = (y_proba_log + y_proba_rf + y_proba_svm + y_proba_knn) / 4
y_pred_hybrid = (y_proba_hybrid > 0.5).astype(int)

In [None]:
def print_metrics(y_true, y_pred, y_proba, model_name):
    print(f"\n--- {model_name} ---")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("F1 Score:", f1_score(y_true, y_pred))
    print("ROC AUC:", roc_auc_score(y_true, y_proba))
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))

print_metrics(y_test, logreg.predict(X_test), y_proba_log, "Logistic Regression")
print_metrics(y_test, rf.predict(X_test), y_proba_rf, "Random Forest")
print_metrics(y_test, svm.predict(X_test), y_proba_svm, "SVM")
print_metrics(y_test, knn.predict(X_test), y_proba_knn, "kNN")
print_metrics(y_test, y_pred_hybrid, y_proba_hybrid, "Hybrid Model")

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_score = cross_val_score(logreg, X_scaled, y, cv=cv, scoring='roc_auc')
print("Średni wynik ROC AUC (Logistic Regression, CV):", cv_score.mean())

In [None]:
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_scaled, y)
print("Rozmiar po SMOTE:", X_res.shape, y_res.shape)

In [None]:
bagging = BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=50, random_state=42)
bagging.fit(X_train, y_train)
y_proba_bag = bagging.predict_proba(X_test)[:, 1]
print_metrics(y_test, bagging.predict(X_test), y_proba_bag, "Bagging Classifier")