<a href="https://colab.research.google.com/github/karim-mammadov/Kaggle-Datasets-MyMLProject/blob/main/diabets_health.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
files.upload()

In [None]:
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download alexteboul/diabetes-health-indicators-dataset

In [None]:
import zipfile
with zipfile.ZipFile("/content/diabetes-health-indicators-dataset.zip", "r") as zip_ref:
    zip_ref.extractall()

In [None]:
import pandas as pd

# DATA CLEANING and VISUALIZATION

In [None]:
df = pd.read_csv('/content/diabetes_binary_5050split_health_indicators_BRFSS2015.csv')

In [None]:
df

In [None]:
df.info()

In [None]:
df.duplicated().sum()

In [None]:
print(df['Diabetes_binary'].value_counts())

In [None]:
df.describe()

In [None]:
df.isna().sum()

In [None]:
df.corr(numeric_only=True)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [None]:
df_corr = df.corr(numeric_only=True)
df_corr

plt.figure(figsize=(16,12))
sns.heatmap(df_corr, annot=True)

In [None]:
plt.pie(df['Sex'].value_counts(), labels=df['Sex'].unique(), autopct='%1.1f%%')
plt.show();

In [None]:
plt.pie(df['Diabetes_binary'].value_counts(), labels=df['Diabetes_binary'].unique(), autopct='%1.1f%%')
plt.show();

In [None]:
df

In [None]:
numeric_columns = ['BMI', 'MentHlth', 'PhysHlth', 'Education', 'Income']

Q1 = df[numeric_columns].quantile(0.25)
Q3 = df[numeric_columns].quantile(0.75)
IQR = Q3 - Q1

df = df[~((df[numeric_columns] < (Q1 - 1.5 * IQR)) |
          (df[numeric_columns] > (Q3 + 1.5 * IQR))).any(axis=1)]

print("Yeni ölçü (outlier təmizlənmiş):", df.shape)

In [None]:
# categorical_features = ['Sex', 'Age', 'HighBP', 'HighChol', 'CholCheck', 'Smoker', 'HeartDiseaseorAttack',
#                         'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost',
#                         'GenHlth', 'DiffWalk']

# numeric_features = ['BMI', 'MentHlth', 'PhysHlth', 'Education', 'Income']

# Build a Model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
X = df.drop("Diabetes_binary", axis=1)
y = df["Diabetes_binary"].copy()

In [None]:
numeric_features = X.select_dtypes(include=[np.number]).columns
categorical_features = X.select_dtypes(exclude=[np.number]).columns

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

full_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [None]:
full_pipeline.fit(X_train, y_train)

In [None]:
full_pipeline.score(X_train,y_train), full_pipeline.score(X_test,y_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier

full_pipeline_2 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(
        n_estimators=100,
        max_depth=7,
        random_state=42
    ))
])

full_pipeline_2.fit(X_train, y_train)
y_pred_rf = full_pipeline_2.predict(X_test)

In [None]:
full_pipeline_2.score(X_train,y_train), full_pipeline_2.score(X_test,y_test)

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import GridSearchCV


numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

full_pipeline_xgb = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42))
])

In [None]:
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [3, 5, 7],
    'classifier__learning_rate': [0.01, 0.1, 0.2],
    'classifier__subsample': [0.8, 1.0]
}

grid_search = GridSearchCV(
    full_pipeline_xgb,
    param_grid,
    cv=3,
    scoring='f1',
    n_jobs=-1,
    verbose=1
)

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
print("Ən yaxşı parametrlər:", grid_search.best_params_)
best_model = grid_search.best_estimator_


y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]

print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, y_proba))


In [None]:
y_proba = best_model.predict_proba(X_test)[:, 1]

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

def evaluate_threshold(thresh):
    y_pred_thresh = (y_proba >= thresh).astype(int)
    print(f"Threshold = {thresh}")
    print(classification_report(y_test, y_pred_thresh))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_thresh))
    print("-" * 50)


for t in [0.5, 0.4, 0.35, 0.3, 0.25]:
    evaluate_threshold(t)

In [None]:
from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(y_test, y_proba)

import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.plot(thresholds, tpr, label='True Positive Rate (Recall)')
plt.plot(thresholds, 1 - fpr, label='1 - False Positive Rate')
plt.xlabel("Threshold")
plt.ylabel("Rate")
plt.title("Threshold vs TPR & 1-FPR")
plt.grid()
plt.legend()
plt.show()