In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# -----------------------------
# Load dataset
# -----------------------------
df = pd.read_csv("Dataset/diabetes.csv")

# -----------------------------
# Replace 0 with NaN (medical logic)
# -----------------------------
cols_with_zero = [
    "Glucose", 
    "BloodPressure", 
    "SkinThickness", 
    "Insulin", 
    "BMI"
]

df[cols_with_zero] = df[cols_with_zero].replace(0, np.nan)

# -----------------------------
# Fill missing values with median
# -----------------------------
for col in cols_with_zero:
    df[col].fillna(df[col].median(), inplace=True)

# -----------------------------
# Features & Target
# -----------------------------
X = df.drop("Outcome", axis=1)
y = df["Outcome"]

# -----------------------------
# Train-test split
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# -----------------------------
# Feature scaling
# -----------------------------
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Diabetes dataset cleaned and prepared")
print("X_train shape:", X_train_scaled.shape)
print("X_test shape:", X_test_scaled.shape)


Diabetes dataset cleaned and prepared
X_train shape: (614, 8)
X_test shape: (154, 8)


In [16]:
import pandas as pd

# Convert to Series if needed
y_train_series = pd.Series(y_train)
y_test_series = pd.Series(y_test)

print("Training set distribution:")
print(y_train_series.value_counts())
print("\nTraining set percentage:")
print(y_train_series.value_counts(normalize=True) * 100)

print("\nTest set distribution:")
print(y_test_series.value_counts())
print("\nTest set percentage:")
print(y_test_series.value_counts(normalize=True) * 100)


Training set distribution:
Outcome
0    400
1    214
Name: count, dtype: int64

Training set percentage:
Outcome
0    65.14658
1    34.85342
Name: proportion, dtype: float64

Test set distribution:
Outcome
0    100
1     54
Name: count, dtype: int64

Test set percentage:
Outcome
0    64.935065
1    35.064935
Name: proportion, dtype: float64


In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

# -----------------------------
# Train Logistic Regression
# -----------------------------
model_diabetes = LogisticRegression(
    max_iter=1000,
    random_state=42
)

model_diabetes.fit(X_train_scaled, y_train)

# -----------------------------
# Predictions
# -----------------------------
y_pred = model_diabetes.predict(X_test_scaled)
y_prob = model_diabetes.predict_proba(X_test_scaled)[:, 1]

# -----------------------------
# Evaluation
# -----------------------------
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))


Accuracy: 0.7077922077922078

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.82      0.78       100
           1       0.60      0.50      0.55        54

    accuracy                           0.71       154
   macro avg       0.68      0.66      0.67       154
weighted avg       0.70      0.71      0.70       154

ROC-AUC: 0.812962962962963


In [4]:
import joblib

joblib.dump(model_diabetes, "diabetes_model.pkl")
joblib.dump(scaler, "diabetes_scaler.pkl")

print("✅ Diabetes model saved successfully")


✅ Diabetes model saved successfully
