In [1]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv(r"C:\Users\bbuser\Downloads\archive (1)\diabetes.csv")

In [4]:
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [5]:
# Convert feature columns to numeric 
import numpy as np

for c in df.columns:
    if c != "Outcome":
        df[c] = pd.to_numeric(df[c], errors="coerce")

# Check dtypes and any NaNs introduced
print(df.dtypes)
print("\nNaN counts after coercion:")
print(df.isna().sum())

Pregnancies                   int64
Glucose                       int64
BloodPressure                 int64
SkinThickness                 int64
Insulin                       int64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                           int64
Outcome                       int64
dtype: object

NaN counts after coercion:
Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [6]:
# Check columns where zero indicates missing information
zero_is_missing = ["Glucose","BloodPressure","SkinThickness","Insulin","BMI"]
zero_is_missing = [c for c in zero_is_missing if c in df.columns]  # keep only existing

print("Zero counts (before fixing):")
for c in zero_is_missing:
    print(f"{c:>15}: {(df[c] == 0).sum()}")

Zero counts (before fixing):
        Glucose: 5
  BloodPressure: 35
  SkinThickness: 227
        Insulin: 374
            BMI: 11


In [7]:
# Swap zero values with NaN
import numpy as np

for c in zero_is_missing:
    df.loc[df[c] == 0, c] = np.nan

print("NaN counts after replacing zeros:")
print(df[zero_is_missing].isna().sum())


NaN counts after replacing zeros:
Glucose            5
BloodPressure     35
SkinThickness    227
Insulin          374
BMI               11
dtype: int64


In [18]:
# Fill missing values using median
for c in zero_is_missing:
    median_val = df[c].median()
    df[c] = df[c].fillna(median_val)

print("NaN counts AFTER imputation (should be 0 now):")
print(df[zero_is_missing].isna().sum())

NaN counts AFTER imputation (should be 0 now):
Glucose          0
BloodPressure    0
SkinThickness    0
Insulin          0
BMI              0
dtype: int64


In [19]:
# Drop  any duplicate raws
before = len(df)
df = df.drop_duplicates()
dropped = before - len(df)

print(f"Number of duplicate rows dropped: {dropped}")
print("New shape:", df.shape)


Number of duplicate rows dropped: 0
New shape: (768, 9)


In [20]:
# Verify data integrity after cleaning 

print("Descriptive statistics (after cleaning):")
display(df.describe())

if "Outcome" in df.columns:
    print("\nTarget distribution (Outcome):")
    print(df["Outcome"].value_counts())
    print("\nTarget ratio (normalized):")
    print((df["Outcome"].value_counts(normalize=True)).round(3))

Descriptive statistics (after cleaning):


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,121.65625,72.386719,29.108073,140.671875,32.455208,0.471876,33.240885,0.348958
std,3.369578,30.438286,12.096642,8.791221,86.38306,6.875177,0.331329,11.760232,0.476951
min,0.0,44.0,24.0,7.0,14.0,18.2,0.078,21.0,0.0
25%,1.0,99.75,64.0,25.0,121.5,27.5,0.24375,24.0,0.0
50%,3.0,117.0,72.0,29.0,125.0,32.3,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0



Target distribution (Outcome):
Outcome
0    500
1    268
Name: count, dtype: int64

Target ratio (normalized):
Outcome
0    0.651
1    0.349
Name: proportion, dtype: float64


# Train & Evaluate the models (Logistic Regression & KNN)

In [11]:

from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

In [21]:
# Split
X = df.drop(columns=["Outcome"]).values
y = df["Outcome"].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)

In [22]:
# Pipelines
pipe_lr  = make_pipeline(StandardScaler(), LogisticRegression(max_iter=1000, solver="lbfgs"))
pipe_knn = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=5))

In [23]:
# Train
pipe_lr.fit(X_train, y_train)
pipe_knn.fit(X_train, y_train)


In [24]:
# Predictions
pred_lr  = pipe_lr.predict(X_test)
pred_knn = pipe_knn.predict(X_test)


In [25]:
# Metrics
def metrics(y_true, y_pred):
    return {
        "accuracy":  accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred, zero_division=0),
        "recall":    recall_score(y_true, y_pred, zero_division=0),
        "f1":        f1_score(y_true, y_pred, zero_division=0),
    }

In [16]:
m_lr  = metrics(y_test, pred_lr)
m_knn = metrics(y_test, pred_knn)

print("[Logistic Regression]\n", classification_report(y_test, pred_lr, digits=3))
print("[KNN]\n", classification_report(y_test, pred_knn, digits=3))


[Logistic Regression]
               precision    recall  f1-score   support

           0      0.772     0.860     0.814       150
           1      0.672     0.531     0.593        81

    accuracy                          0.745       231
   macro avg      0.722     0.695     0.703       231
weighted avg      0.737     0.745     0.736       231

[KNN]
               precision    recall  f1-score   support

           0      0.786     0.833     0.809       150
           1      0.653     0.580     0.614        81

    accuracy                          0.745       231
   macro avg      0.719     0.707     0.712       231
weighted avg      0.739     0.745     0.741       231



In [17]:
import pandas as pd

summary = pd.DataFrame([m_lr, m_knn], index=["LogReg", "KNN"]).round(3)
print("Comparison of Logistic Regression vs KNN:\n")
display(summary)


Comparison of Logistic Regression vs KNN:



Unnamed: 0,accuracy,precision,recall,f1
LogReg,0.745,0.672,0.531,0.593
KNN,0.745,0.653,0.58,0.614


# Discuss which one performs better and why:

Both Logistic Regression and KNN reached the same accuracy score of 0.745. However, KNN showed slightly stronger performance in terms of Recall and F1-score for the positive class (diabetes). This is especially significant in the medical field, since higher recall means the model is identifying more actual diabetes cases and reducing false negatives. On the other hand, Logistic Regression had a marginally better Precision, but its lower Recall indicates it failed to detect more true cases. Given that in healthcare, Recall and F1 are generally more important than overall accuracy, we can conclude that KNN provides a slight advantage for this problem.