In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from xgboost import XGBClassifier

In [2]:
df = pd.read_csv('diabetes.csv')
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [3]:
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [4]:
x = df.drop('Outcome', axis=1)
y = df['Outcome']

In [5]:
# First split: 70% train, 30% temporary
x_train, x_temp, y_train, y_temp = train_test_split(x, y, test_size=0.3, random_state=42, stratify=y)

# Second split: Split temporary into validation and test (15% each)
x_val, x_test, y_val, y_test = train_test_split(x_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

print(f"Training set: {len(x_train)} examples")
print(f"Validation set: {len(x_val)} examples")
print(f"Test set: {len(x_test)} examples")

Training set: 537 examples
Validation set: 115 examples
Test set: 116 examples


In [6]:
# ============================================
# PREPROCESSING - Handle Missing Values
# ============================================

# Check for zero values (they represent missing data in diabetes dataset)
zero_columns = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']

print("Zero values before imputation:")
for col in zero_columns:
    print(f"  {col}: {(df[col] == 0).sum()}")

# Replace zeros with NaN
df_processed = df.copy()
for col in zero_columns:
    df_processed[col] = df_processed[col].replace(0, np.nan)

# Impute with mean
for col in zero_columns:
    df_processed[col].fillna(df_processed[col].mean(), inplace=True)

print("\nMissing values after imputation: 0")

# Update x and y with processed data
x = df_processed.drop('Outcome', axis=1)
y = df_processed['Outcome']

Zero values before imputation:
  Glucose: 5
  BloodPressure: 35
  SkinThickness: 227
  Insulin: 374
  BMI: 11

Missing values after imputation: 0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_processed[col].fillna(df_processed[col].mean(), inplace=True)


In [7]:
# ============================================
# NORMALIZATION - Min-Max Scaling
# ============================================

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

# Fit ONLY on training data
x_train_scaled = scaler.fit_transform(x_train)

# Transform validation and test using training parameters
x_val_scaled = scaler.transform(x_val)
x_test_scaled = scaler.transform(x_test)

print("Data normalized!")

# Use scaled data for training
x_train = x_train_scaled
x_val = x_val_scaled
x_test = x_test_scaled

Data normalized!


In [8]:
print("Train class distribution:")
print(y_train.value_counts())
print()

Train class distribution:
Outcome
0    350
1    187
Name: count, dtype: int64



In [9]:
print("Test class distribution:")
print(y_test.value_counts())
print()

Test class distribution:
Outcome
0    75
1    41
Name: count, dtype: int64



In [10]:
# ============================================
# Train Multiple Models
# ============================================

# 1. XGBoost (your original)
xgb = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
xgb.fit(x_train, y_train)

# 2. Logistic Regression (NEW)
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter=100, random_state=42)
lr.fit(x_train, y_train)

# 3. Naive Bayes (NEW)
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(x_train, y_train)

0,1,2
,priors,
,var_smoothing,1e-09


In [11]:
y_pred = xgb.predict(x_test)

In [12]:
# ============================================
# Evaluate on Validation Set
# ============================================

# XGBoost
y_val_pred_xgb = xgb.predict(x_val)
val_acc_xgb = accuracy_score(y_val, y_val_pred_xgb)
print(f"XGBoost Validation Accuracy: {val_acc_xgb:.4f}")

# Logistic Regression
y_val_pred_lr = lr.predict(x_val)
val_acc_lr = accuracy_score(y_val, y_val_pred_lr)
print(f"Logistic Regression Validation Accuracy: {val_acc_lr:.4f}")

# Naive Bayes
y_val_pred_nb = nb.predict(x_val)
val_acc_nb = accuracy_score(y_val, y_val_pred_nb)
print(f"Naive Bayes Validation Accuracy: {val_acc_nb:.4f}")

XGBoost Validation Accuracy: 0.7391
Logistic Regression Validation Accuracy: 0.7391
Naive Bayes Validation Accuracy: 0.7043


In [13]:
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report

# For each model
print("=" * 50)
print("XGBoost Results:")
y_pred_xgb = xgb.predict(x_test)
print(f"Accuracy:  {accuracy_score(y_test, y_pred_xgb):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_xgb):.4f}")
print(f"Recall:    {recall_score(y_test, y_pred_xgb):.4f}")
print(f"F1-Score:  {f1_score(y_test, y_pred_xgb):.4f}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_xgb))
print()

print("=" * 50)
print("Logistic Regression Results:")
y_pred_lr = lr.predict(x_test)
print(f"Accuracy:  {accuracy_score(y_test, y_pred_lr):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_lr):.4f}")
print(f"Recall:    {recall_score(y_test, y_pred_lr):.4f}")
print(f"F1-Score:  {f1_score(y_test, y_pred_lr):.4f}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_lr))
print()

print("=" * 50)
print("Naive Bayes Results:")
y_pred_nb = nb.predict(x_test)
print(f"Accuracy:  {accuracy_score(y_test, y_pred_nb):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_nb):.4f}")
print(f"Recall:    {recall_score(y_test, y_pred_nb):.4f}")
print(f"F1-Score:  {f1_score(y_test, y_pred_nb):.4f}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_nb))

XGBoost Results:
Accuracy:  0.7586
Precision: 0.7241
Recall:    0.5122
F1-Score:  0.6000
Confusion Matrix:
[[67  8]
 [20 21]]

Logistic Regression Results:
Accuracy:  0.7672
Precision: 0.7917
Recall:    0.4634
F1-Score:  0.5846
Confusion Matrix:
[[70  5]
 [22 19]]

Naive Bayes Results:
Accuracy:  0.7845
Precision: 0.7353
Recall:    0.6098
F1-Score:  0.6667
Confusion Matrix:
[[66  9]
 [16 25]]


In [14]:
# ============================================
# K-Fold Cross-Validation
# ============================================

from sklearn.model_selection import cross_val_score

# Combine train and validation for CV
import numpy as np
x_combined = np.vstack([x_train, x_val])
y_combined = pd.concat([y_train, y_val])

print("=" * 50)
print("5-Fold Cross-Validation")
print("=" * 50)

# XGBoost CV
cv_scores_xgb = cross_val_score(xgb, x_combined, y_combined, cv=5)
print("\nXGBoost CV Scores:")
for i, score in enumerate(cv_scores_xgb, 1):
    print(f"  Fold {i}: {score:.4f}")
print(f"Average: {cv_scores_xgb.mean():.4f} (+/- {cv_scores_xgb.std():.4f})")

# Logistic Regression CV
cv_scores_lr = cross_val_score(lr, x_combined, y_combined, cv=5)
print("\nLogistic Regression CV Scores:")
for i, score in enumerate(cv_scores_lr, 1):
    print(f"  Fold {i}: {score:.4f}")
print(f"Average: {cv_scores_lr.mean():.4f} (+/- {cv_scores_lr.std():.4f})")

# Naive Bayes CV
cv_scores_nb = cross_val_score(nb, x_combined, y_combined, cv=5)
print("\nNaive Bayes CV Scores:")
for i, score in enumerate(cv_scores_nb, 1):
    print(f"  Fold {i}: {score:.4f}")
print(f"Average: {cv_scores_nb.mean():.4f} (+/- {cv_scores_nb.std():.4f})")

5-Fold Cross-Validation

XGBoost CV Scores:
  Fold 1: 0.7099
  Fold 2: 0.7481
  Fold 3: 0.7923
  Fold 4: 0.7231
  Fold 5: 0.7692
Average: 0.7485 (+/- 0.0299)

Logistic Regression CV Scores:
  Fold 1: 0.7710
  Fold 2: 0.7939
  Fold 3: 0.7692
  Fold 4: 0.7385
  Fold 5: 0.7462
Average: 0.7637 (+/- 0.0197)

Naive Bayes CV Scores:
  Fold 1: 0.6870
  Fold 2: 0.7786
  Fold 3: 0.7615
  Fold 4: 0.7077
  Fold 5: 0.7231
Average: 0.7316 (+/- 0.0339)


In [15]:
# ============================================
# Hyperparameter Tuning
# ============================================

print("\n" + "=" * 50)
print("Hyperparameter Tuning - Logistic Regression")
print("=" * 50)

learning_rates = [0.01, 0.05, 0.1, 0.2]
max_iters = [50, 100, 150]

print("\nMax Iter\tTrain Acc\tVal Acc\t\tTest Acc")
print("-" * 60)

best_val_acc = 0
best_params = {}

for max_iter in max_iters:
    model = LogisticRegression(max_iter=max_iter, random_state=42)
    model.fit(x_train, y_train)
    
    train_acc = accuracy_score(y_train, model.predict(x_train))
    val_acc = accuracy_score(y_val, model.predict(x_val))
    test_acc = accuracy_score(y_test, model.predict(x_test))
    
    print(f"{max_iter}\t\t{train_acc:.4f}\t\t{val_acc:.4f}\t\t{test_acc:.4f}")
    
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        best_params = {'max_iter': max_iter}

print(f"\nBest Validation Accuracy: {best_val_acc:.4f}")
print(f"Best Parameters: {best_params}")


Hyperparameter Tuning - Logistic Regression

Max Iter	Train Acc	Val Acc		Test Acc
------------------------------------------------------------
50		0.7877		0.7391		0.7672
100		0.7877		0.7391		0.7672
150		0.7877		0.7391		0.7672

Best Validation Accuracy: 0.7391
Best Parameters: {'max_iter': 50}


In [16]:
# ============================================
# Predictions on Individual Patients
# ============================================

print("\n" + "=" * 50)
print("Single Patient Predictions")
print("=" * 50)

# Example patients
example_patients = [
    [6, 148, 72, 35, 0, 33.6, 0.627, 50],
    [1, 85, 66, 29, 0, 26.6, 0.351, 31],
    [8, 183, 64, 0, 0, 23.3, 0.672, 32]
]

for i, patient in enumerate(example_patients, 1):
    # Preprocess patient data
    patient_df = pd.DataFrame([patient], columns=x.columns)
    patient_scaled = scaler.transform(patient_df)
    
    # Predictions
    pred_xgb = xgb.predict(patient_scaled)[0]
    pred_lr = lr.predict(patient_scaled)[0]
    pred_nb = nb.predict(patient_scaled)[0]
    
    # Probabilities
    prob_xgb = xgb.predict_proba(patient_scaled)[0][1]
    prob_lr = lr.predict_proba(patient_scaled)[0][1]
    
    print(f"\nPatient {i}:")
    print(f"  XGBoost: {'Diabetic' if pred_xgb == 1 else 'Non-Diabetic'} (prob: {prob_xgb:.3f})")
    print(f"  Logistic Reg: {'Diabetic' if pred_lr == 1 else 'Non-Diabetic'} (prob: {prob_lr:.3f})")
    print(f"  Naive Bayes: {'Diabetic' if pred_nb == 1 else 'Non-Diabetic'}")


Single Patient Predictions

Patient 1:
  XGBoost: Diabetic (prob: 0.803)
  Logistic Reg: Diabetic (prob: 0.588)
  Naive Bayes: Diabetic

Patient 2:
  XGBoost: Non-Diabetic (prob: 0.034)
  Logistic Reg: Non-Diabetic (prob: 0.101)
  Naive Bayes: Non-Diabetic

Patient 3:
  XGBoost: Diabetic (prob: 0.943)
  Logistic Reg: Diabetic (prob: 0.674)
  Naive Bayes: Diabetic


In [17]:
print("Train class distribution:\n", y_train.value_counts())
print("Test class distribution:\n", y_test.value_counts())

Train class distribution:
 Outcome
0    350
1    187
Name: count, dtype: int64
Test class distribution:
 Outcome
0    75
1    41
Name: count, dtype: int64


In [18]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

dt = DecisionTreeClassifier(random_state=0)
dt.fit(x_train, y_train)
y_pred_dt = dt.predict(x_test)
print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))

rf = RandomForestClassifier(n_estimators=100, random_state=0)
rf.fit(x_train, y_train)
y_pred_rf = rf.predict(x_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))

Decision Tree Accuracy: 0.7413793103448276
Random Forest Accuracy: 0.7413793103448276


In [19]:
# ============================================
# Final Summary
# ============================================

print("\n" + "=" * 50)
print("FINAL SUMMARY")
print("=" * 50)

print("\nDataset Information:")
print(f"  Total examples: {len(df)}")
print(f"  Training: {len(x_train)} (70%)")
print(f"  Validation: {len(x_val)} (15%)")
print(f"  Test: {len(x_test)} (15%)")

print("\nPreprocessing Applied:")
print("  ✓ Missing value imputation")
print("  ✓ Min-Max normalization")

print("\nTest Set Performance:")
print(f"  XGBoost:           {accuracy_score(y_test, xgb.predict(x_test)):.4f}")
print(f"  Logistic Reg:      {accuracy_score(y_test, lr.predict(x_test)):.4f}")
print(f"  Naive Bayes:       {accuracy_score(y_test, nb.predict(x_test)):.4f}")
print(f"  Decision Tree:     {accuracy_score(y_test, y_pred_dt):.4f}")
print(f"  Random Forest:     {accuracy_score(y_test, y_pred_rf):.4f}")

print("\nCross-Validation (5-Fold):")
print(f"  XGBoost:           {cv_scores_xgb.mean():.4f} (+/- {cv_scores_xgb.std():.4f})")
print(f"  Logistic Reg:      {cv_scores_lr.mean():.4f} (+/- {cv_scores_lr.std():.4f})")
print(f"  Naive Bayes:       {cv_scores_nb.mean():.4f} (+/- {cv_scores_nb.std():.4f})")

print("\n" + "=" * 50)


FINAL SUMMARY

Dataset Information:
  Total examples: 768
  Training: 537 (70%)
  Validation: 115 (15%)
  Test: 116 (15%)

Preprocessing Applied:
  ✓ Missing value imputation
  ✓ Min-Max normalization

Test Set Performance:
  XGBoost:           0.7586
  Logistic Reg:      0.7672
  Naive Bayes:       0.7845
  Decision Tree:     0.7414
  Random Forest:     0.7414

Cross-Validation (5-Fold):
  XGBoost:           0.7485 (+/- 0.0299)
  Logistic Reg:      0.7637 (+/- 0.0197)
  Naive Bayes:       0.7316 (+/- 0.0339)



### Ordinary Least Squares Regression

#### What is Ordinary Least Squares Regression

     X     | Y     | x − x̄  | y − ȳ  | (x − x̄)(y − ȳ) | (x − x̄)²
     72    | 84    | -0.17  | 10.00  | -1.70          | 0.03
     50    | 63    | -22.17 | -11.00 | 243.87         | 491.11
     81    | 77    | 8.83   | 3.00   | 26.49          | 77.97
     74    | 78    | 1.83   | 4.00   | 7.32           | 3.35
     94    | 90    | 21.83  | 16.00  | 349.28         | 476.55
     86    | 75    | 13.83  | 1.00   | 13.83          | 191.27
     59    | 49    | -13.17 | -25.00 | 329.25         | 173.45
     83    | 79    | 10.83  | 5.00   | 54.15          | 117.29
     65    | 77    | -7.17  | 3.00   | -21.51         | 51.41
     33    | 52    | -39.17 | -22.00 | 861.74         | 1534.29
     88    | 74    | 15.83  | -0.00  | -0.00          | 250.69
     81    | 90    | 8.83   | 16.00  | 141.28         | 77.97
 ----------------------------------------------------------------     
Sum |72.17 | 74.00 |------------|------------| 2004.00        | 3445.67

###### b = Σ(x − x̄)(y − ȳ) / Σ(x − x̄)²
###### b = 2004.00 / 3445.67
###### b = 0.58

###### ȳ = a + b × x̄
###### a = ȳ − b × x̄
###### a = 74.00 − (0.58 × 72.17)
###### a = 32.03

###### ŷ = 32.03 + 0.58x
###### ŷ = 32.03 + 0.58 × 86
###### ŷ = 32.03 + 49.88
###### ŷ = 81.91

     x | y | x − x̄ | y − ȳ | (x − x̄)(y − ȳ) | (x − x̄)²
     1 | 2 | -2    | -2.2  | 4.4            | 4
     2 | 4 | -1    | -0.2  | 0.2            | 1
     3 | 5 | 0     | 0.8   | 0.0            | 0
     4 | 4 | 1     | -0.2  | -0.2           | 1
     5 | 6 | 2     | 1.8   | 3.6            | 4
Sum ///|///|///////|///////| 8.0            | 10

n = 5

x̄ = (1 + 2 + 3 + 4 + 5) / 5 = 3.0

ȳ = (2 + 4 + 5 + 4 + 6) / 5 = 4.2

b = 8.0 / 10 = 0.8

a = 4.2 − 0.8 × 3.0 = 1.8

ŷ = 1.8 + 0.8x

ŷ = 1.8 + 0.8 × 6

ŷ = 1.8 + 4.8

ŷ = 6.6

     x | y | x − x̄ | y − ȳ | (x − x̄)(y − ȳ) | (x − x̄)²
     1 | 3 | -1    | -2    | 2              | 1
     2 | 5 |  0    |  0    | 0              | 0
     3 | 7 |  1    |  2    | 2              | 1
Sum ///|///|///////|///////| 4.0            | 2

n = 3

x̄ = (1 + 2 + 3) / 3 = 2.0

ȳ = (3 + 5 + 7) / 3 = 5.0

b = 4.0 / 2 = 2.0

a = 5.0 − 2.0 × 2.0 = 1.0

ŷ = 1.0 + 2.0x

ŷ = 1.0 + 2.0 × 4

ŷ = 1.0 + 8.0

ŷ = 9.0