In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import joblib
import os

In [5]:
df = pd.read_csv("diabetes.csv")

print("‚úÖ Dataset Loaded Successfully!")
print(df.head())

‚úÖ Dataset Loaded Successfully!
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


In [6]:
cols_with_zeros = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']

df[cols_with_zeros] = df[cols_with_zeros].replace(0, np.nan)

imputer = SimpleImputer(strategy='mean')
df[cols_with_zeros] = imputer.fit_transform(df[cols_with_zeros])

print("‚úÖ Missing values handled!")

‚úÖ Missing values handled!


In [7]:
X = df.drop('Outcome', axis=1)
y = df['Outcome']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"‚úÖ Data split complete! Training samples: {X_train.shape[0]}, Testing samples: {X_test.shape[0]}")


‚úÖ Data split complete! Training samples: 614, Testing samples: 154


In [8]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("‚úÖ Features scaled successfully!")

‚úÖ Features scaled successfully!


In [9]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "SVM": SVC(probability=True, random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42)
}

results = []

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    y_prob = model.predict_proba(X_test_scaled)[:, 1]
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_prob)
    
    results.append({
        "Model": name,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1-Score": f1,
        "ROC-AUC": roc_auc
    })

print("‚úÖ Model training complete!")

‚úÖ Model training complete!


In [10]:
results_df = pd.DataFrame(results)
os.makedirs("models", exist_ok=True)
results_df.to_csv("models/model_evaluation.csv", index=False)

print("üìÅ Model evaluation saved at: models/model_evaluation.csv")
print(results_df)

üìÅ Model evaluation saved at: models/model_evaluation.csv
                 Model  Accuracy  Precision    Recall  F1-Score   ROC-AUC
0  Logistic Regression  0.694805   0.577778  0.481481  0.525253  0.811667
1                  SVM  0.733766   0.644444  0.537037  0.585859  0.791204
2        Random Forest  0.740260   0.645833  0.574074  0.607843  0.818889


In [11]:
# Identify best model based on ROC-AUC
best_model_name = results_df.sort_values(by="ROC-AUC", ascending=False).iloc[0]["Model"]
best_model = models[best_model_name]

print(f"üèÜ Best model selected: {best_model_name}")

# Save model + scaler together using joblib
import joblib

save_object = {
    "model": best_model,
    "scaler": scaler  # make sure your scaler variable name is 'scaler'
}

joblib.dump(save_object, "models/best_model.joblib")

print("üíæ Best model and scaler saved at: models/best_model.joblib")


üèÜ Best model selected: Random Forest
üíæ Best model and scaler saved at: models/best_model.joblib
