In [None]:
# Importing the necessary Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, roc_curve
import pickle
import os

In [None]:
# Loading the diabetes dataset from a CSV file and shows its shape and first rows
data = pd.read_csv('diabetes.csv')
print("Shape:", data.shape)
print(data.head())

In [None]:
# Printing the basic information and checks for missing values
print(data.info())
print("\nMissing values per column:")
print(data.isnull().sum())

In [None]:
# Describing the dataset statistics for numeric columns
print(data.describe())

In [None]:
# Explaining the dataset features and printing them
feature_info = {
    "Pregnancies": "Number of times pregnant",
    "Glucose": "Plasma glucose concentration (mg/dL)",
    "BloodPressure": "Diastolic blood pressure (mm Hg)",
    "SkinThickness": "Triceps skin fold thickness (mm)",
    "Insulin": "2-Hour serum insulin (mu U/ml)",
    "BMI": "Body mass index (weight in kg/(height in m)^2)",
    "DiabetesPedigreeFunction": "Diabetes pedigree function",
    "Age": "Age in years",
    "Outcome": "Class variable (0: non-diabetic, 1: diabetic)"
}
print("Feature descriptions:")
for k, v in feature_info.items():
    print(f"{k} : {v}")

In [None]:
# Checking for zero values in columns where zero is not possible and replacing the zeros with NaN
cols_with_zero_not_valid = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]
data[cols_with_zero_not_valid] = data[cols_with_zero_not_valid].replace(0, np.nan)
print(data[cols_with_zero_not_valid].isnull().sum())


In [None]:
# Filling the missing values with the median of each column
data = data.copy()
for col in cols_with_zero_not_valid:
    median_value = data[col].median()
    data[col].fillna(median_value, inplace=True)
print("Missing values after imputation:")
print(data.isnull().sum())


In [None]:
# Showing basic class distribution of the target
print("Outcome value counts:")
print(data['Outcome'].value_counts())
print("\nOutcome percentages:")
print(data['Outcome'].value_counts(normalize=True) * 100)


In [None]:
# Plotting the histograms for all numeric features
data.hist(figsize=(12,10))
plt.tight_layout()
plt.savefig('results/feature_histograms.png')
plt.show()


In [None]:
# Plotting a correlation heatmap
plt.figure(figsize=(8,6))
corr = data.corr()
sns.heatmap(corr, annot=True, fmt=".2f")
plt.title("Correlation matrix")
plt.savefig('results/correlation_heatmap.png')
plt.show()


In [None]:
# Preparing the features and target for modeling
X = data.drop('Outcome', axis=1)
y = data['Outcome']
print("X shape:", X.shape)
print("y shape:", y.shape)


In [None]:
# Splitting the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)


In [None]:
# Scaling the numeric features using StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
# Training three simple models and storing them in a dictionary
models = {
    "LogisticRegression": LogisticRegression(max_iter=200),
    "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5)
}
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    print(name, "trained")


In [None]:
# Evaluating each model on the test set and collecting metrics
results = []
for name, model in models.items():
    y_pred = model.predict(X_test_scaled)
    y_proba = None
    try:
        y_proba = model.predict_proba(X_test_scaled)[:,1]
    except:
        y_proba = None
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    roc = roc_auc_score(y_test, y_proba) if y_proba is not None else None
    cm = confusion_matrix(y_test, y_pred)
    results.append({
        "model": name,
        "accuracy": acc,
        "precision": prec,
        "recall": rec,
        "f1": f1,
        "roc_auc": roc,
        "confusion_matrix": cm
    })
print("Evaluation complete")


In [None]:
# Printing the evaluation results 
for r in results:
    print("Model:", r["model"])
    print(" Accuracy:", round(r["accuracy"], 3))
    print(" Precision:", round(r["precision"], 3))
    print(" Recall:", round(r["recall"], 3))
    print(" F1:", round(r["f1"], 3))
    print(" ROC AUC:", None if r["roc_auc"] is None else round(r["roc_auc"],3))
    print(" Confusion Matrix:\n", r["confusion_matrix"])
    print("-"*30)


In [None]:
# Creating a bar chart comparing model accuracies and saving it
model_names = [r["model"] for r in results]
accuracies = [r["accuracy"] for r in results]
plt.figure(figsize=(6,4))
plt.bar(model_names, accuracies)
plt.ylabel("Accuracy")
plt.ylim(0,1)
plt.title("Model accuracy comparison")
plt.savefig('results/model_accuracy_comparison.png')
plt.show()


In [None]:
# Plotting ROC curves for models that provide probability estimates
plt.figure(figsize=(6,5))
for r in results:
    if r["roc_auc"] is not None:
        model = models[r["model"]]
        y_proba = model.predict_proba(X_test_scaled)[:,1]
        fpr, tpr, _ = roc_curve(y_test, y_proba)
        plt.plot(fpr, tpr, label=f"{r['model']} (AUC = {r['roc_auc']:.2f})")
plt.plot([0,1], [0,1], linestyle='--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curves")
plt.legend()
plt.savefig('results/roc_curves.png')
plt.show()


In [None]:
# Saving the best model (by accuracy) to disk using pickle
best = max(results, key=lambda x: x["accuracy"])
best_model_name = best["model"]
best_model = models[best_model_name]
os.makedirs('results', exist_ok=True)
with open('results/best_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)
print("Saved best model:", best_model_name)


In [None]:
# Defining a function to predict diabetes for a single new sample
def predict_single(sample_dict):
    cols = X.columns.tolist()
    x = [sample_dict.get(c, 0) for c in cols]
    x_arr = np.array(x).reshape(1, -1)
    x_scaled = scaler.transform(x_arr)
    pred = best_model.predict(x_scaled)[0]
    proba = None
    try:
        proba = best_model.predict_proba(x_scaled)[0][1]
    except:
        proba = None
    return {"prediction": int(pred), "probability": None if proba is None else float(proba)}

# Example usage saved as variable for demonstration
example_input = {"Pregnancies":2, "Glucose":120, "BloodPressure":70, "SkinThickness":20, "Insulin":79, "BMI":25.0, "DiabetesPedigreeFunction":0.5, "Age":33}
example_prediction = predict_single(example_input)
print("Example prediction:", example_prediction)
