#  Domain: Disease Prediction



# Project Title:  Disease Prediction Toolkit: Building and Evaluating ML Models

In [None]:
# 1. Install dependencies
!pip install pandas scikit-learn matplotlib seaborn

# 2. Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns
import os



## Load & Preprocess Data

In [None]:
print("🔹 Loading & Preprocessing Data...")

# Upload dataset first via Colab sidebar OR replace with path
dataset_path = "/content/Disease-Prediction-Toolkit/data/diabetes.csv"   # Change to "diabetes.csv" if using that dataset
df = pd.read_csv(dataset_path)

# Handle missing values
df = df.dropna()

# Split features/target
target_column = "Outcome" # Corrected target column name
X = df.drop(target_column, axis=1)   # "target" column = disease outcome
y = df[target_column]

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

🔹 Loading & Preprocessing Data...


## Train Models

In [None]:
print("🔹 Training Models...")

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42)
}

trained_models = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    trained_models[name] = model
    print(f"✅ Trained {name}")

🔹 Training Models...
✅ Trained Logistic Regression
✅ Trained Decision Tree
✅ Trained Random Forest


## Evaluate Models

In [None]:

print("🔹 Evaluating Models...")

results = {}

# Create results directory if it doesn't exist
results_dir = "results"
os.makedirs(results_dir, exist_ok=True)

# ROC Curve - combined plot
plt.figure(figsize=(8,6)) # Increased figure size for clarity

for name, model in trained_models.items():
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_prob) # Calculate AUC using probabilities

    results[name] = {"Accuracy": acc, "Precision": prec, "Recall": rec, "F1-score": f1, "ROC-AUC": roc_auc}

    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(5,4))
    sns.heatmap(cm, annot=True, fmt='d', cmap="Blues")
    plt.title(f"Confusion Matrix - {name}")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    cm_filepath = os.path.join(results_dir, f"{name}_confusion_matrix.png")
    plt.savefig(cm_filepath)
    plt.close() # Close the figure to prevent displaying

    print(f"✅ Saved Confusion Matrix for {name} to {cm_filepath}")

    # ROC Curve data for combined plot
    fpr, tpr, _ = roc_curve(y_test, y_prob)
    plt.plot(fpr, tpr, label=f"{name} (AUC={roc_auc:.2f})")


# Finalize and save the combined ROC Curve
plt.plot([0,1], [0,1], 'k--')
plt.title("ROC Curve")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend()
roc_filepath = os.path.join(results_dir, "roc_curve.png")
plt.savefig(roc_filepath)
plt.close() # Close the figure to prevent displaying

print(f"✅ Saved combined ROC Curve to {roc_filepath}")

🔹 Evaluating Models...
✅ Saved Confusion Matrix for Logistic Regression to results/Logistic Regression_confusion_matrix.png
✅ Saved Confusion Matrix for Decision Tree to results/Decision Tree_confusion_matrix.png
✅ Saved Confusion Matrix for Random Forest to results/Random Forest_confusion_matrix.png
✅ Saved combined ROC Curve to results/roc_curve.png


## Print Results

In [None]:
print("\n📊 Final Model Results:")
for model, metrics in results.items():
    print(f"\n{model}:")
    for metric, value in metrics.items():
        print(f"  {metric}: {value:.4f}")


📊 Final Model Results:

Logistic Regression:
  Accuracy: 0.7143
  Precision: 0.6087
  Recall: 0.5185
  F1-score: 0.5600
  ROC-AUC: 0.6693

Decision Tree:
  Accuracy: 0.7468
  Precision: 0.6744
  Recall: 0.5370
  F1-score: 0.5979
  ROC-AUC: 0.6985

Random Forest:
  Accuracy: 0.7597
  Precision: 0.6809
  Recall: 0.5926
  F1-score: 0.6337
  ROC-AUC: 0.7213
