[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/kaivalyagnik/ml-practice-portfolio/blob/d360e59f1e1dc7f703814cb2bacbf097e4b3c...)

You can also open this notebook in Google Colab to run it interactively.

# üß† Diabetes Prediction using Machine Learning
This notebook demonstrates a complete classification pipeline to predict whether a person is diabetic based on medical features.

**Goals:**
- Perform preprocessing on health-related data
- Build machine learning models (e.g., Logistic Regression, Random Forest)
- Evaluate models with appropriate metrics
- Visualize performance using ROC-AUC curves and classification reports

üì¶ Dataset: [Pima Indians Diabetes Dataset](https://www.kaggle.com/datasets/uciml/pima-indians-diabetes-database)

## üìä 1. Data Preprocessing
This section handles data loading, scaling, and train-test splitting.

We also explore null values, class distribution, and encoding if needed.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sb

# If you use kagglehub (uncommon), ensure it's installed. Otherwise load local file or from Kaggle API.
try:
    import kagglehub
    path = kagglehub.dataset_download("akshaydattatraykhare/diabetes-dataset")
    data = pd.read_csv(path + "/diabetes.csv")
except Exception:
    # Fallback: expect a diabetes.csv in the working directory or mounted drive
    data = pd.read_csv("diabetes.csv")

data.head()

In [None]:
data.info()

In [None]:
features = ['Glucose', 'BloodPressure', 'BMI', 'Age']
sb.pairplot(data, vars=features, hue='Outcome', diag_kind='kde')
plt.suptitle("Pairplot of Selected Features by Outcome", y=1.02)
plt.show()

In [None]:
data.hist(bins=20, figsize=(15, 10))
plt.suptitle("Histograms of Features")
plt.show()

In [None]:
plt.figure(figsize=(10, 8))
sb.heatmap(data.corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()

In [None]:
features = ['Glucose', 'BloodPressure', 'BMI', 'Age']
plt.figure(figsize=(15, 8))
for i, feature in enumerate(features, 1):
    plt.subplot(2, 2, i)
    sb.boxplot(x='Outcome', y=feature, data=data)
    plt.title(f"{feature} by Outcome")
plt.tight_layout()
plt.show()

In [None]:
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, roc_curve, auc

X = data.drop('Outcome', axis=1)
y = data['Outcome']

scaler = StandardScaler()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

models = {
    "Logistic Regression (L2)": LogisticRegression(C=1.0, penalty='l2', solver='liblinear', max_iter=1000),
    "Logistic Regression (L1)": LogisticRegression(C=1.0, penalty='l1', solver='liblinear', max_iter=1000),
    "ElasticNet (LogReg)": LogisticRegression(C=1.0, penalty='elasticnet', solver='saga', l1_ratio=0.5, max_iter=1000),
    "SVM (RBF Kernel)": SVC(kernel='rbf', C=1.0, probability=True),
    "Decision Tree": DecisionTreeClassifier(max_depth=5, random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42),
    "XGBoost": XGBClassifier(n_estimators=100, max_depth=5, learning_rate=0.1, use_label_encoder=False, eval_metric='logloss', random_state=42)
}

def evaluate_classifiers_with_roc(models, X_train, X_test, y_train, y_test, scale_models=None):
    if scale_models is None:
        scale_models = [name for name in models if "Logistic" in name or "SVM" in name or "ElasticNet" in name]
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    plt.figure(figsize=(10, 8))
    for name, model in models.items():
        print(f"\nüîç Evaluating: {name}")
        if name in scale_models:
            model.fit(X_train_scaled, y_train)
            y_pred = model.predict(X_test_scaled)
            if hasattr(model, 'predict_proba'):
                y_scores = model.predict_proba(X_test_scaled)[:, 1]
            else:
                y_scores = model.decision_function(X_test_scaled)
        else:
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            if hasattr(model, 'predict_proba'):
                y_scores = model.predict_proba(X_test)[:, 1]
            else:
                y_scores = model.decision_function(X_test)
        acc = accuracy_score(y_test, y_pred)
        print(f"‚úÖ Accuracy: {acc:.4f}")
        print("üìã Classification Report:\n", classification_report(y_test, y_pred))
        fpr, tpr, _ = roc_curve(y_test, y_scores)
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, lw=2, label=f'{name} (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
    plt.title("üèÅ ROC-AUC Curve for All Models")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.legend(loc='lower right')
    plt.grid(True)
    plt.tight_layout()
    plt.show()

# Run evaluation (uncomment to run)
# evaluate_classifiers_with_roc(models, X_train, X_test, y_train, y_test)

In [None]:
model_grid = {
    "Logistic Regression (L2)": {
        "model": LogisticRegression(solver='liblinear', max_iter=1000),
        "params": {"C": [0.1, 1, 10], "penalty": ['l2'], "max_iter": [100, 500]}
    },
    "Logistic Regression (L1)": {
        "model": LogisticRegression(solver='liblinear', max_iter=1000),
        "params": {"C": [0.1, 1, 10], "penalty": ['l1'], "max_iter": [100, 500]}
    },
    "ElasticNet (LogReg)": {
        "model": LogisticRegression(solver='saga', max_iter=1000),
        "params": {"C": [0.1, 1], "penalty": ['elasticnet'], "l1_ratio": [0.5], "max_iter": [1000]}
    },
    "SVM (RBF Kernel)": {
        "model": SVC(probability=True),
        "params": {"C": [0.1, 1, 10], "gamma": ['scale', 'auto']}
    },
    "Decision Tree": {
        "model": DecisionTreeClassifier(),
        "params": {"max_depth": [3, 5, 7], "min_samples_split": [2, 5]}
    },
    "Random Forest": {
        "model": RandomForestClassifier(),
        "params": {"n_estimators": [100, 200], "max_depth": [5, 10], "min_samples_split": [2, 5]}
    },
    "XGBoost": {
        "model": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
        "params": {"n_estimators": [100, 200], "max_depth": [3, 5], "learning_rate": [0.1, 0.01]}
    }
}

def evaluate_tuned_classifiers(model_grid, X_train, X_test, y_train, y_test):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    scale_models = ["Logistic Regression", "ElasticNet", "SVM"]
    plt.figure(figsize=(10, 8))
    for name, config in model_grid.items():
        print(f"\nüîç Tuning and Evaluating: {name}")
        model = config['model']
        params = config['params']
        scaled = any(keyword in name for keyword in scale_models)
        X_train_input = X_train_scaled if scaled else X_train
        X_test_input = X_test_scaled if scaled else X_test
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        grid = GridSearchCV(model, param_grid=params, cv=cv, scoring='accuracy', n_jobs=-1)
        grid.fit(X_train_input, y_train)
        best_model = grid.best_estimator_
        y_pred = best_model.predict(X_test_input)
        acc = accuracy_score(y_test, y_pred)
        print(f"‚úÖ Best Params: {grid.best_params_}")
        print(f"‚úÖ Accuracy: {acc:.4f}")
        print("üìã Classification Report:\n", classification_report(y_test, y_pred))
        if hasattr(best_model, 'predict_proba'):
            y_scores = best_model.predict_proba(X_test_input)[:, 1]
        else:
            y_scores = best_model.decision_function(X_test_input)
        fpr, tpr, _ = roc_curve(y_test, y_scores)
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, lw=2, label=f'{name} (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--', lw=1)
    plt.title("üèÅ ROC-AUC Curve (Fine-Tuned Models)")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.grid(True)
    plt.legend(loc='lower right')
    plt.tight_layout()
    plt.show()


Notes:
- Outputs were cleared to keep the notebook compact and valid. Re-run cells in your environment to regenerate outputs.
- If you want me to directly commit this fixed notebook into the repository, tell me and I will push a branch and open a PR for you.