In [11]:
import pandas as pd
import numpy as np
import os
import csv
import joblib
import gradio as gr
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
# Load the data
def load_and_prepare_data():
    url = "https://raw.githubusercontent.com/juliocezarcarneiro/diabetes-risk-prediction-model/main/Resources/full_cleaned_merged.csv"
    df = pd.read_csv(url)
    df.dropna(subset=["Diabetes_binary"], inplace=True)
    X = df.drop(columns=["Diabetes_binary"])
    y = df["Diabetes_binary"]
    return X, y, df

# Call the function to get the data
X, y, df = load_and_prepare_data()

# Now you can display the DataFrame
display(df.head())

In [None]:
# Train optimized model
def train_optimized_model(X_train_scaled, y_train):
    param_grid = {
        'n_estimators': [200],
        'max_depth': [None],
        'min_samples_split': [2],
        'min_samples_leaf': [1],
        'bootstrap': [True]
    }
    rf = RandomForestClassifier(random_state=42)
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
    grid_search.fit(X_train_scaled, y_train)
    print("✅ Best Parameters:", grid_search.best_params_)
    return grid_search.best_estimator_

In [None]:
def create_gradio_interface(model, scaler, feature_names):
    def update_bmi(weight, height):
        bmi = (weight / (height ** 2)) * 703
        return round(bmi, 1)

    def predict(weight, height, age, high_bp, gen_hlth, high_chol):
        bmi = (weight / (height ** 2)) * 703
        input_values = np.zeros(len(feature_names))
        input_values[feature_names.index('BMI')] = bmi
        input_values[feature_names.index('Age')] = age
        input_values[feature_names.index('HighBP')] = high_bp
        input_values[feature_names.index('GenHlth')] = gen_hlth
        input_values[feature_names.index('HighChol')] = high_chol
        input_values[feature_names.index('CholCheck')] = 1
        input_values[feature_names.index('Smoker')] = 0

        scaled_input = scaler.transform([input_values])
        proba = model.predict_proba(scaled_input)[0]
        risk_level = "High" if proba[1] > 0.5 else "Low"
        prob = round(proba[1] * 100, 1)
        conf = round(max(proba) * 100, 1)

        # Save to CSV
        filename = "user_predictions.csv"
        file_exists = os.path.isfile(filename)
        with open(filename, mode='a', newline='') as file:
            writer = csv.writer(file)
            if not file_exists:
                writer.writerow([
                    "weight", "height", "bmi", "age", "high_bp",
                    "gen_hlth", "high_chol", "risk_level", "probability", "confidence"
                ])
            writer.writerow([
                weight, height, round(bmi, 1), age, high_bp, gen_hlth, high_chol, risk_level, prob, conf
            ])

        return risk_level, prob, conf, round(bmi, 1)

    with gr.Blocks() as demo:
        gr.Markdown("## 🩺 Diabetes Risk Predictor\n_BMI is auto-calculated from your weight and height_")
        with gr.Row():
            weight_input = gr.Slider(50, 400, step=0.1, label="Weight (lbs)", value=150)
            height_input = gr.Slider(48, 84, step=0.1, label="Height (inches)", value=65)
            bmi_output = gr.Number(label="BMI (auto-calculated)", interactive=False)

        weight_input.change(fn=update_bmi, inputs=[weight_input, height_input], outputs=bmi_output)
        height_input.change(fn=update_bmi, inputs=[weight_input, height_input], outputs=bmi_output)

        age_input = gr.Dropdown(list(range(18, 100)), label="Age", value=45)
        high_bp_input = gr.Radio([0, 1], label="High Blood Pressure (0=No, 1=Yes)", value=0)
        gen_hlth_input = gr.Slider(1, 5, step=1, label="General Health (1=Poor, 5=Excellent)", value=3)
        high_chol_input = gr.Radio([0, 1], label="High Cholesterol (0=No, 1=Yes)", value=0)

        predict_btn = gr.Button("Predict Risk")

        risk_label = gr.Label(label="Risk Level")
        prob_output = gr.Number(label="Diabetes Probability (%)")
        conf_output = gr.Number(label="Confidence Score (%)")
        bmi_display = gr.Number(label="Calculated BMI")

        predict_btn.click(
            fn=predict,
            inputs=[weight_input, height_input, age_input, high_bp_input, gen_hlth_input, high_chol_input],
            outputs=[risk_label, prob_output, conf_output, bmi_display]
        )

    return demo

# Execute the model

if __name__ == "__main__":
    X, y, df = load_and_prepare_data()
    print("✅ Data loaded. Sample:")
    print(df.head())

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    model = train_optimized_model(X_train_scaled, y_train)

    # Evaluation
    y_pred = model.predict(X_test_scaled)
    print(f"\n🎯 Optimized Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print("\n🔍 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("\n📋 Classification Report:\n", classification_report(y_test, y_pred))

    # Save model artifacts
    joblib.dump(model, "diabetes_model.pkl")
    joblib.dump(scaler, "scaler.pkl")
    joblib.dump(X.columns.tolist(), "feature_names.pkl")

    # Launch Gradio app
    demo = create_gradio_interface(model, scaler, X.columns.tolist())
    demo.launch(share=True)