In [180]:
import dash
from dash import html, dcc
import dash_bootstrap_components as dbc
import ollama
import plotly.express as px
import pandas as pd
import missingno as msno
import matplotlib.pyplot as plt
import plotly.figure_factory as ff
import io
import base64
import json
import numpy as np
import os
import plotly.graph_objects as go


In [169]:
assets_folder = "metadata"

json_file_path_user_ip = os.path.join(assets_folder, "user_inputs.json")
with open(json_file_path_user_ip, 'r') as f:
    user_inputs = json.load(f)

json_file_path_user_dtype = os.path.join(assets_folder, "user_data_types.json")
with open(json_file_path_user_dtype, 'r') as f:
    user_data_types = json.load(f)

json_file_path_prepocess_report_ip = os.path.join(assets_folder, "preprocessing_report_inputs.json")
with open(json_file_path_prepocess_report_ip,'r') as f:
    preprocessing_report_inputs = json.load(f)

json_file_path_prepocess_ip = os.path.join(assets_folder, "preprocessing_inputs.json")    
with open(json_file_path_prepocess_ip,'r') as f:
    preprocessing_inputs = json.load(f)

In [170]:
assets_folder = "data"
csv_file_path_raw = os.path.join(assets_folder, "df_raw.csv")
df_dtype_converted = pd.read_csv(csv_file_path_raw)

data = df_dtype_converted.sample(frac=0.95,random_state=123) # 95% data for Model training and testing
evaluation_data = df_dtype_converted.drop(data.index) # 5% data for Final Evaluation


In [171]:
assets_folder = "metadata"
json_file_path_ml_report = os.path.join(assets_folder, "ml_model_report_inputs.json")
with open(json_file_path_ml_report,'r') as f:
    ml_model_report_inputs= json.load(f)

In [172]:
ml_model_report_inputs

{'data_split': {'train_data_size': 889,
  'test_data_size': 381,
  'evaluation_data_size': 67},
 'total_preprocess_param_comb': 120,
 'intial_models_comparison': {'model': ['Gradient Boosting Regressor',
   'Random Forest Regressor',
   'AdaBoost Regressor',
   'Lasso Regression',
   'Ridge Regression',
   'Linear Regression',
   'Decision Tree Regressor',
   'K Neighbors Regressor',
   'Support Vector Regression'],
  'rmse': [4741.4442,
   4944.5859,
   5335.6514,
   6102.0297,
   6101.5581,
   6102.0969,
   6781.1992,
   11765.2596,
   12888.0853],
  'r2': [0.844,
   0.8308,
   0.8058,
   0.7447,
   0.7447,
   0.7446,
   0.687,
   0.0633,
   -0.1229]},
 'model_comparison': {'total_fits': 60,
  'top_model': 'Gradient Boosting Regressor',
  'top_param': {'Transformation': True,
   'Transform_Method': 'yeo-johnson',
   'Normalization': True,
   'Normalize_Method': 'robust',
   'Outlier': False,
   'Outlier_Method': None,
   'Multicollinearity': False,
   'Target Transformation': False,


In [173]:
# List all SHAP images in the assets folder
shap_images = [img for img in os.listdir("assets/") if img.startswith("shap_partial_")]

# Create options for dropdown (Remove "shap_partial_" from names)
dropdown_options = [{"label": img.replace("shap_partial_", "").replace(".png", ""), "value": img} for img in shap_images]

In [174]:
shap_images_loc = [img for img in os.listdir("assets/") if img.startswith("shap_local_explain_sample")]

# ✅ Create dropdown options
dropdown_options_loc = [{"label": f"Instance {i+1}", "value": img} for i, img in enumerate(shap_images_loc)]

In [175]:
def generate_explanation(stage):
    prompts = { 
        "data_split" : f"Explain the data split summary based on {ml_model_report_inputs['data_split']}. Minimum 100 words maximum 150 word",
        "initial_modle_compare" : f"Summary on compare 10 models on {int(user_inputs['K-Fold'])} number of Kfold cross validation. Top 3 Models {ml_model_report_inputs['intial_models_comparison']['model'][:3]} are choosen and their R2 values respectively are {ml_model_report_inputs['intial_models_comparison']['r2'][0:3]}  provide justification based on this",
        "top3_model_para_compare" : f"Provide summary : Top 3 Models {ml_model_report_inputs['intial_models_comparison']['model'][:3]} fitted with total {ml_model_report_inputs['model_comparison']['total_fits']} models based on all the combination of preprocessing parameters - Normalizaton,Transformation,outliers,Feature seletion.The top model found {ml_model_report_inputs['model_comparison']['top_model']} and the preprocessing parameters are {ml_model_report_inputs['model_comparison']['top_param']}",
        "final_model_build" : f"Provide summary of the Final Model built based on {ml_model_report_inputs['best_model_final_results']}",
        "ensemble_model" : f"Best Model {ml_model_report_inputs['final_best_model']}.Provide Summary of Model comparison based on {ml_model_report_inputs['ensemble_results']}",
        "residual_plot" : f"Explain how to read Residual plot in Mahine learning. Maximum 200 Words",
        "error_plot" : f"Explain how to read Error plot in Mahine learning. Maximum 200 Words",
        "shap_summary_plot" : f"""Explain how to read SHAP Summary Plot in explainable AI.
                                    Features Are Sorted by ImportanceTop to bottom: 
                                    The most important features are at the top.
                                    SHAP Values Indicate Impact:
                                    The X-axis represents SHAP values (impact on model output).
                                    Negative SHAP values (left) → Decrease the prediction.
                                    Positive SHAP values (right) → Increase the prediction.
                                    Colors Represent Feature Values:
                                    Color Gradient (from Blue to Red):
                                    Blue - Lower feature values
                                    Red - Higher feature valuesMaximum 200 Words""",
        "shap_summary_feature" : f"Explain how to read Feature Importance Plot.Maximum 200 Words",
        "partial_dependence" : f"Exlpain about partial dependence plot in Global explainability.Maximum 150 Words",
        "local_explain" : """Your task is to explain how to interpret a SHAP waterfall plot in a structured and concise manner.

                            ### **📊 Understanding a SHAP Waterfall Plot**

                            A SHAP waterfall plot visualizes how **each feature** in a machine learning model contributes to a specific prediction by **adding or subtracting value** from a base (expected) prediction.

                            #### **1️⃣ Key Components**
                            - **Base Value (E[f(X)])**: This is the model’s average prediction across all data points.
                            - **Final Prediction (f(x))**: The final predicted value for this specific instance.
                            - **Feature Contributions**:
                            - **Positive Contributions** (🔴 Red bars): Features that **increase** the prediction.
                            - **Negative Contributions** (🔵 Blue bars): Features that **decrease** the prediction.
                            - **Feature Importance Order**: The most impactful features are placed at the **top**, while less significant ones are at the **bottom**.

                            #### **2️⃣ How to Read the Plot**
                            1. **Start from the base value** (leftmost value at the bottom).
                            2. **Observe how each feature modifies the base value** step by step.
                            - Features **pushing the prediction higher** are in **red**.
                            - Features **pulling the prediction lower** are in **blue**.
                            3. **The final prediction (f(x))** is obtained by summing the base value and the feature effects.

                            #### **3️⃣ Interpretation**
                            - **Which features have the most impact?**  
                            - The largest bars (longest red/blue bars) indicate the strongest contributors.
                            - **Is the prediction higher or lower than expected?**  
                            - If most contributions are **positive (red)**, the prediction is **above average**.  
                            - If mostly **negative (blue)**, the prediction is **below average**.
                            - **How do categorical and numerical features affect the outcome?**  
                            - Binary features (e.g., **smoker = 1**) often have a sharp impact.
                            - Continuous features (e.g., **age, BMI**) gradually modify the prediction."""
    }
    
    response = ollama.chat(model="llama2:chat", messages=[{"role": "user", "content": prompts[stage]}])
    
    return response['message']['content']

explanations = {
           "data_split_ex" : generate_explanation("data_split"),
           "initial_modle_compare" : generate_explanation("initial_modle_compare"),
           "top3_model_para_compare" : generate_explanation("top3_model_para_compare"),
           "final_model_build" : generate_explanation("final_model_build"),
           "ensemble_model" : generate_explanation("ensemble_model"),
           "residual_plot" : generate_explanation("residual_plot"),
           "error_plot" : generate_explanation("error_plot"),
           "shap_summary_plot" : generate_explanation("shap_summary_plot"),
           "shap_summary_feature" : generate_explanation("shap_summary_feature"),
           "partial_dependence" : generate_explanation("partial_dependence"),
           "local_explain" : generate_explanation("local_explain")
}

In [176]:
app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
app.layout = dbc.Container([
    dbc.Row(dbc.Col(html.H2(f"ML Model Build Report - {user_inputs['Machine Learning Task']}"), className="text-center my-3")),
    dbc.Row([dbc.Col(html.H4("Data Split"))]),
    dbc.Row([dbc.Col(dcc.Markdown(explanations["data_split_ex"]),width = 6),
            dbc.Col(dcc.Graph(id = 'data_split_id'),className="d-flex justify-content-center",width = 6)]),
    dcc.Interval(id='interval-component', interval=1*1000, n_intervals=0, max_intervals=1),
    
    dbc.Row([dbc.Col(html.H4("Initial Model Comparision"))]),
    dbc.Row([dbc.Col(dcc.Graph(id = 'intial_model_compare' ))]),
    dcc.Interval(id='interval-component', interval=1*1000, n_intervals=0, max_intervals=1),
    dbc.Row([dbc.Col(dcc.Markdown(explanations['initial_modle_compare']))]),
    dbc.Row([dbc.Col(html.H4("Fine Tunning Model PreProcessing Parameters"))]),
    dbc.Row([dbc.Col(dcc.Markdown(explanations['top3_model_para_compare']))]),
    dbc.Row([dbc.Col(html.H4("Final Model Build"))]),
    dbc.Row([dbc.Col(dcc.Graph(id = 'final_model_build' ))]),
    dbc.Row([dbc.Col(dcc.Markdown(explanations['final_model_build']))]),
    dbc.Row([dbc.Col(html.H4("Ensemble Model Comaprison"))]),
    dbc.Row([dbc.Col(dcc.Graph(id = 'ensemble_model' ))]),
    dbc.Row([dbc.Col(dcc.Markdown(explanations['ensemble_model']))]),
    dbc.Row([dbc.Col(html.H4("Model Analysis"))]),
    dbc.Row([dbc.Col(html.H4("Residuals Plot"))]),
    dbc.Row(dbc.Col(html.Img(src="/assets/residuals_plot.png", style={"width": "60%"}))),
    dbc.Row([dbc.Col(dcc.Markdown(explanations['residual_plot']))]),
    dbc.Row([dbc.Col(html.H4("Error Plot"))]),
    dbc.Row(dbc.Col(html.Img(src="/assets/error_plot.png", style={"width": "60%"}))),
    dbc.Row([dbc.Col(dcc.Markdown(explanations['error_plot']))]),
    dbc.Row([dbc.Col(html.H4("Explainable AI"))]),
    dbc.Row([dbc.Col(html.H4("Global Explainability - Summary Plot"))]),
    dbc.Row(dbc.Col(html.Img(src="/assets/shap_summary_plot.png", style={"width": "60%"}))),
    dbc.Row([dbc.Col(dcc.Markdown(explanations['shap_summary_plot']))]),
    dbc.Row([dbc.Col(html.H4("Global Explainability - Feature Importance"))]),
    dbc.Row(dbc.Col(html.Img(src="/assets/shap_summary_plot_feature.png", style={"width": "60%"}))),
    dbc.Row([dbc.Col(dcc.Markdown(explanations['shap_summary_feature']))]),
    dbc.Row([dbc.Col(html.H4("Global Explainability - Partial Dependence Plot"))]),
    dbc.Row([dbc.Col(dcc.Dropdown(
                                    id="shap_plot_selector",
                                    options=dropdown_options,
                                    value=shap_images[0],  # Default selection
                                    clearable=False),)]),
    dbc.Row([html.Img(id="selected_shap_plot", style={"width": "60%", "margin-top": "20px"})]),
    dbc.Row([dbc.Col(dcc.Markdown(explanations['partial_dependence']))]),
    dbc.Row([dbc.Col(html.H4("Local Explainability - SHAP Waterfall Plot"))]),
    dbc.Row([dcc.Dropdown(
        id="shap_waterfall_selector",
        options=dropdown_options_loc,
        value=shap_images[0],  # Default selection
        clearable=False
    ),]),
    dbc.Row([html.Img(id="selected_shap_waterfall", style={"width": "60%", "margin-top": "20px"})]),
    dbc.Row([dbc.Col(dcc.Markdown(explanations['local_explain']))]),
])

@app.callback(
    dash.dependencies.Output('data_split_id', 'figure'),
    [dash.dependencies.Input('interval-component', 'n_intervals')]
)

def data_split_plot(n_intervals):
    
    labels = ['Train', 'Test', 'Evaluation Data']
    values = [ml_model_report_inputs['data_split']['train_data_size'], ml_model_report_inputs['data_split']['test_data_size'], ml_model_report_inputs['data_split']['evaluation_data_size']]

    # Define the "pull" effect (only Evaluation Data is pulled out)
    pull_effect = [0.2, 0, 0]  # Only Evaluation Data is pulled out

    # Create Pie Chart
    fig = go.Figure(data=[go.Pie(labels=labels, values=values, pull=pull_effect)])

    # Update Layout
    fig.update_layout(width=450, height=450,title = "Pie Chart",
                      legend=dict(orientation="h",   # Horizontal legend
                                    x=0.5, y=1.1,     # Centered below the chart
                                    xanchor="center", yanchor="top",
                                    font=dict(size=12)))

    return fig

@app.callback(
    dash.dependencies.Output('intial_model_compare', 'figure'),
    [dash.dependencies.Input('interval-component', 'n_intervals')]
)

def intial_model_compare(n_intercals):
    data = ml_model_report_inputs['intial_models_comparison']

    df = pd.DataFrame(data)

    # Create figure with secondary y-axis
    fig = make_subplots(specs=[[{"secondary_y": True}]])

    # Add RMSE as Bar Chart (Primary Y-Axis)
    fig.add_trace(go.Bar(
        x=df["model"], y=df["rmse"], name="RMSE (Error)", marker_color="indigo"
    ), secondary_y=False)

    # Add R² as Line Chart (Secondary Y-Axis)
    fig.add_trace(go.Scatter(
        x=df["model"], y=df["r2"], name="R² Score (Accuracy)",
        mode="lines+markers", marker=dict(color="red")
    ), secondary_y=True)

    # Update Layout
    fig.update_layout(
        title="📊 Model Performance: RMSE vs R²",
        xaxis_title="Model",
        yaxis_title="RMSE (Lower is Better)",  # Left Y-Axis Label
        yaxis2_title="R² Score (Higher is Better)",# Right Y-Axis Label
        height = 500,
        legend=dict(orientation="h", x=0.5, y=1.1, xanchor="center")  # Horizontal legend below the chart
    )

    return fig 

@app.callback(
    dash.dependencies.Output('final_model_build', 'figure'),
    [dash.dependencies.Input('interval-component', 'n_intervals')]
)

def final_model_build(n_intervals):
    
    # Given dictionary with mixed data types
    raw_results = ml_model_report_inputs['best_model_final_results']

    # Function to extract numeric values from the dictionary
    def extract_numeric(value):
        if isinstance(value, str):
            return float(value.split()[1])  # Extract the numeric part from string
        return value  # Return as is if already a number

    # Convert dictionary into structured data format
    converted_data = {
        "Stage": ["Preprocess", "Test", "Hyperparameter Test", "Evaluation"],
        "R2": [
            extract_numeric(raw_results["preprocess_results"]["R2"]),
            extract_numeric(raw_results["test_result"]["R2"]),
            extract_numeric(raw_results["hyperpar_test_result"]["R2"]),
            extract_numeric(raw_results["eval_result"]["R2"])
        ],
        "RMSE": [
            extract_numeric(raw_results["preprocess_results"]["RMSE"]),
            extract_numeric(raw_results["test_result"]["RMSE"]),
            extract_numeric(raw_results["hyperpar_test_result"]["RMSE"]),
            extract_numeric(raw_results["eval_result"]["RMSE"])
        ]
    }

    df = pd.DataFrame(converted_data)

    # Create figure with secondary y-axis
    fig = make_subplots(specs=[[{"secondary_y": True}]])

    # Add RMSE as Bar Chart (Primary Y-Axis)
    fig.add_trace(go.Bar(
        x=df["Stage"], y=df["RMSE"], name="RMSE (Error)", marker_color="indigo"
    ), secondary_y=False)

    # Add R² as Line Chart (Secondary Y-Axis)
    fig.add_trace(go.Scatter(
        x=df["Stage"], y=df["R2"], name="R² Score (Accuracy)",
        mode="lines+markers", marker=dict(color="crimson")
    ), secondary_y=True)

    # Update Layout
    fig.update_layout(
        title="📊 Model Performance at Different Stages",
        xaxis_title="Stage",
        yaxis_title="RMSE (Lower is Better)",  # Left Y-Axis Label
        yaxis2_title="R² Score (Higher is Better)",  # Right Y-Axis Label
        width=800, height=500,
        legend=dict(orientation="h", x=0.5, y=-0.2, xanchor="center")  # Horizontal legend
    )

    return fig

@app.callback(
    dash.dependencies.Output('ensemble_model', 'figure'),
    [dash.dependencies.Input('interval-component', 'n_intervals')]
)

def ensemble_model(n_intervals):
    
    data = ml_model_report_inputs['ensemble_results']

    df = pd.DataFrame(data)

    # Create figure with secondary y-axis
    fig = make_subplots(specs=[[{"secondary_y": True}]])  # Enables dual-axis

    # Add RMSE as Bar Chart (Primary Y-Axis)
    fig.add_trace(go.Bar(
        x=df["Model"], y=df["RMSE"], name="RMSE (Error)", marker_color="indigo"
    ), secondary_y=False)

    # Add R² as Line Chart (Secondary Y-Axis)
    fig.add_trace(go.Scatter(
        x=df["Model"], y=df["R2"], name="R² Score (Accuracy)",
        mode="lines+markers", marker=dict(color="crimson")
    ), secondary_y=True)

    # Update Layout
    fig.update_layout(
        title="📊 Model Performance: RMSE vs R²",
        xaxis_title="Model",
        yaxis_title="RMSE (Lower is Better)",  # Left Y-Axis Label
        yaxis2_title="R² Score (Higher is Better)",  # Right Y-Axis Label
        width=900, height=500,
        legend=dict(orientation="h", x=0.5, y=-0.2, xanchor="center")  # Horizontal legend below the chart
    )
    return fig

@app.callback(
    dash.dependencies.Output("selected_shap_plot", "src"),
    [dash.dependencies.Input("shap_plot_selector", "value")]
)
def update_image(selected_image):
    return f"/assets/{selected_image}"

@app.callback(
    dash.dependencies.Output("selected_shap_waterfall", "src"),
    [dash.dependencies.Input("shap_waterfall_selector", "value")]
)
def update_image(selected_image):
    return f"/assets/{selected_image}"



if __name__ == "__main__":
    app.run_server(debug=True,port = 8086)

---------------------------------------------------------------------------
DuplicateIdError                          Traceback (most recent call last)
DuplicateIdError: Duplicate component id found in the initial layout: `interval-component`



---------------------------------------------------------------------------
NotFound                                  Traceback (most recent call last)
NotFound: 404 Not Found: The requested URL was not found on the server. If you entered the URL manually please check your spelling and try again.

---------------------------------------------------------------------------
NotFound                                  Traceback (most recent call last)
NotFound: 404 Not Found: The requested URL was not found on the server. If you entered the URL manually please check your spelling and try again.

---------------------------------------------------------------------------
NotFound                                  Traceback (most recent call last)
NotFound: 404 Not Found: The requested URL was not found on the server. If you entered the URL manually please check your spelling and try again.

---------------------------------------------------------------------------
NotFound                   

In [177]:
ml_model_report_inputs['ensemble_results']

{'Model': ['Gradient Boosting Regressor',
  'Gradient Boosting Regressor(Bagging)',
  'Gradient Boosting Regressor(Boosting)',
  'Voting Regressor',
  'Stacking Regressor'],
 'R2': [0.9067, 0.903, 0.903, 0.8933, 0.9029],
 'RMSE': [3945.0074, 4022.1087, 4022.1087, 4217.5356, 4023.606]}

In [178]:
# Given dictionary with mixed data types
raw_results = ml_model_report_inputs['best_model_final_results']

# Function to extract numeric values from the dictionary
def extract_numeric(value):
    if isinstance(value, str):
        return float(value.split()[1])  # Extract the numeric part from string
    return value  # Return as is if already a number

# Convert dictionary into structured data format
converted_data = {
    "Stage": ["Preprocess", "Test", "Hyperparameter Test", "Evaluation"],
    "R2": [
        extract_numeric(raw_results["preprocess_results"]["R2"]),
        extract_numeric(raw_results["test_result"]["R2"]),
        extract_numeric(raw_results["hyperpar_test_result"]["R2"]),
        extract_numeric(raw_results["eval_result"]["R2"])
    ],
    "RMSE": [
        extract_numeric(raw_results["preprocess_results"]["RMSE"]),
        extract_numeric(raw_results["test_result"]["RMSE"]),
        extract_numeric(raw_results["hyperpar_test_result"]["RMSE"]),
        extract_numeric(raw_results["eval_result"]["RMSE"])
    ]
}

# Display structured data
import pandas as pd
df_converted = pd.DataFrame(converted_data)
df_converted


Unnamed: 0,Stage,R2,RMSE
0,Preprocess,0.8445,4732.9684
1,Test,0.8464,4624.9717
2,Hyperparameter Test,0.7918,5506.2722
3,Evaluation,0.9067,3945.0074


In [179]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pandas as pd

# Given Data
data = {
    "Stage": ["Preprocess", "Test", "Hyperparameter Test", "Evaluation"],
    "R2": [0.8358, 0.8593, 0.7909, 0.9067],  # Converted from string
    "RMSE": [4749.9178, 4645.4526, 5399.806, 3945.0074]  # Converted from string
}

df = pd.DataFrame(data)

# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add RMSE as Bar Chart (Primary Y-Axis)
fig.add_trace(go.Bar(
    x=df["Stage"], y=df["RMSE"], name="RMSE (Error)", marker_color="royalblue"
), secondary_y=False)

# Add R² as Line Chart (Secondary Y-Axis)
fig.add_trace(go.Scatter(
    x=df["Stage"], y=df["R2"], name="R² Score (Accuracy)",
    mode="lines+markers", marker=dict(color="crimson")
), secondary_y=True)

# Update Layout
fig.update_layout(
    title="📊 Model Performance at Different Stages",
    xaxis_title="Stage",
    yaxis_title="RMSE (Lower is Better)",  # Left Y-Axis Label
    yaxis2_title="R² Score (Higher is Better)",  # Right Y-Axis Label
    width=800, height=500,
    legend=dict(orientation="h", x=0.5, y=-0.2, xanchor="center")  # Horizontal legend
)

fig.show()
