In [25]:
import dash
from dash import html, dcc
import dash_bootstrap_components as dbc
import ollama
import plotly.express as px
import pandas as pd
import missingno as msno
import matplotlib.pyplot as plt
import plotly.figure_factory as ff
import io
import base64
import json
import numpy as np
import os
import plotly.graph_objects as go


In [16]:
assets_folder = "metadata"

json_file_path_user_ip = os.path.join(assets_folder, "user_inputs.json")
with open(json_file_path_user_ip, 'r') as f:
    user_inputs = json.load(f)

json_file_path_user_dtype = os.path.join(assets_folder, "user_data_types.json")
with open(json_file_path_user_dtype, 'r') as f:
    user_data_types = json.load(f)

json_file_path_prepocess_report_ip = os.path.join(assets_folder, "preprocessing_report_inputs.json")
with open(json_file_path_prepocess_report_ip,'r') as f:
    preprocessing_report_inputs = json.load(f)

json_file_path_prepocess_ip = os.path.join(assets_folder, "preprocessing_inputs.json")    
with open(json_file_path_prepocess_ip,'r') as f:
    preprocessing_inputs = json.load(f)

In [17]:
assets_folder = "data"
csv_file_path_raw = os.path.join(assets_folder, "df_raw.csv")
df_dtype_converted = pd.read_csv(csv_file_path_raw)

data = df_dtype_converted.sample(frac=0.95,random_state=123) # 95% data for Model training and testing
evaluation_data = df_dtype_converted.drop(data.index) # 5% data for Final Evaluation


In [18]:
assets_folder = "metadata"
json_file_path_ml_report = os.path.join(assets_folder, "ml_model_report_inputs.json")
with open(json_file_path_ml_report,'r') as f:
    ml_model_report_inputs= json.load(f)

In [19]:
ml_model_report_inputs

{'data_split': {'train_data_size': 510,
  'test_data_size': 220,
  'evaluation_data_size': 38},
 'total_preprocess_param_comb': 15,
 'intial_models_comparison': {'model': ['Ridge Classifier',
   'Logistic Regression',
   'Gradient Boosting Classifier',
   'Random Forest Classifier',
   'Ada Boost Classifier',
   'K Neighbors Classifier',
   'Decision Tree Classifier',
   'SVM - Linear Kernel'],
  'accuracy': [0.7549, 0.7686, 0.749, 0.7412, 0.7431, 0.6922, 0.6882, 0.5725],
  'auc': [0.8194, 0.8186, 0.8131, 0.7993, 0.7732, 0.6995, 0.6668, 0.5287],
  'recall': [0.5221, 0.5556, 0.5841, 0.5619, 0.5783, 0.4943, 0.5956, 0.393],
  'precision': [0.6973, 0.716, 0.6554, 0.6498, 0.6465, 0.5672, 0.5486, 0.3094],
  'f1': [0.5962, 0.6245, 0.6158, 0.5996, 0.6098, 0.5277, 0.5704, 0.2769]},
 'model_comparison': {'total_fits': 15,
  'top_model': 'Ridge Classifier',
  'top_param': {'Transformation': True,
   'Transform_Method': 'yeo-johnson',
   'Normalization': False,
   'Normalize_Method': None,
   'Out

In [20]:
# List all SHAP images in the assets folder
shap_images = [img for img in os.listdir("assets/") if img.startswith("shap_partial_class_")]

# Create options for dropdown (Remove "shap_partial_" from names)
dropdown_options = [{"label": img.replace("shap_partial_", "").replace(".png", ""), "value": img} for img in shap_images]

In [21]:
shap_images_loc = [img for img in os.listdir("assets/") if img.startswith("shap_local_explain_sample_class")]

# ✅ Create dropdown options
dropdown_options_loc = [{"label": f"Instance {i+1}", "value": img} for i, img in enumerate(shap_images_loc)]

In [22]:
def generate_explanation(stage):
    prompts = { 
        "data_split" : f"Explain the data split summary based on {ml_model_report_inputs['data_split']}. Minimum 100 words maximum 150 word",
        "initial_modle_compare" : f"Summary on compare 10 models on {int(user_inputs['K-Fold'])} number of Kfold cross validation. Top 3 Models {ml_model_report_inputs['intial_models_comparison']['model'][:3]} are choosen and their R2 values respectively are {ml_model_report_inputs['intial_models_comparison']['auc'][0:3]}  provide justification based on this",
        "top3_model_para_compare" : f"Provide summary : Top 3 Models {ml_model_report_inputs['intial_models_comparison']['model'][:3]} fitted with total {ml_model_report_inputs['model_comparison']['total_fits']} models based on all the combination of preprocessing parameters - Normalizaton,Transformation,outliers,Feature seletion.The top model found {ml_model_report_inputs['model_comparison']['top_model']} and the preprocessing parameters are {ml_model_report_inputs['model_comparison']['top_param']}",
        "final_model_build" : f"Provide summary of the Final Model built based on {ml_model_report_inputs['best_model_final_results']}",
        "ensemble_model" : f"Best Model {ml_model_report_inputs['final_best_model']}.Provide Summary of Model comparison based on {ml_model_report_inputs['ensemble_results']}",
        "boundary_plot" : f"Explain how to read Boundary plot in {user_inputs['Machine Learning Task']} Mahine learning task. Maximum 200 Words",
        "confusion_matrix" : f"Explain how to read confusion matrix and Class Report plot in {user_inputs['Machine Learning Task']} Mahine learning task. Maximum 200 Words",
        "shap_summary_plot" : f"""Explain how to read SHAP Summary Plot in explainable AI.
                                    Features Are Sorted by ImportanceTop to bottom: 
                                    The most important features are at the top.
                                    SHAP Values Indicate Impact:
                                    The X-axis represents SHAP values (impact on model output).
                                    Negative SHAP values (left) → Decrease the prediction.
                                    Positive SHAP values (right) → Increase the prediction.
                                    Colors Represent Feature Values:
                                    Color Gradient (from Blue to Red):
                                    Blue - Lower feature values
                                    Red - Higher feature valuesMaximum 200 Words""",
        "shap_summary_feature" : f"Explain how to read Feature Importance Plot.Maximum 200 Words",
        "partial_dependence" : f"Exlpain about partial dependence plot in Global explainability.Maximum 150 Words",
        "local_explain" : """Your task is to explain how to interpret a SHAP waterfall plot in a structured and concise manner.

                            ### **📊 Understanding a SHAP Waterfall Plot**

                            A SHAP waterfall plot visualizes how **each feature** in a machine learning model contributes to a specific prediction by **adding or subtracting value** from a base (expected) prediction.

                            #### **1️⃣ Key Components**
                            - **Base Value (E[f(X)])**: This is the model’s average prediction across all data points.
                            - **Final Prediction (f(x))**: The final predicted value for this specific instance.
                            - **Feature Contributions**:
                            - **Positive Contributions** (🔴 Red bars): Features that **increase** the prediction.
                            - **Negative Contributions** (🔵 Blue bars): Features that **decrease** the prediction.
                            - **Feature Importance Order**: The most impactful features are placed at the **top**, while less significant ones are at the **bottom**.

                            #### **2️⃣ How to Read the Plot**
                            1. **Start from the base value** (leftmost value at the bottom).
                            2. **Observe how each feature modifies the base value** step by step.
                            - Features **pushing the prediction higher** are in **red**.
                            - Features **pulling the prediction lower** are in **blue**.
                            3. **The final prediction (f(x))** is obtained by summing the base value and the feature effects.

                            #### **3️⃣ Interpretation**
                            - **Which features have the most impact?**  
                            - The largest bars (longest red/blue bars) indicate the strongest contributors.
                            - **Is the prediction higher or lower than expected?**  
                            - If most contributions are **positive (red)**, the prediction is **above average**.  
                            - If mostly **negative (blue)**, the prediction is **below average**.
                            - **How do categorical and numerical features affect the outcome?**  
                            - Binary features (e.g., **smoker = 1**) often have a sharp impact.
                            - Continuous features (e.g., **age, BMI**) gradually modify the prediction."""
    }
    
    response = ollama.chat(model="llama2:chat", messages=[{"role": "user", "content": prompts[stage]}])
    
    return response['message']['content']

explanations = {
           "data_split_ex" : generate_explanation("data_split"),
           "initial_modle_compare" : generate_explanation("initial_modle_compare"),
           "top3_model_para_compare" : generate_explanation("top3_model_para_compare"),
           "final_model_build" : generate_explanation("final_model_build"),
           "ensemble_model" : generate_explanation("ensemble_model"),
           "boundary_plot" : generate_explanation("boundary_plot"),
           "confusion_matrix" : generate_explanation("confusion_matrix"),
           "shap_summary_plot" : generate_explanation("shap_summary_plot"),
           "shap_summary_feature" : generate_explanation("shap_summary_feature"),
           "partial_dependence" : generate_explanation("partial_dependence"),
           "local_explain" : generate_explanation("local_explain")
}

In [26]:
app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
app.layout = dbc.Container([
    dbc.Row(dbc.Col(html.H2(f"ML Model Build Report - {user_inputs['Machine Learning Task']}"), className="text-center my-3")),
    dbc.Row([dbc.Col(html.H4("Data Split"))]),
    dbc.Row([dbc.Col(dcc.Markdown(explanations["data_split_ex"]),width = 6),
            dbc.Col(dcc.Graph(id = 'data_split_id'),className="d-flex justify-content-center",width = 6)]),
    dcc.Interval(id='interval-component', interval=1*1000, n_intervals=0, max_intervals=1),
    
    dbc.Row([dbc.Col(html.H4("Initial Model Comparision"))]),
    dbc.Row([dbc.Col(dcc.Graph(id = 'intial_model_compare' ))]),
    dcc.Interval(id='interval-component', interval=1*1000, n_intervals=0, max_intervals=1),
    dbc.Row([dbc.Col(dcc.Markdown(explanations['initial_modle_compare']))]),
    dbc.Row([dbc.Col(html.H4("Fine Tunning Model PreProcessing Parameters"))]),
    dbc.Row([dbc.Col(dcc.Markdown(explanations['top3_model_para_compare']))]),
    dbc.Row([dbc.Col(html.H4("Final Model Build"))]),
    dbc.Row([dbc.Col(dcc.Graph(id = 'final_model_build' ))]),
    dbc.Row([dbc.Col(dcc.Markdown(explanations['final_model_build']))]),
    dbc.Row([dbc.Col(html.H4("Ensemble Model Comaprison"))]),
    dbc.Row([dbc.Col(dcc.Graph(id = 'ensemble_model' ))]),
    dbc.Row([dbc.Col(dcc.Markdown(explanations['ensemble_model']))]),
    dbc.Row([dbc.Col(html.H4("Model Analysis"))]),
    dbc.Row([dbc.Col(html.H4("Boundary Plot"))]),
    dbc.Row(dbc.Col(html.Img(src="/assets/boundary_plot.png", style={"width": "60%"}))),
    dbc.Row([dbc.Col(dcc.Markdown(explanations['boundary_plot']))]),
    dbc.Row([dbc.Col(html.H4("Confusion Matrix and Class Report"))]),
    dbc.Row([dbc.Col(html.Img(src="/assets/confusion_matrix_plot.png", style={"width": "100%"})),
            dbc.Col(html.Img(src="/assets/class_report_plot.png", style={"width": "100%"}))]),
    dbc.Row([dbc.Col(dcc.Markdown(explanations['confusion_matrix']))]),
    dbc.Row([dbc.Col(html.H4("Explainable AI"))]),
    dbc.Row([dbc.Col(html.H4("Global Explainability - Summary Plot"))]),
    dbc.Row(dbc.Col(html.Img(src="/assets/shap_summary_plot_class.png", style={"width": "60%"}))),
    dbc.Row([dbc.Col(dcc.Markdown(explanations['shap_summary_plot']))]),
    dbc.Row([dbc.Col(html.H4("Global Explainability - Feature Importance"))]),
    dbc.Row(dbc.Col(html.Img(src="/assets/shap_summary_plot_feature_class.png", style={"width": "60%"}))),
    dbc.Row([dbc.Col(dcc.Markdown(explanations['shap_summary_feature']))]),
    dbc.Row([dbc.Col(html.H4("Global Explainability - Partial Dependence Plot"))]),
    dbc.Row([dbc.Col(dcc.Dropdown(
                                    id="shap_plot_selector",
                                    options=dropdown_options,
                                    value=shap_images[0],  # Default selection
                                    clearable=False),)]),
    dbc.Row([html.Img(id="selected_shap_plot", style={"width": "60%", "margin-top": "20px"})]),
    dbc.Row([dbc.Col(dcc.Markdown(explanations['partial_dependence']))]),
    dbc.Row([dbc.Col(html.H4("Local Explainability - SHAP Waterfall Plot"))]),
    dbc.Row([dcc.Dropdown(
        id="shap_waterfall_selector",
        options=dropdown_options_loc,
        value=shap_images[0],  # Default selection
        clearable=False
    ),]),
    dbc.Row([html.Img(id="selected_shap_waterfall", style={"width": "60%", "margin-top": "20px"})]),
    dbc.Row([dbc.Col(dcc.Markdown(explanations['local_explain']))]),
])

@app.callback(
    dash.dependencies.Output('data_split_id', 'figure'),
    [dash.dependencies.Input('interval-component', 'n_intervals')]
)

def data_split_plot(n_intervals):
    
    labels = ['Train', 'Test', 'Evaluation Data']
    values = [ml_model_report_inputs['data_split']['train_data_size'], ml_model_report_inputs['data_split']['test_data_size'], ml_model_report_inputs['data_split']['evaluation_data_size']]

    # Define the "pull" effect (only Evaluation Data is pulled out)
    pull_effect = [0.2, 0, 0]  # Only Evaluation Data is pulled out

    # Create Pie Chart
    fig = go.Figure(data=[go.Pie(labels=labels, values=values, pull=pull_effect)])

    # Update Layout
    fig.update_layout(width=450, height=450,title = "Pie Chart",
                      legend=dict(orientation="h",   # Horizontal legend
                                    x=0.5, y=1.1,     # Centered below the chart
                                    xanchor="center", yanchor="top",
                                    font=dict(size=12)))

    return fig

@app.callback(
    dash.dependencies.Output('intial_model_compare', 'figure'),
    [dash.dependencies.Input('interval-component', 'n_intervals')]
)

def intial_model_compare(n_intercals):
    data = ml_model_report_inputs['intial_models_comparison']

    df = pd.DataFrame(data)

    # Melt the DataFrame to long format for Plotly
    df_long = df.melt(id_vars='model', 
                    value_vars=['accuracy', 'auc', 'recall', 'precision', 'f1'],
                    var_name='metric', 
                    value_name='value')

    # Create a grouped bar chart
    fig = px.bar(df_long, 
                x='model', 
                y='value', 
                color='metric', 
                barmode='group',
                title='Model Comparison for Classification Metrics',
                text='value')

    # Optionally format the text on bars
    fig.update_traces(texttemplate='%{text:.4f}', textposition='outside')
    fig.update_layout(xaxis_title='Model', yaxis_title='Metric Value', uniformtext_minsize=8)


    return fig 

@app.callback(
    dash.dependencies.Output('final_model_build', 'figure'),
    [dash.dependencies.Input('interval-component', 'n_intervals')]
)

def final_model_build(n_intervals):
    
    # Given dictionary with mixed data types
    data = ml_model_report_inputs['best_model_final_results']

    df = pd.DataFrame(data)
    df.index.name = 'metric'
    df = df.rename(columns={
        'preprocess_results': 'Preprocess',
        'test_result': 'Test',
        'hyperpar_test_result': 'Hyperparameter Test',
        'eval_result': 'Evaluation'
    })
    # Reset the index so that 'metric' becomes a column
    df = df.reset_index()

    # Melt the DataFrame to a long format
    df_long = df.melt(id_vars='metric', var_name='model', value_name='value')

    # Create a grouped bar chart using Plotly Express
    fig = px.bar(df_long,
                x='model',
                y='value',
                color='metric',
                barmode='group',
                title='Model Comparison for Classification Metrics',
                text='value')

    # Format the text on bars
    fig.update_traces(texttemplate='%{text:.4f}', textposition='outside')
    fig.update_layout(xaxis_title='Result Type', yaxis_title='Metric Value', uniformtext_minsize=8)

    return fig

@app.callback(
    dash.dependencies.Output('ensemble_model', 'figure'),
    [dash.dependencies.Input('interval-component', 'n_intervals')]
)

def ensemble_model(n_intervals):
    
    data = ml_model_report_inputs['ensemble_results']

    df = pd.DataFrame(data)

    # Create figure with secondary y-axis
   

# Convert the DataFrame to a long format for plotting
    df_long = df.melt(id_vars='Model', 
                    value_vars=['Accuracy', 'AUC', 'Recall', 'Precision', 'F1'], 
                    var_name='Metric', 
                    value_name='Value')

    # Create a grouped bar chart using Plotly Express
    fig = px.bar(df_long, 
                x='Model', 
                y='Value', 
                color='Metric', 
                barmode='group', 
                title="Evaluation Metrics for Different Models",
                text='Value')

    # Format the text on bars
    fig.update_traces(texttemplate='%{text:.4f}', textposition='outside')
    fig.update_layout(yaxis_title='Metric Value', xaxis_title='Model', uniformtext_minsize=8)
    
    return fig

@app.callback(
    dash.dependencies.Output("selected_shap_plot", "src"),
    [dash.dependencies.Input("shap_plot_selector", "value")]
)
def update_image(selected_image):
    return f"/assets/{selected_image}"

@app.callback(
    dash.dependencies.Output("selected_shap_waterfall", "src"),
    [dash.dependencies.Input("shap_waterfall_selector", "value")]
)
def update_image(selected_image):
    return f"/assets/{selected_image}"



if __name__ == "__main__":
    app.run_server(debug=True,port = 8090)

---------------------------------------------------------------------------
DuplicateIdError                          Traceback (most recent call last)
DuplicateIdError: Duplicate component id found in the initial layout: `interval-component`



---------------------------------------------------------------------------
NotFound                                  Traceback (most recent call last)
NotFound: 404 Not Found: The requested URL was not found on the server. If you entered the URL manually please check your spelling and try again.

---------------------------------------------------------------------------
NotFound                                  Traceback (most recent call last)
NotFound: 404 Not Found: The requested URL was not found on the server. If you entered the URL manually please check your spelling and try again.



In [None]:
import pandas as pd
import plotly.express as px

# Define the data
data = {
    'preprocess_results': {
        'Accuracy': 0.7745,
        'AUC': 0.8246,
        'Recall': 0.5498,
        'Prec': 0.7433,
        'F1': 0.6279
    },
    'test_result': {
        'Accuracy': 0.7773,
        'AUC': 0.7328,
        'Recall': 0.5844,
        'Prec': 0.7258,
        'F1': 0.6475
    },
    'hyperpar_test_result': {
        'Accuracy': 0.7725,
        'AUC': 0.8257,
        'Recall': 0.55,
        'Prec': 0.7372,
        'F1': 0.6267
    },
    'eval_result': {
        'Accuracy': 0.7895,
        'AUC': 0.6923,
        'Recall': 0.3846,
        'Prec': 1.0,
        'F1': 0.5556
    }
}

# Create a DataFrame; the dictionary keys become column names and the inner keys become the index.
df = pd.DataFrame(data)
df.index.name = 'metric'
df = df.rename(columns={
    'preprocess_results': 'Preprocess',
    'test_result': 'Test',
    'hyperpar_test_result': 'Hyperparameter Test',
    'eval_result': 'Evaluation'
})
# Reset the index so that 'metric' becomes a column
df = df.reset_index()

# Melt the DataFrame to a long format
df_long = df.melt(id_vars='metric', var_name='model', value_name='value')

# Create a grouped bar chart using Plotly Express
fig = px.bar(df_long,
             x='model',
             y='value',
             color='metric',
             barmode='group',
             title='Model Comparison for Classification Metrics',
             text='value')

# Format the text on bars
fig.update_traces(texttemplate='%{text:.4f}', textposition='outside')
fig.update_layout(xaxis_title='Result Type', yaxis_title='Metric Value', uniformtext_minsize=8)

fig.show()


In [None]:
df

Unnamed: 0,preprocess_results,test_result,hyperpar_test_result,eval_result
0,"{'Accuracy': 0.7745, 'AUC': 0.8246, 'Recall': ...","{'Accuracy': 0.7773, 'AUC': 0.7328, 'Recall': ...","{'Accuracy': 0.7725, 'AUC': 0.8257, 'Recall': ...","{'Accuracy': 0.7895, 'AUC': 0.6923, 'Recall': ..."
