

This notebook demonstrates the use of Dash to display the results of analysis discussed in Pat Walter's ***'Comparing Classification Models - You’re Probably Doing It Wrong'***
[post](http://practicalcheminformatics.blogspot.com/2023/11/comparing-classification-models-youre.html)



In [1]:
import dash
from dash import dcc, html
from dash import dash_table
from dash.dependencies import Input, Output
from dash_bootstrap_templates import template_from_url
import dash_bootstrap_components as dbc

import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import pingouin as pg

import seaborn as sns
from sklearn.metrics import roc_auc_score, average_precision_score, matthews_corrcoef
import plotly.express as px

In [2]:
###function copied from the original post

def calc_classification_metrics(df_in, cycle_col, val_col, prob_col, pred_col):
    """
    Calculate classification metrics (ROC AUC, PR AUC, MCC)
    :param df_in: input dataframe must contain columns [method, split] as well the columns specified in the arguments
    :param cycle_col: column indicating the cross-validation fold
    :param val_col: column with the group truth value
    :param prob_col: column with probability (e.g. from sklearn predict_proba)
    :param pred_col: column with binary predictions (e.g. from sklearn predict)
    :return: a dataframe with [cv_cycle, method, split, roc_auc, pr_auc, mcc]
    """
    metric_list = []
    for k, v in df_in.groupby([cycle_col, "method", "split"]):
        cycle, method, split = k
        roc_auc = roc_auc_score(v[val_col], v[prob_col])
        pr_auc = average_precision_score(v[val_col], v[prob_col])
        mcc = matthews_corrcoef(v[val_col], v[pred_col])
        metric_list.append([cycle, method, split, roc_auc, pr_auc, mcc])
    metric_df = pd.DataFrame(metric_list, columns=["cv_cycle", "method", "split", "roc_auc", "pr_auc", "mcc"])
    return metric_df


In [3]:
df = pd.read_csv("BSEP_classification_ChemProp_LightGBM.csv")
#df.head()

In [4]:
df_metrics = calc_classification_metrics(df,
                                         cycle_col="cv_cycle",
                                         val_col="BSEP",
                                         prob_col="BSEP_prob",\
                                         pred_col="BSEP_pred")

In [5]:
# Dash app


app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])


# Define the layout of the dashboard. We will have two sets of radio buttons to pick the split and metric
# We will also need an html div to display the Friedman's coefficient and div for our graph

app.layout = html.Div([
  html.H2("Comparing Distributions Across Cross-Validation Folds"),
    html.Div([
        html.Div([
            html.Label('Select split:'),
            dcc.RadioItems(
        id='radio-selector',
        options=[
            {'label': 'Random', 'value': 'random'},
            {'label': 'Scaffold', 'value': 'scaffold'}
        ],
        value='random',
            
        labelStyle={'display': 'inline-block','border': '1px solid #ddd', 'border-radius': '5px', 
                    'cursor': 'pointer', 'background-color': 'grey','padding':'2px'}
            ),
        ], style={ 'padding': '10px', 'margin-right': '50px', 'display': 'inline-block','width': '20%'}),

    html.Div([
    html.Label('Select metric:'),

    dbc.RadioItems(
            id="method-selector",
        options=[
            {'label': 'ROC AUC', 'value': 'roc_auc'},
            {'label': 'PR AUC', 'value': 'pr_auc'},
            {'label': 'MCC', 'value': 'mcc'}
        ],
        value='roc_auc',
   inline=True,
         labelStyle={'display': 'inline-block','border': '1px solid #ddd', 'border-radius': '5px', 
                     'cursor': 'pointer', 'background-color': 'grey','padding':'2px'}
        ),
        ], style={'padding': '10px', 'margin-right': '50px', 'display': 'inline-block'}),
       
    ]),

 html.Div([
        html.Div(id='friedman-coefficient-output', style={'padding': '10px','background-color': '#272B30','color': 'orange'}),
        dcc.Graph(id='graph'),
    ], style={'position': 'relative'})
])
   
    

# Define callback to update the selection based on radio button selection

@app.callback(
    Output("graph", "figure"),
    Output('friedman-coefficient-output', 'children'),
    Input('radio-selector', 'value'),
    Input("method-selector", "value")


)



def update_graph(selected_subset,method_selector):
    # Filter the data based on the selected split
    filtered_df = df_metrics[(df_metrics['split'] == selected_subset)]
    

    # Create the box plot using Plotly Express
    fig = px.box(filtered_df, x='method', y=method_selector,color='method')
          
    # Prettify the graph
    fig.update_layout(transition_duration=500)   
    fig.update_layout(template='plotly_dark',
                      plot_bgcolor='#272B30', 
                      paper_bgcolor='#272B30',
                      yaxis_tickformat=",.2f",
                      yaxis_title=method_selector.replace("_", " ").upper(),
                      xaxis_title='',
                      showlegend = False
                      
                  )
    
    friedman = pg.friedman(data=filtered_df, dv=method_selector,within="method", subject="cv_cycle")['p-unc'].values[0].round(3)
    friedman_output=f"Friedman Coefficient p-value: {friedman}"


    return fig, friedman_output


# Run the app
if __name__ == '__main__':
    app.run_server(debug=True)
    
    
