In [15]:
import polars.selectors as cs
import polars as pl
import plotly as plt
import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go
from ipywidgets import widgets
from IPython.display import display, HTML, clear_output
from dash import Dash, dcc, html, Input, Output, callback, jupyter_dash, State, no_update, dash_table
from dash.exceptions import PreventUpdate
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
import io
from PyPDF2 import PdfMerger
from sklearn.cluster import KMeans  
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import dash_ag_grid as dag
import xlsxwriter

In [2]:
jupyter_dash.default_mode="external"

In [3]:
parquet_path = "./parquet4visual.parquet"

In [4]:
df = pl.read_parquet(parquet_path,
    columns = [
        'competition_region_division', 
        'position_grouped',
        'P90 Distance',
        'P90 Running Distance',
        'P90 HSR Distance', 
        'P90 Sprinting Distance', 
        'PSV-99'
    ]
)

## App to analyze physical benchmark data from matches

In [5]:
# defining recurrent styles
tab_style = {"fontWeight":"bold", "fontFamily":"arial", "font-size":"25px"}
selected_tab_style = {"fontWeight": "bold", "fontFamily": "Arial", "color": "mediumblue", "font-size":"25px"}

download_button_style = {"textAlign":"right", "marginTop": "10px", "marginRight": "15px", "fontFamily":"arial"}

dropdown_style = {"width": "95%", "marginRight": "100px", "fontFamily":"arial", "margin-bottom": '8px'}
dropdown_label_style = {"marginRight": "10px", "fontWeight":"bold", "fontFamily":"arial"}

sidebar_style = {'width': '18%', 'display': 'inline-block', 'verticalAlign': 'middle','paddingTop': '10px',}
figure_div_style = {'width': '82%', 'display': 'inline-block', 'verticalAlign': 'top',}

In [6]:
class Dropdown:
    """A class defining the different dropdown possibilities"""
    
    def __init__(self, drop_id):
        """Initialize the needed dropdown inputs"""
        self.drop_id = drop_id
        
    def plottype_dropdown(self):
        """Creating a dropdown to select the plot type"""
        return html.Div([
            html.Label(
                "Plottype:",
                style = dropdown_label_style
            ),
            dcc.Dropdown(
                options = ["Boxplot", "Histogram"],
                id = self.drop_id,
                value = "Boxplot",
                placeholder = "Select a plot type",
                style = dropdown_style
            )                  
        ])
    
    def metric_dropdown(self, metrics):
        """Creating a dropdown to select the metric"""   
        return html.Div([
            html.Label(
                "Metric:", 
                style = dropdown_label_style
            ),
            dcc.Dropdown(
                df.select(cs.numeric()).columns, 
                id = self.drop_id, 
                value = metrics,
                multi = True,
                placeholder = "Select a metric",
                style = dropdown_style
            )
        ])
    
    def single_metric_dropdown(self, metric):
        """Creating a dropdown to select the metric"""   
        return html.Div([
            html.Label(
                "Metric:", 
                style = dropdown_label_style
            ),
            dcc.Dropdown(
                df.select(cs.numeric()).columns, 
                id = self.drop_id, 
                value = metric,
                multi = False,
                placeholder = "Select a metric",
                style = dropdown_style
            )
        ])
    
    def competition_dropdown(self, competitions):
        """Creating a dropdown to select the competition"""
        return html.Div([
            html.Label(
                "Competitions:", 
                style = dropdown_label_style
            ),
            dcc.Dropdown(
                sorted([x for x in df['competition_region_division'].unique() if x is not None]), 
                multi = True,
                id = self.drop_id, 
                value = competitions, 
                placeholder = "Select a competition",
                style = dropdown_style
            )
        ])
        
    def position_dropdown(self, positions):
        """Creating a dropdown to select the positions"""   
        return html.Div([
            html.Label(
                "Positions:", 
                style = dropdown_label_style
            ),                    
            dcc.Dropdown(
                options = [
                    'CB', 'RCB|LCB', 'RWB|LWB', 'DM','RM|LM', 'AM', 
                    'RW|LW', 'CF', 'RF|LF'
                ], 
                multi = True,
                id = self.drop_id, 
                value = positions,
                placeholder = "Select a position",
                style = dropdown_style
            )                    
        ])    
    
    

In [7]:
def dataframe_filtering(dataframe, competition_choice, position_choice):
    """Filtering a dataframe based on the selected dropdown choices"""
    
    return dataframe.filter(
        dataframe['competition_region_division'].is_in(competition_choice) &
        dataframe['position_grouped'].is_in(position_choice)
    ) 

In [8]:
class Plotting:
    """A class defining the different plotting possibilities"""
    
    def __init__(self, dataframe, metric, title_input): 
        """Initializing common plot attributes"""
        self.dataframe = dataframe
        self.metric = metric
        self.title_input = title_input
    
    def boxplot(self, xax, col, label, legendT): 
        """Returning a boxplot"""  
        fig = px.box(
            self.dataframe, 
            x = xax,
            y = self.metric, 
            color = col,
            title=f"Boxplot of {self.metric} across {self.title_input}",
            labels = {xax:label},
        )
                
        fig.update_layout(
            title={'x':0.5, 'xanchor': 'center', 'yanchor': 'top'},
            title_font=dict(size=20, family="Arial", color="black", weight="bold"),
            xaxis_title_font=dict(size=16, family="Arial", color="black", weight="bold"),
            yaxis_title_font=dict(size=16, family="Arial", color="black", weight="bold"),
            legend_title = legendT,
            legend_title_font=dict(size=16, family="Arial", color="black", weight="bold"),
        ) 
        
        return fig
    
    def histogram(self, col, facet, label, legendT):
        """Returning a histogram"""
        fig = px.histogram(
            self.dataframe,
            y = self.metric, 
            color = col,
            opacity = 0.4,
            facet_col = facet,
            #orientation="h",
            barnorm=None,
                    
            title = f"Histogram of {self.metric} across {self.title_input}",
            labels = {facet: label}
        )
                
        fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
                
        fig.update_layout(
            barmode = 'overlay',
            title={'x':0.5, 'xanchor': 'center', 'yanchor': 'top'},
            title_font=dict(size=20, family="Arial", color="black", weight="bold"),
            #xaxis_title_font=dict(size=16, family="Arial", color="black", weight="bold"),
            yaxis_title_font=dict(size=16, family="Arial", color="black", weight="bold"),
            legend_title = legendT,
            legend_title_font=dict(size=16, family="Arial", color="black", weight="bold"),
        )
        
        return fig

In [9]:
class Clustering:
    """A class to cluster competitions"""
    
    def __init__(self, df, xax, yax, position):
        """Initializing common plot attributes"""
        self.df = df
        self.xax = xax
        self.yax = yax
        self.position = position
        
    def cluster_position_df(self):
        """Filter the data frame on the wanted positions"""
                
        if not self.position:
            filtered = self.df
        else:
            filtered = self.df.filter(pl.col("position_grouped").is_in(self.position))

        pos_df = (
            filtered
            .group_by("competition_region_division")
            .agg([
                pl.col(self.xax).mean().alias(f"{self.xax} mean"),
                pl.col(self.yax).mean().alias(f"{self.yax} mean"),
            ])
        )
        return pos_df
    
    def cluster_df(self, pos_df):
        
        # create the features --> drop competitions: not numerical & not needed to create clusters
        features = pos_df.drop("competition_region_division")
        
        # scale the features to avoid influence of metric size
        scaler = StandardScaler()
        scaled_features = scaler.fit_transform(features)
        #scaled_features_df = pl.DataFrame(scaled_features, schema=features.columns)
        
        # initialize optimal number of clusters (lowest possible)
        best_k = 0
        # initialize silhoutte score (lowest possible)
        best_score = -1

        # calculate silhouette score for different cluster numbers
        # save best scoring cluster
        cluster_models = {}
        # calculate silhouette score for different cluster numbers
        # save best scoring cluster
        for k in range(2, 6):
            cluster_models[k] = {}  # Initialize the dictionary for each k
            cluster_models[k]['model'] = KMeans(n_clusters=k, random_state=42,init='k-means++', n_init=50, max_iter=1000)
            cluster_models[k]['labels'] = cluster_models[k]['model'].fit_predict(scaled_features)
            cluster_models[k]['score'] = silhouette_score(scaled_features, cluster_models[k]['labels'])
           
            
        for key in cluster_models.keys():
            if cluster_models[key]['score'] > best_score:
                best_score = cluster_models[key]['score']
                best_k = key 
       
        cluster_df = pos_df.with_columns(pl.Series("Clusters", cluster_models[best_k]['labels'].astype(str)))
        
        return cluster_df
    
    def clusterplot(self, cluster_df):
        """Returning a scatter plot visualising the competition clusters"""    

        fig = px.scatter(
            cluster_df, 
            x = f"{self.xax} mean", 
            y = f"{self.yax} mean", 
            color="Clusters", 
            hover_data = 'competition_region_division',
            title=f"Clustered of competitions based on positional metrics",
            color_discrete_sequence = px.colors.qualitative.Plotly,
            #color_continuous_scale = 'portland',
            labels = {
                f"{self.xax} mean": f"{self.xax} mean".replace("['", "").replace("']", ""),
                f"{self.yax} mean": f"{self.yax} mean".replace("['", "").replace("']", "")
            }
        )
        
        fig.update_traces(marker=dict(size=10))
        
        fig.update_layout(
            title={'x':0.5, 'xanchor': 'center', 'yanchor': 'top'},
            title_font=dict(size=20, family="Arial", color="black", weight="bold"),
            xaxis_title_font=dict(size=16, family="Arial", color="black", weight="bold"),
            yaxis_title_font=dict(size=16, family="Arial", color="black", weight="bold"),
            legend_title = "Clusters",
            legend_title_font=dict(size=16, family="Arial", color="black", weight="bold"),
        )
        
        return fig

In [10]:
def excell_maker(data):
    """Writing the clustering data to an excell file"""

In [11]:
def pdf_maker(figures):
    """storing the differnent plots in a pdf"""
    merger = PdfMerger()
    buffers = []

    for fig in figures:
        buf = io.BytesIO()
        pio.write_image(fig, buf, format='pdf', width=800, height=600, scale=1.5)
        buf.seek(0)   
        merger.append(buf)  # Append each single-page PDF
        buffers.append(buf)  # Store references to avoid garbage collection

    # Final combined PDF in memory
    output_pdf = io.BytesIO()
    merger.write(output_pdf)
    output_pdf.seek(0)
    merger.close()
    
    return output_pdf

In [22]:
app = Dash()
app.title = "Benchmarking App"

app.layout = html.Div([
    html.H1(
        "Benchmark analysis", 
        style={"color":"Black", "fontFamily":"arial"}
    ),
    dcc.Tabs(
        id="benchmark_tabs", 
        value="pos_comp",  
        children=[
        
        # Tab 1: Comparing a physiological metric in a certain competition across positions
        dcc.Tab(
            label = "Position Comparison", 
            value = 'pos_comp',
            style = tab_style, 
            selected_style= selected_tab_style,
            children = [
                    
                html.Div([
                    html.Button('Download pdf', id='position_button', n_clicks=0),
                    dcc.Download(id="download_pdf_pos"),
                ], style = download_button_style
                ),
                    
                html.Div([
                    Dropdown("plottype_pos").plottype_dropdown(),
                    Dropdown("metric_pos").metric_dropdown(["PSV-99"]), 
                    Dropdown("comp_dd1").competition_dropdown(["Belgium 1"]),
                    Dropdown("pos_dd1").position_dropdown(['CB', 'RCB|LCB', 'RWB|LWB', 'DM','RM|LM', 'AM', 'RW|LW', 'CF', 'RF|LF'])                        
                ], style = sidebar_style
                ),
                    
                html.Div([
                    html.Div(id='position_boxplot')
                ], style = figure_div_style)  
            ]
        ),
        
        # Tab 2: Comparing a physiological metric on a certain position across compititions
        dcc.Tab(
            label = "Competition Comparison", 
            value = "comp_comp", 
            style = tab_style, 
            selected_style = selected_tab_style,
            children = [     
                html.Div([
                    html.Button('Download pdf', id='competition_button', n_clicks=0),
                    dcc.Download(id="download_pdf_comp"),
                ], style = download_button_style),        
                        
                html.Div([
                    Dropdown("plottype_comp").plottype_dropdown(),
                    Dropdown("metric_comp").metric_dropdown(["PSV-99"]),
                    Dropdown("pos_dd2").position_dropdown(['CB']),
                    Dropdown("comp_dd2").competition_dropdown(['Belgium 1', 'Netherlands 1', 'Spain 1', 'France 1', 'England 1', 'Germany 1', 'Italy 1'])                            
                ], 
                    style = sidebar_style
                ),
                html.Div([
                    html.Div(id='competition_boxplot')
                ], style = figure_div_style)
            ]
        ),
        
        # Tab 3: Clustering competitons based on physiological metrics per position
        dcc.Tab(
            label = "Competition Clustering",
            value = "comp_clus",
            style = tab_style,
            selected_style = selected_tab_style,
            children = [
                html.Div([
                    html.Button('Download Excel', id='excel_button', n_clicks=0),
                    dcc.Download(id="download_excel"),
                ], style = download_button_style),
                
                html.Div([
                    Dropdown("clustering_metric1").single_metric_dropdown("PSV-99"),
                    Dropdown("clustering_metric2").single_metric_dropdown("P90 HSR Distance"),
                    Dropdown("clustering_pos").position_dropdown(['CB']),                            
                ], 
                    style = sidebar_style
                ),
                html.Div([
                    html.Div(id = "clustering_chart"),
                    html.Div(id = "clustering_table")                 
                ], style = figure_div_style)
            ]
        )
    ])
])

# add controls in the interaction
@callback(
    Output(component_id='position_boxplot', component_property='children'),
    Output(component_id='competition_boxplot', component_property='children'),
    Output(component_id='clustering_chart', component_property='children'),
    Output(component_id='clustering_table', component_property='children'),
    Output(component_id='download_excel', component_property='data'),
    
    # the tabs --> if else in figure function
    Input(component_id='benchmark_tabs', component_property='value'),

    # input for the 1st graph
    Input(component_id='plottype_pos', component_property='value'),
    Input(component_id='metric_pos', component_property='value'),
    Input(component_id='comp_dd1', component_property='value'),
    Input(component_id='pos_dd1', component_property='value'),

    # input for the 2nd graph
    Input(component_id='plottype_comp', component_property='value'),
    Input(component_id='metric_comp', component_property='value'),
    Input(component_id='pos_dd2', component_property='value'),
    Input(component_id='comp_dd2', component_property='value'),
    
    # Input for the 3th graph
    Input(component_id='clustering_metric1', component_property='value'),
    Input(component_id='clustering_metric2', component_property='value'),
    Input(component_id='clustering_pos', component_property='value'),
    Input(component_id='excel_button', component_property='n_clicks'),
)

def update_graph(tab, plottype_pos, metric_pos, comp_dd1, pos_dd1, plottype_comp, metric_comp, pos_dd2, comp_dd2, clustering_metric1, clustering_metric2, clustering_pos, excel_button):
    if tab == 'pos_comp':
        figs = []

        filtered_df = dataframe_filtering(df, comp_dd1, pos_dd1)

        if plottype_pos == "Boxplot":
            for metric in metric_pos:
                figs.append(
                    dcc.Graph(
                        figure = Plotting(filtered_df, metric, 'positions').boxplot('position_grouped', 'competition_region_division', 'Positions', 'Competition')
                    )
                )
                    
        elif plottype_pos == "Histogram":
            for metric in metric_pos:
                figs.append(
                    dcc.Graph(
                        figure = Plotting(filtered_df, metric, 'positions').histogram('competition_region_division', 'position_grouped', 'Position', 'Competition')
                    )
                )
                
        return figs, no_update, no_update, no_update, no_update
    
    elif tab == 'comp_comp':
        figs = []
        
        filtered_df = dataframe_filtering(df, comp_dd2, pos_dd2)
        
        if plottype_comp == "Boxplot":
            for metric in metric_comp:
                figs.append(
                    dcc.Graph(
                        figure = Plotting(filtered_df, metric, 'competitions').boxplot('competition_region_division', 'position_grouped', 'Competitions', 'Position')
                    )
                )
                
        elif plottype_comp == "Histogram":
            for metric in metric_comp:
                figs.append(
                    dcc.Graph(
                        figure = Plotting(filtered_df, metric, 'competitions').histogram('position_grouped', 'competition_region_division', 'Position', 'Competition')
                    )
                )
            
        return no_update, figs, no_update, no_update, no_update
    
    elif tab == 'comp_clus':
        figs = []
        
        if clustering_metric1 == clustering_metric2:
            fig = go.Figure().add_annotation(
                text="❌ X-axis and Y-axis metrics must be different for clustering.",
                x=0.5, y=0.5,
                xref="paper", yref="paper",
                showarrow=False,
                font=dict(size=20, color="red"),
                align="center",
                bgcolor="white",
                bordercolor="red",
                borderwidth=2)

            figs.append(
                dcc.Graph(
                    figure = fig
                )
            ) 
            return no_update, no_update, figs, no_update, no_update
            
        else:
            cluster_obj = Clustering(df, clustering_metric1, clustering_metric2, clustering_pos)
            pos_df = cluster_obj.cluster_position_df()
            
            
            cluster_df = cluster_obj.cluster_df(pos_df)
            fig = cluster_obj.clusterplot(cluster_df)    
            
            figs.append(
                dcc.Graph(
                    figure = fig
                )
            )
            
            table = dag.AgGrid(
                rowData=cluster_df.to_dicts(),
                columnDefs=[{"field": i, "flex": 1} for i in cluster_df.columns],
            )
            
            if excel_button > 0:

                buffer = io.BytesIO()
                # Create an in-memory workbook
                workbook = xlsxwriter.Workbook(buffer)
                
                # Let Polars write to that workbook
                cluster_df.write_excel(
                    workbook=workbook,
                    autofit=True,
                    float_precision=3,
                    freeze_panes=(1, 0),
                )
                
                workbook.close()
                buffer.seek(0)

                return no_update, no_update, figs, table, dcc.send_bytes(buffer.read(), "clustering_results.xlsx")            
                
            return no_update, no_update, figs, table, no_update
    
    
# Download PDF - Position Tab
@callback(
    Output("download_pdf_pos", "data"),
    Input("position_button", "n_clicks"),
    State('plottype_pos', 'value'),
    State("metric_pos", "value"),
    State("comp_dd1", "value"),
    State("pos_dd1", "value"),
    prevent_initial_call=True # prevents function from running unless button is clicked
)

def download_pos_pdf(n, plottype_pos, metric_pos, comp_dd1, pos_dd1):
    figs = []

    filtered_df = dataframe_filtering(df, comp_dd1, pos_dd1)

    if plottype_pos == "Boxplot":
        for metric in metric_pos:
            figs.append(
                Plotting(filtered_df, metric, 'positions').boxplot('position_grouped', 'competition_region_division', 'Positions', 'Competition')
            )
                
                
    elif plottype_pos == "Histogram":
        for metric in metric_pos:
            figs.append(
                Plotting(filtered_df, metric, 'positions').histogram('competition_region_division', 'position_grouped', 'Position', 'Competition')
            )
    
    return dcc.send_bytes(
        pdf_maker(figs).read(), 
        filename="position_comparison.pdf"
    ) # sends pdf to browser as downloadable file

# Download PDF - Competition Comparison Tab
@callback(
    Output("download_pdf_comp", "data"),
    Input("competition_button", "n_clicks"),
    State('plottype_comp', 'value'),
    State("metric_comp", "value"),
    State("pos_dd2", "value"),
    State("comp_dd2", "value"),
    prevent_initial_call=True # prevents function from running unless button is clicked
)

def download_comp_pdf(n, plottype_comp, metric_comp, pos_dd2, comp_dd2):
    figs = []
    
    filtered_df = dataframe_filtering(df, comp_dd2, pos_dd2)
    
    if plottype_comp == "Boxplot":
        for metric in metric_comp:
            figs.append(
                Plotting(filtered_df, metric, 'competitions').boxplot('competition_region_division', 'position_grouped', 'Competitions', 'Position')
            )
                
    elif plottype_comp == "Histogram":
        for metric in metric_comp:
            figs.append(
                Plotting(filtered_df, metric, 'competitions').histogram('position_grouped', 'competition_region_division', 'Position', 'Competition')
            )

    return dcc.send_bytes(
        pdf_maker(figs).read(), filename="competition_comparison.pdf"
        ) # sends pdf to browser as downloadable file

# Download Clusters Excell 

if __name__ == '__main__':
    app.run(jupyter_mode="tab", debug=True, use_reloader=False) # external of tab weghalen als je in de notebook zelf wilt runnen; jupyter_server_url="<your-url>" for hosted notebooks
    # specify port app.run(port=8060) wnr huidige port al in use is, optie 2 close het ander programma

Dash app running on http://127.0.0.1:8050/


<IPython.core.display.Javascript object>

[2025-05-07 12:58:37,590] ERROR in app: Exception on /_dash-update-component [POST]
Traceback (most recent call last):
  File "/home/ianh/miniconda3/envs/club_brugge/lib/python3.13/site-packages/flask/app.py", line 880, in full_dispatch_request
    rv = self.dispatch_request()
  File "/home/ianh/miniconda3/envs/club_brugge/lib/python3.13/site-packages/flask/app.py", line 865, in dispatch_request
    return self.ensure_sync(self.view_functions[rule.endpoint])(**view_args)  # type: ignore[no-any-return]
           ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^
  File "/home/ianh/miniconda3/envs/club_brugge/lib/python3.13/site-packages/dash/dash.py", line 1414, in dispatch
    ctx.run(
    ~~~~~~~^
        functools.partial(
        ^^^^^^^^^^^^^^^^^^
    ...<7 lines>...
        )
        ^
    )
    ^
  File "/home/ianh/miniconda3/envs/club_brugge/lib/python3.13/site-packages/dash/_callback.py", line 536, in add_context
    raise err
  File "/home/ianh/miniconda3/envs/