In [2]:
import polars.selectors as cs
import polars as pl
import plotly as plt
import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
from scipy.stats import f_oneway
from scipy.stats import tukey_hsd
from tqdm import tqdm
import xlsxwriter
import openpyxl
from openpyxl import load_workbook
from openpyxl.utils.dataframe import dataframe_to_rows
import statistics
import os
from sklearn.cluster import KMeans  
from sklearn.model_selection import train_test_split  
import matplotlib.pyplot as plt                                          
from sklearn.preprocessing import LabelEncoder 
import altair as alt
import plotly.express as px
from sklearn.preprocessing import StandardScaler
from kneed import KneeLocator
from sklearn.metrics import silhouette_score



In [3]:
parquet_path = "./parquet4visual.parquet"

In [4]:
# get the required columns
df = pl.read_parquet(parquet_path,
   columns = [
      'competition_region_division', 
      'position_grouped', 
      'PSV-99',
      'P90 HSR Distance',
   ])

In [5]:
# get the unique position names
positions = df.select("position_grouped").unique()

# 2 metrics
Clustering competitions based on 2 metrics
Competitions are clusterd together  based on there group mean


In [6]:
scaler = StandardScaler()

In [8]:
with xlsxwriter.Workbook(f"./competition_clusters.xlsx") as wb:
    
    # loop through the positions
    for position in tqdm(positions["position_grouped"]): 

        #filter on the correct position
        pos_df = df.filter(
            pl.col("position_grouped") == position
        ).drop(
            "position_grouped"
        ).group_by(
            "competition_region_division"
        ).agg([
            pl.col("PSV-99").mean().alias("PSV-99 mean"),
            pl.col("P90 HSR Distance").mean().alias("P90 HSR Distance mean"),
        ])

        # create the features --> drop competitions: not numerical & not needed to create clusters
        features = pos_df.drop("competition_region_division")
        
        # scale the features to avoid influence of metric size
        scaled_features = scaler.fit_transform(features)
        scaled_features_df = pl.DataFrame(scaled_features, schema=features.columns)
        
        # initialize optimal number of clusters (lowest possible)
        best_k = 0
        # initialize silhoutte score (lowest possible)
        best_score = -1

        cluster_models = {}
        # calculate silhouette score for different cluster numbers
        # save best scoring cluster
        for k in range(2, 6):
            cluster_models[k] = {}  # Initialize the dictionary for each k
            cluster_models[k]['model'] = KMeans(n_clusters=k, random_state=42,init='k-means++', n_init=50, max_iter=1000)
            cluster_models[k]['labels'] = cluster_models[k]['model'].fit_predict(scaled_features)
            cluster_models[k]['score'] = silhouette_score(scaled_features, cluster_models[k]['labels'])
           
            
        for key in cluster_models.keys():
            if cluster_models[key]['score'] > best_score:
                best_score = cluster_models[key]['score']
                best_k = key 
       
        final_df = pos_df.with_columns(pl.Series("Clusters", cluster_models[best_k]['labels']))
        
        final_df.write_excel(
            workbook = wb,
            worksheet = position,
            autofit = True,
            float_precision = 3,
            freeze_panes = (1,0),
            header_format = {"bold": True}
        )
            
            
    

100%|██████████| 9/9 [00:02<00:00,  4.05it/s]
