In [1833]:
import polars.selectors as cs
import polars as pl
import plotly as plt
import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
from scipy.stats import f_oneway
from scipy.stats import tukey_hsd
from tqdm import tqdm
import xlsxwriter
import openpyxl
from openpyxl import load_workbook
from openpyxl.utils.dataframe import dataframe_to_rows
import statistics
import os
from sklearn.cluster import KMeans  
from sklearn.model_selection import train_test_split  
import matplotlib.pyplot as plt                                          
from sklearn.preprocessing import LabelEncoder 
import altair as alt
import plotly.express as px
from sklearn.preprocessing import StandardScaler
from kneed import KneeLocator
from sklearn.metrics import silhouette_score



In [1850]:
parquet_path = "./parquet4visual.parquet"

In [1851]:
# get the required columns
df = pl.read_parquet(parquet_path,
   columns = [
      'competition_region_division', 
      'position_grouped', 
      'PSV-99',
      'P90 HSR Distance',
   ])

In [None]:
# get the unique position names
positions = df.select("position_grouped").unique()

# 2 metrics
Clustering competitions based on 2 metrics
Competitions are clusterd together  based on there group mean


In [1879]:
scaler = StandardScaler()

In [1915]:
with xlsxwriter.Workbook(f"./competition_clusters.xlsx") as wb:
    
    # loop through the positions
    for position in tqdm(positions["position_grouped"]): 

        #filter on the correct position
        pos_df = df.filter(
            pl.col("position_grouped") == position
        ).drop(
            "position_grouped"
        ).group_by(
            "competition_region_division"
        ).agg([
            pl.col("PSV-99").mean().alias("PSV-99 mean"),
            pl.col("P90 HSR Distance").mean().alias("P90 HSR Distance mean"),
        ])

        # create the features --> drop competitions: not numerical & not needed to create clusters
        features = pos_df.drop("competition_region_division")
        
        # scale the features to avoid influence of metric size
        scaled_features = scaler.fit_transform(features)
        scaled_features_df = pl.DataFrame(scaled_features, schema=features.columns)
        
        # initialize optimal number of clusters (lowest possible)
        best_k = 0
        # initialize silhoutte score (lowest possible)
        best_score = -1

        # calculate silhouette score for different cluster numbers
        # save best scoring cluster
        for k in range(2, 11):
            model = KMeans(n_clusters=k, random_state=42,init='k-means++', n_init=50)
            labels = model.fit_predict(scaled_features)
            score = silhouette_score(scaled_features, labels)
            
            if score > best_score:
                best_k = k
                best_score = score
        
        print(best_k)  
        
        model = KMeans(n_clusters=best_k, init='k-means++', n_init=50, random_state=42, max_iter=1000) 
                        
        model.fit(scaled_features_df)
        
        labels = model.labels_
        
        final_df = pos_df.with_columns(pl.Series("Clusters", labels))
        
        final_df.write_excel(
            workbook = wb,
            worksheet = position,
            autofit = True,
            float_precision = 3,
            freeze_panes = (1,0),
            header_format = {"bold": True}
        )
            
            
    

 11%|█         | 1/9 [00:00<00:03,  2.20it/s]

7


 22%|██▏       | 2/9 [00:00<00:03,  2.17it/s]

2


 33%|███▎      | 3/9 [00:01<00:02,  2.02it/s]

2


 44%|████▍     | 4/9 [00:02<00:02,  1.93it/s]

3


 56%|█████▌    | 5/9 [00:02<00:02,  1.84it/s]

10


 67%|██████▋   | 6/9 [00:03<00:01,  1.84it/s]

10


 78%|███████▊  | 7/9 [00:03<00:01,  1.84it/s]

9


 89%|████████▉ | 8/9 [00:04<00:00,  1.82it/s]

4


100%|██████████| 9/9 [00:04<00:00,  1.90it/s]

6



