In [None]:
import polars as pl
from tqdm import tqdm
import xlsxwriter 
from sklearn import set_config
from sklearn.cluster import KMeans  
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score

In [None]:
parquet_path = "./data/parquet4visual.parquet"

In [None]:
# get the required columns
df = pl.read_parquet(parquet_path,
   columns = [
      'competition_region_division', 
      'position_grouped', 
      'PSV-99',
      'P90 HSR Distance',
      
   ]).filter(pl.col('competition_region_division').is_not_null()) # filtering out the null competitions 

In [60]:
# get the unique position names
positions = df.select("position_grouped").unique()

# 2 metrics
Clustering competitions based on 2 metrics
Competitions are clusterd together  based on there group mean


In [61]:
# set config API to polars to get polars output from sklearn
set_config(transform_output="polars")

In [62]:
scaler = StandardScaler()

In [None]:
with xlsxwriter.Workbook(f"./output/competition_clusters.xlsx") as wb:
    
    grouped_df = df.group_by(
        "position_grouped", "competition_region_division"
    ).agg([
            pl.col("PSV-99").mean().alias("PSV-99 mean"),
            pl.col("P90 HSR Distance").mean().alias("P90 HSR Distance mean"),
        ])
    
    # loop through the positions
    for position in tqdm(positions["position_grouped"]): 

        #filter on the correct position
        pos_df = grouped_df.filter(
            pl.col("position_grouped") == position
        ).drop(
            "position_grouped"
        )

        # create the features --> drop competitions: not numerical & not needed to create clusters
        features = pos_df.drop("competition_region_division")
        
        # scale the features to avoid influence of metric size
        scaled_features = scaler.fit_transform(features)
        #scaled_features_df = pl.DataFrame(scaled_features, schema=features.columns)
        
        # initialize optimal number of clusters (lowest possible)
        best_k = 0
        # initialize silhoutte score (lowest possible)
        best_score = -1


        cluster_models = {}
        for k in range(2, 6):
            model = KMeans(n_clusters=k, random_state=42, init='k-means++', n_init=50, max_iter=1000)
            labels = model.fit_predict(scaled_features)
            score = silhouette_score(scaled_features, labels)
            cluster_models[k] = {
                'model': model,
                'labels': labels,
                'score': score,
            }
        
        for key in cluster_models.keys():
            if cluster_models[key]['score'] > best_score:
                best_score = cluster_models[key]['score']
                best_k = key 
       
        final_df = pos_df.with_columns(pl.Series("Clusters", cluster_models[best_k]['labels'])).sort("Clusters")
        
        final_df.write_excel(
            workbook = wb,
            worksheet = position,
            autofit = True,
            float_precision = 3,
            freeze_panes = (1,0),
            header_format = {"bold": True}
        )
            
            
    

  0%|          | 0/9 [00:00<?, ?it/s]

100%|██████████| 9/9 [00:01<00:00,  4.68it/s]
