In [27]:
import polars.selectors as cs
import polars as pl
import plotly as plt
import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
from scipy.stats import f_oneway
from scipy.stats import tukey_hsd
from tqdm import tqdm
import xlsxwriter
import openpyxl
from openpyxl import load_workbook
from openpyxl.utils.dataframe import dataframe_to_rows
import statistics
import os


In [2]:
parquet_path = "./parquet4visual.parquet"

In [None]:
# get the required columns
df = pl.read_parquet(parquet_path,
   columns = [
      'competition_region_division', 
      'position_grouped', 
      'PSV-99',
      'P90 HSR Distance',
   ])

In [34]:
pl.Config.set_tbl_rows(50)
df.unique('competition_region_division')

competition_region_division,position_grouped,PSV-99,P90 HSR Distance
str,str,f64,f64
"""Poland 1""","""AM""",26.6,276.678846
"""Romania 1""","""AM""",28.5,463.297484
"""Hungary 1""","""AM""",28.6,741.275012
"""Portugal 1""","""AM""",26.2,459.012805
"""Serbia 1""","""AM""",27.9,581.763298
"""France 3""","""AM""",25.8,629.076501
"""Croatia 1""","""AM""",31.4,721.324882
"""Sweden 1""","""AM""",28.1,606.639839
"""France 2""","""AM""",29.4,519.974835
"""Netherlands 2""","""AM""",26.9,898.374473


In [38]:
positions = df.unique('position_grouped')

In [39]:
# hardcode the names of the different metric columns that have to be analyzed
metrics = ['PSV-99', 'P90 HSR Distance',]

In [None]:

values_by_metric = {} 

# loop through the metrics 
for metric in tqdm(metrics):
    # create an empty dictionary to store the metric values per position
    values_by_metric[metric] = {} 
    
    # select only the columns competition name, position and the metric of interest for further use
    mdf = df.select(['competition_region_division', 'position_grouped', metric])
    
    # loop through the different positions
    for position in sorted(positions['position_grouped'].to_list()):
        
        # filter the data frame on the position
        stat_df = mdf.filter(pl.col("position_grouped") == position)
        
        # group the metric values per competition in a list (needed as input for tukey hsd)
        grouped_stats = stat_df.group_by("competition_region_division").agg([
            pl.col(metric)
        ])
        
        valid_groups = {
            row["competition_region_division"]: row[metric] # returns a dictionary --> competition_name: [metric values]
            for row in grouped_stats.iter_rows(named=True) # iter_rows returns a dictionary row{col1_value: col2_value}
            if row["competition_region_division"] is not None and len(row[metric]) > 1 # filters out null competitions and competitions with 1 or less metric values
        }
        #print(valid_groups)
        group_names = sorted(valid_groups.keys()) # sort the competition names so results are in same order and can be joined across competitions
        group_values = [valid_groups[key] for key in group_names]
        result = tukey_hsd(*group_values)

        n_groups = len(group_names)

        rows = []
        # Loop through all pairs
        for i in range(n_groups):
            for j in range(i + 1, n_groups):
                rows.append({
                    f"Competitions": f"{group_names[i]} - {group_names[j]}",
                    f"Statistic {position}": round(result.statistic[i, j], 4),
                    f"P-value {position}": round(result.pvalue[i, j], 4),
                    f"Standard Error {position}": round(result._stand_err[i, j], 4),
                    #f"Mean {group_names[i]}": statistics.mean(group_values[i]),
                    #f"Mean {group_names[j]}": statistics.mean(group_values[j]),
                    #f"SD {group_names[i]}":statistics.stdev(group_values[i]),
                    #f"SD {group_names[j]}":statistics.stdev(group_values[j]),
                })
        
        values_by_metric[metric][position] = pl.DataFrame(rows)
    


100%|██████████| 9/9 [02:46<00:00, 18.55s/it]
100%|██████████| 9/9 [03:28<00:00, 23.15s/it]


In [41]:
values_by_metric

{'PSV-99': {'AM': shape: (1_176, 4)
  ┌────────────────────────────────┬──────────────┬────────────┬───────────────────┐
  │ Competitions                   ┆ Statistic AM ┆ P-value AM ┆ Standard Error AM │
  │ ---                            ┆ ---          ┆ ---        ┆ ---               │
  │ str                            ┆ f64          ┆ f64        ┆ f64               │
  ╞════════════════════════════════╪══════════════╪════════════╪═══════════════════╡
  │ Argentina 1 - Australia 1      ┆ 0.2564       ┆ 0.9996     ┆ 0.0943            │
  │ Argentina 1 - Austria 1        ┆ 0.0918       ┆ 1.0        ┆ 0.0734            │
  │ Argentina 1 - Belgium 1        ┆ -0.1377      ┆ 1.0        ┆ 0.0602            │
  │ Argentina 1 - Belgium 2        ┆ 0.1328       ┆ 1.0        ┆ 0.1183            │
  │ Argentina 1 - Brazil 1         ┆ -0.087       ┆ 1.0        ┆ 0.0647            │
  │ Argentina 1 - Brazil 2         ┆ 0.5964       ┆ 0.0055     ┆ 0.093             │
  │ Argentina 1 - Chile 1    

In [42]:
values_by_metric['PSV-99']['AM']

Competitions,Statistic AM,P-value AM,Standard Error AM
str,f64,f64,f64
"""Argentina 1 - Australia 1""",0.2564,0.9996,0.0943
"""Argentina 1 - Austria 1""",0.0918,1.0,0.0734
"""Argentina 1 - Belgium 1""",-0.1377,1.0,0.0602
"""Argentina 1 - Belgium 2""",0.1328,1.0,0.1183
"""Argentina 1 - Brazil 1""",-0.087,1.0,0.0647
"""Argentina 1 - Brazil 2""",0.5964,0.0055,0.093
"""Argentina 1 - Chile 1""",0.4643,0.0386,0.081
"""Argentina 1 - Colombia 1""",0.3208,0.2633,0.066
"""Argentina 1 - Croatia 1""",0.0829,1.0,0.0837
"""Argentina 1 - Czech Republic 1""",-0.5691,0.0005,0.0796


In [43]:
dataframes = {}

for metric in metrics:
    
    dataframes[metric] = values_by_metric[metric]['CB']
    
    for position in tqdm(positions['position_grouped']):
        if position != 'CB':
            dataframes[metric] = dataframes[metric].join(values_by_metric[metric][position], on = ("Competitions"), how = 'left')

100%|██████████| 9/9 [00:00<00:00, 105.74it/s]
100%|██████████| 9/9 [00:00<00:00, 136.05it/s]


In [44]:
dataframes

{'PSV-99': shape: (1_176, 28)
 ┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
 │ Competiti ┆ Statistic ┆ P-value   ┆ Standard  ┆ … ┆ Standard  ┆ Statistic ┆ P-value   ┆ Standard │
 │ ons       ┆ CB        ┆ CB        ┆ Error CB  ┆   ┆ Error DM  ┆ CF        ┆ CF        ┆ Error CF │
 │ ---       ┆ ---       ┆ ---       ┆ ---       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ---      │
 │ str       ┆ f64       ┆ f64       ┆ f64       ┆   ┆ f64       ┆ f64       ┆ f64       ┆ f64      │
 ╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
 │ Argentina ┆ 1.0231    ┆ 0.8573    ┆ 0.2693    ┆ … ┆ 0.1485    ┆ 0.3192    ┆ 0.0119    ┆ 0.0518   │
 │ 1 -       ┆           ┆           ┆           ┆   ┆           ┆           ┆           ┆          │
 │ Australia ┆           ┆           ┆           ┆   ┆           ┆           ┆           ┆          │
 │ 1         ┆           ┆           ┆           ┆  

In [45]:
if not os.path.exists("./Tukey_Full_Results/"): 
    os.makedirs("./Tukey_Full_Results/")  

In [None]:
# write data for all the positions to 1 sheet
for metric in metrics:
    with xlsxwriter.Workbook(f"./Tukey_Full_Results/tukey_hsd_full_{metric}.xlsx") as wb:
        dataframes[metric].write_excel(
            workbook = wb,
            worksheet = 'competitions comparison',
            autofit = True,
            float_precision = 3,
            freeze_panes = (1,0),
            header_format = {"bold": True}
        )    

In [47]:
if not os.path.exists("./Tukey_PerPos_Results/"): 
    os.makedirs("./Tukey_PerPos_Results/")  

In [None]:
# write the data to a different sheet per postition
for metric in metrics:
    with xlsxwriter.Workbook(f"./Tukey_PerPos_Results/tukey_hsd_{metric}_PerPos.xlsx") as wb:
        for key, value in tqdm(values_by_metric[metric].items()):
            value.write_excel(
                workbook = wb,
                worksheet = key,
                autofit = True,
                float_precision = 3,
                freeze_panes = (1,0),
                header_format = {"bold": True},
                autofilter = True
            )
     



100%|██████████| 9/9 [00:00<00:00, 30.36it/s]
100%|██████████| 9/9 [00:00<00:00, 92.76it/s]
