In [1]:
import polars.selectors as cs
import polars as pl
import plotly as plt
import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
from scipy.stats import f_oneway
from scipy.stats import tukey_hsd
from tqdm import tqdm
import xlsxwriter
import openpyxl
from openpyxl import load_workbook
from openpyxl.utils.dataframe import dataframe_to_rows
import statistics
import os


In [2]:
parquet_path = "./parquet4visual.parquet"

In [3]:
# get the required columns
df = pl.read_parquet(parquet_path,
   columns = [
      'competition_region_division', 
      'position_grouped', 
      'PSV-99',
      'P90 HSR Distance',
   ])

In [4]:
pl.Config.set_tbl_rows(50)
df.unique('competition_region_division')

competition_region_division,position_grouped,PSV-99,P90 HSR Distance
str,str,f64,f64
"""Europe 1""","""AM""",32.1,645.109908
"""Czech Republic 1""","""AM""",28.9,763.000598
"""Switzerland 1""","""AM""",28.2,618.16765
"""Sweden 1""","""AM""",28.1,606.639839
"""Belgium 2""","""AM""",28.6,663.331181
"""Greece 1""","""AM""",27.7,496.597215
"""Ecuador 1""","""AM""",28.4,646.158494
"""USA 3""","""AM""",31.7,373.921666
"""Netherlands 1""","""AM""",29.7,547.112462
"""Chile 1""","""AM""",28.3,654.080908


In [5]:
positions = df.unique('position_grouped')

In [6]:
# hardcode the names of the different metric columns that have to be analyzed
metrics = ['PSV-99', 'P90 HSR Distance',]

In [None]:

values_by_metric = {} 

# loop through the metrics 
for metric in tqdm(metrics):
    # create an empty dictionary to store the metric values per position
    values_by_metric[metric] = {} 
    
    # select only the columns competition name, position and the metric of interest for further use
    mdf = df.select(['competition_region_division', 'position_grouped', metric])
    
    # loop through the different positions
    for position in sorted(positions['position_grouped'].to_list()):
        
        # filter the data frame on the position
        stat_df = mdf.filter(pl.col("position_grouped") == position)
        
        # group the metric values per competition in a list (needed as input for tukey hsd)
        grouped_stats = stat_df.group_by("competition_region_division").agg([
            pl.col(metric)
        ])
        
        valid_groups = {
            row["competition_region_division"]: row[metric] # returns a dictionary --> competition_name: [metric values]
            for row in grouped_stats.iter_rows(named=True) # iter_rows returns a dictionary row{col1_value: col2_value}
            if row["competition_region_division"] is not None and len(row[metric]) > 1 # filters out null competitions and competitions with 1 or less metric values
        }
        #print(valid_groups)
        group_names = sorted(valid_groups.keys()) # sort the competition names so results are in same order and can be joined across competitions
        group_values = [valid_groups[key] for key in group_names]
        result = tukey_hsd(*group_values)

        n_groups = len(group_names)

        rows = []
        # Loop through all pairs
        for i in range(n_groups):
            for j in range(i + 1, n_groups):
                
                rows.append({
                    f"Competitions": f"{group_names[i]} - {group_names[j]}",
                    f"Statistic {position}": round(result.statistic[i, j], 4),
                    f"P-value {position}": round(result.pvalue[i, j], 4),
                    f"Standard Error {position}": round(result._stand_err[i, j], 4),
                    #f"Mean {group_names[i]}": statistics.mean(group_values[i]),
                    #f"Mean {group_names[j]}": statistics.mean(group_values[j]),
                    #f"SD {group_names[i]}":statistics.stdev(group_values[i]),
                    #f"SD {group_names[j]}":statistics.stdev(group_values[j]),
                })
        
        values_by_metric[metric][position] = pl.DataFrame(rows)
    


  0%|          | 0/2 [00:00<?, ?it/s]

  quad_r = quad(f, low, high, args=args, full_output=self.full_output,
100%|██████████| 2/2 [09:37<00:00, 288.68s/it]


In [16]:
#values_by_metric

In [15]:
#values_by_metric['PSV-99']['AM']

In [10]:
dataframes = {}

for metric in metrics:
    
    dataframes[metric] = values_by_metric[metric]['CB']
    
    for position in tqdm(positions['position_grouped']):
        if position != 'CB':
            dataframes[metric] = dataframes[metric].join(values_by_metric[metric][position], on = ("Competitions"), how = 'left')

100%|██████████| 9/9 [00:00<00:00, 86.98it/s]
100%|██████████| 9/9 [00:00<00:00, 100.63it/s]


In [17]:
#dataframes

In [11]:
if not os.path.exists("./Tukey_Full_Results/"): 
    os.makedirs("./Tukey_Full_Results/")  

In [12]:
# write data for all the positions to 1 sheet
for metric in metrics:
    with xlsxwriter.Workbook(f"./Tukey_Full_Results/tukey_hsd_full_{metric}.xlsx") as wb:
        dataframes[metric].write_excel(
            workbook = wb,
            worksheet = 'competitions comparison',
            autofit = True,
            float_precision = 3,
            freeze_panes = (1,0),
            header_format = {"bold": True}
        )    

In [13]:
if not os.path.exists("./Tukey_PerPos_Results/"): 
    os.makedirs("./Tukey_PerPos_Results/")  

In [14]:
# write the data to a different sheet per postition
for metric in metrics:
    with xlsxwriter.Workbook(f"./Tukey_PerPos_Results/tukey_hsd_{metric}_PerPos.xlsx") as wb:
        for key, value in tqdm(values_by_metric[metric].items()):
            value.write_excel(
                workbook = wb,
                worksheet = key,
                autofit = True,
                float_precision = 3,
                freeze_panes = (1,0),
                header_format = {"bold": True},
                autofilter = True
            )
     



100%|██████████| 9/9 [00:00<00:00, 88.34it/s]
100%|██████████| 9/9 [00:00<00:00, 51.79it/s]
