In [74]:
import polars.selectors as cs
import polars as pl
import plotly as plt
import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
from scipy.stats import f_oneway
from scipy.stats import tukey_hsd
from tqdm import tqdm
import xlsxwriter
import openpyxl
from openpyxl import load_workbook
from openpyxl.utils.dataframe import dataframe_to_rows
import statistics


In [3]:
parquet_path = "./parquet4visual.parquet"

In [4]:
df = pl.read_parquet(parquet_path)

In [5]:
pl.Config.set_tbl_cols(250)
df.head

<bound method DataFrame.head of shape: (449_537, 232)
┌─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬───

In [23]:
# select a subset of the dataframe to do statistical testing
df_subset = df.select([
    'competition_region_division', 
    'position_grouped',
    'P90 Distance',
    'P90 Running Distance',
    'P90 HSR Distance', 
    'P90 Sprinting Distance', 
    'PSV-99'
])

In [24]:
positions = df.unique('position_grouped')

In [88]:

dataframes = {}

for position in tqdm(sorted(positions['position_grouped'].to_list())):
    # filter the data frame on the position
    stat_df = df_subset.filter(pl.col("position_grouped") == position)
    
    #onnodig dit stuk herbezien?
    grouped_stats = stat_df.group_by("competition_region_division").agg([
        pl.col("PSV-99").alias("values")
    ])
    valid_groups = {
        row["competition_region_division"]: row["values"]
        for row in grouped_stats.iter_rows(named=True)
        if row["competition_region_division"] is not None and len(row["values"]) > 1
    }
    #print(valid_groups)
    group_names = sorted(valid_groups.keys())
    group_values = [valid_groups[key] for key in group_names]
    result = tukey_hsd(*group_values)

    n_groups = len(group_names)

    rows = []
    # Loop through all pairs
    for i in range(n_groups):
        for j in range(i + 1, n_groups):
            rows.append({
                f"Competitions": f"{group_names[i]} - {group_names[j]}",
                f"Statistic {position}": round(result.statistic[i, j], 4),
                f"P-value {position}": round(result.pvalue[i, j], 4),
                f"Standard Error {position}": round(result._stand_err[i, j], 4),
                #f"Mean {group_names[i]}": statistics.mean(group_values[i]),
                #f"Mean {group_names[j]}": statistics.mean(group_values[j]),
                #f"SD {group_names[i]}":statistics.stdev(group_values[i]),
                #f"SD {group_names[j]}":statistics.stdev(group_values[j]),
            })
    
    dataframes[position] = pl.DataFrame(rows)
    
        

  quad_r = quad(f, low, high, args=args, full_output=self.full_output,
100%|██████████| 9/9 [02:28<00:00, 16.52s/it]


In [91]:
for position in tqdm(positions['position_grouped']):
    if position != 'CB':
        dataframe = dataframe.join(dataframes[position], on = ("Competitions"), how = 'left')

100%|██████████| 9/9 [00:00<00:00, 113.80it/s]


In [93]:
with xlsxwriter.Workbook("tukey_hsd_full_result.xlsx") as wb:
    dataframe.write_excel(
        workbook = wb,
        worksheet = 'competitions comparison',
        autofit = True,
        float_precision = 3,
        freeze_panes = (1,0),
        header_format = {"bold": True}
    )

In [94]:
with xlsxwriter.Workbook("tukey_hsd_position.xlsx") as wb:
    for key, value in tqdm(dataframes.items()):
        value.write_excel(
            workbook = wb,
            worksheet = key,
            autofit = True,
            float_precision = 3,
            freeze_panes = (1,0),
            header_format = {"bold": True},
            autofilter = True
        )
     



100%|██████████| 9/9 [00:00<00:00, 61.44it/s]
