In [1]:
# STEP 0: importing required packages
import polars as pl
import json
import os
import polars.selectors as cs
import xlsxwriter
import zipfile
import io
from pathlib import Path

STEP 1: Defining the path to the required files

In [None]:
# 1.1 json file containing skillcorner competition editions information
ce_path = Path('./data/skillcorner-competition-editions.json')

In [None]:
# 1.2 Directory containing json files with skillcorner position data
json_PosDir_path = Path('/data/skillcorner-20250214.zip')

In [None]:
# 1.3 csv containing statsbomb competition id and info (country & division)
sb_path = "./data//statsbomb-competition-levels.csv"

In [None]:
# 1.4 csv containing comparison of statsbomb and skillcorner id's / information
sb_sc_path = "./data//mapping-competition-seasons.csv"

STEP 2: Converting the competitions editions json to a usable DataFrame

In [6]:

with ce_path.open('r') as ce_file:
    ce_data = json.load(ce_file)['results']
    ce_df = pl.json_normalize(ce_data)

STEP 3: Combining the json files with the position data into a usable dataframe

In [7]:
# initializing an empty list to store json files
json_list = []

In [8]:
# Looping through the json files and storing them into one list
with zipfile.ZipFile(json_PosDir_path, "r") as zip:
    for file_name in zip.namelist():
        if file_name.endswith(".json"):
            with zip.open(file_name) as f:
                json_bytes = f.read()
                json_buffer = io.BytesIO(json_bytes)
                json_df = pl.read_json(json_buffer)
                json_list.append(json_df)

STEP 4: Combining the data

In [9]:
# concatenate json files and join with ce_df (keep left on column name)
df = pl.concat(
        json_list
    ).join(
        ce_df, 
        left_on = "competition_edition_id",
        right_on = "id",
        how = "left"
    )


STEP 5: Cobining statsbomb and skillcorner data with the physical data

In [10]:
sb = pl.read_csv(sb_path)
sb_sc = pl.read_csv(sb_sc_path)

In [11]:
# 5.1 Linking competition info (country & division) to skillcorner competition edition
df = sb_sc.join(sb, #join statsbomb information with the sb - sc comparison
        left_on="sb_competition_id",
        right_on="competition_id",
        how = "left"
).select(
    [
        "sc_competition_season_id", 
        "competition_region_name",
        "competition_division_level"
    ]
).with_columns(
    pl.concat_str(
        [
            pl.col("competition_region_name"),
            pl.col("competition_division_level").cast(str),
        ],
        separator=" ",
    ).alias("competition_region_division")
).drop(
    [
        "competition_region_name", 
        "competition_division_level"
    ]
).join(
    df, 
    left_on="sc_competition_season_id",
    right_on="competition_edition_id",
    how = "right"
)

In [12]:
df['competition_region_division'].unique()

competition_region_division
str
"""Romania 1"""
"""Spain 3"""
"""Uruguay 1"""
"""Germany 3"""
"""Japan 2"""
…
"""Switzerland 1"""
"""Italy 1"""
"""England 1"""
"""Norway 1"""


STEP 6: Performance metrics have to be normalized towards a 90 minute match based on the minutes played

In [13]:
# 6.1: Defining column names that do not have to be normalized (non numeric columns)
list2skip = df.select(~cs.numeric()).columns

In [14]:
# Hardcoding the numerical columns that do not have to be normalized
hardcoded = [
    'PSV-99',
    'competition_edition_id',
    'competition_id',
    'competition_id_right',
    'match_id',
    'player_id',
    'season_id_right',
    'season_id',
    'season_end_year',
    'season_start_year',
    'team_id'
]

In [15]:
list2skip = list2skip + hardcoded

In [16]:
# 6.2 hardcoding the time indicators
# Needed to correctly normalize each metric
time_indicators = {
    "OTIP 2": "Minutes OTIP 2",
    "OTIP 1": "Minutes OTIP 1",
    "TIP 2": "Minutes TIP 2",
    "TIP 1": "Minutes TIP 1",
    "OTIP": "Minutes OTIP",
    "TIP": "Minutes TIP",
    "2": "Minutes 2",
    "1": "Minutes 1"
}

In [17]:
# iterating over the columns to match them to the correct time / minute column
# normalizing the metrics for the time played
for coln in df.columns:
    if coln in list2skip:
        continue

    for pattern, indicator in time_indicators.items():
        if pattern in coln:
            divisor = indicator
            break
        else:
            divisor = "Minutes"

    df = df.with_columns(
        ((pl.col(coln) / pl.col(divisor))*90).alias(f"P90 {coln}")
    )

In [18]:
# sort the dataframe again for logical column order
df = df.select(sorted(df.columns))

STEP 7: Creating Excell files showing benchmarked data for the needed metrics

In [19]:
# 7.1 group the symmetrical positions by hardcoding
# symmetrical positions have to be shown in the same excell sheet 
mapping = {
    "RM": "RM|LM",
    "LM": "RM|LM",
    "RCB": "RCB|LCB",
    "LCB": "RCB|LCB",
    "RWB": "RWB|LWB",
    "LWB": "RWB|LWB",
    "RF": "RF|LF",
    "LF": "RF|LF",
    "RW": "RW|LW",
    "LW": "RW|LW",
    "DM": "DM",
    "AM": "AM",
    "CB": "CB",
    "CF": "CF"
}

In [20]:
df = df.with_columns(
    position_grouped = pl.col("position").replace_strict(mapping)
)

In [21]:
# 7.2 Hardcoding column names of the metrics wanted in the excells
# competition_region_division and position are fixed columns and should not be changed
df_subset = df.select([
    'competition_region_division', 
    'position_grouped',
    'P90 Distance',
    'P90 Running Distance',
    'P90 HSR Distance', 
    'P90 Sprinting Distance', 
    'PSV-99'
])

In [22]:
# The performance metrics are variable columns and can be modified
# depending on the wanted analysis
metrics = df_subset.drop([
    'competition_region_division', 
    'position_grouped'
]).columns

In [23]:
# 7.3 Hardcoding positions in the same order as the wanted excell sheet order
positions = ['CB', 'RCB|LCB', 'RWB|LWB', 'DM','RM|LM', 'AM', 'RW|LW', 'CF', 'RF|LF']

In [24]:
def position_filter(dataframe, position):
    """Filters the dataframe on a specified position"""
    pos_df = dataframe.filter(
                pl.col("position_grouped") == position
            ).filter(
                pl.col("competition_region_division").is_not_null()
            )
    
    return pos_df

In [25]:
def sample_size_calculator(dataframe):
    """Calculate the sample size per position per competition"""
    ss_df = (
        dataframe.group_by(
            "competition_region_division"        
        ).agg([
            pl.col("competition_region_division").len().alias("Sample Size")
        ])
    )
    
    return ss_df

In [26]:

# 7.4 Create the 1st excell type
# store the statistical measures to loop through 
stats = {
    "Mean": pl.col(metrics).mean(),
    "Median": pl.col(metrics).median(),
    "Std": pl.col(metrics).std(),
    "Q25": pl.col(metrics).quantile(0.25),
    "Q75": pl.col(metrics).quantile(0.75),
}

In [27]:
with xlsxwriter.Workbook("match_benchmarks_stat_grouped.xlsx") as wb:

    for position in positions:
        result_list = []
        
        # Filter for the specific position
        df_pos = position_filter(df_subset, position)

        # determine the sample size per competition
        ss = sample_size_calculator(df_pos)

        for stat_name, stat_expr in stats.items():
            result = (
                df_pos
                .group_by("competition_region_division")
                .agg([stat_expr])
                .with_columns(pl.lit(stat_name).alias("Statistical Measure"))
            )
            result_list.append(result)

        # Combine all results with columns in prefered order 
        pl.concat(
                result_list
            ).join(
                ss, on="competition_region_division"
            ).select([
                    "competition_region_division",
                    "Statistical Measure",
                    "Sample Size"
                ]
                + metrics
            ).rename({
                "competition_region_division":"Competition"
            }).write_excel(
                workbook=wb, 
                worksheet = position,
                autofit = True,
                float_precision = 1,
                freeze_panes = (1,0),
                header_format = {"bold": True}
            )

In [28]:
# 7.5 Create the 2nd excell type
with xlsxwriter.Workbook("match_benchmarks_stat_sep.xlsx") as wb:

    for position in positions:    
        result_list = []

        # Filter for the specific position
        df_pos = position_filter(df_subset, position)

        # determine the sample size per competition
        ss = sample_size_calculator(df_pos)
        result_list.append(ss)

        # Filter for this specific position
        for metric in metrics:
            result = (
                df_pos
                .filter(pl.col("position_grouped") == position)
                .group_by("competition_region_division")
                .agg(
                    [pl.col(metric).mean().alias(f"{metric} Mean")] +
                    [pl.col(metric).median().alias(f"{metric} Median")] + 
                    [pl.col(metric).std().alias(f"{metric}  Std")] + 
                    [pl.col(metric).quantile(0.25).alias(f"{metric} Q25")]+
                    [pl.col(metric).quantile(0.75).alias(f"{metric} Q75")]
                )
            )   

            result_list.append(result)

        # Combine all results in a dataframe to write to a position sheet of the excel
        pl.concat(
                result_list, how='align'
            ).rename({
                "competition_region_division":"Competition"
            }).write_excel(
                workbook=wb, 
                worksheet = position,
                autofit = True,
                float_precision = 1,
                freeze_panes = (1,0),
                header_format = {"bold": True}
            )

In [None]:
# STEP 9: Write the dataframe to a parquet file that will be used for visualisation
# entire dataframe written to the parquet file 
parquet4visual_path = "./data/parquet4visual.parquet"

In [30]:

df.write_parquet(parquet4visual_path)