In [2]:
import polars as pl 
from pathlib import Path

In [3]:
base_path = Path("SecondSpectrum/Second Spectrum")

In [4]:
game_path = Path("Anderlecht - Club Brugge")

In [5]:
# define the path to the parquet files containing floodlight data: distance & velocity
distance_path = Path(base_path, game_path, "distance_HT1.parquet")
velocity_path = Path(base_path, game_path, "velocity_HT1.parquet")

In [6]:
# read the distance data
distance_df = pl.read_parquet(distance_path)

In [7]:
# dropping ball status and possession data from the distance dataframe
# keeping it in the dataframe would add the extra option of filtering in ball status and possession
distance_df = distance_df.drop(["HT1 ball status", "HT1 possession"])

In [10]:
# read the velocity data
velocity_df = pl.read_parquet(velocity_path)

In [11]:
# dropping ball status and possession data from the velocity dataframe
# keeping it in the dataframe would add the extra option of filtering in ball status and possession
velocity_df = velocity_df.drop(["HT1 ball status", "HT1 possession"])

In [12]:
# store the high speed running data per player in a dictionary
HSR_distance = {}

for col in distance_df.columns:
    # combine the velocity and the distance dataframes
    HSR_distance[col] = (
        pl.concat(
            [
                distance_df[col].to_frame(name="distance"), 
                velocity_df[col].to_frame(name="velocity")], 
            how="horizontal"
        # filter on a specific speed
        ).filter(
            ((pl.col("velocity") > 6.67) & (pl.col("velocity").is_not_nan()))
        # after filtering keep only the distance column
        ).select(
            "distance"
        # take the sum of the values in the distance column
        ).sum(
        # .item() needed to store them as single values independend of a dataframe 
        ).item(
        )
    )

In [13]:
# convert the dictionary to a dataframe (keys =  column names)
HSR_df = pl.from_dict(HSR_distance)
#HSR_df

Investigating the difference between opta and  floodlight data

In [None]:
# opta data: meters ran in first half >24 km/u 
opta_data = {
    'C. Coosemans': 1,
    'Amuzu': 105,
    'M. Rits': 117,
    'T. Leoni': 21,
    'K. Dolberg': 162,
    'K. Sardella': 202,
    'A. Dreyer': 215,
    'L. Dendoncker': 80,
    'J. Simić': 119,
    'Amando Lapage': 122,
    'Ali Maamar': 207,
    'L. Vázquez': 0.0,
    'Y. Verschaeren': 0.0,
    'N. Engwanda': 0.0,
    'Mads Kikkenborg': 0.0,
    'S. Edozie': 0.0,
    'Thorgan Hazard': 0.0,
    'T. Degreef': 0.0,
    'Thomas Foket': 0.0,
    'Timon Vanhoutte': 0.0,
    'Anas Tajaouart': 0.0,
    'S. Mignolet': 0.0,
    'Ordoñez': 43,
    'Ferran Jutglà': 186,
    'K. Sabbe': 70,
    'C. Talbi': 246,
    'H. Vanaken': 72,
    'B. Mechele': 66,
    'C. Tzolis': 310,
    'Maxim De Cuyper': 211,
    'Ardon Jashari': 79,
    'Raphael Onyedika': 14.869796048973726,
    'Bjorn Meijer': 0.0,
    'Michal Skoras': 0.0,
    'J. Spileers': 0.0,
    'C. Nielsen': 0.0,
    'Romero': 0.0,
    'H. Vetlesen': 0.0,
    'Siquet': 0.0,
    'Jackers': 0.0,
    'Romeo Vermant': 0.0,
    'Gustaf Nilsson': 0.0
}

In [15]:
opta_df = pl.from_dict(opta_data)
opta_df

C. Coosemans,Amuzu,M. Rits,T. Leoni,K. Dolberg,K. Sardella,A. Dreyer,L. Dendoncker,J. Simić,Amando Lapage,Ali Maamar,L. Vázquez,Y. Verschaeren,N. Engwanda,Mads Kikkenborg,S. Edozie,Thorgan Hazard,T. Degreef,Thomas Foket,Timon Vanhoutte,Anas Tajaouart,S. Mignolet,Ordoñez,Ferran Jutglà,K. Sabbe,C. Talbi,H. Vanaken,B. Mechele,C. Tzolis,Maxim De Cuyper,Ardon Jashari,Raphael Onyedika,Bjorn Meijer,Michal Skoras,J. Spileers,C. Nielsen,Romero,H. Vetlesen,Siquet,Jackers,Romeo Vermant,Gustaf Nilsson
i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,i64,i64,i64,i64,i64,i64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1,105,117,21,162,202,215,80,119,122,207,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,43,186,70,246,72,66,310,211,79,14.869796,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
# comparing opta to floodlight 
for col in HSR_df.columns:
    result = (HSR_df[col] / opta_df[col] * 100).item()
    print(result)

0.0
77.18683151971726
76.19579999874799
44.014027005150204
54.0570671175733
71.54327241501005
65.84920483966206
57.687826156616204
59.54741790515035
81.99604847392098
67.85816432197312
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
70.49424814623457
65.30793712985131
70.58125632149833
87.48774954943153
59.56143273247613
54.023771575002954
82.0726800733997
67.77593242048651
45.47957891150366
94.62133608614822
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
