notebook to find out the amount of distance players run above certrain speed tresholds during a  match

In [2]:
import polars as pl
from pathlib import Path
from floodligt_functions import clean_metadata
from floodligt_functions import read_position_data_jsonl
from floodlight.models.kinematics import DistanceModel
from floodlight.models.kinematics import VelocityModel
from floodlight.models.kinetics import MetabolicPowerModel


In [3]:
base_path = Path("SecondSpectrum/Second Spectrum")

In [4]:
game_path = Path("Anderlecht - Club Brugge")

In [5]:
# path to the tracking data file
tracking_file_path = Path(base_path, game_path, "tracking-produced.jsonl")

In [6]:
# path to the origibal meta data file
metadata_path = Path(base_path, game_path, "rsc-bru_metadata.json")

In [7]:
# path to the cleaned metadata file = output of clean_metadata function
cleaned_metadata_path = Path(base_path, game_path, "cleaned_metadata.json")

In [8]:
# modifying the metadata file so it can be used as input for the floodlight function
clean_metadata(metadata_path, cleaned_metadata_path)

In [9]:
# reading in the tracking / position data
position_data = read_position_data_jsonl(
    tracking_file_path,
    cleaned_metadata_path
)

Getting the data

In [10]:
# storing different possibilities for halfs and teams to be used in loops
# only HT1 is used to avoid kernel crashing 
halfs =  ["HT1"] #["HT1", "HT2"]
teams = ["Home", "Away"]

In [11]:
# get list of home players
# this will be used to match player with correct position data
home_players = [player for player in position_data[3]['Home']["player"]]

# get list of away players
away_players = [player for player in position_data[3]['Away']["player"]]

In [12]:
# store a dataframe per half showing the possession status (H = Home or A = Away) 
possession_status = {}

for half in halfs:
    # access the possession data in the position data dictionary
    half_possession = position_data[1][half].code

    # Convert np.nan to None so Polars can handle them as nulls
    half_possession = [x if isinstance(x, str) else None for x in half_possession]

    # Create DataFrame
    possession_status[half] = pl.DataFrame({
        f"{half} possession": half_possession
    }).drop_nulls()

In [13]:
# store a dataframe per half showing the possession status (D = Dead or A = Alive) 
ball_status = {}

for half in halfs:
    # access the ball data in the position data dictionary
    half_ball_status = position_data[2][half].code
    
    # Convert np.nan to None so Polars can handle them as nulls
    half_ball_status = [x if isinstance(x, str) else None for x in half_ball_status]
    
    # Create DataFrame
    ball_status[half] = pl.DataFrame({
        f"{half} ball status": half_ball_status
    }).drop_nulls()

In [14]:
position_data

({'HT1': {'Home': XY(xy=array([[-46.38,  -0.06,  -0.57, ...,    nan,    nan,    nan],
          [-46.36,  -0.06,  -0.56, ...,    nan,    nan,    nan],
          [-46.35,  -0.08,  -0.55, ...,    nan,    nan,    nan],
          ...,
          [   nan,    nan,    nan, ...,    nan,    nan,    nan],
          [   nan,    nan,    nan, ...,    nan,    nan,    nan],
          [   nan,    nan,    nan, ...,    nan,    nan,    nan]],
         shape=(3054601, 42)), framerate=25, direction='lr'),
   'Away': XY(xy=array([[42.22, -0.24,  8.87, ...,   nan,   nan,   nan],
          [42.21, -0.24,  8.88, ...,   nan,   nan,   nan],
          [42.21, -0.23,  8.88, ...,   nan,   nan,   nan],
          ...,
          [  nan,   nan,   nan, ...,   nan,   nan,   nan],
          [  nan,   nan,   nan, ...,   nan,   nan,   nan],
          [  nan,   nan,   nan, ...,   nan,   nan,   nan]],
         shape=(3054601, 42)), framerate=25, direction='rl'),
   'Ball': XY(xy=array([[-0.21,  0.07],
          [ 0.5 , -0.08],

In [15]:
# get the xy coordinates of the position data
# needed as input for the floodlight position - velocity - acceleration data
xy_objects = position_data[0]

In [16]:
xy_objects

{'HT1': {'Home': XY(xy=array([[-46.38,  -0.06,  -0.57, ...,    nan,    nan,    nan],
         [-46.36,  -0.06,  -0.56, ...,    nan,    nan,    nan],
         [-46.35,  -0.08,  -0.55, ...,    nan,    nan,    nan],
         ...,
         [   nan,    nan,    nan, ...,    nan,    nan,    nan],
         [   nan,    nan,    nan, ...,    nan,    nan,    nan],
         [   nan,    nan,    nan, ...,    nan,    nan,    nan]],
        shape=(3054601, 42)), framerate=25, direction='lr'),
  'Away': XY(xy=array([[42.22, -0.24,  8.87, ...,   nan,   nan,   nan],
         [42.21, -0.24,  8.88, ...,   nan,   nan,   nan],
         [42.21, -0.23,  8.88, ...,   nan,   nan,   nan],
         ...,
         [  nan,   nan,   nan, ...,   nan,   nan,   nan],
         [  nan,   nan,   nan, ...,   nan,   nan,   nan],
         [  nan,   nan,   nan, ...,   nan,   nan,   nan]],
        shape=(3054601, 42)), framerate=25, direction='rl'),
  'Ball': XY(xy=array([[-0.21,  0.07],
         [ 0.5 , -0.08],
         [ 1.26, 

In [17]:
# get the player list to use in the creation of the xy dataframe
away_players_xy = [f"{player}_{axis}" for player in away_players for axis in ("x", "y")]

In [18]:
xy_df = (
    pl.from_numpy(
        xy_objects['HT1']['Away'].xy, schema=away_players_xy).filter(~pl.all_horizontal(pl.all().is_nan())
    )
)


In [19]:
# write the velocity data to a parquet file 
xy_path = Path(base_path, game_path, "xy_HT1.parquet")
xy_df.write_parquet(xy_path)

In [20]:
# create a dataframe containing positional and ball status data & write it to parquet
xy_possession_df = pl.concat([xy_df, possession_status["HT1"]], how="horizontal")
xy_possession_path = Path(base_path, game_path, "xy_possession_HT1.parquet")
xy_possession_df.write_parquet(xy_possession_path)
xy_possession_df

S. Mignolet_x,S. Mignolet_y,Ordoñez_x,Ordoñez_y,Ferran Jutglà_x,Ferran Jutglà_y,K. Sabbe_x,K. Sabbe_y,C. Talbi_x,C. Talbi_y,H. Vanaken_x,H. Vanaken_y,B. Mechele_x,B. Mechele_y,C. Tzolis_x,C. Tzolis_y,Maxim De Cuyper_x,Maxim De Cuyper_y,Ardon Jashari_x,Ardon Jashari_y,Raphael Onyedika_x,Raphael Onyedika_y,Bjorn Meijer_x,Bjorn Meijer_y,Michal Skoras_x,Michal Skoras_y,J. Spileers_x,J. Spileers_y,C. Nielsen_x,C. Nielsen_y,Romero_x,Romero_y,H. Vetlesen_x,H. Vetlesen_y,Siquet_x,Siquet_y,Jackers_x,Jackers_y,Romeo Vermant_x,Romeo Vermant_y,Gustaf Nilsson_x,Gustaf Nilsson_y,HT1 possession
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str
42.22,-0.24,8.87,5.31,-0.16,-22.24,11.01,12.63,0.09,9.1,0.45,-30.18,25.92,-3.65,0.16,-15.82,6.17,-16.74,-0.2,-0.14,6.25,-7.62,,,,,,,,,,,,,,,,,,,,,"""H"""
42.21,-0.24,8.88,5.31,-0.23,-22.23,11.0,12.63,0.09,9.09,0.45,-30.19,25.92,-3.65,0.15,-15.82,6.17,-16.75,-0.17,-0.15,6.26,-7.61,,,,,,,,,,,,,,,,,,,,,"""H"""
42.21,-0.23,8.88,5.3,-0.31,-22.22,10.99,12.64,0.09,9.09,0.44,-30.19,25.91,-3.65,0.13,-15.82,6.17,-16.75,-0.14,-0.16,6.27,-7.61,,,,,,,,,,,,,,,,,,,,,"""H"""
42.2,-0.22,8.88,5.3,-0.4,-22.2,10.98,12.65,0.09,9.08,0.42,-30.18,25.91,-3.65,0.11,-15.82,6.17,-16.76,-0.12,-0.17,6.27,-7.61,,,,,,,,,,,,,,,,,,,,,"""A"""
42.19,-0.21,8.88,5.29,-0.5,-22.18,10.97,12.66,0.09,9.07,0.38,-30.19,25.91,-3.65,0.08,-15.82,6.18,-16.76,-0.1,-0.18,6.28,-7.61,,,,,,,,,,,,,,,,,,,,,"""A"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
31.92,6.48,19.09,10.75,15.56,17.2,15.43,26.0,16.08,22.29,16.71,20.77,17.41,20.83,0.83,-20.73,18.02,10.19,18.99,9.89,16.12,13.31,,,,,,,,,,,,,,,,,,,,,"""A"""
31.92,6.43,19.09,10.75,15.55,17.2,15.43,25.96,16.07,22.26,16.7,20.73,17.42,20.81,0.83,-20.73,18.01,10.16,18.99,9.87,16.11,13.3,,,,,,,,,,,,,,,,,,,,,"""A"""
31.92,6.38,19.09,10.74,15.55,17.19,15.41,25.92,16.06,22.23,16.69,20.69,17.42,20.79,0.83,-20.73,18.0,10.12,18.99,9.85,16.11,13.29,,,,,,,,,,,,,,,,,,,,,"""A"""
31.92,6.33,19.1,10.73,15.54,17.19,15.4,25.88,16.05,22.21,16.68,20.65,17.42,20.76,0.83,-20.73,17.99,10.08,18.99,9.83,16.09,13.27,,,,,,,,,,,,,,,,,,,,,"""A"""


Metabolic Power 

In [21]:
metabolic_power_model = MetabolicPowerModel()

In [None]:
metabolic_power_model.fit(xy_objects['HT1']['Away'])

In [None]:
metabolic_power_model.metabolic_power()


Distance

In [14]:
# Dictionary to store results per playing half
distance_dfs = {}

dm = DistanceModel()
for half in halfs:                  
    distance_dfs[half] = {}
    
    for team in teams:
        # apply the floodlight function
        dm.fit(xy_objects[half][team])
        distance = dm.distance_covered()
        # extract only the distance data
        np_array = distance.property
        # define the player names that have to be used as column names
        players = home_players if team == "Home" else away_players
        schema_dict = {player:pl.Float32 for player in players}
        # store the data in a dataframe and filter out rows that entirely consist of nan values 
        distance_dfs[half][team] = (
            pl.DataFrame(
                np_array, 
                schema=schema_dict
            ).filter(
                ~pl.all_horizontal(
                    pl.all().is_nan()
                )
            )
        )

In [15]:
# concatenate data from the Home team, Away team, ball status & possession status
distance_df = (
    pl.concat(
        [
            distance_dfs["HT1"]['Home'],
            distance_dfs["HT1"]['Away'],
            ball_status["HT1"], 
            possession_status["HT1"]
        ], 
        how = "horizontal"
    ).drop_nulls()
)

In [16]:
# write the distance data to a parquet file 
distance_path = Path(base_path, game_path, "distance_HT1.parquet")
distance_df.write_parquet(distance_path)

Velocity

In [14]:

# Dictionary to store results
velocity_dfs = {}

vm = VelocityModel()
for half in halfs:
    velocity_dfs[half] = {}
    
    for team in teams:
        # apply the floodlight function
        vm.fit(xy_objects[half][team])
        velocity = vm.velocity()
        # extract only the velocity data
        np_array = velocity.property
        # define the player names that have to be used as column names
        players = home_players if team == "Home" else away_players
        schema_dict = {player:pl.Float32 for player in players}
        # store the data in a dataframe and filter out rows that entirely consist of nan values v
        velocity_dfs[half][team] = (
            pl.DataFrame(
                np_array, 
                schema=schema_dict
            ).filter(
                ~pl.all_horizontal(
                    pl.all().is_nan()
                )
            )
        )

In [15]:
# concatenate data from the Home team, Away team, ball status & possession status
velocity_df = (
    pl.concat(
        [
            velocity_dfs["HT1"]['Home'],
            velocity_dfs["HT1"]['Away'],
            ball_status["HT1"], 
            possession_status["HT1"]
        ], 
        how = "horizontal"
    ).drop_nulls()
)
#velocity_df

In [16]:
# write the velocity data to a parquet file 
velocity_path = Path(base_path, game_path, "velocity_HT1.parquet")
velocity_df.write_parquet(velocity_path)

In [None]:
# example ofhow the dataframes can be filtered to get distance ran above a certain speed treshold
pl.concat(
    [
        distance_dfs['HT2']['Away']["C. Tzolis"].to_frame(name="distance"), 
        velocity_dfs['HT2']['Away']["C. Tzolis"].to_frame(name="velocity")
    ], 
    how="horizontal"
).filter(
    ((pl.col("velocity") > 5.5) & (pl.col("velocity").is_not_nan()))
).select(
    "distance"
).sum()


distance
f64
518.688651
