notebook to find out the amount of distance players run above certrain speed tresholds during a  match

In [1]:
import polars as pl
from pathlib import Path
from floodligt_functions import clean_metadata
from floodligt_functions import read_position_data_jsonl
from floodlight.models.kinematics import DistanceModel
from floodlight.models.kinematics import VelocityModel

In [2]:
base_path = Path("SecondSpectrum/Second Spectrum")

In [3]:
game_path = Path("Anderlecht - Club Brugge")

In [4]:
# path to the tracking data file
tracking_file_path = Path(base_path, game_path, "tracking-produced.jsonl")

In [5]:
# path to the origibal meta data file
metadata_path = Path(base_path, game_path, "rsc-bru_metadata.json")

In [6]:
# path to the cleaned metadata file = output of clean_metadata function
cleaned_metadata_path = Path(base_path, game_path, "cleaned_metadata.json")

In [7]:
# modifying the metadata file so it can be used as input for the floodlight function
clean_metadata(metadata_path, cleaned_metadata_path)

In [8]:
# reading in the tracking / position data
position_data = read_position_data_jsonl(
    tracking_file_path,
    cleaned_metadata_path
)

Getting the data

In [9]:
# storing different possibilities for halfs and teams to be used in loops
# only HT1 is used to avoid kernel crashing 
halfs =  ["HT1"] #["HT1", "HT2"]
teams = ["Home", "Away"]

In [10]:
# get list of home players
# this will be used to match player with correct position data
home_players = [player for player in position_data[3]['Home']["player"]]

# get list of away players
away_players = [player for player in position_data[3]['Away']["player"]]

In [11]:
# store a dataframe per half showing the possession status (H = Home or A = Away) 
possession_status = {}

for half in halfs:
    # access the possession data in the position data dictionary
    half_possession = position_data[1][half].code

    # Convert np.nan to None so Polars can handle them as nulls
    half_possession = [x if isinstance(x, str) else None for x in half_possession]

    # Create DataFrame
    possession_status[half] = pl.DataFrame({
        f"{half} possession": half_possession
    }).drop_nulls()

In [12]:
# store a dataframe per half showing the possession status (D = Dead or A = Alive) 
ball_status = {}

for half in halfs:
    # access the ball data in the position data dictionary
    half_ball_status = position_data[2][half].code
    
    # Convert np.nan to None so Polars can handle them as nulls
    half_ball_status = [x if isinstance(x, str) else None for x in half_ball_status]
    
    # Create DataFrame
    ball_status[half] = pl.DataFrame({
        f"{half} ball status": half_ball_status
    }).drop_nulls()

In [13]:
# get the xy coordinates of the position data
# needed as input for the floodlight position - velocity - acceleration data
xy_objects = position_data[0]

In [18]:
# get the player list to use in the creation of the xy dataframe
home_players_xy = [f"{player}_{axis}" for player in home_players for axis in ("x", "y")]

In [None]:
xy_df = (
    pl.from_numpy(
        xy_objects['HT1']['Home'].xy, schema=home_players_xy).filter(~pl.all_horizontal(pl.all().is_nan())
    )
)


In [None]:
# write the velocity data to a parquet file 
xy_path = Path(base_path, game_path, "xy_HT1.parquet")
xy_df.write_parquet(xy_path)

In [14]:
# Dictionary to store results per playing half
distance_dfs = {}

dm = DistanceModel()
for half in halfs:                  
    distance_dfs[half] = {}
    
    for team in teams:
        # apply the floodlight function
        dm.fit(xy_objects[half][team])
        distance = dm.distance_covered()
        # extract only the distance data
        np_array = distance.property
        # define the player names that have to be used as column names
        players = home_players if team == "Home" else away_players
        schema_dict = {player:pl.Float32 for player in players}
        # store the data in a dataframe and filter out rows that entirely consist of nan values 
        distance_dfs[half][team] = (
            pl.DataFrame(
                np_array, 
                schema=schema_dict
            ).filter(
                ~pl.all_horizontal(
                    pl.all().is_nan()
                )
            )
        )

In [15]:
# concatenate data from the Home team, Away team, ball status & possession status
distance_df = (
    pl.concat(
        [
            distance_dfs["HT1"]['Home'],
            distance_dfs["HT1"]['Away'],
            ball_status["HT1"], 
            possession_status["HT1"]
        ], 
        how = "horizontal"
    ).drop_nulls()
)

In [16]:
# write the distance data to a parquet file 
distance_path = Path(base_path, game_path, "distance_HT1.parquet")
distance_df.write_parquet(distance_path)

In [14]:

# Dictionary to store results
velocity_dfs = {}

vm = VelocityModel()
for half in halfs:
    velocity_dfs[half] = {}
    
    for team in teams:
        # apply the floodlight function
        vm.fit(xy_objects[half][team])
        velocity = vm.velocity()
        # extract only the velocity data
        np_array = velocity.property
        # define the player names that have to be used as column names
        players = home_players if team == "Home" else away_players
        schema_dict = {player:pl.Float32 for player in players}
        # store the data in a dataframe and filter out rows that entirely consist of nan values v
        velocity_dfs[half][team] = (
            pl.DataFrame(
                np_array, 
                schema=schema_dict
            ).filter(
                ~pl.all_horizontal(
                    pl.all().is_nan()
                )
            )
        )

In [15]:
# concatenate data from the Home team, Away team, ball status & possession status
velocity_df = (
    pl.concat(
        [
            velocity_dfs["HT1"]['Home'],
            velocity_dfs["HT1"]['Away'],
            ball_status["HT1"], 
            possession_status["HT1"]
        ], 
        how = "horizontal"
    ).drop_nulls()
)
#velocity_df

In [16]:
# write the velocity data to a parquet file 
velocity_path = Path(base_path, game_path, "velocity_HT1.parquet")
velocity_df.write_parquet(velocity_path)

In [None]:
# example ofhow the dataframes can be filtered to get distance ran above a certain speed treshold
pl.concat(
    [
        distance_dfs['HT2']['Away']["C. Tzolis"].to_frame(name="distance"), 
        velocity_dfs['HT2']['Away']["C. Tzolis"].to_frame(name="velocity")
    ], 
    how="horizontal"
).filter(
    ((pl.col("velocity") > 5.5) & (pl.col("velocity").is_not_nan()))
).select(
    "distance"
).sum()


distance
f64
518.688651
