Notebook to identify the number of high speed runs per player

In [1]:
from pathlib import Path
import polars as pl

In [2]:
# define the path to the parquet files containing floodlight data: velocity
base_path = Path("SecondSpectrum/Second Spectrum")

In [3]:
game_path = Path("Anderlecht - Club Brugge")

In [4]:
velocity_path = Path(base_path, game_path, "velocity_HT1.parquet")

In [5]:
# read the velocity data
velocity_df = pl.read_parquet(velocity_path)

In [6]:
# dropping ball status and possession data from the velocity dataframe
# keeping it in the dataframe would add the extra option of filtering in ball status and possession
velocity_df = velocity_df.drop(["HT1 ball status", "HT1 possession"])

In [7]:
# target speed (m/s)
target_speed = 6.67

#percentage of target speed loss to define to define the end of a high speed run
drop_off = 0.05

# determine the cutoff speed based on the target speed and the dropoff percentage
cutoff_speed = target_speed - (target_speed * drop_off)

In [None]:
player_HSR = {}

for player in velocity_df.columns:
    # initialize a run counter storing the number of runs above a treshold
    run_counter = 0
    
    # initialize a boolean variable that defines if a frame is above a certain treshold or not
    # used to determine if frames are part of the same run or not
    speed_treshold = False
    
    for speed in velocity_df[player]:

        # detect a high speed run (= speed greater than treshold)
        if speed >= target_speed and not speed_treshold:
            run_counter += 1
            
            speed_treshold = True
            
        # make sure frames of the same run are not counted seperatly
        elif speed >= target_speed and speed_treshold:
            continue
        
        # detect the end off a high speed run based by a drop in speed based on cutoff_speed
        elif speed <= cutoff_speed and speed_treshold:
            speed_treshold = False
            
        else:
            continue
    
    player_HSR[player] = run_counter

In [9]:
# convert the dictionary to a dataframe (keys =  column names)
HSR_df = pl.from_dict(player_HSR)
HSR_df

C. Coosemans,Amuzu,M. Rits,T. Leoni,K. Dolberg,K. Sardella,A. Dreyer,L. Dendoncker,J. Simić,Amando Lapage,Ali Maamar,L. Vázquez,Y. Verschaeren,N. Engwanda,Mads Kikkenborg,S. Edozie,Thorgan Hazard,T. Degreef,Thomas Foket,Timon Vanhoutte,Anas Tajaouart,S. Mignolet,Ordoñez,Ferran Jutglà,K. Sabbe,C. Talbi,H. Vanaken,B. Mechele,C. Tzolis,Maxim De Cuyper,Ardon Jashari,Raphael Onyedika,Bjorn Meijer,Michal Skoras,J. Spileers,C. Nielsen,Romero,H. Vetlesen,Siquet,Jackers,Romeo Vermant,Gustaf Nilsson
i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
0,7,6,1,8,12,12,7,9,6,15,0,0,0,0,0,0,0,0,0,0,0,7,11,6,15,5,7,18,12,8,2,0,0,0,0,0,0,0,0,0,0


In [10]:
# opta data: number of runs in first half >24 km/u 
opta_data = {
    'C. Coosemans': 0,
    'Amuzu': 8,
    'M. Rits': 6,
    'T. Leoni': 1,
    'K. Dolberg': 12,
    'K. Sardella': 14,
    'A. Dreyer': 17,
    'L. Dendoncker': 8,
    'J. Simić': 9,
    'Amando Lapage': 6,
    'Ali Maamar': 22,
    'L. Vázquez': 0.0,
    'Y. Verschaeren': 0.0,
    'N. Engwanda': 0.0,
    'Mads Kikkenborg': 0.0,
    'S. Edozie': 0.0,
    'Thorgan Hazard': 0.0,
    'T. Degreef': 0.0,
    'Thomas Foket': 0.0,
    'Timon Vanhoutte': 0.0,
    'Anas Tajaouart': 0.0,
    'S. Mignolet': 0.0,
    'Ordoñez': 6,
    'Ferran Jutglà': 14,
    'K. Sabbe': 7,
    'C. Talbi': 13,
    'H. Vanaken': 8,
    'B. Mechele': 7,
    'C. Tzolis': 19,
    'Maxim De Cuyper': 16,
    'Ardon Jashari': 11,
    'Raphael Onyedika': 3,
    'Bjorn Meijer': 0.0,
    'Michal Skoras': 0.0,
    'J. Spileers': 0.0,
    'C. Nielsen': 0.0,
    'Romero': 0.0,
    'H. Vetlesen': 0.0,
    'Siquet': 0.0,
    'Jackers': 0.0,
    'Romeo Vermant': 0.0,
    'Gustaf Nilsson': 0.0
}

In [11]:
opta_df = pl.from_dict(opta_data)
opta_df

C. Coosemans,Amuzu,M. Rits,T. Leoni,K. Dolberg,K. Sardella,A. Dreyer,L. Dendoncker,J. Simić,Amando Lapage,Ali Maamar,L. Vázquez,Y. Verschaeren,N. Engwanda,Mads Kikkenborg,S. Edozie,Thorgan Hazard,T. Degreef,Thomas Foket,Timon Vanhoutte,Anas Tajaouart,S. Mignolet,Ordoñez,Ferran Jutglà,K. Sabbe,C. Talbi,H. Vanaken,B. Mechele,C. Tzolis,Maxim De Cuyper,Ardon Jashari,Raphael Onyedika,Bjorn Meijer,Michal Skoras,J. Spileers,C. Nielsen,Romero,H. Vetlesen,Siquet,Jackers,Romeo Vermant,Gustaf Nilsson
i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
0,8,6,1,12,14,17,8,9,6,22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,14,7,13,8,7,19,16,11,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
# comparing opta to floodlight 
for col in HSR_df.columns:
    result = (HSR_df[col] / opta_df[col] * 100).item()
    print(result)

nan
87.5
100.0
100.0
66.66666666666666
85.71428571428571
70.58823529411765
87.5
100.0
100.0
68.18181818181817
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
116.66666666666667
78.57142857142857
85.71428571428571
115.38461538461537
62.5
100.0
94.73684210526315
75.0
72.72727272727273
66.66666666666666
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
