In [1]:
from pathlib import Path
import polars as pl

In [2]:
# define the path to the parquet files containing floodlight data: distance & velocity
distance_path = Path("floodlight_parquets/distance_HT1")
velocity_path = Path("floodlight_parquets/velocity_HT1")

In [3]:
# read the distance data
distance_df = pl.read_parquet(distance_path)

In [4]:
#distance_df

In [None]:
# dropping ball status and possession data from the distance dataframe
# keeping it in the dataframe would add the extra option of filtering in ball status and possession
distance_df = distance_df.drop(["HT1 ball status", "HT1 possession"])

In [6]:
# read the velocity data
velocity_df = pl.read_parquet(velocity_path)

In [7]:
#velocity_df

In [8]:
# dropping ball status and possession data from the velocity dataframe
# keeping it in the dataframe would add the extra option of filtering in ball status and possession
velocity_df = velocity_df.drop(["HT1 ball status", "HT1 possession"])

In [None]:
# store the high speed running data per player in a dictionary
HSR_distance = {}

for col in distance_df.columns:
    # combine the velocity and the distance dataframes
    HSR_distance[col] = pl.concat(
        [distance_df[col].to_frame(name="distance"), velocity_df[col].to_frame(name="velocity")], how="horizontal"
    # filter on a specific speed
    ).filter(
        ((pl.col("velocity") > 6.67) & (pl.col("velocity").is_not_nan()))
    # after filtering keep only the distance column
    ).select(
        "distance"
    # take the sum of the values in the distance column
    # .item() needed to store them as single values independend of a dataframe 
    ).sum().item()

In [12]:
# convert the dictionary to a dataframe (keys =  column names)
HSR_df = pl.from_dict(HSR_distance)
HSR_df

C. Coosemans,Amuzu,M. Rits,T. Leoni,K. Dolberg,K. Sardella,A. Dreyer,L. Dendoncker,J. Simić,Amando Lapage,Ali Maamar,L. Vázquez,Y. Verschaeren,N. Engwanda,Mads Kikkenborg,S. Edozie,Thorgan Hazard,T. Degreef,Thomas Foket,Timon Vanhoutte,Anas Tajaouart,S. Mignolet,Ordoñez,Ferran Jutglà,K. Sabbe,C. Talbi,H. Vanaken,B. Mechele,C. Tzolis,Maxim De Cuyper,Ardon Jashari,Raphael Onyedika,Bjorn Meijer,Michal Skoras,J. Spileers,C. Nielsen,Romero,H. Vetlesen,Siquet,Jackers,Romeo Vermant,Gustaf Nilsson
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
0.0,81.046163,89.14909,9.242945,87.572455,144.517399,141.575797,46.150263,70.861442,100.035167,140.466396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30.312524,121.472748,49.406878,215.219879,42.884234,35.655697,254.425298,143.007219,35.928865,14.069999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Investigating the difference between opta and  floodlight data

In [13]:
# opta data: meters ran in first half >24 km/u 
opta_data = {'C. Coosemans': 1,
 'Amuzu': 105,
 'M. Rits': 117,
 'T. Leoni': 21,
 'K. Dolberg': 162,
 'K. Sardella': 202,
 'A. Dreyer': 215,
 'L. Dendoncker': 80,
 'J. Simić': 119,
 'Amando Lapage': 122,
 'Ali Maamar': 207,
 'L. Vázquez': 0.0,
 'Y. Verschaeren': 0.0,
 'N. Engwanda': 0.0,
 'Mads Kikkenborg': 0.0,
 'S. Edozie': 0.0,
 'Thorgan Hazard': 0.0,
 'T. Degreef': 0.0,
 'Thomas Foket': 0.0,
 'Timon Vanhoutte': 0.0,
 'Anas Tajaouart': 0.0,
 'S. Mignolet': 0.0,
 'Ordoñez': 43,
 'Ferran Jutglà': 186,
 'K. Sabbe': 70,
 'C. Talbi': 246,
 'H. Vanaken': 72,
 'B. Mechele': 66,
 'C. Tzolis': 310,
 'Maxim De Cuyper': 211,
 'Ardon Jashari': 79,
 'Raphael Onyedika': 14.869796048973726,
 'Bjorn Meijer': 0.0,
 'Michal Skoras': 0.0,
 'J. Spileers': 0.0,
 'C. Nielsen': 0.0,
 'Romero': 0.0,
 'H. Vetlesen': 0.0,
 'Siquet': 0.0,
 'Jackers': 0.0,
 'Romeo Vermant': 0.0,
 'Gustaf Nilsson': 0.0}

In [14]:
opta_df = pl.from_dict(opta_data)
opta_df

C. Coosemans,Amuzu,M. Rits,T. Leoni,K. Dolberg,K. Sardella,A. Dreyer,L. Dendoncker,J. Simić,Amando Lapage,Ali Maamar,L. Vázquez,Y. Verschaeren,N. Engwanda,Mads Kikkenborg,S. Edozie,Thorgan Hazard,T. Degreef,Thomas Foket,Timon Vanhoutte,Anas Tajaouart,S. Mignolet,Ordoñez,Ferran Jutglà,K. Sabbe,C. Talbi,H. Vanaken,B. Mechele,C. Tzolis,Maxim De Cuyper,Ardon Jashari,Raphael Onyedika,Bjorn Meijer,Michal Skoras,J. Spileers,C. Nielsen,Romero,H. Vetlesen,Siquet,Jackers,Romeo Vermant,Gustaf Nilsson
i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,i64,i64,i64,i64,i64,i64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1,105,117,21,162,202,215,80,119,122,207,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,43,186,70,246,72,66,310,211,79,14.869796,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
# comparing opta to floodlight 
for col in HSR_df.columns:
    result = (HSR_df[col] / opta_df[col] * 100).item()
    print(result)

0.0
77.18682175565263
76.19580380155851
44.01402616988578
54.05707119886542
71.54326689744512
65.84920792924618
57.68782908399081
59.547430011012324
81.99603849764418
67.85816239351453
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
70.49424230409281
65.30792877571304
70.58125409352839
87.48775583959856
59.56143561506539
54.023783098708066
82.07267681387526
67.77593303676234
45.47957640291258
94.62133292320733
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan


In [16]:
HSR_distance

{'C. Coosemans': 0.0,
 'Amuzu': 81.04616284343525,
 'M. Rits': 89.14909044782347,
 'T. Leoni': 9.242945495676013,
 'K. Dolberg': 87.57245534216199,
 'K. Sardella': 144.51739913283916,
 'A. Dreyer': 141.5757970478793,
 'L. Dendoncker': 46.150263267192656,
 'J. Simić': 70.86144171310467,
 'Amando Lapage': 100.03516696712589,
 'Ali Maamar': 140.46639615457508,
 'L. Vázquez': 0.0,
 'Y. Verschaeren': 0.0,
 'N. Engwanda': 0.0,
 'Mads Kikkenborg': 0.0,
 'S. Edozie': 0.0,
 'Thorgan Hazard': 0.0,
 'T. Degreef': 0.0,
 'Thomas Foket': 0.0,
 'Timon Vanhoutte': 0.0,
 'Anas Tajaouart': 0.0,
 'S. Mignolet': 0.0,
 'Ordoñez': 30.312524190759905,
 'Ferran Jutglà': 121.47274752282624,
 'K. Sabbe': 49.40687786546987,
 'C. Talbi': 215.21987936541248,
 'H. Vanaken': 42.884233642847086,
 'B. Mechele': 35.655696845147325,
 'C. Tzolis': 254.42529812301333,
 'Maxim De Cuyper': 143.00721870756854,
 'Ardon Jashari': 35.92886535830094,
 'Raphael Onyedika': 14.069999224501359,
 'Bjorn Meijer': 0.0,
 'Michal Skoras'