In [1]:
%load_ext autoreload
%autoreload 2

from datetime import datetime, timedelta

import numpy as np
import pandas as pd
from tqdm import tqdm

from sync import config
from sync.preprocessor import Preprocessor, find_spadl_event_types
from sync.elastic import ELASTIC

pd.set_option('display.width', 250)
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 30)

### Preprocessing original data

In [None]:
game_ids = ["8qhuektrx8cmkxs11lxsdd4pg", "8t53c07vfe5vmg9jm0w7pq2vo", "9gewka7f25bz12mqrfm7ygjro"]

lineups = pd.read_parquet("data/ajax/lineup/line_up.parquet")
events = pd.read_parquet("data/ajax/event/event.parquet")
events["utc_timestamp"] = pd.to_datetime(events["utc_timestamp"])
events = find_spadl_event_types(events)

data_dict = dict()

for game_id in tqdm(game_ids):
    game_lineup = lineups.loc[lineups["stats_perform_match_id"] == game_id].set_index("player_id")
    game_events = events[
        (events["stats_perform_match_id"] == game_id)
        & (events["spadl_type"].notna())
        & (events["player_id"].notna())
    ].copy()

    traces = pd.read_parquet(f"data/ajax/tracking/{game_id}.parquet")

    proc = Preprocessor(game_lineup, game_events, traces)
    input_events = proc.format_events_for_syncer()
    input_traces = proc.format_traces_for_syncer()
    start_utc = proc.traces.at[0, "utc_timestamp"]

    data_dict[game_id] = {"start_utc": start_utc, "events": input_events, "traces": input_traces}

100%|██████████| 3/3 [00:11<00:00,  3.96s/it]


### Synchronizing the corrected event data with tracking data

In [3]:
def time_str_to_frame(t: str, fps=25) -> float:
    return round((float(t[:2]) * 60 + float(t[3:])) * fps) if isinstance(t, str) else np.nan

def frame_to_utc_timestamp(frame: float, start_utc: datetime, fps=25) -> datetime:
    return start_utc + timedelta(seconds=frame / fps) if not np.isnan(frame) else np.nan

def get_event_attr(event: pd.Series, col="x"):
    return input_traces[input_traces["ball"]].set_index("frame").at[event["frame"], col]

In [4]:
for game_id in game_ids:
    print()
    print(game_id)
    
    start_utc = data_dict[game_id]["start_utc"]
    input_events = data_dict[game_id]["events"]
    input_traces = data_dict[game_id]["traces"]

    data_path = f"data/ajax/event_corrected/{game_id}.csv"
    drop_cols = ["next_player_id", "next_type", "receive_frame", "note"]
    corrected = pd.read_csv(data_path, header=0).drop(drop_cols, axis=1)
    corrected.loc[corrected["spadl_type"] == "ball_touch", "spadl_type"] = "bad_touch"
    corrected = corrected[(corrected["period_id"] == 1) & (corrected["spadl_type"] != "shield_ball_oop")].copy()

    corrected["frame"] = corrected["synced_ts"].apply(time_str_to_frame)
    corrected["receive_frame"] = corrected["receive_ts"].apply(time_str_to_frame)

    period_events = input_events[input_events["period_id"] == 1]
    corrected.loc[corrected["error_type"] != "missing", "utc_timestamp"] = period_events["utc_timestamp"].values
    corrected.loc[corrected["error_type"] != "missing", "start_x"] = period_events["start_x"].values
    corrected.loc[corrected["error_type"] != "missing", "start_y"] = period_events["start_y"].values
    corrected["utc_timestamp"] = pd.to_datetime(corrected["utc_timestamp"])

    missing = corrected[corrected["error_type"] == "missing"].copy()
    corrected.loc[missing.index, "utc_timestamp"] = missing["frame"].apply(frame_to_utc_timestamp, args=(start_utc,))
    corrected.loc[missing.index, "start_x"] = missing.apply(get_event_attr, col="x", axis=1)
    corrected.loc[missing.index, "start_y"] = missing.apply(get_event_attr, col="y", axis=1)

    corrected_input = corrected.loc[corrected["error_type"] != "false_positive", input_events.columns].copy()
    corrected_input.reset_index(drop=True, inplace=True)

    syncer = ELASTIC(corrected_input, input_traces)
    syncer.run()

    data_dict[game_id]["corrected_events"] = corrected
    data_dict[game_id]["synced_events"] = syncer.events


8qhuektrx8cmkxs11lxsdd4pg


Syncing major events in period 1: 100%|██████████| 685/685 [00:20<00:00, 32.96it/s]
Detecting receiving events: 100%|██████████| 613/613 [00:08<00:00, 70.34it/s]
Post-syncing minor events: 100%|██████████| 75/75 [00:01<00:00, 53.46it/s]



8t53c07vfe5vmg9jm0w7pq2vo


Syncing major events in period 1: 100%|██████████| 551/551 [00:16<00:00, 34.33it/s]
Detecting receiving events: 100%|██████████| 498/498 [00:07<00:00, 70.17it/s]
Post-syncing minor events: 100%|██████████| 86/86 [00:02<00:00, 39.26it/s]



9gewka7f25bz12mqrfm7ygjro


Syncing major events in period 1: 100%|██████████| 664/664 [00:20<00:00, 33.01it/s]
Detecting receiving events: 100%|██████████| 609/609 [00:08<00:00, 68.53it/s]
Post-syncing minor events: 100%|██████████| 70/70 [00:01<00:00, 39.77it/s]


### Calculating accuracy

In [5]:
synced = []
corrected = []

for game_id, data in data_dict.items():
    game_events = data["synced_events"]
    game_events["game_id"] = game_id
    synced.append(game_events)
    corrected.append(data["corrected_events"])

synced = pd.concat(synced, ignore_index=True)
corrected = pd.concat(corrected, ignore_index=True)

true_frames_s = corrected.loc[corrected["error_type"] != "false_positive", "frame"].round().values
true_frames_r = corrected.loc[corrected["error_type"] != "false_positive", "receive_frame"].round().values
len(synced), len(true_frames_s)

(2134, 2134)

In [6]:
pass_like_dict = {x: "pass_like" for x in config.PASS_LIKE_OPEN}
set_piece_dict = {x: "set_piece" for x in config.SET_PIECE}
incoming_dict = {x: "incoming" for x in config.INCOMING}
tackle_dict = {"tackle": "tackle"}
minor_dict = {x: "minor" for x in config.MINOR}
event_cats = pass_like_dict | set_piece_dict | incoming_dict | tackle_dict | minor_dict
synced["event_cat"] = synced["spadl_type"].map(event_cats)
synced["event_cat"].value_counts()

event_cat
pass_like    1590
minor         231
incoming      168
set_piece     117
tackle         28
Name: count, dtype: int64

In [10]:
acc_counts = dict()
acc_rates = dict()

for cat in ["pass_like", "set_piece", "incoming", "tackle", "minor"]:
    cat_events = synced[synced["event_cat"] == cat]
    cat_true_frames = true_frames_s[cat_events.index]
    
    cat_acc = dict()
    cat_acc["total"] = len(cat_events)
    cat_acc["mean_diff"] = (cat_events["frame"] - cat_true_frames).abs().mean()
    cat_acc["exact"] = (cat_events["frame"] == cat_true_frames).astype(int).sum()
    cat_acc["within_5"] = ((cat_events["frame"] - cat_true_frames).abs() <= 5).astype(int).sum()
    cat_acc["within_25"] = ((cat_events["frame"] - cat_true_frames).abs() <= 25).astype(int).sum()
    cat_acc["within_50"] = ((cat_events["frame"] - cat_true_frames).abs() <= 50).astype(int).sum()
    cat_acc["valid"] = (cat_events["frame"].notna()).astype(int).sum()
    acc_counts[cat] = cat_acc

acc_counts = pd.DataFrame(acc_counts).T
acc_counts.loc["all_but_receive"] = acc_counts.sum(axis=0)
acc_counts.at["all_but_receive", "mean_diff"] = (synced["frame"] - true_frames_s).abs().mean()

receive_acc = dict()
receive_acc["total"] = len(corrected[(corrected["error_type"] != "false_positive") & (corrected["receive_frame"].notna())])
receive_acc["mean_diff"] = (synced['receive_frame'] - true_frames_r).abs().mean()
receive_acc["exact"] = (synced["receive_frame"] == true_frames_r).astype(int).sum()
receive_acc["within_5"] = ((synced["receive_frame"] - true_frames_r).abs() <= 5).astype(int).sum()
receive_acc["within_25"] = ((synced["receive_frame"] - true_frames_r).abs() <= 25).astype(int).sum()
receive_acc["within_50"] = ((synced["receive_frame"] - true_frames_r).abs() <= 50).astype(int).sum()
receive_acc["valid"] = (synced["receive_frame"].notna()).astype(int).sum()
acc_counts.loc["receive"] = receive_acc

acc_counts.loc["all_with_receive"] = acc_counts.loc["all_but_receive"] + acc_counts.loc["receive"]
sum_diff_s = (synced['frame'] - true_frames_s).abs().sum()
sum_diff_r = (synced['receive_frame'] - true_frames_r).abs().sum()
acc_counts.at["all_with_receive", "mean_diff"] = (sum_diff_s + sum_diff_r) / acc_counts.at["all_with_receive", "total"]

int_cols = ["total", "exact", "within_5", "within_25", "within_50", "valid"]
acc_counts[int_cols] = acc_counts[int_cols].astype(int)
acc_counts

Unnamed: 0,total,mean_diff,exact,within_5,within_25,within_50,valid
pass_like,1590,0.569064,1520,1546,1561,1566,1571
set_piece,117,0.122807,108,113,114,114,114
incoming,168,6.584337,144,147,151,158,166
tackle,28,11.296296,12,18,22,25,27
minor,231,12.883929,101,151,174,210,224
all_but_receive,2134,2.470029,1885,1975,2022,2073,2102
receive,1745,1.721283,1612,1654,1676,1694,1715
all_with_receive,3879,2.09951,3497,3629,3698,3767,3817


In [11]:
acc_rates = acc_counts.drop(["total", "mean_diff"], axis=1).div(acc_counts["total"], axis=0)
acc_rates

Unnamed: 0,exact,within_5,within_25,within_50,valid
pass_like,0.955975,0.972327,0.981761,0.984906,0.98805
set_piece,0.923077,0.965812,0.974359,0.974359,0.974359
incoming,0.857143,0.875,0.89881,0.940476,0.988095
tackle,0.428571,0.642857,0.785714,0.892857,0.964286
minor,0.437229,0.65368,0.753247,0.909091,0.969697
all_but_receive,0.883318,0.925492,0.947516,0.971415,0.985005
receive,0.923782,0.947851,0.960458,0.970774,0.982808
all_with_receive,0.901521,0.93555,0.953338,0.971127,0.984016
