In [None]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from scipy.spatial import ConvexHull
from scipy.spatial.distance import cdist
from pathlib import Path


In [None]:
def get_convex_hull(df,football_location):
    # Separate football, offense, and defense points

    # Calculate pairwise distances using Euclidean distance for defense
    distances= cdist(football_location[['x', 'y']], df[['x', 'y']], metric='euclidean')

    # Get the indices of the 5 nearest neighbors for defense
    indices = distances.argsort(axis=1)[:, :3]

    # Get the nearest points for defense
    nearest_points = df.iloc[indices.flatten()]

    # Calculate convex hull
    hull = ConvexHull(nearest_points[['x', 'y']])
    # Get the area of the convex hull
    hull_area = hull.area
    return hull_area

In [None]:
class DataProcessor:
    PLAYIDS = ["gameId", "playId"]
    CARRY_START_EVENTS = {
        "run",
        "handoff",
        "pass_outcome_caught",
        "snap_direct"
    }
    @classmethod
    def change_direction_plays(cls,tracking):
        tracking['x'] = np.where(tracking['playDirection']=='left',120-tracking['x'],tracking['x'])
        tracking['y'] = np.where(tracking['playDirection']=='left',160 /3 - tracking['y'],tracking['y'])
        tracking['dir'] = np.where(tracking['playDirection']=='left',tracking['dir']+180,tracking['dir'])
        tracking['dir'] = np.where(tracking['dir']>360,tracking['dir']-360,tracking['dir'])
        tracking['o'] = np.where(tracking['playDirection'] == 'left',tracking['o']+180,tracking['o'])
        tracking['o'] = np.where(tracking['o']>360,tracking['o']-360,tracking['o'])
        return tracking
    @classmethod
    def construct_ball_carrier_view(cls, plays, tracking):
        """
        Construct a view of tracking data focused on ball carriers,
        subset to the frames where they have the ball
        """
        
        ball_carriers = tracking.merge(
            play_df[cls.PLAYIDS + ["ballCarrierId"]],
            left_on=cls.PLAYIDS + ["nflId"],
            right_on=cls.PLAYIDS + ["ballCarrierId"]
        )
        # take last non-NA event as end of play
        play_end = ball_carriers.dropna(subset="event").drop_duplicates(
            subset=cls.PLAYIDS, keep="last"
        )[cls.PLAYIDS + ["frameId", "event", "x"]].rename(
            columns=dict(frameId="endFrameId", event="endEvent", x="x_final")
        )
        # consider class of events that are valid for ball carrier to have received the ball
        carry_start = ball_carriers[
            ball_carriers.event.isin({"run", "handoff", "pass_outcome_caught", "snap_direct"})
        ].drop_duplicates(
            subset=cls.PLAYIDS, keep="last"
        )[cls.PLAYIDS + ["frameId", "event"]].rename(
            columns=dict(frameId="startFrameId", event="receiptEvent")
        )
        carry_window = carry_start.merge(play_end, on=cls.PLAYIDS)

        # extract only those frames between ball receipt and end of play
        ball_carriers = ball_carriers.merge(
            carry_window,
            on=cls.PLAYIDS
        )

        angle_to_downfield = np.pi * np.where(
            ball_carriers.playDirection == "right",
            ball_carriers.dir - 90,
            ball_carriers.dir - 270
        ) / 180
        ball_carriers["s_downfield"] = ball_carriers.s * np.cos(angle_to_downfield)
        
        return ball_carriers[
            ball_carriers.frameId.between(ball_carriers.startFrameId, ball_carriers.endFrameId)
        ].reset_index(drop=True).astype({"nflId": "int", "jerseyNumber": "int"})
        
    @classmethod
    def construct_defender_view(cls, carriers, tracking, nearby_radius=5):
        """
        Given a carrier view, determine where the defenders are relative to the carrier
        """
        mergeby = cls.PLAYIDS + ["frameId"]

        # Right now really "all others" instead of defenders, but we're going to drop
        # the same-team players shortly, so just use _defenders as a suffix for now
        others = carriers.merge(
            tracking.drop(columns=["playDirection", "time", "event"]),
            on=mergeby, suffixes=("_carrier", "_defender")
        )
        others.nflId_defender = others.nflId_defender.fillna(0).astype("int")
        others["dist_to_carrier"] = np.sqrt(
            np.power(others.x_carrier - others.x_defender, 2) + np.power(others.y_carrier - others.y_defender, 2)
        )
        
        defenders = others[
            (others.club_defender != others.club_carrier) & (others.club_defender != "football")
        ].reset_index(drop=True).astype({"jerseyNumber_defender": "int"})
        
        return defenders
    @classmethod
    def get_hull_df(cls,tackling,plays):
        tackling = tackling.merge(plays,on=cls.PLAYIDS)
        hull_df = pd.DataFrame()
        grouped = tackling.groupby(cls.PLAYIDS+['frameId'])
        for (game, play, frame), group_df in grouped:
            filtered_df = tackling[(tackling['gameId']==game)&
                                         (tackling['playId']==play)&
                                         (tackling['frameId']==frame)]
            ball_carrier = filtered_df[filtered_df['nflId']==filtered_df['ballCarrierId']]
            offense_df = filtered_df[filtered_df['possessionTeam']==filtered_df['club']]
            defense_df = filtered_df[filtered_df['possessionTeam']!=filtered_df['club']]
            time = filtered_df['time'].iloc[0]
            try:
                hull_area_off = get_convex_hull(offense_df,ball_carrier)
            except:
                hull_area_off = (np.NaN)
            try:
                hull_area_def = get_convex_hull(defense_df,ball_carrier)
            except:
                hull_area_def = (np.NaN)
            df_frame_hull = pd.DataFrame({'gameId':[game],
                                          'playId':[play],
                                          'frameId':[frame],
                                          'hull_area_offense': hull_area_off,
                                          'hull_area_defense': hull_area_def})
            hull_df = pd.concat([hull_df,df_frame_hull],ignore_index=True)
        return hull_df
    @classmethod
    def merge_hull_df(cls,defenders,hull_df):
        defenders = defenders.merge(hull_df,on=cls.PLAYIDS + ['frameId'])
        return defenders
    @classmethod
    def add_labels(cls, defenders, tackles):
        defenders = defenders.merge(
            tackles,
            left_on=DataProcessor.PLAYIDS + ["nflId_defender"],
            right_on=DataProcessor.PLAYIDS + ["nflId"],
            how="left"
        ).drop(columns="nflId")
        map_cols = [c for c in tackles if c != "nflId"]
        defenders[map_cols] = defenders[map_cols].fillna(0).astype("int")
        return defenders

    @classmethod
    def pipeline(cls, plays, players, tackles, tracking_dfs):
        defenders = []
        for _df in tqdm(tracking_dfs):
            if isinstance(_df, Path):
                _df = pd.read_csv(_df)
            _df = DataProcessor.change_direction_plays(_df)
            carriers = DataProcessor.construct_ball_carrier_view(plays, _df)
            _defenders = DataProcessor.construct_defender_view(carriers, _df)
            _hull_df = DataProcessor.get_hull_df(_df,plays)
            _defenders = DataProcessor.merge_hull_df(_defenders,_hull_df)
            _defenders = DataProcessor.add_labels(_defenders, tackles)
            defenders.append(_defenders)
        defenders = pd.concat(defenders).reset_index(drop=True)
        return defenders.sort_values(cls.PLAYIDS + ["frameId", "nflId_defender"]).reset_index(drop=True)

In [None]:
PATH = ''
data_dir = Path(PATH)
assert data_dir.exists()

In [None]:
sorted(data_dir.glob("*.csv"))

In [None]:
tracking_dfs = sorted(data_dir.glob("tracking_week_*.csv"))
play_df = pd.read_csv(Path(data_dir, "plays.csv"))
tackle_df = pd.read_csv(Path(data_dir, "tackles.csv"))
players_df = pd.read_csv(Path(data_dir, "players.csv"))
games_df = pd.read_csv(Path(data_dir, "games.csv"))

In [None]:
defenders = DataProcessor.pipeline(play_df, players_df, tackle_df, tracking_dfs)

In [None]:
defenders.to_pickle('tracking_data_clean.pkl')