In [None]:
import pandas as pd

def load_and_display_heads():
    # File paths (update with your actual file paths)
    game_data_file = '../data/games.csv'
    play_data_file = '../data/plays.csv'
    player_play_data_file = '../data/player_play.csv'
    player_data_file = '../data/players.csv'
    tracking_data_file = '../data/tracking_week_1.csv'  # Example for week 1
    
    # Load datasets
    try:
        game_data = pd.read_csv(game_data_file)
        play_data = pd.read_csv(play_data_file)
        player_play_data = pd.read_csv(player_play_data_file)
        player_data = pd.read_csv(player_data_file)
        tracking_data = pd.read_csv(tracking_data_file)
        
        # Display heads of each dataset
        print("Game Data Head:")
        print(game_data.head())
        print("\nPlay Data Head:")
        print(play_data.head())
        print("\nPlayer Play Data Head:")
        print(player_play_data.head())
        print("\nPlayer Data Head:")
        print(player_data.head())
        print("\nTracking Data Head:")
        print(tracking_data.head())
    
    except FileNotFoundError as e:
        print(f"Error: {e}")
    except pd.errors.EmptyDataError:
        print("Error: One of the files is empty or corrupted.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

    return game_data, play_data, player_play_data, player_data, tracking_data

game_data, play_data, player_play_data, player_data, tracking_data = load_and_display_heads()


Game Data Head:
       gameId  season  week   gameDate gameTimeEastern homeTeamAbbr  \
0  2022090800    2022     1   9/8/2022        20:20:00           LA   
1  2022091100    2022     1  9/11/2022        13:00:00          ATL   
2  2022091101    2022     1  9/11/2022        13:00:00          CAR   
3  2022091102    2022     1  9/11/2022        13:00:00          CHI   
4  2022091103    2022     1  9/11/2022        13:00:00          CIN   

  visitorTeamAbbr  homeFinalScore  visitorFinalScore  
0             BUF              10                 31  
1              NO              26                 27  
2             CLE              24                 26  
3              SF              19                 10  
4             PIT              20                 23  

Play Data Head:
       gameId  playId                                    playDescription  \
0  2022102302    2655  (1:54) (Shotgun) J.Burrow pass short middle to...   
1  2022091809    3698  (2:13) (Shotgun) J.Burrow pass shor

In [None]:
import pandas as pd
import numpy as np

def parse_features(data):
    """
    Parses features (X) and targets (y) for the ball prediction model.
    Args:
        tracking_data (pd.DataFrame): Player tracking data.
        play_data (pd.DataFrame): Play-level information.
        games_data (pd.DataFrame): Game-level information.
    Returns:
        X (list): List of feature dictionaries for each play.
        y (list): List of ball locations (grids) for each second of play.
    """
    x_data, y_data = [], []



    # Iterate through plays
    for play_id, play_df in data.groupby("playId"):
        play_info = play_df.iloc[0]
        
        # Extract situation features
        situation_features = {
            "down": play_info["down"],
            "distance": play_info["yardsToGo"],
            "score_differential": play_info["preSnapHomeScore"] - play_info["preSnapVisitorScore"],
            "quarter": play_info["quarter"],
            "game_clock": play_info["gameClock"]
        }

        # Get player tracking data at the snap
        snap_frame = play_df[play_df["frameType"] == "SNAP"]
        if snap_frame.empty:
            continue
        player_features = []
        for _, row in snap_frame.iterrows():
            player_features.append({
                "x": row["x"],
                "y": row["y"],
                "s": row["s"],
                "dir": row["dir"],
                "o": row["o"],
                "team": row["club"],
                # "player_role": row["position"]
            })
        
        # Create X (features) for the play
        x_data.append({
            "situation": situation_features,
            "players": player_features
        })

        # Create y (target) based on play type
        ball_positions = np.zeros((10, 120, 54))  # Field grid: 120x54 yards
        for second in range(10):
            frame = play_df[play_df["frameId"] == second * 10 + 1]
            if not pd.isna(play_info["rushLocationType"]):
                ball_row = frame[frame["hadRushAttempt"] == 1]
            elif not pd.isna(play_info["targetX"]):

                ball_row = frame[frame["wasTargettedReceiver"] == 1] if "passForward" in frame["event"].values else frame[frame["hadDropback"] == 1]
            else:
                continue  # Skip plays that don't fit criteria
            
            if not ball_row.empty:
                x = int(ball_row["x"].values[0])
                y = int(ball_row["y"].values[0])
                ball_positions[second, x, y] = 1
        
        y_data.append(ball_positions)

    return x_data, y_data

In [None]:
def chunk_by_play_id(data, max_chunk_size):
    """
    Filters unnecessary frames from tracking data and splits it into chunks.
    
    Args:
        data (pd.DataFrame): Tracking data with multiple frames per play.
        max_chunk_size (int): Approximate maximum number of rows per chunk.
    
    Returns:
        list: A list of DataFrame chunks.
    """
    chunks = []
    current_chunk = []
    current_size = 0

    # Filter out unnecessary frames
    filtered_data = []
    for play_id, group in data.groupby("playId"):
        # Find the frame ID for the snap
        snap_frame = group[group["frameType"] == "SNAP"]
        if snap_frame.empty:
            continue  # Skip plays without a snap frame
        
        snap_frame_id = snap_frame["frameId"].min()

        # Keep only frames from the snap up to 101 frames after
        filtered_group = group[
            (group["frameId"] >= snap_frame_id) & 
            (group["frameId"] <= snap_frame_id + 101)
        ]
        filtered_data.append(filtered_group)

    # Combine filtered data
    filtered_data = pd.concat(filtered_data)

    # Group by playId and create chunks
    for play_id, group in filtered_data.groupby("playId"):
        group_size = len(group)
        print(group_size)
        
        # Add to the current chunk if size allows
        if current_size + group_size <= max_chunk_size:
            current_chunk.append(group)
            current_size += group_size
        else:
            # Finalize the current chunk and start a new one
            chunks.append(pd.concat(current_chunk))
            current_chunk = [group]
            current_size = group_size

    # Add the last chunk
    if current_chunk:
        chunks.append(pd.concat(current_chunk))

    return chunks


In [None]:
def process_tracking_data_in_chunks(tracking_data, play_data, game_data, player_data, player_play_data, result_X = [], result_y = []):
    chunks = chunk_by_play_id(tracking_data, 10000) 
    play_data = play_data.merge(game_data, on="gameId", how="left")
    print(len(chunks))
    for chunk in chunks:
        tracking_data = tracking_data.merge(player_play_data, on=["gameId", "playId", "nflId"], how="left")
        tracking_data = tracking_data.merge(player_data, on=["nflId"], how='left')

        data = tracking_data.merge(play_data, on=["gameId", "playId"], how="left")
        
        # Process the chunk
        X_chunk, y_chunk = parse_features(data)
        print("Processed chunk")
        result_X.extend(X_chunk)
        result_y.extend(y_chunk)
    
    return 
x_data = []
y_data = []
process_tracking_data_in_chunks(tracking_data, play_data, game_data, player_data, player_play_data, x_data, y_data)


MemoryError: Unable to allocate 163. MiB for an array with shape (3, 7104700) and data type int64

In [None]:
def ball_grid_probabilities(tracking_data, play_data):
    """
    Converts ball positions into grid probabilities for 1x1 yard squares.
    Args:
        tracking_data (pd.DataFrame): Player tracking data.
        play_data (pd.DataFrame): Play-level information.
    Returns:
        grid_probs (np.ndarray): Probability grids for the ball.
    """
    grid_probs = []
    for play_id, play_df in tracking_data.groupby("playId"):
        play_info = play_data[play_data["playId"] == play_id].iloc[0]
        
        # Generate grid for each second
        play_grids = []
        for second in range(10):
            second_frame = play_df[play_df["frameId"] == second + 1]
            grid = np.zeros((120, 54))
            
            # Assign ball location based on rules
            if play_info["playType"] == "run":
                ball_carrier = second_frame[second_frame["hadRushAttempt"] == 1]
                if not ball_carrier.empty:
                    x, y = int(ball_carrier["x"]), int(ball_carrier["y"])
                    grid[x, y] = 1
            
            elif play_info["playType"] == "pass":
                if "passForward" in second_frame["event"].values:
                    receiver = second_frame[second_frame["wasTargettedReceiver"] == 1]
                    if not receiver.empty:
                        x, y = int(receiver["x"]), int(receiver["y"])
                        grid[x, y] = 1
                else:
                    qb = second_frame[second_frame["hadDropback"] == 1]
                    if not qb.empty:
                        x, y = int(qb["x"]), int(qb["y"])
                        grid[x, y] = 1

            play_grids.append(grid)
        grid_probs.append(play_grids)
    
    return grid_probs


[]

In [None]:

play_data = play_data.merge(game_data, on="gameId", how="left")
data = tracking_data.merge(play_data, on=["gameId", "playId"], how="left")

In [None]:
play_data[~play_data["rushLocationType"].isna()]

Unnamed: 0,targetX
4,
5,
6,
7,
11,
...,...
16111,
16115,
16118,
16121,


In [None]:
X = []
y = []
# Merge datasets for context
play_data = play_data.merge(game_data, on="gameId", how="left")
player_data = player_data.merge(player_play_data, on=["nflId"], how='left')
tracking_data = tracking_data.merge(player_data, on=["nflId"], how="left")
data = tracking_data.merge(play_data, on=["gameId", "playId"], how="left")

# Iterate through plays
for play_id, play_df in data.groupby("playId"):
    play_info = play_df.iloc[0]
    
    # Extract situation features
    situation_features = {
        "down": play_info["down"],
        "distance": play_info["yardsToGo"],
        "score_differential": play_info["preSnapHomeScore"] - play_info["preSnapVisitorScore"],
        "quarter": play_info["quarter"],
        "game_clock": play_info["gameClock"]
    }

    # Get player tracking data at the snap
    snap_frame = play_df[play_df["frameType"] == "SNAP"]
    if snap_frame.empty:
        continue
    player_features = []
    for _, row in snap_frame.iterrows():
        player_features.append({
            "x": row["x"],
            "y": row["y"],
            "s": row["s"],
            "dir": row["dir"],
            "o": row["o"],
            "team": row["club"],
            # "player_role": row["position"]
        })
    
    # Create X (features) for the play
    X.append({
        "situation": situation_features,
        "players": player_features
    })

    # Create y (target) based on play type
    ball_positions = np.zeros((10, 120, 54))  # Field grid: 120x54 yards
    for second in range(10):
        frame = play_df[play_df["frameId"] == second * 10 + 1]
        if not pd.isna(play_info["rushLocationType"]):
            ball_row = frame[frame["hadRushAttempt"] == 1]
        elif not pd.isna(play_info["targetX"]):

            ball_row = frame[frame["wasTargettedReceiver"] == 1] if "passForward" in frame["event"].values else frame[frame["hadDropback"] == 1]
        else:
            continue  # Skip plays that don't fit criteria
        
        if not ball_row.empty:
            x = int(ball_row["x"].values[0])
            y = int(ball_row["y"].values[0])
            ball_positions[second, x, y] = 1
    
    y.append(ball_positions)

KeyError: 'hadRushAttempt'

In [None]:
play_data = play_data.merge(game_data, on="gameId", how="left")
player_data = player_data.merge(player_play_data, on=["nflId"], how='left')
tracking_data = tracking_data.merge(player_data, on=["nflId"], how="left")
data = tracking_data.merge(play_data, on=["gameId", "playId"], how="left")

Index(['gameId', 'playId', 'nflId', 'displayName_x', 'frameId', 'frameType',
       'time', 'jerseyNumber', 'club', 'playDirection', 'x', 'y', 's', 'a',
       'dis', 'o', 'dir', 'event', 'height_x', 'weight_x', 'birthDate_x',
       'collegeName_x', 'position_x', 'displayName_y', 'height_y', 'weight_y',
       'birthDate_y', 'collegeName_y', 'position_y', 'displayName',
       'playDescription', 'quarter', 'down', 'yardsToGo', 'possessionTeam',
       'defensiveTeam', 'yardlineSide', 'yardlineNumber', 'gameClock',
       'preSnapHomeScore', 'preSnapVisitorScore', 'playNullifiedByPenalty',
       'absoluteYardlineNumber', 'preSnapHomeTeamWinProbability',
       'preSnapVisitorTeamWinProbability', 'expectedPoints',
       'offenseFormation', 'receiverAlignment', 'playClockAtSnap',
       'passResult', 'passLength', 'targetX', 'targetY', 'playAction',
       'dropbackType', 'dropbackDistance', 'passLocationType', 'timeToThrow',
       'timeInTackleBox', 'timeToSack', 'passTippedAtLine',


In [None]:
player_data = player_data.merge(player_play_data, on=["nflId"], how='left')
player_data

Unnamed: 0,nflId,height,weight,birthDate,collegeName,position,displayName,gameId,playId,teamAbbr,...,wasRunningRoute,routeRan,blockedPlayerNFLId1,blockedPlayerNFLId2,blockedPlayerNFLId3,pressureAllowedAsBlocker,timeToPressureAllowedAsBlocker,pff_defensiveCoverageAssignment,pff_primaryDefensiveCoverageMatchupNflId,pff_secondaryDefensiveCoverageMatchupNflId
0,25511,6-4,225,1977-08-03,Michigan,QB,Tom Brady,2022091113,413,TB,...,,,,,,,,,,
1,25511,6-4,225,1977-08-03,Michigan,QB,Tom Brady,2022091113,444,TB,...,,,,,,,,,,
2,25511,6-4,225,1977-08-03,Michigan,QB,Tom Brady,2022091113,465,TB,...,,,,,,,,,,
3,25511,6-4,225,1977-08-03,Michigan,QB,Tom Brady,2022091113,489,TB,...,,,,,,,,,,
4,25511,6-4,225,1977-08-03,Michigan,QB,Tom Brady,2022091113,510,TB,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
354722,55241,6-2,280,,Coastal Carolina,DT,C.J. Brewer,2022091900,3577,BUF,...,,,,,,,,,,
354723,55241,6-2,280,,Coastal Carolina,DT,C.J. Brewer,2022091900,3598,BUF,...,,,,,,,,,,
354724,55241,6-2,280,,Coastal Carolina,DT,C.J. Brewer,2022091900,3732,BUF,...,,,,,,,,,,
354725,55241,6-2,280,,Coastal Carolina,DT,C.J. Brewer,2022091900,3753,BUF,...,,,,,,,,,,


In [None]:
tracking_data = tracking_data.merge(player_data, on=["gameId", "nflId", "playId"], how="left")
tracking_data

KeyError: 'gameId'

In [None]:
print(tracking_data.duplicated(subset=["gameId", "playId", "nflId"]).sum())  # Should be 0
print(player_play_data.duplicated(subset=["gameId", "playId", "nflId"]).sum())  # Should be 0


7059804
0


In [None]:
tracking_data.columns

Index(['gameId', 'playId', 'nflId', 'displayName', 'frameId', 'frameType',
       'time', 'jerseyNumber', 'club', 'playDirection', 'x', 'y', 's', 'a',
       'dis', 'o', 'dir', 'event'],
      dtype='object')

In [None]:
tracking_data = tracking_data.merge(
        player_play_data,
        on=["gameId", "playId", "nflId"],  # Keys to match
        how="left"  # Keep all rows in tracking_data
    )

In [None]:
tracking_data = tracking_data.merge(
        player_data,
        on=["nflId"],  # Keys to match
        how="left"  # Keep all rows in tracking_data
    )

Index(['nflId', 'height', 'weight', 'birthDate', 'collegeName', 'position',
       'displayName'],
      dtype='object')