# Pre-Processing Game Logs for ALL 2024 Games to Date
All code & functions were originated and pulled from the 02_Fine_Tuning_Data_II_Drives notebook.

In [36]:
import pandas as pd 
import numpy as np 
import re 
from collections import defaultdict

In [37]:
# Read in training data 
df = pd.read_csv("/Users/laurenmanis/Desktop/Fall 2024/DS 5690 | Gen AI Models/all_game_logs.csv").drop(columns='Unnamed: 0')
players = pd.read_csv('/Users/laurenmanis/Desktop/Fall 2024/DS 5690 | Gen AI Models/nfl_players.csv').drop(columns='Unnamed: 0')
df.head()

Unnamed: 0,game_id,game_log
0,2024_01_ARI_BUF,GAME\n2-T.Bass kicks 65 yards from BUF 35 to e...
1,2024_01_BAL_KC,GAME\n7-H.Butker kicks 65 yards from KC 35 to ...
2,2024_01_CAR_NO,GAME\n4-E.Pineiro kicks 64 yards from CAR 35 t...
3,2024_01_DAL_CLE,GAME\n7-D.Hopkins kicks 66 yards from CLE 35 t...
4,2024_01_DEN_SEA,GAME\n3-W.Lutz kicks 66 yards from DEN 35 to S...


## Prepare Data for Input to the Model

In [38]:
# Filter relevant players (QBs, RBs, Ks)
relevant_positions = ["QB", "RB", "K", "P"]
relevant_players = players[players["position"].isin(relevant_positions)].copy()
relevant_players['full_name'] = relevant_players['full_name'].replace("'", "")

# Remove apostrophes from names and create a `player_key` column
relevant_players["formatted_name"] = (
    relevant_players["jersey_number"].astype(str) + "-" +  
    relevant_players["full_name"].str.split(" ").str[0].str[0] + "." +  
    relevant_players["full_name"].str.split(" ").str[-1] 
)

# Create a mapping of player to team
player_team_map = relevant_players.set_index("formatted_name")["team"].to_dict()


In [39]:
# Function to separate out plays
def parse_plays(game_log):
    timestamp_pattern = r"\(\d{1,2}:\d{2}\)|\(:\d{1,2}\)"
    plays = []

    # Split the game log by timestamps, keeping the delimiters
    segments = re.split(f"({timestamp_pattern})", game_log)
    segments = [seg.strip() for seg in segments if seg.strip()]

    current_play = None

    for i in range(0, len(segments), 2):
        if i + 1 < len(segments):
            # Extract play text and timestamp
            play_text = segments[i]
            timestamp = segments[i + 1]

            # If there's a current play, append it to the list
            if current_play:
                plays.append(current_play)

            # Start a new play
            current_play = {"play": play_text, "timestamp": timestamp}
        else:
            # Append any remaining text to the current play
            if current_play:
                current_play["play"] += f" {segments[i]}"

    # Append the last play if it exists
    if current_play:
        plays.append(current_play)

    return plays

df['plays'] = df['game_log'].apply(parse_plays)

# Flatten the DataFrame into rows for each play
plays_df = df.explode('plays', ignore_index=True)

# Expand the 'plays' column into separate columns
plays_df = pd.concat([plays_df.drop(columns=['plays']), plays_df['plays'].apply(pd.Series)], axis=1)

plays_df.head()


Unnamed: 0,game_id,game_log,play,timestamp
0,2024_01_ARI_BUF,GAME\n2-T.Bass kicks 65 yards from BUF 35 to e...,GAME\n2-T.Bass kicks 65 yards from BUF 35 to e...,(15:00)
1,2024_01_ARI_BUF,GAME\n2-T.Bass kicks 65 yards from BUF 35 to e...,6-J.Conner up the middle to ARI 33 for 3 yards...,(14:27)
2,2024_01_ARI_BUF,GAME\n2-T.Bass kicks 65 yards from BUF 35 to e...,1-K.Murray pass short left to 6-J.Conner to BU...,(13:43)
3,2024_01_ARI_BUF,GAME\n2-T.Bass kicks 65 yards from BUF 35 to e...,(Shotgun) 1-K.Murray pass short middle to 6-J....,(13:02)
4,2024_01_ARI_BUF,GAME\n2-T.Bass kicks 65 yards from BUF 35 to e...,6-J.Conner up the middle to BUF 34 for 2 yards...,(12:26)


In [40]:
# Define a function to extract the first player in the play
def extract_first_player(play_text):
    # Remove apostrophes to avoid confusion
    play_text = play_text.replace("'", "")

    # Match the player format (e.g., "8-A.Rodgers")
    player_match = re.search(r"(\d+-[A-Z]\.[A-Za-z]+)", play_text)
    if player_match:
        return player_match.group(1)  
    return None 

# Define a function to map the extracted player to their team
def parse_and_map_teams(play_text, player_team_map):
    first_player = extract_first_player(play_text)  
    if first_player:
        return player_team_map.get(first_player, "Unknown Team")  
    return "Unknown Team" 

# Apply the function to identify the offense team
plays_df["offense_team"] = plays_df["play"].apply(lambda x: parse_and_map_teams(x, player_team_map))
plays_df.sample(5)


Unnamed: 0,game_id,game_log,play,timestamp,offense_team
7713,2024_04_JAX_HOU,GAME\n15-K.Fairbairn kicks 65 yards from HOU 3...,1-T.Etienne left tackle to JAX 29 for 1 yard (...,(3:14),JAX
24446,2024_12_DAL_WAS,GAME\n3-A.Seibert kicks 68 yards from WAS 35 t...,"(No Huddle, Shotgun) 8-B.Robinson right guard ...",(:52),WAS
25153,2024_12_MIN_CHI,GAME\n96-J.Romo kicks 65 yards from MIN 35 to ...,(Shotgun) 18-C.Williams pass short left to 85-...,(8:39),CHI
14281,2024_07_KC_SF,GAME\n7-H.Butker kicks 65 yards from KC 35 to ...,(Shotgun) 15-P.Mahomes pass incomplete deep mi...,(14:47),KC
7596,2024_04_JAX_HOU,GAME\n15-K.Fairbairn kicks 65 yards from HOU 3...,(Shotgun) 16-T.Lawrence pass short left to 13-...,(5:17),Unknown Team


In [41]:
# Function to extract all the play-level stats
def extract_team_stats(play_text):
    stats = {
        "points": 0,
        "touchdowns": 0,
        "field_goals": 0,
        "missed_field_goals": 0,
        "rushing_yards": 0,
        "passing_yards": 0,
        "sacks": 0,
        "completions": 0,
        "incompletions": 0,
        "receptions": 0,
        "carries": 0,
        "plays": 1,
        "QB": None,
        "ball_carrier": None,
        "turnovers": 0,
        "nullified": 0,
        "new_drive": 0,
        "injury": None,
        "kicker": None
    }

    # Check for nullified plays
    if "- No Play" in play_text or "REVERSED" in play_text:
        stats["nullified"] = 1
        return stats

    # Touchdown detection
    if "TOUCHDOWN" in play_text:
        stats["touchdowns"] += 1
        if "extra point is GOOD" in play_text:
            stats["points"] += 7
        elif "TWO-POINT CONVERSION ATTEMPT" in play_text and "ATTEMPT FAILS" not in play_text:
            stats["points"] += 8
        else:
            stats["points"] += 6

    # Turnover detection
    if "INTERCEPTED" in play_text or "RECOVERED by" in play_text or "turnover" in play_text.lower():
        stats["turnovers"] += 1

    # Missed field goal detection
    if "field goal is No Good" in play_text:
        stats["missed_field_goals"] += 1
        kicker_match = re.search(r"(\d+-[\w.-]+)", play_text)
        stats["kicker"] = kicker_match.group(1) if kicker_match else None

    # Field goal success detection
    if "field goal is GOOD" in play_text:
        stats["field_goals"] += 1
        stats["points"] += 3
        kicker_match = re.search(r"(\d+-[\w.-]+)", play_text)
        stats["kicker"] = kicker_match.group(1) if kicker_match else None

    # Passing stats
    pass_match = re.search(r"pass.*?to\s([\w.-]+).*?for (-?\d+) yards", play_text)
    if pass_match:
        stats["passing_yards"] += int(pass_match.group(2))
        stats["completions"] += 1
        qb_match = re.search(r"(\d+-[\w.-]+) pass", play_text)
        stats["QB"] = qb_match.group(1) if qb_match else None
        stats["ball_carrier"] = pass_match.group(1)

    if "pass incomplete" in play_text:
        stats["incompletions"] += 1
        qb_match = re.search(r"(\d+-[\w.-]+) pass", play_text)
        stats["QB"] = qb_match.group(1) if qb_match else None

    # Rushing stats
    rush_match = re.search(r"(\d+-[\w.-]+).*?(left|right|middle).*?for (-?\d+) yards", play_text)
    if rush_match and "pass" not in play_text:
        stats["rushing_yards"] += int(rush_match.group(3))
        stats["carries"] += 1
        stats["ball_carrier"] = rush_match.group(1)

    # Sacks
    sack_match = re.search(r"sacked.*?for (-\d+) yards", play_text)
    if sack_match:
        stats["sacks"] += 1
        stats["passing_yards"] += int(sack_match.group(1))
        qb_match = re.search(r"(\d+-[\w.-]+) sacked", play_text)
        stats["QB"] = qb_match.group(1) if qb_match else None

    # Injury detection
    injury_match = re.search(r"([\w.-]+) was injured during the play", play_text)
    if injury_match:
        injured_player = injury_match.group(1)
        stats["injury"] = injured_player

    return stats

In [42]:
# Apply the function to extract stats and players
plays_df['team_stats'] = plays_df['play'].apply(extract_team_stats)

# Expand the stats dictionary into separate columns
team_stats_df = pd.json_normalize(plays_df['team_stats'])

# Concatenate the expanded stats back to the original DataFrame
plays_df = pd.concat([plays_df.drop(columns=['team_stats']), team_stats_df], axis=1)
plays_df.head()


Unnamed: 0,game_id,game_log,play,timestamp,offense_team,points,touchdowns,field_goals,missed_field_goals,rushing_yards,...,receptions,carries,plays,QB,ball_carrier,turnovers,nullified,new_drive,injury,kicker
0,2024_01_ARI_BUF,GAME\n2-T.Bass kicks 65 yards from BUF 35 to e...,GAME\n2-T.Bass kicks 65 yards from BUF 35 to e...,(15:00),BUF,0,0,0,0,0,...,0,0,1,,,0,0,0,,
1,2024_01_ARI_BUF,GAME\n2-T.Bass kicks 65 yards from BUF 35 to e...,6-J.Conner up the middle to ARI 33 for 3 yards...,(14:27),ARI,0,0,0,0,3,...,0,1,1,,6-J.Conner,0,0,0,,
2,2024_01_ARI_BUF,GAME\n2-T.Bass kicks 65 yards from BUF 35 to e...,1-K.Murray pass short left to 6-J.Conner to BU...,(13:43),ARI,0,0,0,0,0,...,0,0,1,1-K.Murray,6-J.Conner,0,0,0,,
3,2024_01_ARI_BUF,GAME\n2-T.Bass kicks 65 yards from BUF 35 to e...,(Shotgun) 1-K.Murray pass short middle to 6-J....,(13:02),ARI,0,0,0,0,0,...,0,0,1,1-K.Murray,6-J.Conner,0,0,0,,
4,2024_01_ARI_BUF,GAME\n2-T.Bass kicks 65 yards from BUF 35 to e...,6-J.Conner up the middle to BUF 34 for 2 yards...,(12:26),ARI,0,0,0,0,2,...,0,1,1,,6-J.Conner,0,0,0,,


In [43]:
# Function to group plays into drives
def assign_drive_numbers(plays_df):
    plays_df["drive_number"] = 0
    plays_df["quarter_end"] = False  #
    # Track previous team to identify turnovers on downs
    plays_df["previous_team"] = None  

    # Group by game_id and assign drive numbers within each game
    for game_id, group in plays_df.groupby("game_id"):
        drive_number = 0
        drive_numbers = []
        quarter_ends = []
        previous_team = None
        consecutive_team_plays = 0

        for _, play in group.iterrows():
            current_team = play["offense_team"]

            # Assign the current drive number
            drive_numbers.append(drive_number)

            # Mark if the current play is an "END QUARTER"
            is_quarter_end = "END QUARTER" in play["play"]
            quarter_ends.append(is_quarter_end)

            # Check if the play continues the current drive
            if current_team == previous_team:
                consecutive_team_plays += 1
            else:
                consecutive_team_plays = 1 

            # Check for turnover on downs: 4 consecutive plays with no points scored, followed by a change in team
            if consecutive_team_plays >= 4 and current_team != previous_team:
                drive_number += 1
                consecutive_team_plays = 1 

            # Check for other drive-ending conditions
            if (
                "extra point" in play["play"].lower()
                or "field goal" in play["play"].lower()
                or "two-point conversion attempt" in play["play"].lower()
                or "punt" in play["play"].lower()
                or "turnover" in play["play"].lower()
                or "recovered by" in play["play"].lower()
                or "INTERCEPTED" in play["play"]
                or "Touchback" in play["play"]
                or "END QUARTER 2" in play["play"]
            ):
                drive_number += 1

            # Update the previous team
            previous_team = current_team

        # Assign drive numbers and quarter end markers back to the DataFrame
        plays_df.loc[group.index, "drive_number"] = drive_numbers
        plays_df.loc[group.index, "quarter_end"] = quarter_ends

    # Drop the temporary column used for tracking the previous team
    plays_df.drop(columns=["previous_team"], inplace=True)

    return plays_df

# Apply the function
plays_df = assign_drive_numbers(plays_df)
plays_df.head()


Unnamed: 0,game_id,game_log,play,timestamp,offense_team,points,touchdowns,field_goals,missed_field_goals,rushing_yards,...,plays,QB,ball_carrier,turnovers,nullified,new_drive,injury,kicker,drive_number,quarter_end
0,2024_01_ARI_BUF,GAME\n2-T.Bass kicks 65 yards from BUF 35 to e...,GAME\n2-T.Bass kicks 65 yards from BUF 35 to e...,(15:00),BUF,0,0,0,0,0,...,1,,,0,0,0,,,0,False
1,2024_01_ARI_BUF,GAME\n2-T.Bass kicks 65 yards from BUF 35 to e...,6-J.Conner up the middle to ARI 33 for 3 yards...,(14:27),ARI,0,0,0,0,3,...,1,,6-J.Conner,0,0,0,,,1,False
2,2024_01_ARI_BUF,GAME\n2-T.Bass kicks 65 yards from BUF 35 to e...,1-K.Murray pass short left to 6-J.Conner to BU...,(13:43),ARI,0,0,0,0,0,...,1,1-K.Murray,6-J.Conner,0,0,0,,,1,False
3,2024_01_ARI_BUF,GAME\n2-T.Bass kicks 65 yards from BUF 35 to e...,(Shotgun) 1-K.Murray pass short middle to 6-J....,(13:02),ARI,0,0,0,0,0,...,1,1-K.Murray,6-J.Conner,0,0,0,,,1,False
4,2024_01_ARI_BUF,GAME\n2-T.Bass kicks 65 yards from BUF 35 to e...,6-J.Conner up the middle to BUF 34 for 2 yards...,(12:26),ARI,0,0,0,0,2,...,1,,6-J.Conner,0,0,0,,,1,False


In [44]:
# Fill "Unknown Team" with the nearest non-Unknown team within the same game and drive 
def fill_unknown_offense_team(df):
    # Sort the DataFrame to ensure logical order
    df = df.sort_values(by=['game_id', 'drive_number']).reset_index(drop=True)

    # Replace 'Unknown Team' with NaN for filling
    df['offense_team'] = df['offense_team'].replace('Unknown Team', None)

    # Forward fill and backward fill within each game_id and drive_number
    df['offense_team'] = (
        df.groupby(['game_id', 'drive_number'])['offense_team']
        .apply(lambda group: group.ffill().bfill())
        .reset_index(level=[0, 1], drop=True)  
    )

    # Fill any remaining NaN values with 'Unknown Team' (Should just be kickoffs)
    df['offense_team'] = df['offense_team'].fillna('Unknown Team')

    return df

# Apply the function
plays_df = fill_unknown_offense_team(plays_df)
plays_df.head()


Unnamed: 0,game_id,game_log,play,timestamp,offense_team,points,touchdowns,field_goals,missed_field_goals,rushing_yards,...,plays,QB,ball_carrier,turnovers,nullified,new_drive,injury,kicker,drive_number,quarter_end
0,2024_01_ARI_BUF,GAME\n2-T.Bass kicks 65 yards from BUF 35 to e...,GAME\n2-T.Bass kicks 65 yards from BUF 35 to e...,(15:00),BUF,0,0,0,0,0,...,1,,,0,0,0,,,0,False
1,2024_01_ARI_BUF,GAME\n2-T.Bass kicks 65 yards from BUF 35 to e...,6-J.Conner up the middle to ARI 33 for 3 yards...,(14:27),ARI,0,0,0,0,3,...,1,,6-J.Conner,0,0,0,,,1,False
2,2024_01_ARI_BUF,GAME\n2-T.Bass kicks 65 yards from BUF 35 to e...,1-K.Murray pass short left to 6-J.Conner to BU...,(13:43),ARI,0,0,0,0,0,...,1,1-K.Murray,6-J.Conner,0,0,0,,,1,False
3,2024_01_ARI_BUF,GAME\n2-T.Bass kicks 65 yards from BUF 35 to e...,(Shotgun) 1-K.Murray pass short middle to 6-J....,(13:02),ARI,0,0,0,0,0,...,1,1-K.Murray,6-J.Conner,0,0,0,,,1,False
4,2024_01_ARI_BUF,GAME\n2-T.Bass kicks 65 yards from BUF 35 to e...,6-J.Conner up the middle to BUF 34 for 2 yards...,(12:26),ARI,0,0,0,0,2,...,1,,6-J.Conner,0,0,0,,,1,False


In [45]:
# Ensure all values in the `play` column are strings
plays_df['play'] = plays_df['play'].astype(str)

# Create the play log for each drive
plays = plays_df[['game_id', 'drive_number', 'offense_team', 'play']].copy()

plays['play_log'] = plays.groupby(['game_id', 'drive_number'])['play'].transform(
    lambda x: f"Offense Team: {plays.loc[x.index[0], 'offense_team']}. " + " ".join(x)
)

# Drop duplicates to keep only one row per drive
plays = plays.drop_duplicates(subset=['game_id', 'drive_number'])
plays.head()

Unnamed: 0,game_id,drive_number,offense_team,play,play_log
0,2024_01_ARI_BUF,0,BUF,GAME\n2-T.Bass kicks 65 yards from BUF 35 to e...,Offense Team: BUF. GAME\n2-T.Bass kicks 65 yar...
1,2024_01_ARI_BUF,1,ARI,6-J.Conner up the middle to ARI 33 for 3 yards...,Offense Team: ARI. 6-J.Conner up the middle to...
14,2024_01_ARI_BUF,2,BUF,4-J.Cook left guard to ARI 36 for 9 yards (34-...,Offense Team: BUF. 4-J.Cook left guard to ARI ...
18,2024_01_ARI_BUF,3,ARI,6-J.Conner left end to ARI 33 for 4 yards (40-...,Offense Team: ARI. 6-J.Conner left end to ARI ...
28,2024_01_ARI_BUF,4,BUF,(Shotgun) 17-J.Allen pass incomplete short rig...,Offense Team: BUF. (Shotgun) 17-J.Allen pass i...


In [46]:
# Function to consolidate and create one row per drive
def consolidate_drive_stats(df):
    # Group by `game_id` and `drive_number`
    grouped = df.groupby(['game_id', 'drive_number'])

    # Initialize a list to store the consolidated rows
    consolidated_rows = []

    for (game_id, drive_number), group in grouped:
        # Initialize drive-level stats
        drive_stats = {
            "game_id": game_id,
            "drive_number": drive_number,
            "offense_team": group.iloc[0]['offense_team'],
            "points": group['points'].sum(),
            "touchdowns": group['touchdowns'].sum(),
            "field_goals": group['field_goals'].sum(),
            "missed_field_goals": group['missed_field_goals'].sum(),
            "rushing_yards": group['rushing_yards'].sum(),
            "passing_yards": group['passing_yards'].sum(),
            "sacks": group['sacks'].sum(),
            "completions": group['completions'].sum(),
            "incompletions": group['incompletions'].sum(),
            "turnovers": group['turnovers'].sum(),
            "plays": group['plays'].sum(),
            "QB": group['QB'].mode()[0] if not group['QB'].isna().all() else None,
            "quarter_end": group['quarter_end'].any(),  
        }

        # Collect unique ball carriers and their stats
        ball_carrier_data = group[['ball_carrier', 'rushing_yards', 'carries', 'receptions', 'plays']]
        unique_ball_carriers = ball_carrier_data.drop_duplicates(subset=['ball_carrier'])

        for i, row in enumerate(unique_ball_carriers.itertuples(), start=1):
            if pd.notna(row.ball_carrier):
                drive_stats[f'ball_carrier_{i}'] = row.ball_carrier
                drive_stats[f'rushing_yards_{i}'] = row.rushing_yards
                drive_stats[f'carries_{i}'] = row.carries
                drive_stats[f'receptions_{i}'] = row.receptions

        # Collect unique kickers and their stats
        kicker_data = group[['kicker', 'field_goals', 'missed_field_goals']]
        unique_kickers = kicker_data.drop_duplicates(subset=['kicker'])

        for i, row in enumerate(unique_kickers.itertuples(), start=1):
            if pd.notna(row.kicker):
                drive_stats[f'kicker_{i}'] = row.kicker
                drive_stats[f'field_goals_{i}'] = row.field_goals
                drive_stats[f'missed_field_goals_{i}'] = row.missed_field_goals

        # Consolidate injuries
        injuries = group['injury'].dropna().unique()
        drive_stats['injuries'] = "; ".join(injuries) if len(injuries) > 0 else None

        # Append consolidated row
        consolidated_rows.append(drive_stats)

    # Create a new DataFrame from the consolidated rows
    return pd.DataFrame(consolidated_rows)

# Apply the function to consolidate drive stats
consolidated_game_drives = consolidate_drive_stats(plays_df)
consolidated_game_drives.head()


Unnamed: 0,game_id,drive_number,offense_team,points,touchdowns,field_goals,missed_field_goals,rushing_yards,passing_yards,sacks,...,carries_12,receptions_12,ball_carrier_13,rushing_yards_13,carries_13,receptions_13,ball_carrier_14,rushing_yards_14,carries_14,receptions_14
0,2024_01_ARI_BUF,0,BUF,0,0,0,0,0,0,0,...,,,,,,,,,,
1,2024_01_ARI_BUF,1,ARI,7,1,0,0,16,44,0,...,,,,,,,,,,
2,2024_01_ARI_BUF,2,BUF,0,0,0,0,12,3,1,...,,,,,,,,,,
3,2024_01_ARI_BUF,3,ARI,3,0,1,0,15,44,0,...,,,,,,,,,,
4,2024_01_ARI_BUF,4,BUF,3,0,1,0,37,23,0,...,,,,,,,,,,


In [47]:
# Generate drive summaries
def generate_drive_summary(row):
    # Extract values from the row's columns
    team_name = row["offense_team"]
    points_scored = row["points"]
    total_plays = row["plays"]
    total_rushing_yards = row["rushing_yards"]
    total_passing_yards = row["passing_yards"]
    total_turnovers = row["turnovers"]
    total_field_goals = row["field_goals"]
    total_missed_field_goals = row["missed_field_goals"]
    total_yards = total_rushing_yards + total_passing_yards
    injury = row.get("injuries", None)
    quarter_end = row.get("quarter_end", False)

    # Initialize player stats text
    player_stats = []

    # Add QB stats
    if not pd.isna(row["QB"]):
        player_stats.append(f"{row['QB']}: {row['passing_yards']} passing yards on {row['completions']} completions")

    # Dynamically extract ball carrier stats, excluding duplicates
    i = 1
    ball_carriers_set = set()
    while f"ball_carrier_{i}" in row:
        ball_carrier = row[f"ball_carrier_{i}"]
        if pd.notna(ball_carrier) and ball_carrier not in ball_carriers_set:
            ball_carriers_set.add(ball_carrier)
            rushing_yards = row.get(f"rushing_yards_{i}", 0)
            carries = row.get(f"carries_{i}", 0)
            receptions = row.get(f"receptions_{i}", 0)
            receiving_yards = row.get(f"receiving_yards_{i}", 0)
            touchdowns = row.get(f"player_touchdowns_{i}", 0)

            player_summary = f"{ball_carrier}: {rushing_yards} rushing yards on {carries} carries"
            if receptions > 0:
                player_summary += f", {receptions} receptions for {receiving_yards} receiving yards"
            if touchdowns > 0:
                player_summary += f", {touchdowns} touchdown(s)"

            player_stats.append(player_summary)
        i += 1

    player_stats_text = "; ".join(player_stats)

    # Handle field goal text
    field_goal_text = ""
    if total_field_goals > 0:
        field_goal_text = f"{team_name} made {total_field_goals} field goal(s)"
    if total_missed_field_goals > 0:
        missed_text = f"{team_name} missed {total_missed_field_goals} field goal(s)"
        field_goal_text = f"{field_goal_text}, and {missed_text}" if field_goal_text else missed_text

    # Handle injuries
    injury_text = f"Injury Report: {injury}" if injury else ""

    # Handle quarter end
    quarter_end_text = " This drive marked the end of the quarter." if quarter_end else ""

    # Generate the summary text
    output = f"""
    Drive Summary: There were {total_plays} plays in the drive, for a total of {total_yards} net yards gained. {team_name} scored {points_scored} points.
    Team Stats: {team_name} had {total_rushing_yards} rushing yards and {total_passing_yards} passing yards. They committed {total_turnovers} turnovers.
    {field_goal_text}
    Individual Stats: {player_stats_text}
    {injury_text}
    {quarter_end_text}
    """
    return output.strip()

# Apply the updated function to generate summaries
consolidated_game_drives["drive_summary"] = consolidated_game_drives.apply(generate_drive_summary, axis=1)
consolidated_game_drives["drive_summary"].head()


0    Drive Summary: There were 1 plays in the drive...
1    Drive Summary: There were 13 plays in the driv...
2    Drive Summary: There were 4 plays in the drive...
3    Drive Summary: There were 10 plays in the driv...
4    Drive Summary: There were 11 plays in the driv...
Name: drive_summary, dtype: object

In [48]:
# Merge play logs with drive summaries
game_drives = consolidated_game_drives[['game_id', 'drive_number', 'offense_team', 'drive_summary']].merge(
    plays[['game_id', 'drive_number', 'play_log']],  
    on=['game_id', 'drive_number'], 
    how='left'
)

# Combine play log and drive summary into a single field
game_drives['combined_summary'] = (
    "Play Log: " + game_drives['play_log'].fillna("No play log available") + "\n\n" + 
    "Drive Summary: " + game_drives['drive_summary']
)

# Drop unnecessary columns
game_drives = game_drives[['game_id', 'drive_number', 'offense_team', 'combined_summary']]
game_drives.head()

Unnamed: 0,game_id,drive_number,offense_team,combined_summary
0,2024_01_ARI_BUF,0,BUF,Play Log: Offense Team: BUF. GAME\n2-T.Bass ki...
1,2024_01_ARI_BUF,1,ARI,Play Log: Offense Team: ARI. 6-J.Conner up the...
2,2024_01_ARI_BUF,2,BUF,Play Log: Offense Team: BUF. 4-J.Cook left gua...
3,2024_01_ARI_BUF,3,ARI,Play Log: Offense Team: ARI. 6-J.Conner left e...
4,2024_01_ARI_BUF,4,BUF,Play Log: Offense Team: BUF. (Shotgun) 17-J.Al...


In [51]:
# Save as a csv
game_drives.to_csv("/Users/laurenmanis/Desktop/Fall 2024/DS 5690 | Gen AI Models/all_games_data.csv", index=False)

## Aggregate Stats to Game-Level

In [54]:
# Group by game_id and drive_number to compute drive-level totals
drive_level_totals = plays_df.groupby(['game_id', 'drive_number','offense_team']).agg({
    'points': 'sum',
    'touchdowns': 'sum',
    'field_goals': 'sum',
    'missed_field_goals': 'sum',
    'rushing_yards': 'sum',
    'passing_yards': 'sum',
    'sacks': 'sum',
    'completions': 'sum',
    'incompletions': 'sum',
    'carries': 'sum',
    'plays': 'sum',
    'turnovers': 'sum',
}).reset_index()

drive_level_totals.head()


Unnamed: 0,game_id,drive_number,offense_team,points,touchdowns,field_goals,missed_field_goals,rushing_yards,passing_yards,sacks,completions,incompletions,carries,plays,turnovers
0,2024_01_ARI_BUF,0,BUF,0,0,0,0,0,0,0,0,0,0,1,0
1,2024_01_ARI_BUF,1,ARI,7,1,0,0,16,44,0,4,1,7,13,0
2,2024_01_ARI_BUF,2,BUF,0,0,0,0,12,3,1,1,0,2,4,1
3,2024_01_ARI_BUF,3,ARI,3,0,1,0,15,44,0,6,0,2,10,0
4,2024_01_ARI_BUF,4,BUF,3,0,1,0,37,23,0,1,0,5,11,0


In [64]:
# Group by game_id to compute game-level totals
game_level_totals = drive_level_totals.groupby(['game_id','offense_team']).agg({
    'points': 'sum',
    'touchdowns': 'sum',
    'field_goals': 'sum',
    'missed_field_goals': 'sum',
    'rushing_yards': 'sum',
    'passing_yards': 'sum',
    'sacks': 'sum',
    'completions': 'sum',
    'incompletions': 'sum',
    'carries': 'sum',
    'plays': 'sum',
    'turnovers': 'sum',
}).reset_index()

# Inspect the game-level totals
game_level_totals = game_level_totals[game_level_totals['offense_team'] != 'Unknown Team'].rename(columns={'offense_team':'team_player'})
game_level_totals.head()


Unnamed: 0,game_id,team_player,points,touchdowns,field_goals,missed_field_goals,rushing_yards,passing_yards,sacks,completions,incompletions,carries,plays,turnovers
0,2024_01_ARI_BUF,ARI,20,2,2,0,124,143,3,22,10,22,64,1
1,2024_01_ARI_BUF,BUF,34,4,2,0,140,221,1,18,5,24,70,1
2,2024_01_BAL_KC,BAL,20,2,2,1,182,264,1,24,13,29,85,1
3,2024_01_BAL_KC,KC,27,3,2,0,71,279,1,18,7,16,61,1
5,2024_01_CAR_NO,CAR,3,0,1,0,50,137,3,15,16,13,65,4


In [56]:
# Player-level stats grouped by game_id, drive_number, and player
player_level_stats = plays_df.groupby(['game_id', 'drive_number', 'ball_carrier']).agg({
    'rushing_yards': 'sum',
    'carries': 'sum',
    'completions': 'sum',
    'passing_yards': 'sum',
    'touchdowns':'sum'
}).reset_index().rename(columns={"passing_yards":"receiving_yards","completions":"receptions"})

player_level_stats.head()


Unnamed: 0,game_id,drive_number,ball_carrier,rushing_yards,carries,receptions,receiving_yards,touchdowns
0,2024_01_ARI_BUF,1,1-K.Murray,6,1,0,0,0
1,2024_01_ARI_BUF,1,14-Mi.Wilson,0,0,1,5,1
2,2024_01_ARI_BUF,1,4-G.Dortch,0,0,1,8,0
3,2024_01_ARI_BUF,1,6-J.Conner,10,6,2,31,0
4,2024_01_ARI_BUF,2,13-M.Hollins,0,0,1,14,0


In [65]:
# Aggregate player stats for the entire game
game_player_totals = player_level_stats.groupby(['game_id', 'ball_carrier']).agg({
    'rushing_yards': 'sum',
    'carries': 'sum',
    'receptions': 'sum',
    'receiving_yards': 'sum',
    'touchdowns':'sum'
}).reset_index().rename(columns={'ball_carrier':'team_player'})

game_player_totals.head()


Unnamed: 0,game_id,team_player,rushing_yards,carries,receptions,receiving_yards,touchdowns
0,2024_01_ARI_BUF,0-K.Coleman,0,0,4,51,0
1,2024_01_ARI_BUF,1-C.Samuel,0,0,2,15,0
2,2024_01_ARI_BUF,1-K.Murray,58,4,0,0,0
3,2024_01_ARI_BUF,10-K.Shakir,0,0,3,42,1
4,2024_01_ARI_BUF,13-M.Hollins,0,0,2,25,1


In [58]:
# QB-level stats grouped by game_id and drive_number
qb_level_stats = plays_df.groupby(['game_id', 'drive_number', 'QB']).agg({
    'passing_yards': 'sum',
    'completions': 'sum',
    'incompletions': 'sum',
    'sacks': 'sum',
    'turnovers': 'sum',  
    'touchdowns':'sum'
}).reset_index()

# Inspect the QB-level stats
qb_level_stats.head()


Unnamed: 0,game_id,drive_number,QB,passing_yards,completions,incompletions,sacks,turnovers,touchdowns
0,2024_01_ARI_BUF,1,1-K.Murray,44,4,1,0,0,1
1,2024_01_ARI_BUF,2,17-J.Allen,3,1,0,1,1,0
2,2024_01_ARI_BUF,3,1-K.Murray,44,6,0,0,0,0
3,2024_01_ARI_BUF,4,17-J.Allen,23,1,0,0,0,0
4,2024_01_ARI_BUF,5,1-K.Murray,43,6,2,0,0,0


In [66]:
# Aggregate QB stats for the entire game
game_qb_totals = qb_level_stats.groupby(['game_id', 'QB']).agg({
    'passing_yards': 'sum',
    'completions': 'sum',
    'incompletions': 'sum',
    'sacks': 'sum',
    'turnovers': 'sum',  
    'touchdowns':'sum'
}).reset_index().rename(columns={'QB':'team_player'})

game_qb_totals.head()


Unnamed: 0,game_id,team_player,passing_yards,completions,incompletions,sacks,turnovers,touchdowns
0,2024_01_ARI_BUF,1-K.Murray,143,22,10,3,1,1
1,2024_01_ARI_BUF,17-J.Allen,221,18,5,1,1,2
2,2024_01_BAL_KC,15-P.Mahomes,279,18,7,1,0,1
3,2024_01_BAL_KC,8-L.Jackson,264,24,13,1,1,1
4,2024_01_CAR_NO,14-A.Dalton,0,0,1,0,0,0


In [60]:
# Kicker-level stats grouped by game_id and drive_number
kicker_level_stats = plays_df.groupby(['game_id', 'drive_number', 'kicker']).agg({
    'field_goals': 'sum',
    'missed_field_goals': 'sum',
    'points': 'sum',  
    'plays': 'count'  
}).reset_index().rename(columns={"plays":"attempts"})

# Inspect the kicker-level stats
kicker_level_stats.head()


Unnamed: 0,game_id,drive_number,kicker,field_goals,missed_field_goals,points,attempts
0,2024_01_ARI_BUF,3,5-M.Prater,1,0,3,1
1,2024_01_ARI_BUF,4,2-T.Bass,1,0,3,1
2,2024_01_ARI_BUF,13,5-M.Prater,1,0,3,1
3,2024_01_ARI_BUF,17,2-T.Bass,1,0,3,1
4,2024_01_BAL_KC,6,7-H.Butker,1,0,3,1


In [67]:
# Aggregate kicker stats for the entire game
game_kicker_totals = kicker_level_stats.groupby(['game_id', 'kicker']).agg({
    'field_goals': 'sum',
    'missed_field_goals': 'sum',
    'points': 'sum',  
    'attempts': 'count'  
}).reset_index().rename(columns={'kicker':'team_player'})

game_kicker_totals.head()


Unnamed: 0,game_id,team_player,field_goals,missed_field_goals,points,attempts
0,2024_01_ARI_BUF,2-T.Bass,2,0,6,2
1,2024_01_ARI_BUF,5-M.Prater,2,0,6,2
2,2024_01_BAL_KC,7-H.Butker,2,0,6,2
3,2024_01_BAL_KC,9-J.Tucker,2,1,6,3
4,2024_01_CAR_NO,19-B.Grupe,4,0,12,4


In [75]:
# Merge data frames 
team_results = game_level_totals
player_results = pd.concat([game_player_totals, game_qb_totals, game_kicker_totals], axis=0, ignore_index=True)
player_results.head()

Unnamed: 0,game_id,team_player,rushing_yards,carries,receptions,receiving_yards,touchdowns,passing_yards,completions,incompletions,sacks,turnovers,field_goals,missed_field_goals,points,attempts
0,2024_01_ARI_BUF,0-K.Coleman,0.0,0.0,4.0,51.0,0.0,,,,,,,,,
1,2024_01_ARI_BUF,1-C.Samuel,0.0,0.0,2.0,15.0,0.0,,,,,,,,,
2,2024_01_ARI_BUF,1-K.Murray,58.0,4.0,0.0,0.0,0.0,,,,,,,,,
3,2024_01_ARI_BUF,10-K.Shakir,0.0,0.0,3.0,42.0,1.0,,,,,,,,,
4,2024_01_ARI_BUF,13-M.Hollins,0.0,0.0,2.0,25.0,1.0,,,,,,,,,


In [79]:
def create_team_numeric_summary(df):

    # Group by game_id and create a summary
    summaries = []
    for game_id, group in team_results.groupby("game_id"):
        summary = f"Game {game_id} Team Summary:\n"
        for _, team in group.iterrows():
            team_summary = (
                f"- {team['team_player']}: {team['points']} points, {team['touchdowns']} touchdowns, "
                f"{team['field_goals']} field goals, {team['rushing_yards']} rushing yards, "
                f"{team['passing_yards']} passing yards, {team['plays']} total plays, "
                f"{team['turnovers']} turnovers."
            )
            summary += team_summary + "\n"
        summaries.append({"game_id": game_id, "team_summary": summary.strip()})

    return pd.DataFrame(summaries)


team_numeric_summaries = create_team_numeric_summary(team_results)
team_numeric_summaries[0:3]

Unnamed: 0,game_id,team_summary
0,2024_01_ARI_BUF,Game 2024_01_ARI_BUF Team Summary:\n- ARI: 20 ...
1,2024_01_BAL_KC,Game 2024_01_BAL_KC Team Summary:\n- BAL: 20 p...
2,2024_01_CAR_NO,Game 2024_01_CAR_NO Team Summary:\n- CAR: 3 po...


In [81]:
def create_player_numeric_summary(df):

    # Group by game_id and create a summary
    summaries = []
    for game_id, group in player_results.groupby("game_id"):
        summary = f"Game {game_id} Player Summary:\n"
        for _, player in group.iterrows():
            player_summary = f"- {player['team_player']}: "
            if player["passing_yards"] > 0:
                player_summary += f"{player['passing_yards']} passing yards, {player['completions']} completions; "
            if player["rushing_yards"] > 0:
                player_summary += f"{player['rushing_yards']} rushing yards on {player['carries']} carries; "
            if player["receptions"] > 0:
                player_summary += f"{player['receptions']} receptions for {player['receiving_yards']} yards; "
            if player["points"] > 0:
                player_summary += f"{player['points']} points; "
            summary += player_summary.strip("; ") + ".\n"
        summaries.append({"game_id": game_id, "player_summary": summary.strip()})

    return pd.DataFrame(summaries)


player_numeric_summaries = create_player_numeric_summary(player_results)
player_numeric_summaries[0:3]


Unnamed: 0,game_id,player_summary
0,2024_01_ARI_BUF,Game 2024_01_ARI_BUF Player Summary:\n- 0-K.Co...
1,2024_01_BAL_KC,Game 2024_01_BAL_KC Player Summary:\n- 1-X.Wor...
2,2024_01_CAR_NO,Game 2024_01_CAR_NO Player Summary:\n- 0-J.San...


In [82]:
# Merge summaries back into the drive logs dataframe
merged_drives = game_drives.merge(team_numeric_summaries, on="game_id", how="left").merge(player_numeric_summaries, on="game_id", how="left")

# Create a combined summary column
merged_drives["numeric_summary"] = (
    "Team Stats:\n" + merged_drives["team_summary"].fillna("") + "\n\n" +
    "Player Stats:\n" + merged_drives["player_summary"].fillna("")
)

merged_drives.head()

Unnamed: 0,game_id,drive_number,offense_team,combined_summary,team_summary,player_summary,numeric_summary
0,2024_01_ARI_BUF,0,BUF,Play Log: Offense Team: BUF. GAME\n2-T.Bass ki...,Game 2024_01_ARI_BUF Team Summary:\n- ARI: 20 ...,Game 2024_01_ARI_BUF Player Summary:\n- 0-K.Co...,Team Stats:\nGame 2024_01_ARI_BUF Team Summary...
1,2024_01_ARI_BUF,1,ARI,Play Log: Offense Team: ARI. 6-J.Conner up the...,Game 2024_01_ARI_BUF Team Summary:\n- ARI: 20 ...,Game 2024_01_ARI_BUF Player Summary:\n- 0-K.Co...,Team Stats:\nGame 2024_01_ARI_BUF Team Summary...
2,2024_01_ARI_BUF,2,BUF,Play Log: Offense Team: BUF. 4-J.Cook left gua...,Game 2024_01_ARI_BUF Team Summary:\n- ARI: 20 ...,Game 2024_01_ARI_BUF Player Summary:\n- 0-K.Co...,Team Stats:\nGame 2024_01_ARI_BUF Team Summary...
3,2024_01_ARI_BUF,3,ARI,Play Log: Offense Team: ARI. 6-J.Conner left e...,Game 2024_01_ARI_BUF Team Summary:\n- ARI: 20 ...,Game 2024_01_ARI_BUF Player Summary:\n- 0-K.Co...,Team Stats:\nGame 2024_01_ARI_BUF Team Summary...
4,2024_01_ARI_BUF,4,BUF,Play Log: Offense Team: BUF. (Shotgun) 17-J.Al...,Game 2024_01_ARI_BUF Team Summary:\n- ARI: 20 ...,Game 2024_01_ARI_BUF Player Summary:\n- 0-K.Co...,Team Stats:\nGame 2024_01_ARI_BUF Team Summary...


In [84]:
# Save as a csv
merged_drives[['game_id','drive_number','offense_team','combined_summary','numeric_summary']].to_csv("/Users/laurenmanis/Desktop/Fall 2024/DS 5690 | Gen AI Models/all_game_stats.csv", index=False)