# Fine-Tuning Stage II: Data Pre-Processing
## Numeric Reasoning & Stat Aggregation

In [24]:
import pandas as pd
import numpy as np 
import re
import random
import json

In [2]:
# Read in data containing all stats for teams and players at the drive level
plays_df = pd.read_csv("/Users/laurenmanis/Desktop/Fall 2024/DS 5690 | Gen AI Models/player_plays.csv")
plays_df.head()

Unnamed: 0,game_log,game_summary,game_id,play,timestamp,offense_team,points,touchdowns,field_goals,missed_field_goals,...,carries,plays,QB,ball_carrier,turnovers,nullified,new_drive,injury,kicker,drive_number
0,7-H.Butker kicks 65 yards from KC 35 to end zo...,Headline: Chiefs hold off Ravens 27-20 when re...,1,7-H.Butker kicks 65 yards from KC 35 to end zo...,(15:00),Unknown Team,0,0,0,0,...,0,1,,,0,0,0,,,0
1,7-H.Butker kicks 65 yards from KC 35 to end zo...,Headline: Chiefs hold off Ravens 27-20 when re...,1,(Shotgun) 22-D.Henry left end to BAL 32 for 2 ...,(14:19),BAL,0,0,0,0,...,1,1,,22-D.Henry,0,0,0,,,1
2,7-H.Butker kicks 65 yards from KC 35 to end zo...,Headline: Chiefs hold off Ravens 27-20 when re...,1,(Shotgun) 8-L.Jackson pass short right to 4-Z....,(13:55),BAL,0,0,0,0,...,0,1,,,0,1,0,,,1
3,7-H.Butker kicks 65 yards from KC 35 to end zo...,Headline: Chiefs hold off Ravens 27-20 when re...,1,(Shotgun) 8-L.Jackson pass short right to 43-J...,(13:20),BAL,0,0,0,0,...,0,1,8-L.Jackson,43-J.Hill,0,0,0,,,1
4,7-H.Butker kicks 65 yards from KC 35 to end zo...,Headline: Chiefs hold off Ravens 27-20 when re...,1,(Shotgun) 8-L.Jackson pass short right to 43-J...,(12:43),BAL,0,0,0,0,...,0,1,8-L.Jackson,43-J.Hill,0,0,0,,,1


## Aggregating Team and Player Stats by Drive and by Game

### Team Stats by Game

In [3]:
# Group by game_id and drive_number to compute drive-level totals
drive_level_totals = plays_df.groupby(['game_id', 'drive_number','offense_team']).agg({
    'points': 'sum',
    'touchdowns': 'sum',
    'field_goals': 'sum',
    'missed_field_goals': 'sum',
    'rushing_yards': 'sum',
    'passing_yards': 'sum',
    'sacks': 'sum',
    'completions': 'sum',
    'incompletions': 'sum',
    'carries': 'sum',
    'plays': 'sum',
    'turnovers': 'sum',
}).reset_index()

drive_level_totals.head()


Unnamed: 0,game_id,drive_number,offense_team,points,touchdowns,field_goals,missed_field_goals,rushing_yards,passing_yards,sacks,completions,incompletions,carries,plays,turnovers
0,1,0,Unknown Team,0,0,0,0,0,0,0,0,0,0,1,0
1,1,1,BAL,7,1,0,0,35,39,0,3,1,6,13,0
2,1,2,KC,7,1,0,0,25,27,0,2,0,2,5,0
3,1,3,BAL,0,0,0,0,0,18,0,2,0,0,5,0
4,1,4,KC,0,0,0,0,0,32,1,2,0,0,6,0


In [5]:
# Group by game_id to compute game-level totals
game_level_totals = drive_level_totals.groupby(['game_id','offense_team']).agg({
    'points': 'sum',
    'touchdowns': 'sum',
    'field_goals': 'sum',
    'missed_field_goals': 'sum',
    'rushing_yards': 'sum',
    'passing_yards': 'sum',
    'sacks': 'sum',
    'completions': 'sum',
    'incompletions': 'sum',
    'carries': 'sum',
    'plays': 'sum',
    'turnovers': 'sum',
}).reset_index()

# Inspect the game-level totals
game_level_totals = game_level_totals[game_level_totals['offense_team'] != 'Unknown Team']
game_level_totals.head()


Unnamed: 0,game_id,offense_team,points,touchdowns,field_goals,missed_field_goals,rushing_yards,passing_yards,sacks,completions,incompletions,carries,plays,turnovers
0,1,BAL,20,2,2,1,182,264,1,24,13,29,85,1
1,1,KC,27,3,2,0,71,279,1,18,7,16,61,1
3,2,DET,19,2,2,0,156,199,2,17,9,26,69,1
4,2,LA,20,2,2,0,81,303,2,31,14,20,85,2
5,3,GB,16,1,3,1,129,181,2,15,17,16,73,1


### Player Stats by Game

In [12]:
# Player-level stats grouped by game_id, drive_number, and player
player_level_stats = plays_df.groupby(['game_id', 'drive_number', 'ball_carrier']).agg({
    'rushing_yards': 'sum',
    'carries': 'sum',
    'completions': 'sum',
    'passing_yards': 'sum',
    'touchdowns':'sum'
}).reset_index().rename(columns={"passing_yards":"receiving_yards","completions":"receptions"})

player_level_stats.head()


Unnamed: 0,game_id,drive_number,ball_carrier,rushing_yards,carries,receptions,receiving_yards,touchdowns
0,1,1,22-D.Henry,16,4,0,0,1
1,1,1,4-Z.Flowers,0,0,1,19,0
2,1,1,43-J.Hill,0,0,2,20,0
3,1,1,8-L.Jackson,19,2,0,0,0
4,1,2,1-X.Worthy,21,1,0,0,1


In [14]:
# Aggregate player stats for the entire game
game_player_totals = player_level_stats.groupby(['game_id', 'ball_carrier']).agg({
    'rushing_yards': 'sum',
    'carries': 'sum',
    'receptions': 'sum',
    'receiving_yards': 'sum',
    'touchdowns':'sum'
}).reset_index()

game_player_totals.head()


Unnamed: 0,game_id,ball_carrier,rushing_yards,carries,receptions,receiving_yards,touchdowns
0,1,1-X.Worthy,21,1,2,47,2
1,1,10-I.Pacheco,43,12,2,33,0
2,1,15-N.Agholor,0,0,1,6,0
3,1,15-P.Mahomes,4,1,1,2,0
4,1,22-D.Henry,43,10,0,0,1


### QB Stats by Game

In [17]:
# QB-level stats grouped by game_id and drive_number
qb_level_stats = plays_df.groupby(['game_id', 'drive_number', 'QB']).agg({
    'passing_yards': 'sum',
    'completions': 'sum',
    'incompletions': 'sum',
    'sacks': 'sum',
    'turnovers': 'sum',  
    'touchdowns':'sum'
}).reset_index()

# Inspect the QB-level stats
qb_level_stats.head()


Unnamed: 0,game_id,drive_number,QB,passing_yards,completions,incompletions,sacks,turnovers,touchdowns
0,1,1,8-L.Jackson,39,3,1,0,0,0
1,1,2,15-P.Mahomes,27,2,0,0,0,0
2,1,3,8-L.Jackson,18,2,0,0,0,0
3,1,4,15-P.Mahomes,32,2,0,1,0,0
4,1,5,8-L.Jackson,-7,0,0,1,1,0


In [19]:
# Aggregate QB stats for the entire game
game_qb_totals = qb_level_stats.groupby(['game_id', 'QB']).agg({
    'passing_yards': 'sum',
    'completions': 'sum',
    'incompletions': 'sum',
    'sacks': 'sum',
    'turnovers': 'sum',  
    'touchdowns':'sum'
}).reset_index()

game_qb_totals.head()


Unnamed: 0,game_id,QB,passing_yards,completions,incompletions,sacks,turnovers,touchdowns
0,1,15-P.Mahomes,279,18,7,1,0,1
1,1,8-L.Jackson,264,24,13,1,1,1
2,2,16-J.Goff,199,17,9,2,0,1
3,2,9-M.Stafford,303,31,14,2,1,1
4,3,1-J.Hurts,299,21,11,2,2,2


### Kicking Stats by Game (Field Goals)

In [21]:
# Kicker-level stats grouped by game_id and drive_number
kicker_level_stats = plays_df.groupby(['game_id', 'drive_number', 'kicker']).agg({
    'field_goals': 'sum',
    'missed_field_goals': 'sum',
    'points': 'sum',  
    'plays': 'count'  
}).reset_index().rename(columns={"plays":"attempts"})

# Inspect the kicker-level stats
kicker_level_stats.head()


Unnamed: 0,game_id,drive_number,kicker,field_goals,missed_field_goals,points,attempts
0,1,6,7-H.Butker,1,0,3,1
1,1,7,7-H.Butker,1,0,3,1
2,1,8,9-J.Tucker,0,1,0,1
3,1,10,9-J.Tucker,1,0,3,1
4,1,17,9-J.Tucker,1,0,3,1


In [22]:
# Aggregate kicker stats for the entire game
game_kicker_totals = kicker_level_stats.groupby(['game_id', 'kicker']).agg({
    'field_goals': 'sum',
    'missed_field_goals': 'sum',
    'points': 'sum',  
    'attempts': 'count'  
}).reset_index()

game_kicker_totals.head()


Unnamed: 0,game_id,kicker,field_goals,missed_field_goals,points,attempts
0,1,7-H.Butker,2,0,6,2
1,1,9-J.Tucker,2,1,6,3
2,2,16-J.Karty,2,0,6,2
3,2,39-J.Bates,2,0,6,2
4,3,4-J.Elliott,2,0,6,2


## Generating Training Examples

In [30]:
# Generate training examples for players/qbs/kickers 
def generate_stat_examples(df, entity_type, num_examples):
    examples = []
    
    for _ in range(num_examples):
        # Randomly select a game
        game_id = random.choice(df['game_id'].unique())
        game_df = df[df['game_id'] == game_id]
        
        if entity_type == "player":
            # Randomly select a player within the game
            player = random.choice(game_df['ball_carrier'].dropna().unique())
            player_df = game_df[game_df['ball_carrier'] == player]
        elif entity_type == "qb":
            # Randomly select a QB within the game
            player = random.choice(game_df['QB'].dropna().unique())
            player_df = game_df[game_df['QB'] == player]
        elif entity_type == "kicker":
            # Randomly select a kicker within the game
            player = random.choice(game_df['kicker'].dropna().unique())
            player_df = game_df[game_df['kicker'] == player]
        else:
            raise ValueError("Invalid entity_type. Must be 'player', 'qb', or 'kicker'.")

        # Randomly select 2 or 3 drives for the selected player
        sampled_drives = player_df.sample(n=min(3, len(player_df)), replace=False)
        
        # Construct the prompt and completion
        if entity_type == "player":
            stats_text = "\n".join(
                f"On a drive, {player} had the following stats: {row['carries']} carries, {row['rushing_yards']} rushing yards, {row['receptions']} receptions, {row['receiving_yards']} receiving yards."
                for _, row in sampled_drives.iterrows()
            )
            total_carries = sampled_drives['carries'].sum()
            total_rushing_yards = sampled_drives['rushing_yards'].sum()
            total_receptions = sampled_drives['receptions'].sum()
            total_receiving_yards = sampled_drives['receiving_yards'].sum()
            completion = (
                f"{player}'s final statline: {total_carries} carries for {total_rushing_yards} rushing yards, {total_receptions} receptions for {total_receiving_yards} receiving yards."
            )

        elif entity_type == "qb":
            stats_text = "\n".join(
                f"On a drive, {player} completed {row['completions']} passes for {row['passing_yards']} yards."
                for _, row in sampled_drives.iterrows()
            )
            total_completions = sampled_drives['completions'].sum()
            total_passing_yards = sampled_drives['passing_yards'].sum()
            completion = (
                f"{player}'s final statline: {total_completions} completions for {total_passing_yards} passing yards."
            )

        elif entity_type == "kicker":
            stats_text = "\n".join(
                f"On a drive, {player} made {row['field_goals']} field goals, missed {row['missed_field_goals']} field goals, "
                f"and scored {row['points']} points."
                for _, row in sampled_drives.iterrows()
            )
            total_field_goals = sampled_drives['field_goals'].sum()
            total_missed_field_goals = sampled_drives['missed_field_goals'].sum()
            total_points = sampled_drives['points'].sum()
            completion = (
                f"{player}'s final statline: {total_field_goals} field goals made, {total_missed_field_goals} missed, and {total_points} points."
            )

        prompt = f"{stats_text}\nWhat was {player}'s final statline?"

        examples.append({
            "messages": [
                {"role": "system", "content": "You are a sports statistician specializing in NFL Football."},
                {"role": "user", "content": prompt},
                {"role": "assistant", "content": completion}
            ]
        })

    return examples

In [41]:
# Generate training examples for players
player_examples = generate_stat_examples(player_level_stats, "player", 100)
qb_examples = generate_stat_examples(qb_level_stats, "qb", 100)
kicker_examples = generate_stat_examples(kicker_level_stats, "kicker", 50)

player_examples[0:2]

[{'messages': [{'role': 'system',
    'content': 'You are a sports statistician specializing in NFL Football.'},
   {'role': 'user',
    'content': "On a drive, 2-D.Carlson had the following stats: 1 carries, 29 rushing yards, 0 receptions, 0 receiving yards.\nWhat was 2-D.Carlson's final statline?"},
   {'role': 'assistant',
    'content': "2-D.Carlson's final statline: 1 carries for 29 rushing yards, 0 receptions for 0 receiving yards."}]},
 {'messages': [{'role': 'system',
    'content': 'You are a sports statistician specializing in NFL Football.'},
   {'role': 'user',
    'content': "On a drive, 85-N.Brown had the following stats: 0 carries, 0 rushing yards, 2 receptions, 19 receiving yards.\nOn a drive, 85-N.Brown had the following stats: 0 carries, 0 rushing yards, 1 receptions, 31 receiving yards.\nOn a drive, 85-N.Brown had the following stats: 0 carries, 0 rushing yards, 2 receptions, 10 receiving yards.\nWhat was 85-N.Brown's final statline?"},
   {'role': 'assistant',
    '

In [37]:
# Function to generate team-level examples
def generate_team_stat_examples(df, num_examples):
    examples = []
    
    for _ in range(num_examples):
        # Randomly select a game
        game_id = random.choice(df['game_id'].unique())
        game_df = df[df['game_id'] == game_id]
        
        # Randomly select a team within the game
        team = random.choice(game_df['offense_team'].dropna().unique())
        team_df = game_df[game_df['offense_team'] == team]
        
        # Randomly select 2 or 3 drives for the selected team
        sampled_drives = team_df.sample(n=min(3, len(team_df)), replace=False)
        
        # Construct the prompt and completion
        stats_text = "\n".join(
            f"On a drive, {team} had {row['plays']} plays, {row['rushing_yards']} rushing yards, {row['passing_yards']} passing yards, scored {row['points']} points, with {row['touchdowns']} touchdowns, {row['field_goals']} field goals, and committed {row['turnovers']} turnovers."
            for _, row in sampled_drives.iterrows()
        )
        total_plays = sampled_drives['plays'].sum()
        total_rushing_yards = sampled_drives['rushing_yards'].sum()
        total_passing_yards = sampled_drives['passing_yards'].sum()
        total_points = sampled_drives['points'].sum()
        total_touchdowns = sampled_drives['touchdowns'].sum()
        total_field_goals = sampled_drives['field_goals'].sum()
        total_turnovers = sampled_drives['turnovers'].sum()

        completion = (
            f"{team}'s final statline: {total_plays} plays, {total_rushing_yards} rushing yards, "
            f"{total_passing_yards} passing yards, {total_points} points, {total_touchdowns} touchdowns, "
            f"{total_field_goals} field goals, and {total_turnovers} turnovers."
        )

        prompt = f"{stats_text}\nWhat was {team}'s final statline?"

        examples.append({
            "messages": [
                {"role": "system", "content": "You are a sports statistician specializing in NFL football."},
                {"role": "user", "content": prompt},
                {"role": "assistant", "content": completion}
            ]
        })

    return examples


In [43]:
# Generate team-level examples
team_examples = generate_team_stat_examples(drive_level_totals, 150)

team_examples[0:2]

[{'messages': [{'role': 'system',
    'content': 'You are a sports statistician specializing in NFL football.'},
   {'role': 'user',
    'content': "On a drive, SEA had 7 plays, 5 rushing yards, 0 passing yards, scored 0 points, with 0 touchdowns, 0 field goals, and committed 0 turnovers.\nOn a drive, SEA had 5 plays, 22 rushing yards, 33 passing yards, scored 7 points, with 1 touchdowns, 0 field goals, and committed 0 turnovers.\nOn a drive, SEA had 11 plays, 25 rushing yards, 17 passing yards, scored 7 points, with 1 touchdowns, 0 field goals, and committed 0 turnovers.\nWhat was SEA's final statline?"},
   {'role': 'assistant',
    'content': "SEA's final statline: 23 plays, 52 rushing yards, 50 passing yards, 14 points, 2 touchdowns, 0 field goals, and 0 turnovers."}]},
 {'messages': [{'role': 'system',
    'content': 'You are a sports statistician specializing in NFL football.'},
   {'role': 'user',
    'content': "On a drive, SF had 8 plays, 16 rushing yards, 41 passing yards, sc

In [44]:
# Combine all examples into one list
all_examples = player_examples + qb_examples + kicker_examples + team_examples

# Create a JSONL file for fine-tuning
output_file = "/Users/laurenmanis/Desktop/Fall 2024/DS 5690 | Gen AI Models/stat_reasoning_fine_tuning.jsonl"

# Save to the JSONL format
with open(output_file, "w") as f:
    for example in all_examples:
        json.dump(example, f)  
        f.write("\n")          

print(f"Fine-tuning examples saved to {output_file}")

Fine-tuning examples saved to /Users/laurenmanis/Desktop/Fall 2024/DS 5690 | Gen AI Models/stat_reasoning_fine_tuning.jsonl


In [40]:
# Create a test set
player_test = generate_stat_examples(player_level_stats, "player", 15)
qb_test = generate_stat_examples(qb_level_stats, "qb", 15)
kicker_test = generate_stat_examples(kicker_level_stats, "kicker", 5)
team_test = generate_team_stat_examples(drive_level_totals, 15)

# Combine all examples into one list
all_test = player_test + qb_test + kicker_test + team_test

# Create a JSONL file for fine-tuning
output_file = "/Users/laurenmanis/Desktop/Fall 2024/DS 5690 | Gen AI Models/stat_reasoning_test.jsonl"

# Save to the JSONL format
with open(output_file, "w") as f:
    for example in all_test:
        json.dump(example, f)  
        f.write("\n")          

print(f"Test examples saved to {output_file}")

Test examples saved to /Users/laurenmanis/Desktop/Fall 2024/DS 5690 | Gen AI Models/stat_reasoning_test.jsonl
