# Fine-Tuning Stage I: Data Pre-Processing
## Active NFL Players, Recent Team Performance, and Fanatasy Football Basics

In [64]:
import pandas as pd
import numpy as np 
import json
import random

In [46]:
players = pd.read_csv('/Users/laurenmanis/Desktop/Fall 2024/DS 5690 | Gen AI Models/nfl_players.csv').drop(columns='Unnamed: 0')
players.head()

Unnamed: 0,team,team_name,position,jersey_number,full_name,years_exp,rookie_year,all_pro,rookie
0,NYJ,New York Jets,QB,8,Aaron Rodgers,19,2005,1,0
1,CHI,Chicago Bears,TE,84,Marcedes Lewis,18,2006,0,0
2,TEN,Tennessee Titans,K,6,Nick Folk,17,2007,0,0
3,CAR,Carolina Panthers,LS,44,J.J. Jansen,16,2008,0,0
4,IND,Indianapolis Colts,QB,15,Joe Flacco,16,2008,0,0


In [47]:
# Create plain language player descriptions
def create_player_description(row):
    # Determine rookie or veteran status
    rookie_status = "rookie" if row['rookie'] == 1 else "veteran"
    # Determine all-pro status
    all_pro_status = "an all-pro" if row['all_pro'] == 1 else "not an all-pro"
    # Format the description
    description = (
        f"{row['full_name']} is a {rookie_status} {row['position']} for the {row['team_name']} ({row['team']}) with {row['years_exp']} years of experience. He is {all_pro_status}."
    )
    return description

# Apply the function to the dataframe
players['player_descriptions'] = players.apply(create_player_description, axis=1)
players['player_descriptions'].sample(5)

569     Devin Bush is a veteran LB for the Cleveland B...
603     Dre'Mont Jones is a veteran DL for the Seattle...
1348    Drake Thomas is a veteran LB for the Seattle S...
633     Blake Gillikin is a veteran P for the Arizona ...
171     Mike Hilton is a veteran DB for the Cincinnati...
Name: player_descriptions, dtype: object

In [48]:
# Create additional descriptions for rookies and second-year players 
rookies = players[players['rookie_year'] >= 2023].copy()

# Create plain language player descriptions
def create_rookie_description(row):
    # Determine rookie or veteran status
    rookie_status = "rookie" if row['rookie'] == 1 else "second-year"
    # Format the description
    description = (
        f"{row['full_name']} is a {rookie_status} {row['position']} for the {row['team_name']} ({row['team']}). He entered the league in {row['rookie_year']}."
    )
    return description

# Apply the function to the dataframe
rookies['player_descriptions'] = rookies.apply(create_rookie_description, axis=1)
rookies['player_descriptions'].sample(5)

1540    Winston Reid is a rookie LB for the Cleveland ...
1242    O'Cyrus Torrence is a second-year OL for the B...
1529    James Williams is a rookie LB for the Tennesse...
1247    DeMarvion Overshown is a second-year LB for th...
1636    Edgerrin Cooper is a rookie LB for the Green B...
Name: player_descriptions, dtype: object

In [49]:
# ...And more additional descriptions for rookies and second-year players 
def create_rookie_description2(row):
    # Determine rookie or veteran status
    rookie_status = "rookie" if row['rookie'] == 1 else "second-year"
    # Format the description
    description = (
        f"{row['full_name']} began his NFL career in {row['rookie_year']} as a {row['position']} for the {row['team_name']} ({row['team']})."
    )
    return description

# Apply the function to the dataframe
rookies['player_descriptions2'] = rookies.apply(create_rookie_description2, axis=1)
rookies['player_descriptions2'].sample(5)

1531    Jaylen Harrell began his NFL career in 2024 as...
1611    Anthony Gould began his NFL career in 2024 as ...
1686    Rome Odunze began his NFL career in 2024 as a ...
1444    Beau Brade began his NFL career in 2024 as a D...
1488    Xavier Thomas began his NFL career in 2024 as ...
Name: player_descriptions2, dtype: object

In [50]:
teams = pd.read_csv('/Users/laurenmanis/Desktop/Fall 2024/DS 5690 | Gen AI Models/team_records.csv')
teams.head()

Unnamed: 0,Tm,W,L,Playoffs,Won_Division,Won_Conference,Won_SuperBowl,Pre_Season_Rank
0,Buffalo Bills,11,6,1,1,0,0,4
1,Miami Dolphins,11,6,1,0,0,0,9
2,New York Jets,7,10,0,0,0,0,14
3,New England Patriots,4,13,0,0,0,0,28
4,Baltimore Ravens,13,4,1,1,0,0,3


In [61]:
# Append playoff wins 
Tm = ['Cleveland Browns','Miami Dolphins','Los Angeles Rams','Dallas Cowboys','Pittsburgh Steelers','Philadelphia Eagles',
    'Houston Texans','Green Bay Packers', 'Tampa Bay Buccaneers', 'Buffalo Bills', 
    'Baltimore Ravens', 'Detroit Lions', 'San Francisco 49ers', 'Kansas City Chiefs']
playoff_exit = [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 4, 5]

playoffs = pd.DataFrame({
    'Tm': Tm,
    'Exit_Round': playoff_exit
})

teams = teams.merge(playoffs, on='Tm', how='left')
teams.head()

Unnamed: 0,Tm,W,L,Playoffs,Won_Division,Won_Conference,Won_SuperBowl,Pre_Season_Rank,Exit_Round
0,Buffalo Bills,11,6,1,1,0,0,4,2.0
1,Miami Dolphins,11,6,1,0,0,0,9,1.0
2,New York Jets,7,10,0,0,0,0,14,
3,New England Patriots,4,13,0,0,0,0,28,
4,Baltimore Ravens,13,4,1,1,0,0,3,3.0


In [60]:
# Function to generate descriptions
def create_team_description(row):
    # Start with the season record
    description = f"The {row['Tm']} finished the 2023 season with a record of {row['W']}-{row['L']}."
    
    # Add playoffs information
    if row["Playoffs"] == 1:
        description += " They made the playoffs"
        if row["Won_Division"] == 1:
            description += " and won their division"
        if row["Won_Conference"] == 1:
            description += ", their conference"
        if row["Won_SuperBowl"] == 1:
            description += ", and went on to win the Super Bowl"
        description += "."
    else:
        description += " They missed the playoffs."
    
    # Add preseason ranking
    description += f" Going into the 2024 season, PFF had them ranked {row['Pre_Season_Rank']}th overall."
    
    return description

# Apply the function to the dataframe
team_descriptions = teams.apply(create_team_description, axis=1)
team_descriptions.sample(5)

8     The Houston Texans finished the 2023 season wi...
23    The Chicago Bears finished the 2023 season wit...
19    The Washington Commanders finished the 2023 se...
20    The Detroit Lions finished the 2023 season wit...
25    The New Orleans Saints finished the 2023 seaso...
dtype: object

In [62]:
# ...And more team descriptions 
def create_team_description2(row):
    # Base description with team name and record
    description = f"The {row['Tm']} are a {'good' if row['W'] > row['L'] else 'struggling'} team and won {row['W']} games in 2023."
    
    # Playoff information
    if row['Playoffs'] == 1:
        if not pd.isna(row['Exit_Round']):
            exit_text = (
                "in the wildcard round" if row['Exit_Round'] == 1 else
                "in the divisional round" if row['Exit_Round'] == 2 else
                "in the conference championship round" if row['Exit_Round'] == 3 else
                "in the Super Bowl"
            )
            description += f" They made the playoffs but lost {exit_text}."
        elif row['Exit_Round'] == 5:
            description += " They won the Super Bowl and are the reigning champions."
    else:
        description += " They did not make the playoffs."
    
    return description

# Apply the function to the dataframe
team_descriptions2 = teams.apply(create_team_description2, axis=1)
team_descriptions2.sample(5)

16    The Dallas Cowboys are a good team and won 12 ...
9     The Jacksonville Jaguars are a good team and w...
5     The Cleveland Browns are a good team and won 1...
8     The Houston Texans are a good team and won 10 ...
28    The San Francisco 49ers are a good team and wo...
dtype: object

In [11]:
# Context on fantasy football 
fantasy_rules = [
    "Rosters: In a standard fantasy football league, managers set a weekly lineup consisting of one quarterback (QB), 2 running backs (RB), 2 wide receivers (WR), 1 tight end (TE), 1 flex player (RB/WR/TE), 1 kicker (K), 1 defense/special teams (D/ST), and 7 bench players.",
    "Scoring System: Passing: 1 point per 25 passing yards, 4 points per passing touchdown. Rushing: 1 point per 10 rushing yards, 6 points per rushing touchdown. Receiving: 1 point per 10 receiving yards, 6 points per receiving touchdown, 1 point per reception (PPR). Conversions: 2 points per 2-point conversion. Kicking: 1 point per extra point, 3 points per field goal. Defense/Special Teams (D/ST): Starts at 10 points. Points can decrease based on yards or points allowed. Points increase for sacks, turnovers, forced fumbles, and return touchdowns.",
    "Wide Receivers, Running Backs, and Quarterbacks are the most valuable players, as they have the potential to earn the most points.",
    "Managers can adjust lineups all the way until a player's game kicks off.",
    "During a season, managers can make changes to their roster, adding or dropping players, through the waiver wire or by making trades. Every player that is not on a roster in the fantasy league is on waivers and eligible to be added during certain points of the week."
]

In [65]:
# Convert descriptions into fine-tuning compatible format
all_descriptions = []

# Process player descriptions
for _, row in players.iterrows():
    player_name = row['full_name']
    description = row['player_descriptions']
    
    all_descriptions.append({
        "messages": [
            {"role": "system", "content": "You are a football knowledge assistant. Please provide brief summaries on active NFL players."},
            {"role": "user", "content": f"Tell me about {player_name}."},
            {"role": "assistant", "content": description}
        ]
    })
    
# Process additional rookie descriptions
for _, row in rookies.iterrows():
    player_name = row['full_name']
    description = row['player_descriptions']
    
    all_descriptions.append({
        "messages": [
            {"role": "system", "content": "You are a National Football League (NFL) knowledge assistant."},
            {"role": "user", "content": f"Tell me about this player: {player_name}."},
            {"role": "assistant", "content": description}
        ]
    })

# Process team descriptions
for description in team_descriptions:
    all_descriptions.append({
        "messages": [
            {"role": "system", "content": "You are a football knowledge assistant."},
            {"role": "user", "content": "Tell me about this team as of recent."},
            {"role": "assistant", "content": description}
        ]
    })
    
# Process additional team descriptions
for description in team_descriptions2:
    all_descriptions.append({
        "messages": [
            {"role": "system", "content": "You are a football knowledge assistant. Provide brief but informative summaries about NFL teams."},
            {"role": "user", "content": "Summarize the performance of this football team during the 2023 season."},
            {"role": "assistant", "content": description}
        ]
    })

# Process fantasy football rules
for rule in fantasy_rules:
    all_descriptions.append({
        "messages": [
            {"role": "system", "content": "You are a football knowledge assistant. Learn about fantasy football concepts."},
            {"role": "user", "content": "Explain this fantasy football concept."},
            {"role": "assistant", "content": rule}
        ]
    })
    
# Shuffle the descriptions
random.shuffle(all_descriptions)

# Save to JSONL file
output_file = "/Users/laurenmanis/Desktop/Fall 2024/DS 5690 | Gen AI Models/all_descriptions.jsonl"
with open(output_file, "w", encoding="utf-8") as f:
    for item in all_descriptions:
        f.write(json.dumps(item) + "\n")

print(f"Shuffled descriptions saved to {output_file}")

Shuffled descriptions saved to /Users/laurenmanis/Desktop/Fall 2024/DS 5690 | Gen AI Models/all_descriptions.jsonl
