Football Injury Datasets

https://nflreadr.nflverse.com/reference/load_injuries.html

https://nflreadr.nflverse.com/reference/load_rosters.html

https://nflreadr.nflverse.com/reference/load_players.html

https://nflreadr.nflverse.com/reference/load_combine.html

https://nflreadr.nflverse.com/articles/dictionary_injuries.html

https://nflreadr.nflverse.com/articles/dictionary_rosters.html

https://nflreadr.nflverse.com/articles/dictionary_combine.html

In [None]:
# import nfl_data_py as nfl

# combined = nfl.import_combine_data([2009, 2024])
# cleancombined = nfl.clean_nfl_data(combined)
# print("COMBINE DATA")
# print("====================================")
# print(cleancombined.columns.tolist())

# rosters = nfl.import_seasonal_rosters([2009, 2024])
# cleanrosters = nfl.clean_nfl_data(rosters)
# print("ROSTER DATA")
# print("====================================")
# print(cleanrosters.columns.tolist())

# ids = nfl.import_ids()
# cleanids = nfl.clean_nfl_data(ids)
# print("ID DATA")
# print("====================================")
# print(cleanids.columns.tolist())

# injuries = nfl.import_injuries([2009, 2024])
# cleaninjuries = nfl.clean_nfl_data(injuries)
# print("INJURY DATA")
# print("====================================")
# print(cleaninjuries.columns.tolist())




In [None]:
import pandas as pd
import nfl_data_py as nfl

# Define the range of years from 2009 to 2024
years = list(range(2009, 2025))

# Import and clean data for the specified years
combine_df = nfl.clean_nfl_data(nfl.import_combine_data(years=years))
rosters_df = nfl.clean_nfl_data(nfl.import_seasonal_rosters(years=years))
injuries_df = nfl.clean_nfl_data(nfl.import_injuries(years=years))

# Create a standardized key for matching: use 'player_name' for combine/roster and 'full_name' for injuries
combine_df['player_key'] = combine_df['player_name'].str.lower().str.strip()
rosters_df['player_key'] = rosters_df['player_name'].str.lower().str.strip()
injuries_df['player_key'] = injuries_df['full_name'].str.lower().str.strip()

# Rename injuries' season and week columns so they don't conflict with roster/combined season data
injuries_df = injuries_df.rename(columns={'season': 'season_injury', 'week': 'week_injury'})

# Sort the injuries data so earlier injury events come first (by season_injury then week_injury)
injuries_df = injuries_df.sort_values(by=['season_injury', 'week_injury'])

# For each player, compute a cumulative injury count (i.e. number of previous injuries)
injuries_df['prev_injury_count'] = injuries_df.groupby('player_key').cumcount()

# ---- Aggregate Injury Data ----
# We want one record per player per injury season.
# For each (player, season_injury) group, count the number of injuries,
# capture the maximum previous injury count, and collect a list of reported primary injuries.
injuries_agg = injuries_df.groupby(['player_key', 'season_injury']).agg(
    injury_count = ('report_primary_injury', 'count'),
    max_prev_injury = ('prev_injury_count', 'max'),
    injury_reports = ('report_primary_injury', lambda x: list(x.dropna()))
).reset_index()

# ---- Merge Injury Aggregates with Roster Data ----
# Roster data has a 'season' column that represents the season of the roster.
# We merge injuries_agg to rosters_df using both player_key and matching season (season_injury == season)
merged_r_inj = pd.merge(
    rosters_df,
    injuries_agg,
    left_on=['player_key', 'season'],
    right_on=['player_key', 'season_injury'],
    how='left'
)

# ---- Merge in Combine Data ----
# Combine data is generally static for each player.
# We merge it on player_key.
# (Optionally, you can drop the combine 'season' if it conflicts with roster season.)
final_df = pd.merge(
    merged_r_inj,
    combine_df.drop(columns=['season']),  # drop combine season to avoid confusion
    on='player_key',
    how='left',
    suffixes=('', '_combine')
)

# Order the final dataset by player_key and roster season
final_df = final_df.sort_values(by=['player_key', 'season'])

# Export the final merged dataset to a CSV file
final_df.to_csv("injury_data.csv", index=False)
print("Data exported to injury_data.csv")

# Display all column names in the final merged dataframe
print("All column names:")
print(final_df.columns.tolist())

print(final_df.head(100))
