Football Injury Datasets

https://nflreadr.nflverse.com/reference/load_injuries.html

https://nflreadr.nflverse.com/reference/load_rosters.html

https://nflreadr.nflverse.com/reference/load_players.html

https://nflreadr.nflverse.com/reference/load_combine.html

https://nflreadr.nflverse.com/articles/dictionary_injuries.html

https://nflreadr.nflverse.com/articles/dictionary_rosters.html

https://nflreadr.nflverse.com/articles/dictionary_combine.html

In [None]:
# import nfl_data_py as nfl

# combined = nfl.import_combine_data([2009, 2024])
# cleancombined = nfl.clean_nfl_data(combined)
# print("COMBINE DATA")
# print("====================================")
# print(cleancombined.columns.tolist())

# rosters = nfl.import_seasonal_rosters([2009, 2024])
# cleanrosters = nfl.clean_nfl_data(rosters)
# print("ROSTER DATA")
# print("====================================")
# print(cleanrosters.columns.tolist())

# ids = nfl.import_ids()
# cleanids = nfl.clean_nfl_data(ids)
# print("ID DATA")
# print("====================================")
# print(cleanids.columns.tolist())

# injuries = nfl.import_injuries([2009, 2024])
# cleaninjuries = nfl.clean_nfl_data(injuries)
# print("INJURY DATA")
# print("====================================")
# print(cleaninjuries.columns.tolist())

In [None]:
import pandas as pd
import nfl_data_py as nfl

# ---------------------------
# 1. Import Data and Standardize Keys
# ---------------------------
years = list(range(2009, 2025))

# Import and clean data
combine_df = nfl.clean_nfl_data(nfl.import_combine_data(years=years))
rosters_df = nfl.clean_nfl_data(nfl.import_seasonal_rosters(years=years))
injuries_df = nfl.clean_nfl_data(nfl.import_injuries(years=years))

# Create a common key (using player_name for combine/roster and full_name for injuries)
combine_df['player_key'] = combine_df['player_name'].str.lower().str.strip()
rosters_df['player_key'] = rosters_df['player_name'].str.lower().str.strip()
injuries_df['player_key'] = injuries_df['full_name'].str.lower().str.strip()

# ---------------------------
# 2. Prepare Injuries Data
# ---------------------------
# Rename injuries columns to avoid conflicts
injuries_df = injuries_df.rename(columns={
    'season': 'season_injury', 
    'week': 'week_injury',
    'position': 'position_injury'
})

# Define valid injury types (all lowercase)
valid_injury_types = [
    "knee", "ankle", "hamstring", "shoulder", "foot", "concussion",
    "groin", "back", "calf", "hip", "neck", "toe", "quadricep", "elbow",
    "hand", "rib", "wrist", "thumb", "abdomen", "head", "finger", "achilles",
    "shin", "pectoral", "forearm", "heel", "biceps", "fibula"
]

# Create one-hot indicator columns based on practice injury descriptions.
# (We consider both 'practice_primary_injury' and 'practice_secondary_injury'.)
for inj in valid_injury_types:
    col_name = f"injury_{inj}"
    injuries_df[col_name] = (
        injuries_df['practice_primary_injury'].str.lower().str.contains(inj, na=False) |
        injuries_df['practice_secondary_injury'].str.lower().str.contains(inj, na=False)
    ).astype(int)

# List of indicator column names
injury_type_cols = [f"injury_{inj}" for inj in valid_injury_types]

# Filter out rows that don't mention any valid injury type
injuries_df = injuries_df[injuries_df[injury_type_cols].sum(axis=1) > 0]

# Sort injuries by season_injury and week_injury (earlier events first)
injuries_df = injuries_df.sort_values(by=['season_injury', 'week_injury'])

# Compute cumulative injury counts for each type for each player
for inj in valid_injury_types:
    cum_col = f"cum_injury_{inj}"
    injuries_df[cum_col] = injuries_df.groupby('player_key')[f"injury_{inj}"].cumsum()

# ---------------------------
# 3. Merge Datasets
# ---------------------------
# Merge roster and injury data on player_key and matching season (roster 'season' with injuries 'season_injury')
roster_injury_df = pd.merge(
    rosters_df,
    injuries_df,
    left_on=['player_key', 'season'],
    right_on=['player_key', 'season_injury'],
    how='outer',
    suffixes=('_roster', '_injury')
)

# Merge in combine data on player_key (drop combine's 'season' to avoid conflict)
merged_df = pd.merge(
    roster_injury_df,
    combine_df.drop(columns=['season']),
    on='player_key',
    how='outer',
    suffixes=('', '_combine')
)

# ---------------------------
# 4. Cleanup and One-Hot Encode Position
# ---------------------------
# Drop columns we no longer need
cols_to_drop = ['report_status', 'practice_primary_injury', 'practice_secondary_injury', 'prev_injury_count', 'position_injury']
required_columns = ['height', 'weight', 'years_exp', 'age', 'practice_primary_injury']
merged_df = merged_df.dropna(subset=required_columns)
merged_df = merged_df.drop(columns=[col for col in cols_to_drop if col in merged_df.columns])

# One-hot encode the player's position (from roster data; assumed column 'position' exists)
position_dummies = pd.get_dummies(merged_df['position'], prefix='pos')
merged_df = pd.concat([merged_df, position_dummies], axis=1)

# ---------------------------
# 5. Subset the Final Columns and Drop Incomplete Rows
# ---------------------------
# Desired columns:
# - Roster details: 'height', 'weight', 'years_exp', 'age'
# - Injury indicator columns for each type (injury_*)
# - Cumulative injury count columns (cum_injury_*)
# - One-hot encoded position columns (from position_dummies)
desired_columns = ['height', 'weight', 'years_exp', 'age'] + \
                  injury_type_cols + \
                  [f"cum_injury_{inj}" for inj in valid_injury_types] + \
                  list(position_dummies.columns)

final_subset = merged_df[desired_columns]

# ---------------------------
# 6. Export and Display
# ---------------------------
final_subset.to_csv("train.csv", index=False)
print("Data exported to train.csv")
print("Final subset column names:")
print(final_subset.columns.tolist())