In [243]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import os
import glob

In [244]:
# Ensuring all columns and rows are visible
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_info_columns', 115)

In [245]:
# Loading in the CSV files containing player statistics for each year
raw_data_path = "../data/raw"

csv_files = glob.glob(os.path.join(raw_data_path, 'stats_player_reg_*.csv'))

dfs = []
for file in csv_files:
    print(f'Loading {file}')
    df = pd.read_csv(file)
    
    year = int(os.path.basename(file).split('_')[-1].split('.')[0])
    df['season'] = year
    
    dfs.append(df)

all_seasons = pd.concat(dfs, ignore_index=True)

# Filtering to only include skill position players
skill_positions = ["QB", "RB", "WR", "TE"]
all_seasons = all_seasons[all_seasons['position'].isin(skill_positions)].reset_index(drop = True).copy()

print(f'\nCombined dataset shape: {all_seasons.shape}')

Loading ../data/raw/stats_player_reg_2017.csv
Loading ../data/raw/stats_player_reg_2016.csv
Loading ../data/raw/stats_player_reg_2015.csv
Loading ../data/raw/stats_player_reg_2022.csv
Loading ../data/raw/stats_player_reg_2023.csv
Loading ../data/raw/stats_player_reg_2021.csv
Loading ../data/raw/stats_player_reg_2020.csv
Loading ../data/raw/stats_player_reg_2018.csv
Loading ../data/raw/stats_player_reg_2024.csv
Loading ../data/raw/stats_player_reg_2019.csv

Combined dataset shape: (5815, 113)


In [246]:
# Previewing the full dataframe
all_seasons.head()

Unnamed: 0,player_id,player_name,player_display_name,position,position_group,headshot_url,season,season_type,recent_team,games,completions,attempts,passing_yards,passing_tds,passing_interceptions,sacks_suffered,sack_yards_lost,sack_fumbles,sack_fumbles_lost,passing_air_yards,passing_yards_after_catch,passing_first_downs,passing_epa,passing_cpoe,passing_2pt_conversions,pacr,carries,rushing_yards,rushing_tds,rushing_fumbles,rushing_fumbles_lost,rushing_first_downs,rushing_epa,rushing_2pt_conversions,receptions,targets,receiving_yards,receiving_tds,receiving_fumbles,receiving_fumbles_lost,receiving_air_yards,receiving_yards_after_catch,receiving_first_downs,receiving_epa,receiving_2pt_conversions,racr,target_share,air_yards_share,wopr,special_teams_tds,def_tackles_solo,def_tackles_with_assist,def_tackle_assists,def_tackles_for_loss,def_tackles_for_loss_yards,def_fumbles_forced,def_sacks,def_sack_yards,def_qb_hits,def_interceptions,def_interception_yards,def_pass_defended,def_tds,def_fumbles,def_safeties,misc_yards,fumble_recovery_own,fumble_recovery_yards_own,fumble_recovery_opp,fumble_recovery_yards_opp,fumble_recovery_tds,penalties,penalty_yards,punt_returns,punt_return_yards,kickoff_returns,kickoff_return_yards,fg_made,fg_att,fg_missed,fg_blocked,fg_long,fg_pct,fg_made_0_19,fg_made_20_29,fg_made_30_39,fg_made_40_49,fg_made_50_59,fg_made_60_,fg_missed_0_19,fg_missed_20_29,fg_missed_30_39,fg_missed_40_49,fg_missed_50_59,fg_missed_60_,fg_made_list,fg_missed_list,fg_blocked_list,fg_made_distance,fg_missed_distance,fg_blocked_distance,pat_made,pat_att,pat_missed,pat_blocked,pat_pct,gwfg_made,gwfg_att,gwfg_missed,gwfg_blocked,gwfg_distance_list,fantasy_points,fantasy_points_ppr
0,00-0019596,T.Brady,Tom Brady,QB,QB,https://static.www.nfl.com/image/private/{form...,2017,REG,NE,16,385,581,4577,32,8,35,-201,6,2,5308,1910,230,140.614184,3.735653,2,0.862283,25,28,0,1,1,9,-12.203785,0,0,0,0,0,0,0,0,0,0,,0,,0.0,0.0,0.0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0,0.0,0,0,2,3,0,0,0,2,15,0,0,0,0,0,0,0,0,,,0,0,0,0,0,0,0,0,0,0,0,0,,,,0,0,0,0,0,0,0,,0,0,0,0,,295.88,295.88
1,00-0020531,D.Brees,Drew Brees,QB,QB,https://static.www.nfl.com/image/private/{form...,2017,REG,NO,16,386,536,4334,23,8,20,-145,2,0,3417,2369,201,106.646449,4.681357,0,1.268364,33,12,2,2,0,10,-8.335352,0,0,0,0,0,0,0,0,0,0,,0,,0.0,0.0,0.0,0,1,0,0,0,0,0,0.0,0.0,0,0,0,0,0,0.0,0,0,3,5,0,0,0,5,32,0,0,0,0,0,0,0,0,,,0,0,0,0,0,0,0,0,0,0,0,0,,,,0,0,0,0,0,0,0,,0,0,0,0,,262.56,262.56
2,00-0021206,J.McCown,Josh McCown,QB,QB,https://static.www.nfl.com/image/private/{form...,2017,REG,NYJ,13,267,397,2926,18,9,39,-264,6,3,3036,1268,135,13.373195,5.534142,0,0.963768,37,124,5,4,1,17,6.621054,0,0,0,0,0,0,0,0,0,0,,0,,0.0,0.0,0.0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0,0.0,0,0,6,9,0,0,0,4,20,0,0,0,0,0,0,0,0,,,0,0,0,0,0,0,0,0,0,0,0,0,,,,0,0,0,0,0,0,0,,0,0,0,0,,205.44,205.44
3,00-0021429,C.Palmer,Carson Palmer,QB,QB,https://static.www.nfl.com/image/private/{form...,2017,REG,ARI,7,164,267,1978,9,7,22,-150,0,0,2499,743,95,-4.198268,-0.034985,0,0.791517,14,12,0,2,0,6,0.243019,0,0,0,0,0,0,0,0,0,0,,0,,0.0,0.0,0.0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0,0.0,0,0,2,3,0,0,0,1,5,0,0,0,0,0,0,0,0,,,0,0,0,0,0,0,0,0,0,0,0,0,,,,0,0,0,0,0,0,0,,0,0,0,0,,102.32,102.32
4,00-0021547,A.Gates,Antonio Gates,TE,TE,https://static.www.nfl.com/image/private/{form...,2017,REG,LAC,16,0,0,0,0,0,0,0,0,0,0,0,0,,,0,,0,0,0,0,0,0,,0,30,52,316,3,0,0,432,66,16,3.038502,0,0.731481,0.09075,0.090377,0.199389,0,1,0,0,0,0,0,0.0,0.0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,1,5,0,0,0,0,0,0,0,0,,,0,0,0,0,0,0,0,0,0,0,0,0,,,,0,0,0,0,0,0,0,,0,0,0,0,,49.6,79.6


In [247]:
# Loading the players dataset which has player info
players = pd.read_csv('../data/raw/players.csv')

players_filtered = players[['gsis_id', 'rookie_season']]

# Merging rookie_season into all_seasons and sorting the updated dataframe
all_seasons = all_seasons.merge(players_filtered, left_on = 'player_id', right_on = 'gsis_id', how = 'inner')

all_seasons = all_seasons.sort_values(['player_id', 'season'])

# Getting next year's team for each player and merging in that info
all_seasons['team_next_year'] = all_seasons.groupby('player_id')['recent_team'].shift(-1)

players_latest_team = players[['gsis_id', 'latest_team']]
all_seasons = all_seasons.merge(players_latest_team, left_on = 'player_id', right_on = 'gsis_id', how = 'left')

# Setting the next_years_team column to latest_team for the 2024 observations
all_seasons.loc[all_seasons['season'] == 2024, 'team_next_year'] = all_seasons.loc[all_seasons['season'] == 2024, 'latest_team']

# If team_next_year is NaN (player retired) then assign a 0
all_seasons['new_team_next_year'] = (
    (all_seasons['team_next_year'].notna()) &
    (all_seasons['recent_team'] != all_seasons['team_next_year'])
).astype(int)

# Dropping unneeded columns and renaming duplicate columns
all_seasons = all_seasons.drop(columns = ['latest_team', 'gsis_id_y', 'team_next_year'])

all_seasons = all_seasons.rename(columns = {'gsis_id_x': 'gsis_id'})

In [248]:
# Creating a new column to show the years of experience for a player in that season
all_seasons['years_of_experience'] = all_seasons['season'] - all_seasons['rookie_season']

all_seasons[['season', 'rookie_season', 'years_of_experience']].head()

Unnamed: 0,season,rookie_season,years_of_experience
0,2015,1999,16
1,2015,1998,17
2,2015,2000,15
3,2016,2000,16
4,2017,2000,17


In [249]:
# Creating the QB only dataframes which contains player info and stats specific to the position
qb_all = all_seasons[all_seasons['position'] == 'QB']

qb_filtered = qb_all[['player_id', 'player_display_name', 'position', 'years_of_experience', 'season', 'recent_team', 'new_team_next_year', 
                      'games', 'completions', 'attempts', 'passing_yards', 'passing_tds', 'passing_interceptions', 'sacks_suffered', 
                      'sack_fumbles', 'passing_first_downs', 'passing_epa', 'passing_cpoe', 'pacr', 'carries', 'rushing_yards', 
                      'rushing_tds', 'rushing_fumbles', 'rushing_first_downs', 'rushing_epa', 'fantasy_points_ppr'
                     ]].copy()

qb_filtered.sort_values(by = 'fantasy_points_ppr', ascending = False).head()

Unnamed: 0,player_id,player_display_name,position,years_of_experience,season,recent_team,new_team_next_year,games,completions,attempts,passing_yards,passing_tds,passing_interceptions,sacks_suffered,sack_fumbles,passing_first_downs,passing_epa,passing_cpoe,pacr,carries,rushing_yards,rushing_tds,rushing_fumbles,rushing_first_downs,rushing_epa,fantasy_points_ppr
3971,00-0034796,Lamar Jackson,QB,6,2024,BAL,0,17,316,474,4172,41,4,23,2,198,172.278612,4.551942,1.015332,139,915,4,6,53,11.652933,430.38
3337,00-0033873,Patrick Mahomes,QB,5,2022,KC,0,17,435,648,5250,41,12,26,0,273,193.131205,3.572168,1.113468,61,358,4,4,28,24.309493,417.4
3333,00-0033873,Patrick Mahomes,QB,1,2018,KC,0,16,383,580,5097,50,12,26,3,237,221.761404,4.47219,0.969933,60,272,2,2,22,15.273858,417.08
3966,00-0034796,Lamar Jackson,QB,1,2019,BAL,0,15,265,401,3127,36,6,23,0,161,144.002652,3.81266,0.882087,176,1206,7,7,76,54.889087,415.68
4054,00-0034857,Josh Allen,QB,3,2021,BUF,0,17,409,646,4407,36,15,26,3,240,79.992858,2.097556,0.832295,122,763,6,5,56,55.646718,402.58


In [250]:
# Transforming the count stats into per-game stats to make the values more robust to injuries 
qb_count_stats = ['completions', 'attempts', 'passing_yards', 'passing_tds', 'passing_interceptions',
               'sacks_suffered', 'sack_fumbles', 'passing_first_downs', 'passing_epa', 'passing_cpoe', 'pacr',
               'carries', 'rushing_yards', 'rushing_tds', 'rushing_fumbles', 'rushing_first_downs', 'rushing_epa',
               'fantasy_points_ppr']

qb_per_game = qb_filtered.copy()

for stat in qb_count_stats:
    qb_per_game[stat + '_per_game'] = (qb_per_game[stat] / qb_per_game['games'].replace(0, pd.NA))

qb_per_game = qb_per_game[['player_id', 'player_display_name', 'position', 'years_of_experience', 'season', 'recent_team', 
                           'new_team_next_year', 'games'] + [col for col in qb_per_game.columns if col.endswith('_per_game')]]

# Filtering to only players with >= 8 games played
qb_per_game_filtered = qb_per_game[qb_per_game['games'] >= 8].round(2)

qb_per_game_filtered = qb_per_game_filtered.sort_values(by = ['player_id', 'season'])

# Creating the target variable which is predicted fantasy PPG for the following year
qb_per_game_filtered['fantasy_points_ppr_per_game_next_year'] = (
    qb_per_game_filtered.groupby('player_id')['fantasy_points_ppr_per_game'].shift(-1)
)

qb_per_game_filtered.head()

Unnamed: 0,player_id,player_display_name,position,years_of_experience,season,recent_team,new_team_next_year,games,completions_per_game,attempts_per_game,passing_yards_per_game,passing_tds_per_game,passing_interceptions_per_game,sacks_suffered_per_game,sack_fumbles_per_game,passing_first_downs_per_game,passing_epa_per_game,passing_cpoe_per_game,pacr_per_game,carries_per_game,rushing_yards_per_game,rushing_tds_per_game,rushing_fumbles_per_game,rushing_first_downs_per_game,rushing_epa_per_game,fantasy_points_ppr_per_game,fantasy_points_ppr_per_game_next_year
0,00-0007091,Matt Hasselbeck,QB,16,2015,IND,0,8,19.5,32.0,211.25,1.12,0.62,2.0,0.38,10.62,-0.01,-0.28,0.11,2.0,1.88,0.0,0.0,0.25,-0.73,11.39,
1,00-0010346,Peyton Manning,QB,17,2015,DEN,0,10,19.8,33.1,224.9,0.9,1.7,1.6,0.1,11.0,-3.2,-0.24,0.07,0.6,-0.6,0.0,0.0,0.0,-0.36,9.14,
2,00-0019596,Tom Brady,QB,15,2015,NE,0,16,25.12,39.0,298.12,2.25,0.44,2.38,0.31,14.25,7.97,-0.01,0.06,2.12,3.31,0.19,0.06,0.88,0.42,21.54,21.55
3,00-0019596,Tom Brady,QB,16,2016,NE,0,12,24.25,36.0,296.17,2.33,0.17,1.25,0.17,13.58,12.2,0.3,0.09,2.33,5.33,0.0,0.08,0.92,-0.48,21.55,18.49
4,00-0019596,Tom Brady,QB,17,2017,NE,0,16,24.06,36.31,286.06,2.0,0.5,2.19,0.38,14.38,8.79,0.23,0.05,1.56,1.75,0.0,0.06,0.56,-0.76,18.49,17.58


In [251]:
# Observing potential missing values (NaN values are expected in the 2024 season for the target variable)
qb_per_game_filtered_no_2024 = qb_per_game_filtered[qb_per_game_filtered['season'] != 2024]

print(f'There are {qb_per_game_filtered_no_2024.isna().any(axis = 1).sum()} unexpected rows with NaN values\n')

qb_na_obs = qb_per_game_filtered_no_2024[qb_per_game_filtered_no_2024.isna().any(axis = 1)]

qb_na_obs

# Missing values appear to be due to lack of playing time or retirement in the following year 

There are 61 unexpected rows with NaN values



Unnamed: 0,player_id,player_display_name,position,years_of_experience,season,recent_team,new_team_next_year,games,completions_per_game,attempts_per_game,passing_yards_per_game,passing_tds_per_game,passing_interceptions_per_game,sacks_suffered_per_game,sack_fumbles_per_game,passing_first_downs_per_game,passing_epa_per_game,passing_cpoe_per_game,pacr_per_game,carries_per_game,rushing_yards_per_game,rushing_tds_per_game,rushing_fumbles_per_game,rushing_first_downs_per_game,rushing_epa_per_game,fantasy_points_ppr_per_game,fantasy_points_ppr_per_game_next_year
0,00-0007091,Matt Hasselbeck,QB,16,2015,IND,0,8,19.5,32.0,211.25,1.12,0.62,2.0,0.38,10.62,-0.01,-0.28,0.11,2.0,1.88,0.0,0.0,0.25,-0.73,11.39,
1,00-0010346,Peyton Manning,QB,17,2015,DEN,0,10,19.8,33.1,224.9,0.9,1.7,1.6,0.1,11.0,-3.2,-0.24,0.07,0.6,-0.6,0.0,0.0,0.0,-0.36,9.14,
9,00-0019596,Tom Brady,QB,22,2022,TB,0,17,28.82,43.12,276.12,1.47,0.53,1.29,0.18,13.94,3.54,0.03,0.05,1.71,-0.06,0.06,0.12,0.41,-1.2,15.98,
18,00-0020531,Drew Brees,QB,19,2020,NO,0,12,22.92,32.5,245.17,2.0,0.5,1.08,0.42,12.5,5.73,0.13,0.1,1.5,-0.17,0.17,0.0,0.33,-0.36,17.46,
23,00-0021206,Josh McCown,QB,15,2017,NYJ,0,13,20.54,30.54,225.08,1.38,0.69,3.0,0.46,10.38,1.03,0.43,0.07,2.85,9.54,0.38,0.31,1.31,0.51,15.8,
27,00-0021429,Carson Palmer,QB,13,2016,ARI,0,15,24.27,39.8,282.2,1.73,0.93,2.67,0.53,14.67,2.2,0.03,0.05,0.93,2.53,0.0,0.33,0.53,-0.45,16.21,
55,00-0022803,Eli Manning,QB,14,2018,NYG,0,16,23.75,36.0,268.69,1.31,0.69,2.94,0.44,12.94,1.82,0.03,0.06,0.94,1.25,0.06,0.0,0.31,0.42,15.0,
71,00-0022924,Ben Roethlisberger,QB,17,2021,PIT,0,16,24.38,37.81,233.75,1.38,0.62,2.38,0.56,11.75,-2.83,-0.15,0.06,1.25,0.31,0.06,0.0,0.19,-0.12,13.63,
77,00-0022942,Philip Rivers,QB,16,2020,IND,0,16,23.06,33.94,260.56,1.5,0.69,1.19,0.12,12.5,6.53,0.05,0.07,1.12,-0.5,0.0,0.0,0.0,-0.63,15.0,
88,00-0023436,Alex Smith,QB,15,2020,WAS,0,8,21.0,31.5,197.75,0.75,1.0,2.75,0.12,9.25,-4.09,-0.11,0.15,1.25,0.38,0.0,0.12,0.12,-0.8,8.95,


In [252]:
# Dropping rows with NaN values from years < 2024
qb_per_game_filtered = qb_per_game_filtered.drop(index = qb_na_obs.index)


qb_per_game_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Index: 291 entries, 2 to 5811
Data columns (total 27 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   player_id                              291 non-null    object 
 1   player_display_name                    291 non-null    object 
 2   position                               291 non-null    object 
 3   years_of_experience                    291 non-null    int64  
 4   season                                 291 non-null    int64  
 5   recent_team                            291 non-null    object 
 6   new_team_next_year                     291 non-null    int64  
 7   games                                  291 non-null    int64  
 8   completions_per_game                   291 non-null    float64
 9   attempts_per_game                      291 non-null    float64
 10  passing_yards_per_game                 291 non-null    float64
 11  passing_td

In [253]:
# Creating the RB only dataframes which contains player info and stats specific to the position
rb_all = all_seasons[all_seasons['position'] == 'RB']

rb_filtered = rb_all[['player_id', 'player_display_name', 'position', 'years_of_experience', 'season', 'recent_team', 'new_team_next_year', 
                      'games', 'carries', 'rushing_yards', 'rushing_tds', 'rushing_fumbles', 'rushing_first_downs', 'rushing_epa',
                      'receptions', 'targets', 'receiving_yards', 'receiving_tds', 'receiving_fumbles', 'receiving_air_yards',
                      'receiving_yards_after_catch', 'receiving_first_downs', 'receiving_epa', 'racr', 'target_share', 'air_yards_share',
                      'wopr', 'fantasy_points_ppr'
             ]].copy()

rb_filtered.sort_values(by = 'fantasy_points_ppr', ascending = False).head()

Unnamed: 0,player_id,player_display_name,position,years_of_experience,season,recent_team,new_team_next_year,games,carries,rushing_yards,rushing_tds,rushing_fumbles,rushing_first_downs,rushing_epa,receptions,targets,receiving_yards,receiving_tds,receiving_fumbles,receiving_air_yards,receiving_yards_after_catch,receiving_first_downs,receiving_epa,racr,target_share,air_yards_share,wopr,fantasy_points_ppr
2942,00-0033280,Christian McCaffrey,RB,2,2019,CAR,0,16,287,1387,15,1,58,-14.251966,116,142,1005,4,0,111,987,58,50.483007,9.054054,0.236667,0.020947,0.369663,471.2
2304,00-0032187,David Johnson,RB,1,2016,ARI,0,16,293,1239,16,5,76,-18.82608,80,120,879,4,0,529,661,42,23.366134,1.661626,0.188679,0.085268,0.342706,407.8
2946,00-0033280,Christian McCaffrey,RB,6,2023,SF,0,16,272,1459,14,2,86,14.388324,67,83,564,7,1,153,461,31,6.454787,3.686275,0.176221,0.038079,0.290986,391.3
4025,00-0034844,Saquon Barkley,RB,0,2018,NYG,0,16,261,1307,11,0,52,-7.855426,91,121,721,4,0,18,768,30,2.5663,40.055556,0.210801,0.004116,0.319083,385.8
2941,00-0033280,Christian McCaffrey,RB,1,2018,CAR,0,16,219,1098,7,2,56,7.454052,107,124,867,6,2,84,859,42,33.349967,10.321429,0.227941,0.020187,0.356043,385.5


In [254]:
# Transforming the count stats into per-game stats to make the values more robust to injuries 
rb_count_stats = ['carries', 'rushing_yards', 'rushing_tds', 'rushing_fumbles', 'rushing_first_downs', 'rushing_epa', 'receptions', 
                  'targets', 'receiving_yards', 'receiving_tds', 'receiving_fumbles', 'receiving_air_yards', 
                  'receiving_yards_after_catch', 'receiving_first_downs', 'receiving_epa', 'fantasy_points_ppr']


rb_per_game = rb_filtered.copy()

for stat in rb_count_stats:
    rb_per_game[stat + '_per_game'] = (rb_per_game[stat] / rb_per_game['games'].replace(0, pd.NA))

rb_per_game = rb_per_game[['player_id', 'player_display_name', 'position', 'years_of_experience', 'season', 'recent_team', 'new_team_next_year', 
                           'games', 'racr', 'target_share', 'air_yards_share', 'wopr'] + [
                               col for col in rb_per_game.columns if col.endswith('_per_game')]]

rushing_cols = ['carries_per_game', 'rushing_yards_per_game', 'rushing_tds_per_game',
                'rushing_fumbles_per_game', 'rushing_first_downs_per_game', 'rushing_epa_per_game']

receiving_cols = ['receptions_per_game', 'targets_per_game', 'receiving_yards_per_game',
                  'receiving_tds_per_game', 'receiving_fumbles_per_game', 'receiving_air_yards_per_game',
                  'receiving_yards_after_catch_per_game', 'receiving_first_downs_per_game',
                  'receiving_epa_per_game', 'racr', 'target_share', 'air_yards_share', 'wopr']

fantasy_cols = ['fantasy_points_ppr_per_game']

# Reordering the columns and filtering to >= 8 games played
final_cols = ['player_id', 'player_display_name', 'position', 'years_of_experience', 'season', 'recent_team', 'new_team_next_year', 
              'games'] + rushing_cols + receiving_cols + fantasy_cols

rb_per_game = rb_per_game[final_cols]

rb_per_game_filtered = rb_per_game[rb_per_game['games'] >= 8].round(2)

rb_per_game_filtered = rb_per_game_filtered.sort_values(by = ['player_id', 'season'])

# Creating the target variable
rb_per_game_filtered['fantasy_points_ppr_per_game_next_year'] = (
    rb_per_game_filtered.groupby('player_id')['fantasy_points_ppr_per_game'].shift(-1)
)

rb_per_game_filtered.head()

Unnamed: 0,player_id,player_display_name,position,years_of_experience,season,recent_team,new_team_next_year,games,carries_per_game,rushing_yards_per_game,rushing_tds_per_game,rushing_fumbles_per_game,rushing_first_downs_per_game,rushing_epa_per_game,receptions_per_game,targets_per_game,receiving_yards_per_game,receiving_tds_per_game,receiving_fumbles_per_game,receiving_air_yards_per_game,receiving_yards_after_catch_per_game,receiving_first_downs_per_game,receiving_epa_per_game,racr,target_share,air_yards_share,wopr,fantasy_points_ppr_per_game,fantasy_points_ppr_per_game_next_year
103,00-0023500,Frank Gore,RB,10,2015,IND,0,16,16.25,60.44,0.38,0.25,3.06,-3.11,2.12,3.62,16.69,0.06,0.0,-0.81,17.06,0.62,-0.03,-20.54,0.1,-0.0,0.14,12.09,13.39
104,00-0023500,Frank Gore,RB,11,2016,IND,0,16,16.44,64.06,0.25,0.12,3.12,-1.53,2.38,2.94,17.31,0.25,0.0,0.06,17.88,0.88,0.63,277.0,0.08,0.0,0.12,13.39,10.85
105,00-0023500,Frank Gore,RB,12,2017,IND,1,16,16.31,60.06,0.19,0.19,3.12,-1.5,1.81,2.38,15.31,0.06,0.0,-1.62,17.19,0.56,0.3,-9.42,0.08,-0.01,0.12,10.85,7.33
106,00-0023500,Frank Gore,RB,13,2018,MIA,1,14,11.14,51.57,0.0,0.07,1.79,-0.28,0.86,1.14,8.86,0.07,0.0,1.5,8.21,0.57,0.49,5.9,0.04,0.01,0.06,7.33,6.33
107,00-0023500,Frank Gore,RB,14,2019,BUF,1,15,11.07,39.93,0.13,0.0,2.2,-2.19,0.87,1.07,6.67,0.0,0.0,0.93,5.87,0.33,0.07,7.14,0.03,0.0,0.05,6.33,6.68


In [255]:
# Observing potential missing values (NaN values are expected in the 2024 season for the target variable)
rb_per_game_filtered_no_2024 = rb_per_game_filtered[rb_per_game_filtered['season'] != 2024]

print(f'There are {rb_per_game_filtered_no_2024.isna().any(axis = 1).sum()} unexpected rows with NaN values\n')

rb_na_obs = rb_per_game_filtered_no_2024[rb_per_game_filtered_no_2024.isna().any(axis = 1)]

rb_na_obs

# Missing values mostly appear to be due to lack of playing time or retirement in the following year 

There are 235 unexpected rows with NaN values



Unnamed: 0,player_id,player_display_name,position,years_of_experience,season,recent_team,new_team_next_year,games,carries_per_game,rushing_yards_per_game,rushing_tds_per_game,rushing_fumbles_per_game,rushing_first_downs_per_game,rushing_epa_per_game,receptions_per_game,targets_per_game,receiving_yards_per_game,receiving_tds_per_game,receiving_fumbles_per_game,receiving_air_yards_per_game,receiving_yards_after_catch_per_game,receiving_first_downs_per_game,receiving_epa_per_game,racr,target_share,air_yards_share,wopr,fantasy_points_ppr_per_game,fantasy_points_ppr_per_game_next_year
108,00-0023500,Frank Gore,RB,15,2020,NYJ,0,15,12.47,43.53,0.13,0.07,2.2,-2.75,1.07,1.27,5.93,0.0,0.0,2.27,4.87,0.2,0.03,2.62,0.04,0.01,0.07,6.68,
111,00-0023564,Darren Sproles,RB,11,2016,PHI,0,15,6.27,29.2,0.13,0.0,1.47,-0.18,3.47,4.73,28.47,0.13,0.0,2.33,27.13,1.27,0.13,12.2,0.12,0.01,0.18,10.83,
133,00-0024204,Fred Jackson,RB,9,2015,SEA,0,15,1.73,6.67,0.0,0.0,0.33,-0.32,2.13,2.73,17.13,0.13,0.07,4.73,17.07,0.8,0.61,3.62,0.09,0.02,0.14,5.18,
135,00-0024217,Reggie Bush,RB,10,2016,BUF,0,10,1.2,-0.3,0.1,0.1,0.4,-0.53,0.7,1.0,9.0,0.0,0.0,7.4,4.0,0.6,0.56,1.22,0.02,0.02,0.04,2.37,
145,00-0024242,DeAngelo Williams,RB,10,2016,PIT,0,8,12.25,42.88,0.5,0.0,2.75,-0.83,2.25,3.38,14.75,0.25,0.0,-2.5,16.5,0.75,0.1,-5.9,0.05,-0.0,0.07,12.51,
190,00-0025394,Adrian Peterson,RB,13,2020,DET,1,16,9.75,37.75,0.44,0.0,2.19,-0.76,0.75,1.12,6.31,0.0,0.0,-0.38,6.44,0.25,0.11,-16.83,0.03,-0.0,0.05,7.78,
199,00-0025399,Marshawn Lynch,RB,10,2017,LV,0,15,13.8,59.4,0.47,0.07,3.07,-1.02,1.33,2.07,10.07,0.0,0.0,1.0,10.93,0.4,-0.2,10.07,0.06,0.0,0.09,10.95,
228,00-0026019,Danny Woodhead,RB,9,2017,BAL,0,8,1.75,7.0,0.0,0.0,0.25,-0.35,4.12,4.88,25.0,0.0,0.0,13.5,14.75,1.12,0.06,1.85,0.07,0.03,0.12,7.32,
244,00-0026144,Darren McFadden,RB,7,2015,DAL,0,16,14.94,68.06,0.19,0.19,3.31,-0.82,2.5,3.31,20.5,0.0,0.0,-0.25,19.81,0.75,-0.57,-82.0,0.1,-0.0,0.15,12.23,
249,00-0026153,Jonathan Stewart,RB,9,2017,CAR,1,15,13.2,45.33,0.4,0.2,2.87,-2.61,0.53,1.0,3.47,0.07,0.0,-1.13,4.6,0.13,-0.26,-3.06,0.03,-0.0,0.04,7.81,


In [256]:
# Dropping rows with NaN values from years < 2024 and from the specified columns
rb_per_game_filtered = rb_per_game_filtered.drop(index = rb_na_obs.index).dropna(subset = ['rushing_epa_per_game', 
                                                                                  'receiving_epa_per_game', 'racr'])

rb_per_game_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Index: 727 entries, 103 to 5814
Data columns (total 29 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   player_id                              727 non-null    object 
 1   player_display_name                    727 non-null    object 
 2   position                               727 non-null    object 
 3   years_of_experience                    727 non-null    int64  
 4   season                                 727 non-null    int64  
 5   recent_team                            727 non-null    object 
 6   new_team_next_year                     727 non-null    int64  
 7   games                                  727 non-null    int64  
 8   carries_per_game                       727 non-null    float64
 9   rushing_yards_per_game                 727 non-null    float64
 10  rushing_tds_per_game                   727 non-null    float64
 11  rushing_

In [257]:
# Creating the WR only dataframes which contains player info and stats specific to the position
wr_all = all_seasons[all_seasons['position'] == 'WR']

wr_filtered = wr_all[['player_id', 'player_display_name', 'position', 'years_of_experience', 'season', 'recent_team', 
                      'new_team_next_year', 'games', 'carries', 'rushing_yards', 'rushing_tds', 'rushing_fumbles', 'rushing_first_downs',
                      'receptions', 'targets', 'receiving_yards', 'receiving_tds', 'receiving_fumbles', 'receiving_air_yards',
                      'receiving_yards_after_catch', 'receiving_first_downs', 'receiving_epa', 'racr', 'target_share', 'air_yards_share',
                      'wopr', 'fantasy_points_ppr'
             ]].copy()

wr_filtered.sort_values(by = 'fantasy_points_ppr', ascending = False).head()

Unnamed: 0,player_id,player_display_name,position,years_of_experience,season,recent_team,new_team_next_year,games,carries,rushing_yards,rushing_tds,rushing_fumbles,rushing_first_downs,receptions,targets,receiving_yards,receiving_tds,receiving_fumbles,receiving_air_yards,receiving_yards_after_catch,receiving_first_downs,receiving_epa,racr,target_share,air_yards_share,wopr,fantasy_points_ppr
3413,00-0033908,Cooper Kupp,WR,4,2021,LA,0,17,4,18,0,0,1,145,191,1947,16,0,1641,846,89,112.935848,1.186472,0.317276,0.319634,0.699657,439.5
4850,00-0036358,CeeDee Lamb,WR,3,2023,DAL,0,17,14,113,2,0,6,135,181,1749,12,3,1722,680,81,103.519985,1.015679,0.299174,0.359199,0.7002,403.2
5142,00-0036900,Ja'Marr Chase,WR,3,2024,CIN,0,17,3,32,0,0,2,127,175,1708,17,0,1526,787,75,77.010441,1.119266,0.278662,0.330733,0.649506,403.0
517,00-0027793,Antonio Brown,WR,5,2015,PIT,0,16,3,28,0,0,1,136,193,1834,10,1,2110,593,84,58.203196,0.869194,0.330479,0.358234,0.746483,390.2
2781,00-0033040,Tyreek Hill,WR,7,2023,MIA,0,16,6,15,0,0,2,119,171,1799,13,1,1847,652,83,91.748831,0.974012,0.311475,0.428936,0.767469,376.4


In [258]:
# Transforming the count stats into per-game stats to make the values more robust to injuries 
wr_count_stats = ['carries', 'rushing_yards', 'rushing_tds', 'rushing_fumbles', 'rushing_first_downs', 'receptions', 
                  'targets', 'receiving_yards', 'receiving_tds', 'receiving_fumbles', 'receiving_air_yards', 
                  'receiving_yards_after_catch', 'receiving_first_downs', 'receiving_epa', 'fantasy_points_ppr']


wr_per_game = wr_filtered.copy()

for stat in wr_count_stats:
    wr_per_game[stat + '_per_game'] = (wr_per_game[stat] / wr_per_game['games'].replace(0, pd.NA))

wr_per_game = wr_per_game[['player_id', 'player_display_name', 'position', 'years_of_experience', 'season', 'recent_team', 
                           'new_team_next_year', 'games', 'racr', 'target_share', 'air_yards_share', 'wopr'] + [
                               col for col in wr_per_game.columns if col.endswith('_per_game')]]

rushing_cols = ['carries_per_game', 'rushing_yards_per_game', 'rushing_tds_per_game',
                'rushing_fumbles_per_game', 'rushing_first_downs_per_game']

receiving_cols = ['receptions_per_game', 'targets_per_game', 'receiving_yards_per_game',
                  'receiving_tds_per_game', 'receiving_fumbles_per_game', 'receiving_air_yards_per_game',
                  'receiving_yards_after_catch_per_game', 'receiving_first_downs_per_game',
                  'receiving_epa_per_game', 'racr', 'target_share', 'air_yards_share', 'wopr']

fantasy_cols = ['fantasy_points_ppr_per_game']

# Ordering the final columns and filtering to only include players with >= 8 games played
final_cols = ['player_id', 'player_display_name', 'position', 'years_of_experience', 'season', 'recent_team', 'new_team_next_year', 
              'games'] + receiving_cols + rushing_cols + fantasy_cols

wr_per_game = wr_per_game[final_cols]

wr_per_game_filtered = wr_per_game[wr_per_game['games'] >= 8].round(2)

wr_per_game_filtered = wr_per_game_filtered.sort_values(by = ['player_id', 'season'])

# Creating the target variable
wr_per_game_filtered['fantasy_points_ppr_per_game_next_year'] = (
    wr_per_game_filtered.groupby('player_id')['fantasy_points_ppr_per_game'].shift(-1)
)

wr_per_game_filtered.head()

Unnamed: 0,player_id,player_display_name,position,years_of_experience,season,recent_team,new_team_next_year,games,receptions_per_game,targets_per_game,receiving_yards_per_game,receiving_tds_per_game,receiving_fumbles_per_game,receiving_air_yards_per_game,receiving_yards_after_catch_per_game,receiving_first_downs_per_game,receiving_epa_per_game,racr,target_share,air_yards_share,wopr,carries_per_game,rushing_yards_per_game,rushing_tds_per_game,rushing_fumbles_per_game,rushing_first_downs_per_game,fantasy_points_ppr_per_game,fantasy_points_ppr_per_game_next_year
12,00-0020337,Steve Smith,WR,15,2016,BAL,0,14,5.0,7.21,57.07,0.36,0.0,62.36,20.07,2.79,2.71,0.92,0.15,0.17,0.35,0.0,0.0,0.0,0.0,0.0,13.14,
35,00-0022044,Andre Johnson,WR,12,2015,IND,1,16,2.56,4.81,31.44,0.25,0.0,49.5,9.19,1.94,0.83,0.64,0.13,0.15,0.3,0.0,0.0,0.0,0.0,0.0,7.21,
37,00-0022084,Anquan Boldin,WR,12,2015,SF,1,14,4.93,7.93,56.36,0.29,0.07,63.86,20.14,2.5,0.02,0.88,0.21,0.23,0.48,0.0,0.0,0.0,0.0,0.0,12.14,10.84
38,00-0022084,Anquan Boldin,WR,13,2016,DET,0,16,4.19,5.94,36.5,0.5,0.0,34.12,14.44,2.56,1.75,1.07,0.16,0.12,0.33,0.0,0.0,0.0,0.0,0.0,10.84,
44,00-0022414,Malcom Floyd,WR,11,2015,LAC,0,15,2.0,4.6,37.4,0.2,0.07,92.47,5.73,1.27,-0.11,0.4,0.1,0.29,0.36,0.0,0.0,0.0,0.0,0.0,6.81,


In [259]:
# Observing potential missing values (NaN values are expected in the 2024 season for the target variable)
wr_per_game_filtered_no_2024 = wr_per_game_filtered[wr_per_game_filtered['season'] != 2024]

print(f'There are {wr_per_game_filtered_no_2024.isna().any(axis = 1).sum()} unexpected rows with NaN values\n')

wr_na_obs = wr_per_game_filtered_no_2024[wr_per_game_filtered_no_2024.isna().any(axis = 1)]

wr_na_obs

# Missing values mostly appear to be due to lack of playing time or retirement in the following year 

There are 340 unexpected rows with NaN values



Unnamed: 0,player_id,player_display_name,position,years_of_experience,season,recent_team,new_team_next_year,games,receptions_per_game,targets_per_game,receiving_yards_per_game,receiving_tds_per_game,receiving_fumbles_per_game,receiving_air_yards_per_game,receiving_yards_after_catch_per_game,receiving_first_downs_per_game,receiving_epa_per_game,racr,target_share,air_yards_share,wopr,carries_per_game,rushing_yards_per_game,rushing_tds_per_game,rushing_fumbles_per_game,rushing_first_downs_per_game,fantasy_points_ppr_per_game,fantasy_points_ppr_per_game_next_year
12,00-0020337,Steve Smith,WR,15,2016,BAL,0,14,5.0,7.21,57.07,0.36,0.0,62.36,20.07,2.79,2.71,0.92,0.15,0.17,0.35,0.0,0.0,0.0,0.0,0.0,13.14,
35,00-0022044,Andre Johnson,WR,12,2015,IND,1,16,2.56,4.81,31.44,0.25,0.0,49.5,9.19,1.94,0.83,0.64,0.13,0.15,0.3,0.0,0.0,0.0,0.0,0.0,7.21,
38,00-0022084,Anquan Boldin,WR,13,2016,DET,0,16,4.19,5.94,36.5,0.5,0.0,34.12,14.44,2.56,1.75,1.07,0.16,0.12,0.33,0.0,0.0,0.0,0.0,0.0,10.84,
44,00-0022414,Malcom Floyd,WR,11,2015,LAC,0,15,2.0,4.6,37.4,0.2,0.07,92.47,5.73,1.27,-0.11,0.4,0.1,0.29,0.36,0.0,0.0,0.0,0.0,0.0,6.81,
57,00-0022811,Jerricho Cotchery,WR,11,2015,CAR,0,14,2.79,3.86,34.64,0.21,0.0,32.21,14.0,1.86,2.18,1.08,0.11,0.09,0.22,0.07,1.14,0.0,0.0,0.07,7.65,
64,00-0022921,Larry Fitzgerald,WR,16,2020,ARI,0,13,4.15,5.54,31.46,0.08,0.0,33.23,14.23,1.92,-0.14,0.95,0.13,0.1,0.27,0.0,0.0,0.0,0.0,0.0,7.76,
82,00-0023310,Lance Moore,WR,10,2015,DET,0,13,2.23,3.31,25.92,0.31,0.08,32.38,4.54,1.54,0.71,0.8,0.07,0.1,0.18,0.0,0.0,0.0,0.0,0.0,6.52,
83,00-0023367,Nate Washington,WR,10,2015,HOU,0,13,3.62,7.23,50.62,0.31,0.0,82.08,12.46,2.54,1.05,0.62,0.15,0.18,0.36,0.0,0.0,0.0,0.0,0.0,10.52,
99,00-0023462,Roddy White,WR,10,2015,ATL,0,15,2.87,4.67,33.73,0.07,0.0,46.93,7.0,1.8,0.21,0.72,0.11,0.15,0.27,0.0,0.0,0.0,0.0,0.0,6.64,
101,00-0023496,Vincent Jackson,WR,10,2015,TB,0,10,3.3,6.2,54.3,0.3,0.0,83.1,10.3,2.8,2.35,0.65,0.12,0.16,0.29,0.0,0.0,0.0,0.0,0.0,10.73,


In [260]:
# Dropping rows with NaN values from years < 2024 and from the specified columns
wr_per_game_filtered = wr_per_game_filtered.drop(index = wr_na_obs.index).dropna(subset = ['receiving_epa_per_game', 'racr'])

wr_per_game_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1199 entries, 37 to 5812
Data columns (total 28 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   player_id                              1199 non-null   object 
 1   player_display_name                    1199 non-null   object 
 2   position                               1199 non-null   object 
 3   years_of_experience                    1199 non-null   int64  
 4   season                                 1199 non-null   int64  
 5   recent_team                            1199 non-null   object 
 6   new_team_next_year                     1199 non-null   int64  
 7   games                                  1199 non-null   int64  
 8   receptions_per_game                    1199 non-null   float64
 9   targets_per_game                       1199 non-null   float64
 10  receiving_yards_per_game               1199 non-null   float64
 11  receivin

In [261]:
# Creating the TE only dataframes which contains player info and stats specific to the position
te_all = all_seasons[all_seasons['position'] == 'TE']

te_filtered = te_all[['player_id', 'player_display_name', 'position', 'years_of_experience', 'season', 'recent_team', 
                      'new_team_next_year', 'games', 'receptions', 'targets', 'receiving_yards', 'receiving_tds', 
                      'receiving_fumbles', 'receiving_air_yards', 'receiving_yards_after_catch', 'receiving_first_downs', 
                      'receiving_epa', 'racr', 'target_share', 'air_yards_share', 'wopr', 'fantasy_points_ppr'
             ]].copy()

te_filtered.sort_values(by = 'fantasy_points_ppr', ascending = False).head()

Unnamed: 0,player_id,player_display_name,position,years_of_experience,season,recent_team,new_team_next_year,games,receptions,targets,receiving_yards,receiving_tds,receiving_fumbles,receiving_air_yards,receiving_yards_after_catch,receiving_first_downs,receiving_epa,racr,target_share,air_yards_share,wopr,fantasy_points_ppr
1329,00-0030506,Travis Kelce,TE,9,2022,KC,0,17,110,152,1338,12,1,1074,657,78,66.036887,1.24581,0.247557,0.227929,0.530886,316.3
1327,00-0030506,Travis Kelce,TE,7,2020,KC,0,15,105,145,1416,11,1,1243,587,79,75.219912,1.139179,0.237316,0.243582,0.526481,312.76
3915,00-0034753,Mark Andrews,TE,3,2021,BAL,0,17,107,153,1361,9,1,1574,450,75,56.071777,0.864676,0.258883,0.300841,0.598914,301.1
1325,00-0030506,Travis Kelce,TE,5,2018,KC,0,16,103,150,1336,10,2,1375,568,68,69.233599,0.971636,0.265957,0.260762,0.58147,294.6
1120,00-0030061,Zach Ertz,TE,5,2018,PHI,0,16,116,156,1163,8,1,1130,353,66,19.195038,1.029204,0.263514,0.253534,0.572744,280.3


In [262]:
# Transforming the count stats into per-game stats to make the values more robust to injuries 
te_count_stats = ['receptions', 'targets', 'receiving_yards', 'receiving_tds', 'receiving_fumbles', 'receiving_air_yards', 
                  'receiving_yards_after_catch', 'receiving_first_downs', 'receiving_epa', 'fantasy_points_ppr']


te_per_game = te_filtered.copy()

for stat in te_count_stats:
    te_per_game[stat + '_per_game'] = (te_per_game[stat] / te_per_game['games'].replace(0, pd.NA))

te_per_game = te_per_game[['player_id', 'player_display_name', 'position', 'years_of_experience', 'season', 'recent_team', 
                           'new_team_next_year', 'games', 'racr', 'target_share', 'air_yards_share', 'wopr'] + [
                               col for col in te_per_game.columns if col.endswith('_per_game')]]

# Filtering to only include players with >= 8 games played
te_per_game_filtered = te_per_game[te_per_game['games'] >= 8].round(2)

te_per_game_filtered = te_per_game_filtered.sort_values(by = ['player_id', 'season'])

# Creating the target variable
te_per_game_filtered['fantasy_points_ppr_per_game_next_year'] = (
    te_per_game_filtered.groupby('player_id')['fantasy_points_ppr_per_game'].shift(-1)
)

te_per_game_filtered.head()

Unnamed: 0,player_id,player_display_name,position,years_of_experience,season,recent_team,new_team_next_year,games,racr,target_share,air_yards_share,wopr,receptions_per_game,targets_per_game,receiving_yards_per_game,receiving_tds_per_game,receiving_fumbles_per_game,receiving_air_yards_per_game,receiving_yards_after_catch_per_game,receiving_first_downs_per_game,receiving_epa_per_game,fantasy_points_ppr_per_game,fantasy_points_ppr_per_game_next_year
29,00-0021547,Antonio Gates,TE,12,2015,LAC,0,11,0.91,0.13,0.14,0.29,5.09,7.73,57.27,0.45,0.0,63.27,21.73,3.0,2.03,13.55,11.37
30,00-0021547,Antonio Gates,TE,13,2016,LAC,0,13,0.71,0.16,0.16,0.36,4.08,7.15,42.15,0.54,0.08,59.15,14.46,2.62,0.91,11.37,4.97
31,00-0021547,Antonio Gates,TE,14,2017,LAC,0,16,0.73,0.09,0.09,0.2,1.88,3.25,19.75,0.19,0.0,27.0,4.12,1.0,0.19,4.97,4.71
32,00-0021547,Antonio Gates,TE,15,2018,LAC,0,16,0.94,0.09,0.09,0.2,1.75,2.81,20.81,0.12,0.06,22.25,7.88,1.19,0.36,4.71,
39,00-0022127,Jason Witten,TE,12,2015,DAL,0,16,0.98,0.2,0.18,0.43,4.81,6.5,44.56,0.19,0.06,45.44,14.38,2.25,-0.16,10.27,10.15


In [263]:
# Observing potential missing values (NaN values are expected in the 2024 season for the target variable)
te_per_game_filtered_no_2024 = te_per_game_filtered[te_per_game_filtered['season'] != 2024]

print(f'There are {te_per_game_filtered_no_2024.isna().any(axis = 1).sum()} unexpected rows with NaN values\n')

te_na_obs = te_per_game_filtered_no_2024[te_per_game_filtered_no_2024.isna().any(axis = 1)]

te_na_obs

# Missing values appear to be due to lack of playing time or retirement in the following year 

There are 170 unexpected rows with NaN values



Unnamed: 0,player_id,player_display_name,position,years_of_experience,season,recent_team,new_team_next_year,games,racr,target_share,air_yards_share,wopr,receptions_per_game,targets_per_game,receiving_yards_per_game,receiving_tds_per_game,receiving_fumbles_per_game,receiving_air_yards_per_game,receiving_yards_after_catch_per_game,receiving_first_downs_per_game,receiving_epa_per_game,fantasy_points_ppr_per_game,fantasy_points_ppr_per_game_next_year
32,00-0021547,Antonio Gates,TE,15,2018,LAC,0,16,0.94,0.09,0.09,0.2,1.75,2.81,20.81,0.12,0.06,22.25,7.88,1.19,0.36,4.71,
43,00-0022127,Jason Witten,TE,17,2020,LV,0,10,0.76,0.03,0.02,0.06,1.3,1.7,6.9,0.2,0.0,9.1,2.0,0.8,0.24,3.19,
81,00-0022943,Benjamin Watson,TE,15,2019,NE,0,9,0.81,0.04,0.05,0.09,1.89,2.67,19.22,0.0,0.0,23.78,9.22,1.0,1.13,3.81,
100,00-0023465,Heath Miller,TE,10,2015,PIT,0,15,1.15,0.14,0.08,0.26,4.0,5.4,35.67,0.13,0.0,31.0,14.13,1.93,1.33,8.51,
139,00-0024221,Vernon Davis,TE,12,2018,WAS,0,13,0.88,0.07,0.1,0.18,1.92,2.77,28.23,0.15,0.0,32.0,12.92,1.23,1.46,5.82,
152,00-0024243,Marcedes Lewis,TE,15,2021,GB,0,13,2.08,0.05,0.02,0.09,1.77,2.15,16.46,0.0,0.08,7.92,11.08,0.77,0.66,3.42,
162,00-0024268,Anthony Fasano,TE,11,2017,MIA,0,11,0.9,0.03,0.02,0.06,1.09,1.45,9.73,0.09,0.09,10.82,3.36,0.55,-0.68,2.61,
168,00-0024313,Owen Daniels,TE,9,2015,DEN,0,16,0.98,0.13,0.1,0.26,2.88,4.81,32.31,0.19,0.0,33.0,16.56,1.69,0.23,7.23,
176,00-0024389,Delanie Walker,TE,11,2017,TEN,0,16,0.8,0.23,0.23,0.51,4.62,6.94,50.44,0.19,0.12,63.19,15.5,2.69,0.09,10.91,
208,00-0025418,Greg Olsen,TE,13,2020,SEA,0,11,0.83,0.07,0.06,0.15,2.18,3.36,21.73,0.09,0.0,26.27,4.45,1.45,0.19,4.9,


In [264]:
# Dropping rows with NaN values from years < 2024
te_per_game_filtered = te_per_game_filtered.drop(index = te_na_obs.index)

te_per_game_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Index: 610 entries, 29 to 5792
Data columns (total 23 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   player_id                              610 non-null    object 
 1   player_display_name                    610 non-null    object 
 2   position                               610 non-null    object 
 3   years_of_experience                    610 non-null    int64  
 4   season                                 610 non-null    int64  
 5   recent_team                            610 non-null    object 
 6   new_team_next_year                     610 non-null    int64  
 7   games                                  610 non-null    int64  
 8   racr                                   610 non-null    float64
 9   target_share                           610 non-null    float64
 10  air_yards_share                        610 non-null    float64
 11  wopr     

In [265]:
# Saving the cleaned datasets to CSV files
qb_per_game_filtered.to_csv('../data/processed/qb_per_game_filtered.csv', index = False)
rb_per_game_filtered.to_csv('../data/processed/rb_per_game_filtered.csv', index = False)
wr_per_game_filtered.to_csv('../data/processed/wr_per_game_filtered.csv', index = False)
te_per_game_filtered.to_csv('../data/processed/te_per_game_filtered.csv', index = False)