# Import Data

In [8]:
# clear memory
# %reset -f

# helper.py
from helper import *

- use volume


In [59]:
# load data and sort
df = pd.read_csv('./data/clean/fantasy_data.csv').sort_values(by=['Key', 'Exp']).reset_index(drop=True)

# get injury-related cols
injury_cols = ['Player', 'Tm', 'Pos', 'Key', 'Year', 'Age', 'Exp', 'games_played_pct', 'games_started_pct', 'Pass_Att_per_game', 'Rush_Att_per_game', 'Rec_Tgt_per_game', 'Rec_Rec_per_game']
df = df[injury_cols]

# target is games played in next season
df['target'] = df.groupby('Key')['games_played_pct'].shift(-1)

# show shape and nulls
show_shape_and_nulls(df)

Shape: (29369, 14)
Null values:


Unnamed: 0,Player,Tm,Pos,Key,Year,Age,Exp,games_played_pct,games_started_pct,Pass_Att_per_game,Rush_Att_per_game,Rec_Tgt_per_game,Rec_Rec_per_game,target
0,0,0,0,0,0,0,0,0,0,0,0,0,0,6959


- The only null values are held in the "Target" column (indicating players in their final season). We cannot use rows without a target to train, so they will be dropped here.

In [60]:
# drop rows with null target values
df = df.dropna(subset='target')

# check
df.isna().sum().sum()

0

# Create Features

In [None]:
def create_features(df, target_col):
    """
    Create features for each player.

    Args:
    - df (pd.dataframe): Player data.

    Returns:
    - (pl.dataframe): Dataframe with new features added.
    """

    # convert to polars dataframe and sort
    df = pl.from_pandas(df).sort(["Key", "Year"])

    # convert to a lazy frame for efficiency
    lazy_df = df.lazy()

    # define cols to aggregate
    non_agg_cols = ['Player', 'Tm', 'Pos', 'Key', 'Year', 'Age', 'Exp'] + [target_col]
    agg_cols = [col for col in df.columns if col not in non_agg_cols]

    # list of expressions for original columns
    base_exprs = [pl.col('*')]

    # expressions that rely on prior aliases
    post_exprs = []

    # iterate through each column to be aggregated
    for col in agg_cols:
        # rolling stats (n years)
        for n in [2, 3]:
            base_exprs.extend([
                pl.col(col)
                .rolling_mean(window_size=n, min_samples=1)
                .over('Key')
                .alias(f'{col}_{n}y_mean'),
                pl.col(col)
                .rolling_std(window_size=n, min_samples=1)
                .over('Key')
                .alias(f'{col}_{n}y_std')])

        # cumulative career mean
        cum_sum = pl.col(col).cum_sum().over('Key')
        cum_count = (pl.col('Exp') + 1)
        cum_mean = (cum_sum / cum_count).alias(f'{col}_career_mean')
        base_exprs.extend([cum_mean])

    # add the new columns to df
    lazy_df = lazy_df.with_columns(base_exprs)

    # collect results back into a pandas df
    df_pandas = lazy_df.collect().to_pandas()

    # fill nulls and infs with 0
    non_target_cols = [col for col in df_pandas.columns if col != target_col]
    df_pandas[non_target_cols] = df_pandas[non_target_cols].replace([np.inf, -np.inf], np.nan).fillna(0)

    # sort columns 
    return df_pandas[sorted(df_pandas.columns)]

In [62]:
# aggregate data by player
features = create_features(df, target_col='target')

# show shape and nulls
show_shape_and_nulls(features)

Shape: (22410, 56)
Null values:


Unnamed: 0,Age,Exp,Key,Pass_Att_per_game,Pass_Att_per_game_2y_mean,Pass_Att_per_game_2y_std,Pass_Att_per_game_3y_mean,Pass_Att_per_game_3y_std,Pass_Att_per_game_4y_mean,Pass_Att_per_game_4y_std,Pass_Att_per_game_career_mean,Player,Pos,Rec_Rec_per_game,Rec_Rec_per_game_2y_mean,Rec_Rec_per_game_2y_std,Rec_Rec_per_game_3y_mean,Rec_Rec_per_game_3y_std,Rec_Rec_per_game_4y_mean,Rec_Rec_per_game_4y_std,Rec_Rec_per_game_career_mean,Rec_Tgt_per_game,Rec_Tgt_per_game_2y_mean,Rec_Tgt_per_game_2y_std,Rec_Tgt_per_game_3y_mean,Rec_Tgt_per_game_3y_std,Rec_Tgt_per_game_4y_mean,Rec_Tgt_per_game_4y_std,Rec_Tgt_per_game_career_mean,Rush_Att_per_game,Rush_Att_per_game_2y_mean,Rush_Att_per_game_2y_std,Rush_Att_per_game_3y_mean,Rush_Att_per_game_3y_std,Rush_Att_per_game_4y_mean,Rush_Att_per_game_4y_std,Rush_Att_per_game_career_mean,Tm,Year,games_played_pct,games_played_pct_2y_mean,games_played_pct_2y_std,games_played_pct_3y_mean,games_played_pct_3y_std,games_played_pct_4y_mean,games_played_pct_4y_std,games_played_pct_career_mean,games_started_pct,games_started_pct_2y_mean,games_started_pct_2y_std,games_started_pct_3y_mean,games_started_pct_3y_std,games_started_pct_4y_mean,games_started_pct_4y_std,games_started_pct_career_mean,target
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [63]:
# get 2024 features
features_2024 = features[features['Year'] == 2024].copy()

# drop players with null target values
features = features.dropna(subset='target')

# check
features.isna().sum().sum()

0

# Positional Subsets

In [64]:
# get positional subsets
qb, rb, wr_te = get_pos_subsets(features)

# show shapes
qb.shape, rb.shape, wr_te.shape

((3458, 40), (7089, 48), (11863, 48))

# Baseline
- Baseline RMSE scores for the 3 positional subsets using an XGBoost with 1000 trees.

In [58]:
# added 2yr stats
for pos, data in zip(['QB', 'RB', 'WR/TE'], [qb, rb, wr_te]):
    # cross validate
    model = XGBRegressor(n_jobs=-1, random_state=SEED, n_estimators=1000)
    summary = cross_val(df=data, target_col='target', estimator=model)

    # show results
    print(f'--- {pos} ---')
    display(summary)

--- QB ---


Unnamed: 0,train_rmse,train_r2,val_rmse,val_r2
mean,0.023505,0.99501,0.309306,0.148888
std,0.003312,0.001375,0.008444,0.031412


--- RB ---


Unnamed: 0,train_rmse,train_r2,val_rmse,val_r2
mean,0.044403,0.974904,0.300237,-0.147688
std,0.002047,0.002082,0.00459,0.020959


--- WR/TE ---


Unnamed: 0,train_rmse,train_r2,val_rmse,val_r2
mean,0.060292,0.954274,0.29932,-0.128511
std,0.000771,0.000846,0.006014,0.039049


In [65]:
# added 4yr stats
for pos, data in zip(['QB', 'RB', 'WR/TE'], [qb, rb, wr_te]):
    # cross validate
    model = XGBRegressor(n_jobs=-1, random_state=SEED, n_estimators=1000)
    summary = cross_val(df=data, target_col='target', estimator=model)

    # show results
    print(f'--- {pos} ---')
    display(summary)

--- QB ---


Unnamed: 0,train_rmse,train_r2,val_rmse,val_r2
mean,0.023517,0.995005,0.309643,0.147072
std,0.00332,0.00138,0.007586,0.026319


--- RB ---


Unnamed: 0,train_rmse,train_r2,val_rmse,val_r2
mean,0.04438,0.974929,0.301191,-0.154836
std,0.002052,0.002088,0.005707,0.018107


--- WR/TE ---


Unnamed: 0,train_rmse,train_r2,val_rmse,val_r2
mean,0.060207,0.954403,0.298371,-0.121539
std,0.000794,0.000857,0.003093,0.035686


- The train metrics are almost perfect, with much worse validation scores.
- We will run 100 iterations of bayesian optimization on each positional group to regularize and bring the validation scores down.