In [113]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression as LR, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor as RFR
from xgboost import XGBRegressor as XGBR
from scipy.stats import loguniform, randint
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
import joblib

In [114]:
# Defining a function to use for finding the most appropriate model for each dataset
def run_search(X_train, y_train, name):
    '''
    Runs RandomizedSearchCV on a specified pipeline and parameter distribution, and returns 
    info about the model with the lowest CV RMSE
    '''
    param_dist = [
        {
            'reg': [LR()]
        },
        {
            'reg': [Lasso(max_iter = 5000)],
            'reg__alpha': [.001, .01, .1, 1, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
        },
        {
            'reg': [Ridge(max_iter = 5000)],
            'reg__alpha': [.001, .01, .1, 1, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
        },
        {
            'reg': [RFR(random_state = 23)],
            'reg__max_depth': [None] + list(np.arange(3, 31, 3)),
            'reg__n_estimators': np.arange(100, 1001, 100),
            'reg__min_samples_split': randint(2, 15),
            'reg__min_samples_leaf': randint(1, 10),
            'reg__max_features': ['sqrt', 'log', .5, .75]
        },
        {
            'reg': [XGBR(random_state = 23)],
            'reg__max_depth': np.arange(2, 15),
            'reg__n_estimators': np.arange(100, 1001, 100),
            'reg__learning_rate': loguniform(.1, .3),
            'reg__subsample': [.6, .8, 1.0],
            'reg__colsample_bytree': [.6, .8, 1.0],
            'reg__gamma': [0, 1, 5],
            'reg__reg_alpha': loguniform(1e-3, 10),
            'reg__reg_lambda': loguniform(1e-3, 10),
            'reg__min_child_weight': np.arange(1, 10),
            'reg__max_delta_step': [0, 1, 5],
            'reg__learning_rate': loguniform(0.01, 0.3)
        }
    ]

    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('reg', LR())
    ])

    cv = KFold(n_splits = 5, shuffle = True, random_state = 23)

    search = RandomizedSearchCV(pipe, param_dist, scoring = 'neg_root_mean_squared_error', cv = cv, n_iter = 50, 
                            n_jobs = -1, verbose = 1, random_state = 23)

    search.fit(X_train, y_train)

    best_params = search.best_params_
    best_score = -search.best_score_
    best_model = search.best_estimator_

    print(f'\n{name} Best Parameters: {best_params}')
    print(f'\n{name} Best RMSE: {best_score:.3f}')
    print(f'\n{name} Best Model: {best_model}')

    return search, best_model

In [115]:
# Ensuring that all columns and rows are displayed
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [116]:
# Loading in the cleaned CSV files for each position
qb = pd.read_csv('../data/processed/qb_per_game_filtered.csv')
rb = pd.read_csv('../data/processed/rb_per_game_filtered.csv')
wr = pd.read_csv('../data/processed/wr_per_game_filtered.csv')
te = pd.read_csv('../data/processed/te_per_game_filtered.csv')

In [117]:
# Splitting the data into train and test sets
train_qb = qb[qb['season'] != 2024]

test_qb = qb[qb['season'] == 2024].drop(columns = 'fantasy_points_ppr_per_game_next_year')

# Filtering to create the features and the target data 
X_qb = train_qb.drop(columns = ['fantasy_points_ppr_per_game_next_year', 'fantasy_points_ppr_per_game',
                                      'player_id', 'player_display_name', 'position', 'season', 'recent_team'])

y_qb = train_qb['fantasy_points_ppr_per_game_next_year']

# Splitting the initial train data into train and validation sets 
X_train_qb, X_val_qb, y_train_qb, y_val_qb = train_test_split(X_qb, y_qb, test_size = .2, random_state = 23)

X_train_qb.info()

<class 'pandas.core.frame.DataFrame'>
Index: 201 entries, 18 to 85
Data columns (total 20 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   years_of_experience             201 non-null    int64  
 1   new_team_next_year              201 non-null    int64  
 2   games                           201 non-null    int64  
 3   completions_per_game            201 non-null    float64
 4   attempts_per_game               201 non-null    float64
 5   passing_yards_per_game          201 non-null    float64
 6   passing_tds_per_game            201 non-null    float64
 7   passing_interceptions_per_game  201 non-null    float64
 8   sacks_suffered_per_game         201 non-null    float64
 9   sack_fumbles_per_game           201 non-null    float64
 10  passing_first_downs_per_game    201 non-null    float64
 11  passing_epa_per_game            201 non-null    float64
 12  passing_cpoe_per_game           201 non-n

In [118]:
# Running the function to return the model with the lowest CV RMSE
search_qb, best_model_qb = run_search(X_train_qb, y_train_qb, 'QB')

Fitting 5 folds for each of 50 candidates, totalling 250 fits

QB Best Parameters: {'reg': Ridge(max_iter=5000), 'reg__alpha': 80}

QB Best RMSE: 3.690

QB Best Model: Pipeline(steps=[('scaler', StandardScaler()),
                ('reg', Ridge(alpha=80, max_iter=5000))])


In [119]:
# Splitting the data into train and test sets
train_rb = rb[rb['season'] != 2024]

test_rb = rb[rb['season'] == 2024].drop(columns = 'fantasy_points_ppr_per_game_next_year')

# Filtering to create the features and target data
X_rb = train_rb.drop(columns = ['fantasy_points_ppr_per_game_next_year', 'fantasy_points_ppr_per_game',
                                      'player_id', 'player_display_name', 'position', 'season', 'recent_team'])

y_rb = train_rb['fantasy_points_ppr_per_game_next_year']

# Splitting the initial train data into train and validation sets 
X_train_rb, X_val_rb, y_train_rb, y_val_rb = train_test_split(X_rb, y_rb, test_size = .2, random_state = 23)

X_train_rb.info()

<class 'pandas.core.frame.DataFrame'>
Index: 505 entries, 425 to 644
Data columns (total 22 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   years_of_experience                   505 non-null    int64  
 1   new_team_next_year                    505 non-null    int64  
 2   games                                 505 non-null    int64  
 3   carries_per_game                      505 non-null    float64
 4   rushing_yards_per_game                505 non-null    float64
 5   rushing_tds_per_game                  505 non-null    float64
 6   rushing_fumbles_per_game              505 non-null    float64
 7   rushing_first_downs_per_game          505 non-null    float64
 8   rushing_epa_per_game                  505 non-null    float64
 9   receptions_per_game                   505 non-null    float64
 10  targets_per_game                      505 non-null    float64
 11  receiving_yards_per_ga

In [120]:
# Running the function to return the model with the lowest CV RMSE
search_rb, best_model_rb = run_search(X_train_rb, y_train_rb, 'RB')

Fitting 5 folds for each of 50 candidates, totalling 250 fits

RB Best Parameters: {'reg': Ridge(max_iter=5000), 'reg__alpha': 80}

RB Best RMSE: 3.963

RB Best Model: Pipeline(steps=[('scaler', StandardScaler()),
                ('reg', Ridge(alpha=80, max_iter=5000))])


In [121]:
# Splitting the data into train and test sets
train_wr = wr[wr['season'] != 2024]

test_wr = wr[wr['season'] == 2024].drop(columns = 'fantasy_points_ppr_per_game_next_year')

# Filtering to create the features and target data
X_wr = train_wr.drop(columns = ['fantasy_points_ppr_per_game_next_year', 'fantasy_points_ppr_per_game',
                                      'player_id', 'player_display_name', 'position', 'season', 'recent_team'])

y_wr = train_wr['fantasy_points_ppr_per_game_next_year']

# Splitting the initial train data into train and validation sets 
X_train_wr, X_val_wr, y_train_wr, y_val_wr = train_test_split(X_wr, y_wr, test_size = .2, random_state = 23)

X_train_wr.info()

<class 'pandas.core.frame.DataFrame'>
Index: 833 entries, 76 to 617
Data columns (total 21 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   years_of_experience                   833 non-null    int64  
 1   new_team_next_year                    833 non-null    int64  
 2   games                                 833 non-null    int64  
 3   receptions_per_game                   833 non-null    float64
 4   targets_per_game                      833 non-null    float64
 5   receiving_yards_per_game              833 non-null    float64
 6   receiving_tds_per_game                833 non-null    float64
 7   receiving_fumbles_per_game            833 non-null    float64
 8   receiving_air_yards_per_game          833 non-null    float64
 9   receiving_yards_after_catch_per_game  833 non-null    float64
 10  receiving_first_downs_per_game        833 non-null    float64
 11  receiving_epa_per_game 

In [122]:
# Running the function to return the model with the lowest CV RMSE
search_wr, best_model_wr = run_search(X_train_wr, y_train_wr, 'WR')

Fitting 5 folds for each of 50 candidates, totalling 250 fits

WR Best Parameters: {'reg': Ridge(max_iter=5000), 'reg__alpha': 50}

WR Best RMSE: 3.291

WR Best Model: Pipeline(steps=[('scaler', StandardScaler()),
                ('reg', Ridge(alpha=50, max_iter=5000))])


In [123]:
# Splitting into train and test sets
train_te = te[te['season'] != 2024]

test_te = te[te['season'] == 2024].drop(columns = 'fantasy_points_ppr_per_game_next_year')

# Filtering to create the features and the test data
X_te = train_te.drop(columns = ['fantasy_points_ppr_per_game_next_year', 'fantasy_points_ppr_per_game',
                                      'player_id', 'player_display_name', 'position', 'season', 'recent_team'])

y_te = train_te['fantasy_points_ppr_per_game_next_year']

# Splitting the initial train data into train and validation sets 
X_train_te, X_val_te, y_train_te, y_val_te = train_test_split(X_te, y_te, test_size = .2, random_state = 23)

X_train_te.info()

<class 'pandas.core.frame.DataFrame'>
Index: 424 entries, 287 to 40
Data columns (total 16 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   years_of_experience                   424 non-null    int64  
 1   new_team_next_year                    424 non-null    int64  
 2   games                                 424 non-null    int64  
 3   racr                                  424 non-null    float64
 4   target_share                          424 non-null    float64
 5   air_yards_share                       424 non-null    float64
 6   wopr                                  424 non-null    float64
 7   receptions_per_game                   424 non-null    float64
 8   targets_per_game                      424 non-null    float64
 9   receiving_yards_per_game              424 non-null    float64
 10  receiving_tds_per_game                424 non-null    float64
 11  receiving_fumbles_per_g

In [124]:
# Running the function to return the model with the lowest CV RMSE
search_te, best_model_te = run_search(X_train_te, y_train_te, 'TE')

Fitting 5 folds for each of 50 candidates, totalling 250 fits

TE Best Parameters: {'reg': Lasso(max_iter=5000), 'reg__alpha': 0.1}

TE Best RMSE: 2.650

TE Best Model: Pipeline(steps=[('scaler', StandardScaler()),
                ('reg', Lasso(alpha=0.1, max_iter=5000))])


In [125]:
# Saving every model pipeline, validation set, and test set for each position into pkl files
joblib.dump(best_model_qb, '../models/best_model_qb.pkl')
joblib.dump(best_model_rb, '../models/best_model_rb.pkl')
joblib.dump(best_model_wr, '../models/best_model_wr.pkl')
joblib.dump(best_model_te, '../models/best_model_te.pkl')

joblib.dump(X_val_qb, '../data/processed/X_val_qb.pkl')
joblib.dump(X_val_rb, '../data/processed/X_val_rb.pkl')
joblib.dump(X_val_wr, '../data/processed/X_val_wr.pkl')
joblib.dump(X_val_te, '../data/processed/X_val_te.pkl')

joblib.dump(y_val_qb, '../data/processed/y_val_qb.pkl')
joblib.dump(y_val_rb, '../data/processed/y_val_rb.pkl')
joblib.dump(y_val_wr, '../data/processed/y_val_wr.pkl')
joblib.dump(y_val_te, '../data/processed/y_val_te.pkl')

joblib.dump(test_qb, '../data/processed/test_qb.pkl')
joblib.dump(test_rb, '../data/processed/test_rb.pkl')
joblib.dump(test_wr, '../data/processed/test_wr.pkl')
joblib.dump(test_te, '../data/processed/test_te.pkl')

['../data/processed/test_te.pkl']