In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression as LR, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor as RFR
from xgboost import XGBRegressor as XGBR
from scipy.stats import loguniform, randint
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
import joblib

In [2]:
def run_search(X_train, y_train, name):
    
    param_dist = [
        {
            'reg': [LR()]
        },
        {
            'reg': [Lasso(max_iter = 5000)],
            'reg__alpha': [.001, .01, .1, 1, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
        },
        {
            'reg': [Ridge(max_iter = 5000)],
            'reg__alpha': [.001, .01, .1, 1, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
        },
        {
            'reg': [RFR(random_state = 23)],
            'reg__max_depth': [None] + list(np.arange(3, 31, 3)),
            'reg__n_estimators': np.arange(100, 1001, 100),
            'reg__min_samples_split': randint(2, 15),
            'reg__min_samples_leaf': randint(1, 10),
            'reg__max_features': ['sqrt', 'log', .5, .75]
        },
        {
            'reg': [XGBR(random_state = 23)],
            'reg__max_depth': np.arange(2, 15),
            'reg__n_estimators': np.arange(100, 1001, 100),
            'reg__learning_rate': loguniform(.1, .3),
            'reg__subsample': [.6, .8, 1.0],
            'reg__colsample_bytree': [.6, .8, 1.0],
            'reg__gamma': [0, 1, 5]
        }
    ]

    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('reg', LR())
    ])

    cv = KFold(n_splits = 5, shuffle = True, random_state = 23)

    search = RandomizedSearchCV(pipe, param_dist, scoring = 'neg_root_mean_squared_error', cv = cv, n_iter = 50, 
                            n_jobs = -1, verbose = 1, random_state = 23)

    search.fit(X_train, y_train)

    best_params = search.best_params_
    best_score = -search.best_score_
    best_model = search.best_estimator_

    print(f'\n{name} Best Parameters: {best_params}')
    print(f'\n{name} Best RMSE: {best_score:.3f}')
    print(f'\n{name} Best Model: {best_model}')

    return search, best_model

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [4]:
qb = pd.read_csv('../data/processed/qb_per_game_filtered.csv')
rb = pd.read_csv('../data/processed/rb_per_game_filtered.csv')
wr = pd.read_csv('../data/processed/wr_per_game_filtered.csv')
te = pd.read_csv('../data/processed/te_per_game_filtered.csv')

In [5]:
qb.head()

Unnamed: 0,player_id,player_display_name,position,season,recent_team,games,completions_per_game,attempts_per_game,passing_yards_per_game,passing_tds_per_game,passing_interceptions_per_game,sacks_suffered_per_game,sack_fumbles_per_game,passing_first_downs_per_game,passing_epa_per_game,passing_cpoe_per_game,pacr_per_game,carries_per_game,rushing_yards_per_game,rushing_tds_per_game,rushing_fumbles_per_game,rushing_first_downs_per_game,rushing_epa_per_game,fantasy_points_ppr_per_game,fantasy_points_ppr_per_game_next_year
0,00-0007091,Matt Hasselbeck,QB,2015,IND,8,19.5,32.0,211.2,1.1,0.6,2.0,0.4,10.6,-0.0,-0.3,0.1,2.0,1.9,0.0,0.0,0.2,-0.7,11.4,
1,00-0010346,Peyton Manning,QB,2015,DEN,10,19.8,33.1,224.9,0.9,1.7,1.6,0.1,11.0,-3.2,-0.2,0.1,0.6,-0.6,0.0,0.0,0.0,-0.4,9.1,
2,00-0019596,Tom Brady,QB,2015,NE,16,25.1,39.0,298.1,2.2,0.4,2.4,0.3,14.2,8.0,-0.0,0.1,2.1,3.3,0.2,0.1,0.9,0.4,21.5,21.5
3,00-0019596,Tom Brady,QB,2016,NE,12,24.2,36.0,296.2,2.3,0.2,1.2,0.2,13.6,12.2,0.3,0.1,2.3,5.3,0.0,0.1,0.9,-0.5,21.5,18.5
4,00-0019596,Tom Brady,QB,2017,NE,16,24.1,36.3,286.1,2.0,0.5,2.2,0.4,14.4,8.8,0.2,0.1,1.6,1.8,0.0,0.1,0.6,-0.8,18.5,17.6


In [6]:
train_qb = qb[qb['season'] != 2024].dropna()

test_qb = qb[qb['season'] == 2024].drop(columns = 'fantasy_points_ppr_per_game_next_year')

X_qb = train_qb.drop(columns = ['fantasy_points_ppr_per_game_next_year', 'fantasy_points_ppr_per_game',
                                      'player_id', 'player_display_name', 'position', 'season', 'recent_team',
                                      'games'])

y_qb = train_qb['fantasy_points_ppr_per_game_next_year']

X_train_qb, X_val_qb, y_train_qb, y_val_qb = train_test_split(X_qb, y_qb, test_size = .2, random_state = 23)

train_qb.info()

<class 'pandas.core.frame.DataFrame'>
Index: 385 entries, 2 to 523
Data columns (total 25 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   player_id                              385 non-null    object 
 1   player_display_name                    385 non-null    object 
 2   position                               385 non-null    object 
 3   season                                 385 non-null    int64  
 4   recent_team                            385 non-null    object 
 5   games                                  385 non-null    int64  
 6   completions_per_game                   385 non-null    float64
 7   attempts_per_game                      385 non-null    float64
 8   passing_yards_per_game                 385 non-null    float64
 9   passing_tds_per_game                   385 non-null    float64
 10  passing_interceptions_per_game         385 non-null    float64
 11  sacks_suffe

In [7]:
search_qb, best_model_qb = run_search(X_train_qb, y_train_qb, 'QB')

Fitting 5 folds for each of 50 candidates, totalling 250 fits

QB Best Parameters: {'reg': RandomForestRegressor(random_state=23), 'reg__max_depth': 27, 'reg__max_features': 'sqrt', 'reg__min_samples_leaf': 8, 'reg__min_samples_split': 9, 'reg__n_estimators': 400}

QB Best RMSE: 4.887

QB Best Model: Pipeline(steps=[('scaler', StandardScaler()),
                ('reg',
                 RandomForestRegressor(max_depth=27, max_features='sqrt',
                                       min_samples_leaf=8, min_samples_split=9,
                                       n_estimators=400, random_state=23))])


In [8]:
train_rb = rb[rb['season'] != 2024].dropna()

test_rb = rb[rb['season'] == 2024].drop(columns = 'fantasy_points_ppr_per_game_next_year')

X_rb = train_rb.drop(columns = ['fantasy_points_ppr_per_game_next_year', 'fantasy_points_ppr_per_game',
                                      'player_id', 'player_display_name', 'position', 'season', 'recent_team',
                                      'games'])

y_rb = train_rb['fantasy_points_ppr_per_game_next_year']

X_train_rb, X_val_rb, y_train_rb, y_val_rb = train_test_split(X_rb, y_rb, test_size = .2, random_state = 23)

train_rb.info()

<class 'pandas.core.frame.DataFrame'>
Index: 815 entries, 0 to 1144
Data columns (total 27 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   player_id                              815 non-null    object 
 1   player_display_name                    815 non-null    object 
 2   position                               815 non-null    object 
 3   season                                 815 non-null    int64  
 4   recent_team                            815 non-null    object 
 5   games                                  815 non-null    int64  
 6   carries_per_game                       815 non-null    float64
 7   rushing_yards_per_game                 815 non-null    float64
 8   rushing_tds_per_game                   815 non-null    float64
 9   rushing_fumbles_per_game               815 non-null    float64
 10  rushing_first_downs_per_game           815 non-null    float64
 11  rushing_ep

In [9]:
search_rb, best_model_rb = run_search(X_train_rb, y_train_rb, 'RB')

Fitting 5 folds for each of 50 candidates, totalling 250 fits

RB Best Parameters: {'reg': Ridge(max_iter=5000), 'reg__alpha': 80}

RB Best RMSE: 3.898

RB Best Model: Pipeline(steps=[('scaler', StandardScaler()),
                ('reg', Ridge(alpha=80, max_iter=5000))])


In [10]:
train_wr = wr[wr['season'] != 2024].dropna()

test_wr = wr[wr['season'] == 2024].drop(columns = 'fantasy_points_ppr_per_game_next_year')

X_wr = train_wr.drop(columns = ['fantasy_points_ppr_per_game_next_year', 'fantasy_points_ppr_per_game',
                                      'player_id', 'player_display_name', 'position', 'season', 'recent_team',
                                      'games'])

y_wr = train_wr['fantasy_points_ppr_per_game_next_year']

X_train_wr, X_val_wr, y_train_wr, y_val_wr = train_test_split(X_wr, y_wr, test_size = .2, random_state = 23)

train_wr.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1301 entries, 0 to 1829
Data columns (total 26 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   player_id                              1301 non-null   object 
 1   player_display_name                    1301 non-null   object 
 2   position                               1301 non-null   object 
 3   season                                 1301 non-null   int64  
 4   recent_team                            1301 non-null   object 
 5   games                                  1301 non-null   int64  
 6   receptions_per_game                    1301 non-null   float64
 7   targets_per_game                       1301 non-null   float64
 8   receiving_yards_per_game               1301 non-null   float64
 9   receiving_tds_per_game                 1301 non-null   float64
 10  receiving_fumbles_per_game             1301 non-null   float64
 11  receiving

In [11]:
search_wr, best_model_wr = run_search(X_train_wr, y_train_wr, 'WR')

Fitting 5 folds for each of 50 candidates, totalling 250 fits

WR Best Parameters: {'reg': Ridge(max_iter=5000), 'reg__alpha': 80}

WR Best RMSE: 3.335

WR Best Model: Pipeline(steps=[('scaler', StandardScaler()),
                ('reg', Ridge(alpha=80, max_iter=5000))])


In [12]:
train_te = te[te['season'] != 2024].dropna()

test_te = te[te['season'] == 2024].drop(columns = 'fantasy_points_ppr_per_game_next_year')

X_te = train_te.drop(columns = ['fantasy_points_ppr_per_game_next_year', 'fantasy_points_ppr_per_game',
                                      'player_id', 'player_display_name', 'position', 'season', 'recent_team',
                                      'games'])

y_te = train_te['fantasy_points_ppr_per_game_next_year']

X_train_te, X_val_te, y_train_te, y_val_te = train_test_split(X_te, y_te, test_size = .2, random_state = 23)

train_te.info()

<class 'pandas.core.frame.DataFrame'>
Index: 722 entries, 0 to 1006
Data columns (total 21 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   player_id                              722 non-null    object 
 1   player_display_name                    722 non-null    object 
 2   position                               722 non-null    object 
 3   season                                 722 non-null    int64  
 4   recent_team                            722 non-null    object 
 5   games                                  722 non-null    int64  
 6   racr                                   722 non-null    float64
 7   target_share                           722 non-null    float64
 8   air_yards_share                        722 non-null    float64
 9   wopr                                   722 non-null    float64
 10  receptions_per_game                    722 non-null    float64
 11  targets_pe

In [13]:
search_te, best_model_te = run_search(X_train_te, y_train_te, 'TE')

Fitting 5 folds for each of 50 candidates, totalling 250 fits

TE Best Parameters: {'reg': Ridge(max_iter=5000), 'reg__alpha': 10}

TE Best RMSE: 2.555

TE Best Model: Pipeline(steps=[('scaler', StandardScaler()),
                ('reg', Ridge(alpha=10, max_iter=5000))])


In [35]:
best_model_rb

In [31]:
joblib.dump(best_model_qb, '../models/best_model_qb.pkl')
joblib.dump(best_model_rb, '../models/best_model_rb.pkl')
joblib.dump(best_model_wr, '../models/best_model_wr.pkl')
joblib.dump(best_model_te, '../models/best_model_te.pkl')

joblib.dump(X_val_qb, '../data/processed/X_val_qb.pkl')
joblib.dump(X_val_rb, '../data/processed/X_val_rb.pkl')
joblib.dump(X_val_wr, '../data/processed/X_val_wr.pkl')
joblib.dump(X_val_te, '../data/processed/X_val_te.pkl')

joblib.dump(y_val_qb, '../data/processed/y_val_qb.pkl')
joblib.dump(y_val_rb, '../data/processed/y_val_rb.pkl')
joblib.dump(y_val_wr, '../data/processed/y_val_wr.pkl')
joblib.dump(y_val_te, '../data/processed/y_val_te.pkl')

joblib.dump(test_qb, '../data/processed/test_qb.pkl')
joblib.dump(test_rb, '../data/processed/test_rb.pkl')
joblib.dump(test_wr, '../data/processed/test_wr.pkl')
joblib.dump(test_te, '../data/processed/test_te.pkl')

<class 'pandas.core.frame.DataFrame'>
Index: 77 entries, 404 to 203
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   completions_per_game            77 non-null     float64
 1   attempts_per_game               77 non-null     float64
 2   passing_yards_per_game          77 non-null     float64
 3   passing_tds_per_game            77 non-null     float64
 4   passing_interceptions_per_game  77 non-null     float64
 5   sacks_suffered_per_game         77 non-null     float64
 6   sack_fumbles_per_game           77 non-null     float64
 7   passing_first_downs_per_game    77 non-null     float64
 8   passing_epa_per_game            77 non-null     float64
 9   passing_cpoe_per_game           77 non-null     float64
 10  pacr_per_game                   77 non-null     float64
 11  carries_per_game                77 non-null     float64
 12  rushing_yards_per_game          77 non-n