In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set working directory
import os
current_dir = os.getcwd()
os.chdir(current_dir.replace('\code', '\data'))

In [2]:
# Load data
passing = pd.read_csv('gross_passing_for_ml.csv')
rushing = pd.read_csv('gross_rushing_for_ml.csv')
receiving = pd.read_csv('gross_receiving_for_ml.csv')

passing.head()

Unnamed: 0,Player,Age,Tm,Pos,G_x,GS,Cmp_x,Att,Cmp%,Yds_x,...,TD_Rush,YdsPer_Rush,1stD_Rush,Pen,Yds_Pen,1stPy,Sc%,TO%,EXP,Future_Passing_Points
0,A.J. McCarron,27.0,CIN,QB,3.0,0.0,7.0,14.0,50.0,66.0,...,6,3.6,72,110,1027,30,26.5,11.4,-108.45,0.32
1,A.J. McCarron,28.0,OAK,QB,2.0,0.0,1.0,3.0,33.3,8.0,...,9,4.2,86,110,965,17,31.3,13.6,-17.92,7.0
2,A.J. McCarron,29.0,HOU,QB,2.0,1.0,21.0,37.0,56.8,225.0,...,17,4.6,112,111,892,31,37.9,12.4,118.67,0.8
3,Aaron Rodgers,29.0,GNB,QB,16.0,16.0,371.0,552.0,67.2,4295.0,...,9,3.9,85,103,923,43,37.6,8.1,25.77,157.44
4,Aaron Rodgers,30.0,GNB,QB,9.0,9.0,193.0,290.0,66.6,2536.0,...,17,4.7,119,86,801,31,40.9,12.4,113.82,317.24


In [3]:
import numpy as np
import pandas as pd
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

def xgboost_model(X, y, df):
    # Initialize the model
    model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=200)
    scaler = StandardScaler()
    kf = KFold(n_splits=3, shuffle=True)

    # Initialize lists to store results
    r2_scores = []
    mse_scores = []
    predictions = np.zeros(len(y))

    # Loop through each fold
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        r2 = r2_score(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        r2_scores.append(r2)
        mse_scores.append(mse)
        predictions[test_index] = y_pred

    # Add predictions to the original DataFrame
    df['XGBoost'] = predictions

    print('R2:', np.mean(r2_scores))
    print('MSE:', np.mean(mse_scores))
    return df

# Load data
passing = pd.read_csv('gross_passing_for_ml.csv')
passing = passing.dropna(subset=['Future_Passing_Points']).reset_index(drop=True)
passing = passing.fillna(-1)
passing = pd.get_dummies(passing, columns=['Pos'])
columns_to_drop = ['Player', 'Tm', 'Year', 'Future_Passing_Points']
X = passing.drop(columns=columns_to_drop)
y = passing['Future_Passing_Points']

# Run the function
xgboost_model(X, y, passing).head(10)

R2: 0.4879061430368707
MSE: 5984.8952080164845


Unnamed: 0,Player,Age,Tm,G_x,GS,Cmp_x,Att,Cmp%,Yds_x,TD,...,1stPy,Sc%,TO%,EXP,Future_Passing_Points,Pos_QB,Pos_RB,Pos_WR,Pos_WR/QB,XGBoost
0,A.J. McCarron,27.0,CIN,3.0,0.0,7.0,14.0,50.0,66.0,0.0,...,30,26.5,11.4,-108.45,0.32,True,False,False,False,24.163828
1,A.J. McCarron,28.0,OAK,2.0,0.0,1.0,3.0,33.3,8.0,0.0,...,17,31.3,13.6,-17.92,7.0,True,False,False,False,9.717576
2,A.J. McCarron,29.0,HOU,2.0,1.0,21.0,37.0,56.8,225.0,0.0,...,31,37.9,12.4,118.67,0.8,True,False,False,False,19.357555
3,Aaron Rodgers,29.0,GNB,16.0,16.0,371.0,552.0,67.2,4295.0,39.0,...,43,37.6,8.1,25.77,157.44,True,False,False,False,278.819
4,Aaron Rodgers,30.0,GNB,9.0,9.0,193.0,290.0,66.6,2536.0,17.0,...,31,40.9,12.4,113.82,317.24,True,False,False,False,233.142548
5,Aaron Rodgers,31.0,GNB,16.0,16.0,341.0,520.0,65.6,4381.0,38.0,...,30,46.7,7.1,172.28,260.84,True,False,False,False,267.861267
6,Aaron Rodgers,32.0,GNB,16.0,16.0,347.0,572.0,60.7,3821.0,31.0,...,44,33.3,8.9,40.61,323.12,True,False,False,False,269.601379
7,Aaron Rodgers,33.0,GNB,16.0,16.0,401.0,610.0,65.7,4428.0,40.0,...,36,43.8,9.1,115.16,119.0,True,False,False,False,226.648041
8,Aaron Rodgers,34.0,GNB,7.0,7.0,154.0,238.0,64.7,1675.0,16.0,...,32,30.5,13.2,-61.83,273.68,True,False,False,False,235.052872
9,Aaron Rodgers,35.0,GNB,16.0,16.0,372.0,597.0,62.3,4442.0,25.0,...,26,38.8,6.7,108.88,256.08,True,False,False,False,268.157166


In [4]:
# Load data
rushing = pd.read_csv('gross_rushing_for_ml.csv')
rushing = rushing.dropna(subset=['Future_Rushing_Points']).reset_index(drop=True)
rushing = rushing.fillna(-1)
rushing = pd.get_dummies(rushing, columns=['Pos'])
columns_to_drop = ['Player', 'Tm', 'Year', 'Future_Rushing_Points', 'Player-additional']
X = rushing.drop(columns=columns_to_drop)
y = rushing['Future_Rushing_Points']

# Run the function
xgboost_model(X, y, rushing).head(10)

R2: 0.5026481801447402
MSE: 1266.9386882910376


Unnamed: 0,Player,Tm,Age,G_x,GS,Att,Yds_x,TD,1D,Lng,...,TO%,EXP,Future_Rushing_Points,Pos_DB,Pos_FB,Pos_QB,Pos_RB,Pos_TE,Pos_WR,XGBoost
0,A.J. Jenkins,KAN,24.0,16.0,1.0,2.0,6.0,0.0,0.0,5.0,...,9.1,38.85,-2.2,False,False,False,False,False,True,1.492403
1,A.J. McCarron,OAK,28.0,2.0,0.0,3.0,-2.0,0.0,0.0,0.0,...,13.6,-17.92,9.9,False,False,True,False,False,False,0.239005
2,AJ Dillon,GNB,22.0,11.0,0.0,46.0,242.0,2.0,11.0,30.0,...,5.6,290.75,106.3,False,False,False,True,False,False,60.368179
3,AJ Dillon,GNB,23.0,17.0,2.0,187.0,803.0,5.0,50.0,36.0,...,6.4,180.42,117.0,False,False,False,True,False,False,94.472313
4,AJ Dillon,GNB,24.0,17.0,3.0,186.0,770.0,7.0,47.0,27.0,...,11.3,68.9,73.3,False,False,False,True,False,False,70.98526
5,Aaron Jones,GNB,23.0,12.0,4.0,81.0,448.0,4.0,23.0,46.0,...,13.2,-61.83,118.8,False,False,False,True,False,False,83.905495
6,Aaron Jones,GNB,24.0,12.0,8.0,133.0,728.0,8.0,41.0,67.0,...,6.7,108.88,198.4,False,False,False,True,False,False,86.750267
7,Aaron Jones,GNB,25.0,16.0,16.0,236.0,1084.0,16.0,55.0,56.0,...,6.8,113.4,160.4,False,False,False,True,False,False,111.257843
8,Aaron Jones,GNB,26.0,14.0,14.0,201.0,1104.0,9.0,52.0,77.0,...,5.6,290.75,99.9,False,False,False,True,False,False,164.754852
9,Aaron Jones,GNB,27.0,15.0,15.0,171.0,799.0,4.0,37.0,57.0,...,6.4,180.42,114.1,False,False,False,True,False,False,127.949318


In [5]:
# Load data
receiving = pd.read_csv('gross_receiving_for_ml.csv')
receiving = receiving.dropna(subset=['Future_Receiving_Points']).reset_index(drop=True)
receiving = receiving.fillna(-1)
receiving = pd.get_dummies(receiving, columns=['Pos'])
columns_to_drop = ['Player', 'Tm', 'Year', 'Future_Receiving_Points', 'Player-additional']
X = receiving.drop(columns=columns_to_drop)
y = receiving['Future_Receiving_Points']

# Run the function
xgboost_model(X, y, receiving).head(10)

R2: 0.41304693020265565
MSE: 2169.526262916142


Unnamed: 0,Player,Tm,Age,G_x,GS,Tgt,Rec,Ctch%,Yds_x,Y/R,...,1stPy,Sc%,TO%,EXP,Future_Receiving_Points,Pos_FB,Pos_RB,Pos_TE,Pos_WR,XGBoost
0,A.J. Brown,TEN,22.0,16.0,11.0,84.0,52.0,61.9,1051.0,20.2,...,36,31.4,8.6,115.88,208.5,False,False,False,True,178.340439
1,A.J. Brown,TEN,23.0,14.0,12.0,106.0,70.0,66.0,1075.0,15.4,...,36,47.9,7.2,245.01,148.4,False,False,False,True,178.790512
2,A.J. Brown,TEN,24.0,13.0,13.0,105.0,63.0,60.0,869.0,13.8,...,34,38.9,13.0,79.19,259.6,False,False,False,True,208.68837
3,A.J. Brown,PHI,25.0,17.0,16.0,145.0,88.0,60.7,1496.0,17.0,...,29,42.5,10.2,226.96,240.6,False,False,False,True,195.891098
4,A.J. Green,CIN,24.0,16.0,16.0,164.0,97.0,59.1,1350.0,13.9,...,24,35.4,12.8,-90.48,257.6,False,False,False,True,191.50885
5,A.J. Green,CIN,25.0,16.0,16.0,178.0,98.0,55.1,1426.0,14.6,...,24,32.3,14.9,33.5,174.6,False,False,False,True,242.504517
6,A.J. Green,CIN,26.0,13.0,13.0,117.0,69.0,59.0,1041.0,15.1,...,27,34.0,13.1,13.84,232.7,False,False,False,True,165.958237
7,A.J. Green,CIN,27.0,16.0,16.0,132.0,86.0,65.2,1297.0,15.1,...,34,39.6,9.3,82.59,153.4,False,False,False,True,213.127701
8,A.J. Green,CIN,28.0,10.0,10.0,100.0,66.0,66.0,964.0,14.6,...,37,35.8,8.5,30.41,193.3,False,False,False,True,193.642471
9,A.J. Green,CIN,29.0,16.0,16.0,143.0,75.0,52.4,1078.0,14.4,...,30,26.5,11.4,-108.45,128.4,False,False,False,True,183.856644


In [23]:
# LSTM
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from keras.callbacks import EarlyStopping
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import KFold

def lstm_model(X, y, df):
    # Initialize the model
    model = Sequential()
    model.add(LSTM(10, activation='relu', input_shape=(X.shape[1], 1)))
    model.add(Dropout(0.1))
    model.add(Dense(10, activation='relu'))
    model.add(Dropout(0.1))
    model.add(Dense(10))
    model.add(Dense(1))
    model.compile(optimizer='adam', loss='mse')
    scaler = StandardScaler()
    kf = KFold(n_splits=3, shuffle=True)

    # Initialize lists to store results
    r2_scores = []
    mse_scores = []
    predictions = np.zeros(len(y))

    # Loop through each fold
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
        X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)
        model.fit(X_train, y_train, epochs=50, batch_size=10, verbose=0, callbacks=[EarlyStopping(patience=5)])
        y_pred = model.predict(X_test)
        r2 = r2_score(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        r2_scores.append(r2)
        mse_scores.append(mse)
        predictions[test_index] = y_pred.reshape(-1)

    # Add predictions to the original DataFrame
    df['LSTM'] = predictions

    print('R2:', np.mean(r2_scores))
    print('MSE:', np.mean(mse_scores))
    return df

In [24]:
# LSTM for passing
passing = pd.read_csv('gross_passing_for_ml.csv')
passing = passing.dropna(subset=['Future_Passing_Points']).reset_index(drop=True)
passing = passing.fillna(-1)
passing = pd.get_dummies(passing, columns=['Pos'])
columns_to_drop = ['Player', 'Tm', 'Year', 'Future_Passing_Points']
X = passing.drop(columns=columns_to_drop)
y = passing['Future_Passing_Points']

# Run the function
lstm_model(X, y, passing).head(10)

  super().__init__(**kwargs)
  current = self.get_monitor_value(logs)


[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step


  current = self.get_monitor_value(logs)


[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


  current = self.get_monitor_value(logs)


[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
R2: -0.09663156784125611
MSE: 12818.328289730567


Unnamed: 0,Player,Age,Tm,G_x,GS,Cmp_x,Att,Cmp%,Yds_x,TD,...,1stPy,Sc%,TO%,EXP,Future_Passing_Points,Pos_QB,Pos_RB,Pos_WR,Pos_WR/QB,LSTM
0,A.J. McCarron,27.0,CIN,3.0,0.0,7.0,14.0,50.0,66.0,0.0,...,30,26.5,11.4,-108.45,0.32,True,False,False,False,111.845161
1,A.J. McCarron,28.0,OAK,2.0,0.0,1.0,3.0,33.3,8.0,0.0,...,17,31.3,13.6,-17.92,7.0,True,False,False,False,82.292999
2,A.J. McCarron,29.0,HOU,2.0,1.0,21.0,37.0,56.8,225.0,0.0,...,31,37.9,12.4,118.67,0.8,True,False,False,False,91.641891
3,Aaron Rodgers,29.0,GNB,16.0,16.0,371.0,552.0,67.2,4295.0,39.0,...,43,37.6,8.1,25.77,157.44,True,False,False,False,99.779152
4,Aaron Rodgers,30.0,GNB,9.0,9.0,193.0,290.0,66.6,2536.0,17.0,...,31,40.9,12.4,113.82,317.24,True,False,False,False,94.730019
5,Aaron Rodgers,31.0,GNB,16.0,16.0,341.0,520.0,65.6,4381.0,38.0,...,30,46.7,7.1,172.28,260.84,True,False,False,False,75.049355
6,Aaron Rodgers,32.0,GNB,16.0,16.0,347.0,572.0,60.7,3821.0,31.0,...,44,33.3,8.9,40.61,323.12,True,False,False,False,123.605293
7,Aaron Rodgers,33.0,GNB,16.0,16.0,401.0,610.0,65.7,4428.0,40.0,...,36,43.8,9.1,115.16,119.0,True,False,False,False,99.190712
8,Aaron Rodgers,34.0,GNB,7.0,7.0,154.0,238.0,64.7,1675.0,16.0,...,32,30.5,13.2,-61.83,273.68,True,False,False,False,99.625351
9,Aaron Rodgers,35.0,GNB,16.0,16.0,372.0,597.0,62.3,4442.0,25.0,...,26,38.8,6.7,108.88,256.08,True,False,False,False,74.881752


In [4]:
receiving.columns

Index(['Player', 'Tm', 'Age', 'G_x', 'GS', 'Tgt', 'Rec', 'Ctch%', 'Yds_x',
       'Y/R', 'TD', '1D', 'Succ%', 'Lng', 'Y/Tgt', 'R/G', 'Y/G', 'Fmb',
       'Player-additional', 'AllPro', 'ProBowl', 'Receiving_Points', 'Year',
       'Career_Passing_Points', 'Career_Rushing_Points',
       'Career_Receiving_Points', 'Pick', 'Kmeans', 'PF', 'Yds_y', 'Ply',
       'Y/P', 'TO', 'FL', '1stD', 'Cmp', 'Att_Pass', 'Yds_Pass', 'TD_Pass',
       'Int', 'NY/A', '1stD_Pass', 'Att_Rush', 'Yds_Rush', 'TD_Rush',
       'YdsPer_Rush', '1stD_Rush', 'Pen', 'Yds_Pen', '1stPy', 'Sc%', 'TO%',
       'EXP', 'Future_Receiving_Points', 'Pos_FB', 'Pos_RB', 'Pos_TE',
       'Pos_WR', 'Predicted_Future_Receiving_Points'],
      dtype='object')

In [8]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score, mean_squared_error

# Load and preprocess data
receiving = pd.read_csv('gross_receiving_for_ml.csv')
receiving = receiving.dropna(subset=['Future_Receiving_Points']).reset_index(drop=True)
receiving = receiving.fillna(-1)
receiving = pd.get_dummies(receiving, columns=['Pos'])
columns_to_drop = ['Player', 'Tm', 'Year', 'Future_Receiving_Points']
X = receiving.drop(columns=columns_to_drop)
y = receiving['Future_Receiving_Points']

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.33, random_state=42)

# Define the MLPRegressor
mlp = MLPRegressor(max_iter=10000, learning_rate='constant', solver='adam', activation='relu',
                   random_state=42)

# Define the grid of hyperparameters
param_grid = {
    'hidden_layer_sizes': [(100, 100), (50, 50, 50), (20, 20, 20, 20), (5, 5, 5, 5, 5)],
    'alpha': [10, 0.0001],
    'tol': [10, 0.0001]
}

# Perform grid search
grid_search = GridSearchCV(estimator=mlp, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error')
grid_result = grid_search.fit(X_train, y_train)

# Print the top 5 performing hyperparameters
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

# Evaluate the best model on the test set
best_model = grid_result.best_estimator_
y_pred = best_model.predict(X_test)
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print(f"Test R2: {r2}")
print(f"Test MSE: {mse}")

# Add predictions to the original DataFrame
receiving['Predicted_Future_Receiving_Points'] = np.nan
receiving.loc[y_test.index, 'Predicted_Future_Receiving_Points'] = y_pred

Best: -2079.809929 using {'alpha': 0.0001, 'hidden_layer_sizes': (5, 5, 5, 5, 5), 'tol': 0.0001}
-2291.180149 (231.224257) with: {'alpha': 10, 'hidden_layer_sizes': (100, 100), 'tol': 10}
-2112.581691 (103.267615) with: {'alpha': 10, 'hidden_layer_sizes': (100, 100), 'tol': 0.0001}
-2222.588588 (223.422379) with: {'alpha': 10, 'hidden_layer_sizes': (50, 50, 50), 'tol': 10}
-2185.382865 (143.659448) with: {'alpha': 10, 'hidden_layer_sizes': (50, 50, 50), 'tol': 0.0001}
-2261.022221 (191.145130) with: {'alpha': 10, 'hidden_layer_sizes': (20, 20, 20, 20), 'tol': 10}
-2135.951929 (245.200244) with: {'alpha': 10, 'hidden_layer_sizes': (20, 20, 20, 20), 'tol': 0.0001}
-15510.336002 (1248.131803) with: {'alpha': 10, 'hidden_layer_sizes': (5, 5, 5, 5, 5), 'tol': 10}
-2096.210546 (133.764180) with: {'alpha': 10, 'hidden_layer_sizes': (5, 5, 5, 5, 5), 'tol': 0.0001}
-2297.663279 (232.794021) with: {'alpha': 0.0001, 'hidden_layer_sizes': (100, 100), 'tol': 10}
-2123.401392 (121.376390) with: {'al