NFL passing stats with k-fold cross validation by time series split (continuation of my final project)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

passing_data = np.array(pd.read_csv('/content/drive/My Drive/KSU-MachineLearning/FinalProject/passing_cleaned.csv'))

# index of touchdowns is 10, this will be our Y
y = passing_data[:, 10]
print(f'The y matrix : {y}')

# We want X to not include index 0 (data point number), index 10 (passing touchdown), and index 11 (touchdown %)
indices_to_delete = [0, 10, 11]

X = np.delete(passing_data, indices_to_delete, 1)

# grab the different teams and names so we can encode them
categorical_indeces = np.array([0, 1])
numerical_indeces = np.array(list(range(2, 24)))

categorical_data = np.delete(X, numerical_indeces, 1)
numerical_data = np.delete(X, categorical_indeces, 1)
print(categorical_data)
print(numerical_data)

#encode names
encoded_categorical1 = LabelEncoder().fit_transform(categorical_data[:,0]).reshape(-1,1)
print(encoded_categorical1)

#encode teams
encoded_categorical2 = LabelEncoder().fit_transform(categorical_data[:,1]).reshape(-1,1)

encoded_categorical = np.concatenate((encoded_categorical1, encoded_categorical2), axis=1)

# combine them back (concatenate side by side)
X_transform = np.concatenate((encoded_categorical, numerical_data), axis=1)
column_names = ['Player','Tm', 'Age', 'G', 'GS', 'Cmp', 'Att', 'Cmp%', 'Yds', 'Int', 'Int%', '1D', 'Lng', 'Y/A', 'AY/A', 'Y/C', 'Y/G', 'Rate', 'Sk', 'Yds-s', 'Sk%', 'NY/A', 'ANY/A', 'Yea']
print(len(column_names))
print(X_transform)
print(X_transform.shape)


The y matrix : [36 26 32 ... 0 0 0]
[['Kurt Warner' 'STL']
 ['Peyton Manning' 'IND']
 ['Brett Favre' 'GNB']
 ...
 ['Garrett Wilson' 'NYJ']
 ['Christian Kirk' 'JAX']
 ["Ja'Marr Chase" 'CIN']]
[[30 16 16 ... 7.87 7.41 2001]
 [25 16 16 ... 6.77 5.88 2001]
 [32 16 16 ... 7.09 7.02 2001]
 ...
 [23 17 17 ... 0.0 0.0 2023]
 [27 12 12 ... -0.5 -0.5 2023]
 [23 16 16 ... -7.0 -7.0 2023]]
[[423]
 [544]
 [ 74]
 ...
 [270]
 [143]
 [290]]
24
[[423 33 30 ... 7.87 7.41 2001]
 [544 15 25 ... 6.77 5.88 2001]
 [74 13 32 ... 7.09 7.02 2001]
 ...
 [270 26 23 ... 0.0 0.0 2023]
 [143 16 27 ... -0.5 -0.5 2023]
 [290 8 23 ... -7.0 -7.0 2023]]
(2350, 24)


Now we will use time series split to the split the data it to different years.

In [None]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, root_mean_squared_error

clf = RandomForestRegressor(oob_score=True, n_estimators=175, max_depth = 20, criterion = 'absolute_error')

ts_split = TimeSeriesSplit(n_splits=8, max_train_size=None, test_size=None, gap =0)
for fold, (train_index, test_index) in enumerate(ts_split.split(X_transform, y)):
  Xtrain, Xtest = X_transform[train_index], X_transform[test_index]
  ytrain, ytest = y[train_index], y[test_index]
  print(f'Fold X: {Xtrain}')
  print("end x fold")
  print(Xtest)
  print("done printing Xtest")

  clf.fit(Xtrain, ytrain)

  if(fold == 1):
    y_manninghat = clf.predict(X_transform[344].reshape(1,-1)) #Peyton Manning 2004
    print(f'Models Manning TD Error in 2004: {y[344] - y_manninghat}')
  if(fold == 2):
    y_manninghat = clf.predict(X_transform[560].reshape(1,-1)) #Peyton Manning 2006
    print(f'Models Manning TD Error in 2006: {y[560] - y_manninghat}')
  if(fold == 3):
    y_manninghat = clf.predict(X_transform[871].reshape(1,-1)) #Peyton Manning 2009
    print(f'Models Manning TD Error in 2009:{y[871] - y_manninghat}')
  if(fold == 4):
    y_manninghat = clf.predict(X_transform[1187].reshape(1,-1)) #Peyton Manning 2012
    print(f'Models Manning TD Error in 2012:{y[1187] - y_manninghat}')


  y_hat = clf.predict(Xtest)

  oob_score_def = clf.oob_score_
  print(f'The Out Of Bag score is {oob_score_def}')

  mse_def = mean_squared_error(ytest, y_hat)
  print(f'The mean squared error is {mse_def}')

  rmse_def = root_mean_squared_error(ytest, y_hat)
  print(f'The root mean squared error is {rmse_def}')

  r2_def = r2_score(ytest, y_hat)
  print(f'The r2 score is {r2_def}')



Fold X: [[423 33 30 ... 7.87 7.41 2001]
 [544 15 25 ... 6.77 5.88 2001]
 [74 13 32 ... 7.09 7.02 2001]
 ...
 [49 35 27 ... 6.67 7.13 2003]
 [343 25 25 ... 3.31 2.4 2003]
 [463 16 33 ... 4.81 5.25 2003]]
end x fold
[[183 11 30 ... 3.98 2.22 2003]
 [564 7 23 ... 5.21 5.14 2003]
 [422 3 23 ... 3.03 1.1 2003]
 ...
 [142 6 33 ... 4.92 6.46 2005]
 [176 12 22 ... 3.33 3.33 2005]
 [327 7 35 ... 6.11 8.33 2005]]
done printing Xtest
The Out Of Bag score is 0.9381650857468886
The mean squared error is 4.841823316912973
The root mean squared error is 2.2004143511877423
The r2 score is 0.9347332723787573
Fold X: [[423 33 30 ... 7.87 7.41 2001]
 [544 15 25 ... 6.77 5.88 2001]
 [74 13 32 ... 7.09 7.02 2001]
 ...
 [142 6 33 ... 4.92 6.46 2005]
 [176 12 22 ... 3.33 3.33 2005]
 [327 7 35 ... 6.11 8.33 2005]]
end x fold
[[432 30 26 ... 11.75 26.75 2005]
 [236 24 43 ... 1.45 1.45 2005]
 [35 32 25 ... 13.5 13.5 2005]
 ...
 [216 9 25 ... 5.14 4.54 2008]
 [614 31 28 ... 5.69 6.02 2008]
 [598 14 30 ... 7.5 5.

1.403 rmse very nice :)