In [225]:
# import all necessary dependencies
import pandas as pd
import pathlib
from pathlib import Path
from sqlalchemy import create_engine, inspect, text
from sqlalchemy.orm import Session
import matplotlib.pyplot as plt
from sqlalchemy.types import String, Float, Integer
import psycopg2
import sklearn as skl
import tensorflow as tf
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

In [226]:
# read in the raw data files
season_2018_df = pd.read_excel('wr_season_2018.xlsx')
season_2019_df = pd.read_excel('wr_season_2019.xlsx')
season_2020_df = pd.read_excel('wr_season_2020.xlsx')
season_2021_df = pd.read_excel('wr_season_2021.xlsx')
season_2022_df = pd.read_excel('wr_season_2022.xlsx')

## Data Cleaning

In [227]:
# preview new dataframe
season_2018_df.head()

Unnamed: 0,Rank,Name,Team,Pos,GMS,TGTS,REC,PCT,YDS,TD,...,Y/T,Y/R,ATT,YDS.1,AVG,TD.1,FUM,LST,FPTS/G,FPTS
0,1,Tyreek Hill,KC,WR,16,137,87,63.5,1479,12,...,10.8,17.0,22,151,6.9,1,0,0,15.1,241.0
1,2,Antonio Brown,PIT,WR,15,168,104,61.9,1297,15,...,7.7,12.5,0,0,0.0,0,0,0,14.6,219.7
2,3,Davante Adams,GB,WR,15,169,111,65.7,1386,13,...,8.2,12.5,0,0,0.0,0,0,0,14.6,218.6
3,4,DeAndre Hopkins,HOU,WR,16,163,115,70.6,1572,11,...,9.6,13.7,1,-7,-7.0,0,2,2,13.7,218.5
4,5,Julio Jones,ATL,WR,16,170,113,66.5,1677,8,...,9.9,14.8,2,12,6.0,0,2,2,13.3,212.9


In [228]:
# drop rank column from all dataframes since we don't need this value for the model
season_2018_df = season_2018_df.drop(columns=['Rank'])
season_2019_df = season_2019_df.drop(columns=['Rank'])
season_2020_df = season_2020_df.drop(columns=['Rank'])
season_2021_df = season_2021_df.drop(columns=['Rank'])
season_2022_df = season_2022_df.drop(columns=['Rank'])

In [229]:
# rename columns to be clear terminology
season_2018_df = season_2018_df.rename(columns={'Pos': 'Position', 'GMS': 'Games', 'REC': 'Receptions', 'TGTS': 'Targets', 'PCT': 'Percentage',
                               'YDS': 'Yards', 'TD': 'Touchdowns', 'LNG': 'Long', 'Y/T': 'Yards_per_target', 'Y/R': 'Yards_per_reception', 'ATT': 'Attempts',
                               'TD.1': 'Rushing_touchdown', 'YDS.1': 'Rushing_yards', 'AVG': 'Average_rushing_yards', 'FUM': 'Fumbles', 'LST': 'Lost_yards',
                               'FPTS/G': 'Fantasy_points_per_game', 'FPTS': 'Fantasy_points'})

In [230]:
# rename columns to be clear terminology
season_2019_df = season_2019_df.rename(columns={'Pos': 'Position', 'GMS': 'Games', 'REC': 'Receptions', 'TGTS': 'Targets', 'PCT': 'Percentage',
                               'YDS': 'Yards', 'TD': 'Touchdowns', 'LNG': 'Long', 'Y/T': 'Yards_per_target', 'Y/R': 'Yards_per_reception', 'ATT': 'Attempts',
                               'TD.1': 'Rushing_touchdown', 'YDS.1': 'Rushing_yards', 'AVG': 'Average_rushing_yards', 'FUM': 'Fumbles', 'LST': 'Lost_yards',
                               'FPTS/G': 'Fantasy_points_per_game', 'FPTS': 'Fantasy_points'})

In [231]:
# rename columns to be clear terminology
season_2020_df = season_2020_df.rename(columns={'Pos': 'Position', 'GMS': 'Games', 'TGTS': 'Targets', 'REC': 'Receptions', 'PCT': 'Percentage',
                               'YDS': 'Yards', 'TD': 'Touchdowns', 'LNG': 'Long', 'Y/T': 'Yards_per_target', 'Y/R': 'Yards_per_reception', 'ATT': 'Attempts',
                               'TD.1': 'Rushing_touchdown', 'YDS.1': 'Rushing_yards', 'AVG': 'Average_rushing_yards', 'FUM': 'Fumbles', 'LST': 'Lost_yards',
                               'FPTS/G': 'Fantasy_points_per_game', 'FPTS': 'Fantasy_points'})

In [232]:
# rename columns to be clear terminology
season_2021_df = season_2021_df.rename(columns={'Pos': 'Position', 'GMS': 'Games', 'TGTS': 'Targets', 'REC': 'Receptions', 'PCT': 'Percentage',
                               'YDS': 'Yards', 'TD': 'Touchdowns', 'LNG': 'Long', 'Y/T': 'Yards_per_target', 'Y/R': 'Yards_per_reception', 'ATT': 'Attempts',
                               'TD.1': 'Rushing_touchdown', 'YDS.1': 'Rushing_yards', 'AVG': 'Average_rushing_yards', 'FUM': 'Fumbles', 'LST': 'Lost_yards',
                               'FPTS/G': 'Fantasy_points_per_game', 'FPTS': 'Fantasy_points'})

In [233]:
# rename columns to be clear terminology
season_2022_df = season_2022_df.rename(columns={'REC': 'Receptions', 'TGTS': 'Targets', 'PCT': 'Percentage',
                               'YDS': 'Yards', 'TD': 'Touchdowns', 'LNG': 'Long', 'Y/T': 'Yards_per_target', 'Y/R': 'Yards_per_reception', 'ATT': 'Attempts',
                               'TD.1': 'Rushing_touchdown', 'YDS.1': 'Rushing_yards', 'AVG': 'Average_rushing_yards', 'FUM': 'Fumbles', 'LST': 'Lost_yards',
                               'FPTS/G': 'Fantasy_points_per_game', 'FPTS': 'Fantasy_points'})

In [234]:
# check to make sure columns have been renamed
season_2018_df.head()

Unnamed: 0,Name,Team,Position,Games,Targets,Receptions,Percentage,Yards,Touchdowns,Long,Yards_per_target,Yards_per_reception,Attempts,Rushing_yards,Average_rushing_yards,Rushing_touchdown,Fumbles,Lost_yards,Fantasy_points_per_game,Fantasy_points
0,Tyreek Hill,KC,WR,16,137,87,63.5,1479,12,75,10.8,17.0,22,151,6.9,1,0,0,15.1,241.0
1,Antonio Brown,PIT,WR,15,168,104,61.9,1297,15,78,7.7,12.5,0,0,0.0,0,0,0,14.6,219.7
2,Davante Adams,GB,WR,15,169,111,65.7,1386,13,57,8.2,12.5,0,0,0.0,0,0,0,14.6,218.6
3,DeAndre Hopkins,HOU,WR,16,163,115,70.6,1572,11,49,9.6,13.7,1,-7,-7.0,0,2,2,13.7,218.5
4,Julio Jones,ATL,WR,16,170,113,66.5,1677,8,58,9.9,14.8,2,12,6.0,0,2,2,13.3,212.9


In [235]:
# add column to each dataframe that contains the year of play
season_2018_df['Year'] = '2018'
season_2019_df['Year'] = '2019'
season_2020_df['Year'] = '2020'
season_2021_df['Year'] = '2021'

In [236]:
# concat all dataframes that will be used to train the model
all_dfs = [season_2018_df, season_2019_df, season_2020_df, season_2021_df]
results = pd.concat(all_dfs)

In [237]:
# write the clean dataframes to csv for future use
season_2018_df.to_csv('wr_season_2018_clean.csv', index=False)

In [238]:
# write the clean dataframes to csv for future use
season_2019_df.to_csv('wr_season_2019_clean.csv', index=False)

In [239]:
# write the clean dataframes to csv for future use
season_2020_df.to_csv('wr_season_2020_clean.csv', index=False)

In [240]:
# write the clean dataframes to csv for future use
season_2021_df.to_csv('wr_season_2021_clean.csv', index=False)

In [241]:
# write the clean dataframes to csv for future use
season_2022_df.to_csv('wr_season_2022_clean.csv', index=False)

## Data Preprocessing

In [242]:
# drop unnecessary columns before fitting model
results = results.drop(columns=['Team', 'Position', 'Year', 'Games'])

In [243]:
# set name column as index
results = results.set_index('Name')

In [244]:
# create bins and labels for touchdown column
touchdown_bins = [-1, 3, 6, 9, 12, 15, 100]
bin_labels = [1,2,3,4,5,6]

In [245]:
# view new df containing converted data
results

Unnamed: 0_level_0,Targets,Receptions,Percentage,Yards,Touchdowns,Long,Yards_per_target,Yards_per_reception,Attempts,Rushing_yards,Average_rushing_yards,Rushing_touchdown,Fumbles,Lost_yards,Fantasy_points_per_game,Fantasy_points
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Tyreek Hill,137,87,63.5,1479,12,75,10.8,17.0,22,151,6.9,1,0,0,15.1,241.0
Antonio Brown,168,104,61.9,1297,15,78,7.7,12.5,0,0,0.0,0,0,0,14.6,219.7
Davante Adams,169,111,65.7,1386,13,57,8.2,12.5,0,0,0.0,0,0,0,14.6,218.6
DeAndre Hopkins,163,115,70.6,1572,11,49,9.6,13.7,1,-7,-7.0,0,2,2,13.7,218.5
Julio Jones,170,113,66.5,1677,8,58,9.9,14.8,2,12,6.0,0,2,2,13.3,212.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Malik Taylor,3,2,66.7,14,0,7,4.7,7.0,0,0,0.0,0,1,1,-0.1,-0.6
Racey McMath,6,2,33.3,8,0,9,1.3,4.0,0,0,0.0,0,1,1,-0.1,-1.2
Travis Benjamin,5,0,0.0,0,0,0,0.0,0.0,0,0,0.0,0,1,1,-0.2,-2.0
J.J. Koski,0,0,0.0,0,0,0,0.0,0.0,0,0,0.0,0,1,1,-0.4,-2.0


In [246]:
results.shape

(990, 16)

In [247]:
# create the bins in new columbs for number of touchdowns
results['Touchdown_bins'] = pd.cut(results['Touchdowns'], bins = touchdown_bins, labels = bin_labels)


In [248]:
# y is the target and x is the features
# for this case we're training on touchdown performance first
y = results['Touchdown_bins']
X = results.drop(columns=['Touchdown_bins', 'Touchdowns'])

# create the training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=.20)

In [249]:
# create a StandardScaler instance
scaler = StandardScaler()

# fit the StandardScaler
X_scaler = scaler.fit(X_train)

# scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [250]:
X_test.shape

(198, 15)

## Model Creation

In [251]:
# define the model
nn = Sequential()

# first hidden layer
nn.add(Dense(units=70, activation = 'relu', input_dim = X_test.shape[1]))

# second hidden layer
nn.add(Dense(units=40, activation='relu'))

# third hidden layer
nn.add(Dense(units=20, activation='relu'))

# output layer
nn.add(Dense(units=1, activation='relu'))

# check the structure of the model
nn.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_16 (Dense)            (None, 70)                1120      
                                                                 
 dense_17 (Dense)            (None, 40)                2840      
                                                                 
 dense_18 (Dense)            (None, 20)                820       
                                                                 
 dense_19 (Dense)            (None, 1)                 21        
                                                                 
Total params: 4801 (18.75 KB)
Trainable params: 4801 (18.75 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [252]:
# compile the model (mae or mse loss functions)
nn.compile(loss="mse", optimizer="adam", metrics=["accuracy"])

In [253]:
# train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=60)

Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60
Epoch 35/60
Epoch 36/60
Epoch 37/60
Epoch 38/60
Epoch 39/60
Epoch 40/60
Epoch 41/60
Epoch 42/60
Epoch 43/60
Epoch 44/60
Epoch 45/60
Epoch 46/60
Epoch 47/60
Epoch 48/60
Epoch 49/60
Epoch 50/60
Epoch 51/60
Epoch 52/60
Epoch 53/60
Epoch 54/60
Epoch 55/60
Epoch 56/60
Epoch 57/60
Epoch 58/60
Epoch 59/60
Epoch 60/60


In [254]:
# evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

7/7 - 0s - loss: 0.0578 - accuracy: 0.7475 - 41ms/epoch - 6ms/step
Loss: 0.057782191783189774, Accuracy: 0.747474730014801


In [255]:
# set name column as index
season_2022_df = season_2022_df.set_index('Name')

In [256]:
season_2022_df = season_2022_df.drop(columns='Touchdowns')

In [257]:
season_2022_df.head()

Unnamed: 0_level_0,Targets,Receptions,Percentage,Yards,Long,Yards_per_target,Yards_per_reception,Attempts,Rushing_yards,Average_rushing_yards,Rushing_touchdown,Fumbles,Lost_yards,Fantasy_points_per_game,Fantasy_points
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Justin Jefferson,184,128,69.6,1809,64,9.8,14.1,4,24,6.0,1,0,0,14.2,240.66
Davante Adams,180,100,55.6,1516,60,8.4,15.2,3,-1,-0.3,0,1,0,13.9,235.5
Tyreek Hill,170,119,70.0,1710,64,10.1,14.4,7,32,4.6,2,1,0,13.4,228.2
A.J. Brown,146,88,60.3,1496,78,10.2,17.0,0,0,0.0,0,2,2,12.4,211.6
Stefon Diggs,154,108,70.1,1429,53,9.3,13.2,1,-3,-3.0,0,1,0,13.0,208.6


In [258]:
selected_player = season_2022_df.loc['Tyreek Hill']
selected_player

Targets                     170.0
Receptions                  119.0
Percentage                   70.0
Yards                      1710.0
Long                         64.0
Yards_per_target             10.1
Yards_per_reception          14.4
Attempts                      7.0
Rushing_yards                32.0
Average_rushing_yards         4.6
Rushing_touchdown             2.0
Fumbles                       1.0
Lost_yards                    0.0
Fantasy_points_per_game      13.4
Fantasy_points              228.2
Name: Tyreek Hill, dtype: float64

In [259]:
# convert data to array containing the values of features we want to provide model for predictions
player_array = np.array(selected_player)

In [260]:
player_array

array([1.700e+02, 1.190e+02, 7.000e+01, 1.710e+03, 6.400e+01, 1.010e+01,
       1.440e+01, 7.000e+00, 3.200e+01, 4.600e+00, 2.000e+00, 1.000e+00,
       0.000e+00, 1.340e+01, 2.282e+02])

In [261]:
#converted_array = player_array.reshape(1, -1)

In [262]:
# scale the data using same X_scaler as used for the model
record_to_predict = X_scaler.transform(converted_array)



In [263]:
print(record_to_predict)

[[ 3.02698733  3.2969246   0.63941433  3.72382924  1.30501458  0.8182841
   0.5350154   1.37333357  0.84990255  0.5036347   5.08639031  0.52528141
  -0.45919392  2.74946518  3.41649728]]


In [264]:
# confusion matrix
# confusion_matrix(y_predictions,y_test)

In [265]:
# print classification report to compare testing data to the model predictions
# print(classification_report(y_test, y_predictions))

In [266]:
## convert the dataframe row to an array for predictions?
## transform new prediction data using same scaler
## np.array([values of features you want to give the model for predictions])
## first convert data to array THEN scale it
## record_to_predict = X_scaler.transform(np.array([values of features]))