# 

In [1]:
# Load all libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import MeanSquaredError, MeanAbsoluteError, BinaryAccuracy
from tensorflow.keras.losses import BinaryCrossentropy


In [2]:
# Load datasets for active and retired player pool selection

# Define data types for IDs
dtype_dict = {
    'playerId': 'int32',
    'playId': 'int32',
    'gameId': 'int32'
}

# Load each dataset with specified columns and data types
players = pd.read_csv('players.csv', usecols=['playerId', 'nameFull', 'position', 'dob'], dtype=dtype_dict)
plays = pd.read_csv('plays.csv', usecols=['playId', 'gameId', 'safety'], dtype=dtype_dict)
games = pd.read_csv('games.csv', usecols=['gameId', 'gameDate'], dtype=dtype_dict)

# Offense datasets
passer = pd.read_csv('passer.csv', usecols=['playId', 'playerId'], dtype=dtype_dict)
rusher = pd.read_csv('rusher.csv', usecols=['playId', 'playerId'], dtype=dtype_dict)
receiver = pd.read_csv('receiver.csv', usecols=['playId', 'playerId'], dtype=dtype_dict)

# Special Teams datasets
kicks = pd.read_csv('kicks.csv', usecols=['playId', 'playerId'], dtype=dtype_dict)
kickReturns = pd.read_csv('kickReturns.csv', usecols=['playId', 'playerId'], dtype=dtype_dict)

# Defense datasets
tackles = pd.read_csv('tackles.csv', usecols=['playId', 'playerId'], dtype=dtype_dict)
sacks = pd.read_csv('sacks.csv', usecols=['playId', 'playerId'], dtype=dtype_dict)
fumblForced = pd.read_csv('fumblForced.csv', usecols=['playId', 'playerId'], dtype=dtype_dict)

# Remaining datasets
#fumbles = pd.read_csv('fumbles.csv', usecols=['playId', 'playerId'], dtype=dtype_dict)
#interceptions = pd.read_csv('interceptions.csv', usecols=['playId', 'playerId'], dtype=dtype_dict)
#passDef = pd.read_csv('passDef.csv', usecols=['playId', 'playerId'], dtype=dtype_dict)
#penalties = pd.read_csv('penalties.csv', usecols=['playId', 'playerId'], dtype=dtype_dict)
#qhHits = pd.read_csv('qhHits.csv', usecols=['playId', 'playerId'], dtype=dtype_dict)
##combine = pd.read_csv('combine.csv', usecols=['playerId'], dtype={'playerId': 'int32'})
#draft = pd.read_csv('draft.csv', usecols=['playerId'], dtype={'playerId': 'int32'})
#officials = pd.read_csv('officials.csv', usecols=['gameId'], dtype={'gameId': 'int32'})


In [3]:
# Merged dataset for Offense pool of active and retired players

# Initialize the Offense dataset with players
Offense_Active_And_Retired_Player_Status = players

# Randomly select specified number of unique 'playerId's and filter the dataframe
Offense_Active_And_Retired_Player_Status = Offense_Active_And_Retired_Player_Status[Offense_Active_And_Retired_Player_Status['playerId'].isin(Offense_Active_And_Retired_Player_Status['playerId'].drop_duplicates().sample(n=3000, random_state=42))]

# Rename 'playId' in datasets to ensure it gets a suffix when merging
passer = passer.rename(columns={'playId': 'playId_passer'})
rusher = rusher.rename(columns={'playId': 'playId_rusher'})
receiver = receiver.rename(columns={'playId': 'playId_receiver'})

# Merge Offense datasets along with player data to determine "optimal team" based on player characteristics
Offense_Active_And_Retired_Player_Status = pd.merge(Offense_Active_And_Retired_Player_Status, passer, on='playerId', how='left')
Offense_Active_And_Retired_Player_Status = pd.merge(Offense_Active_And_Retired_Player_Status, rusher, on='playerId', how='left')
Offense_Active_And_Retired_Player_Status = pd.merge(Offense_Active_And_Retired_Player_Status, receiver, on='playerId', how='left')

# Combine 'playId_suffix' variants into a single 'playId' column
Offense_Active_And_Retired_Player_Status['playId'] = Offense_Active_And_Retired_Player_Status['playId_passer'].combine_first(Offense_Active_And_Retired_Player_Status['playId_rusher']).combine_first(Offense_Active_And_Retired_Player_Status['playId_receiver'])

# Merge plays and games to Offense datasets to complete the profile
Offense_Active_And_Retired_Player_Status = pd.merge(Offense_Active_And_Retired_Player_Status, plays, on='playId', how='left')
Offense_Active_And_Retired_Player_Status = pd.merge(Offense_Active_And_Retired_Player_Status, games, on='gameId', how='left')

# Drop rows where 'gameDate' is NaN
Offense_Active_And_Retired_Player_Status = Offense_Active_And_Retired_Player_Status.dropna(subset=['gameDate'])

# Convert dates to datetime format
Offense_Active_And_Retired_Player_Status['gameDate'] = pd.to_datetime(Offense_Active_And_Retired_Player_Status['gameDate'])
Offense_Active_And_Retired_Player_Status['dob'] = pd.to_datetime(Offense_Active_And_Retired_Player_Status['dob'])
active_or_retired_game_date_threshold = pd.to_datetime('2017-08-03') #start of the 2017-2018 season
definitive_retired_game_date_threshold = pd.to_datetime('2010-08-12') #start of the 2010-2011 season
dataset_end_date = pd.to_datetime('2020-02-02')

# Find the index of the most recent gameDate for each playerId
idx = Offense_Active_And_Retired_Player_Status.groupby('playerId')['gameDate'].idxmax()

# Filter the dataframe to keep only rows with the most recent gameDate per playerId
Offense_Active_And_Retired_Player_Status = Offense_Active_And_Retired_Player_Status.loc[idx]

# Vectorized calclation of player age and append to end of dataset
Offense_Active_And_Retired_Player_Status['ageAtDatasetEnd'] = (dataset_end_date - Offense_Active_And_Retired_Player_Status['dob']).dt.days / 365.25
Offense_Active_And_Retired_Player_Status['ageAtDatasetEnd'] = Offense_Active_And_Retired_Player_Status['ageAtDatasetEnd'].round(2)

# Drop rows where 'ageAtDatasetEnd' is NaN
Offense_Active_And_Retired_Player_Status = Offense_Active_And_Retired_Player_Status.dropna(subset=['ageAtDatasetEnd'])

# Vectorized determination of conditions for active/retired
conditions_active = [
    (Offense_Active_And_Retired_Player_Status['gameDate'] >= active_or_retired_game_date_threshold) & (Offense_Active_And_Retired_Player_Status['ageAtDatasetEnd'] < 49),
    (Offense_Active_And_Retired_Player_Status['gameDate'] >= active_or_retired_game_date_threshold) & (Offense_Active_And_Retired_Player_Status['ageAtDatasetEnd'] >= 49)
]
choices_active = ['Yes-Active-O', 'Inconclusive-Active-O']

conditions_retired = [
    (Offense_Active_And_Retired_Player_Status['gameDate'] < active_or_retired_game_date_threshold) & (Offense_Active_And_Retired_Player_Status['ageAtDatasetEnd'] >= 49),
    ((Offense_Active_And_Retired_Player_Status['gameDate'] >= definitive_retired_game_date_threshold) & (Offense_Active_And_Retired_Player_Status['gameDate'] < active_or_retired_game_date_threshold) & (Offense_Active_And_Retired_Player_Status['ageAtDatasetEnd'] < 49)),
    (Offense_Active_And_Retired_Player_Status['gameDate'] < definitive_retired_game_date_threshold)
]
choices_retired = ['Yes-Retired-O', 'Inconclusive-Retired-O', 'Yes-Retired-O']

# Append vectorized determination of conditions for active/retired
Offense_Active_And_Retired_Player_Status['ActivePlayer'] = np.select(conditions_active, choices_active, default='')
Offense_Active_And_Retired_Player_Status['RetiredPlayer'] = np.select(conditions_retired, choices_retired, default='')


In [4]:
# Display a preview of the dataframe

Offense_Active_And_Retired_Player_Status


Unnamed: 0,playerId,nameFull,position,dob,playId_passer,playId_rusher,playId_receiver,playId,gameId,safety,gameDate,ageAtDatasetEnd,ActivePlayer,RetiredPlayer
699,19910154,Fred McAfee,RB,1968-06-20,,2031971.0,2032246.0,2031971.0,29108.0,0.0,2006-12-31,51.62,,Yes-Retired-O
733,19910326,Keenan McCardell,WR,1970-01-06,1043733.0,2043301.0,3049040.0,1043733.0,28715.0,0.0,2005-12-04,50.07,,Yes-Retired-O
3042,19910407,Doug Pederson,QB,1968-01-31,19315.0,19548.0,,19315.0,26958.0,0.0,2004-10-03,52.01,,Yes-Retired-O
3196,19930070,Jason Elam,K,1970-03-08,,2016277.0,,2016277.0,29049.0,0.0,2006-12-03,49.91,,Yes-Retired-O
3211,19930200,Craig Hentrich,P,1971-05-18,2050077.0,3050360.0,,2050077.0,29159.0,0.0,2006-08-26,48.71,,Yes-Retired-O
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64623601,20190618,Ventell Bryant,WR,1996-08-24,,,15066503.0,15066503.0,58077.0,0.0,2019-11-28,23.44,Yes-Active-O,
64623642,20190625,Darrin Hall,RB,1996-09-06,,15010300.0,15010301.0,15010300.0,57879.0,0.0,2019-08-22,23.41,Yes-Active-O,
64623725,20190676,Kelvin McKnight,WR,1997-04-25,,,15013502.0,15013502.0,57890.0,0.0,2019-08-29,22.77,Yes-Active-O,
64623766,20190764,John Lovett,QB,1996-04-25,,,15001446.0,15001446.0,57843.0,0.0,2019-08-10,23.77,Yes-Active-O,


In [5]:
# Merged dataset for Special Teams pool of active and retired players

# Initialize the Special Teams dataset with players
Special_Teams_Active_And_Retired_Player_Status = players

# Randomly select specified number of unique 'playerId's and filter the dataframe
Special_Teams_Active_And_Retired_Player_Status = Special_Teams_Active_And_Retired_Player_Status[Special_Teams_Active_And_Retired_Player_Status['playerId'].isin(Special_Teams_Active_And_Retired_Player_Status['playerId'].drop_duplicates().sample(n=15000, random_state=52))]

# Rename 'playId' in datasets to ensure it gets a suffix when merging
kicks = kicks.rename(columns={'playId': 'playId_kicks'})
kickReturns = kickReturns.rename(columns={'playId': 'playId_kickReturns'})

# Merge Special Teams datasets along with player data to determine "optimal team" based on player characteristics
Special_Teams_Active_And_Retired_Player_Status = pd.merge(Special_Teams_Active_And_Retired_Player_Status, kicks, on='playerId', how='left')
Special_Teams_Active_And_Retired_Player_Status = pd.merge(Special_Teams_Active_And_Retired_Player_Status, kickReturns, on='playerId', how='left')

# Combine 'playId_suffix' variants into a single 'playId' column
Special_Teams_Active_And_Retired_Player_Status['playId'] = Special_Teams_Active_And_Retired_Player_Status['playId_kicks'].combine_first(Special_Teams_Active_And_Retired_Player_Status['playId_kickReturns'])

# Merge plays and games to Special Teams datasets to complete the profile
Special_Teams_Active_And_Retired_Player_Status = pd.merge(Special_Teams_Active_And_Retired_Player_Status, plays, on='playId', how='left')
Special_Teams_Active_And_Retired_Player_Status = pd.merge(Special_Teams_Active_And_Retired_Player_Status, games, on='gameId', how='left')

# Drop rows where 'gameDate' is NaN
Special_Teams_Active_And_Retired_Player_Status = Special_Teams_Active_And_Retired_Player_Status.dropna(subset=['gameDate'])

# Convert dates to datetime format
Special_Teams_Active_And_Retired_Player_Status['gameDate'] = pd.to_datetime(Special_Teams_Active_And_Retired_Player_Status['gameDate'])
Special_Teams_Active_And_Retired_Player_Status['dob'] = pd.to_datetime(Special_Teams_Active_And_Retired_Player_Status['dob'])

# Find the index of the most recent gameDate for each playerId
idx = Special_Teams_Active_And_Retired_Player_Status.groupby('playerId')['gameDate'].idxmax()

# Filter the dataframe to keep only rows with the most recent gameDate per playerId
Special_Teams_Active_And_Retired_Player_Status = Special_Teams_Active_And_Retired_Player_Status.loc[idx]

# Vectorized calclation of player age and append to end of dataset
Special_Teams_Active_And_Retired_Player_Status['ageAtDatasetEnd'] = (dataset_end_date - Special_Teams_Active_And_Retired_Player_Status['dob']).dt.days / 365.25
Special_Teams_Active_And_Retired_Player_Status['ageAtDatasetEnd'] = Special_Teams_Active_And_Retired_Player_Status['ageAtDatasetEnd'].round(2)

# Drop rows where 'ageAtDatasetEnd' is NaN
Special_Teams_Active_And_Retired_Player_Status = Special_Teams_Active_And_Retired_Player_Status.dropna(subset=['ageAtDatasetEnd'])

# Vectorized determination of conditions for active/retired
conditions_active = [
    (Special_Teams_Active_And_Retired_Player_Status['gameDate'] >= active_or_retired_game_date_threshold) & (Special_Teams_Active_And_Retired_Player_Status['ageAtDatasetEnd'] < 49),
    (Special_Teams_Active_And_Retired_Player_Status['gameDate'] >= active_or_retired_game_date_threshold) & (Special_Teams_Active_And_Retired_Player_Status['ageAtDatasetEnd'] >= 49)
]
choices_active = ['Yes-Active-ST', 'Inconclusive-Active-ST']

conditions_retired = [
    (Special_Teams_Active_And_Retired_Player_Status['gameDate'] < active_or_retired_game_date_threshold) & (Special_Teams_Active_And_Retired_Player_Status['ageAtDatasetEnd'] >= 49),
    ((Special_Teams_Active_And_Retired_Player_Status['gameDate'] >= definitive_retired_game_date_threshold) & (Special_Teams_Active_And_Retired_Player_Status['gameDate'] < active_or_retired_game_date_threshold) & (Special_Teams_Active_And_Retired_Player_Status['ageAtDatasetEnd'] < 49)),
    (Special_Teams_Active_And_Retired_Player_Status['gameDate'] < definitive_retired_game_date_threshold)
]
choices_retired = ['Yes-Retired-ST', 'Inconclusive-Retired-ST', 'Yes-Retired-ST']

# Append vectorized determination of conditions for active/retired
Special_Teams_Active_And_Retired_Player_Status['ActivePlayer'] = np.select(conditions_active, choices_active, default='')
Special_Teams_Active_And_Retired_Player_Status['RetiredPlayer'] = np.select(conditions_retired, choices_retired, default='')


In [6]:
# Display a preview of the dataframe

Special_Teams_Active_And_Retired_Player_Status


Unnamed: 0,playerId,nameFull,position,dob,playId_kicks,playId_kickReturns,playId,gameId,safety,gameDate,ageAtDatasetEnd,ActivePlayer,RetiredPlayer
1281,19820086,Morten Andersen,K,1960-08-19,3002884.0,,3002884.0,29436.0,0.0,2007-12-30,59.46,,Yes-Retired-ST
1407,19820171,Gary Anderson,K,1959-07-16,51314.0,,51314.0,27157.0,0.0,2005-01-02,60.55,,Yes-Retired-ST
1830,19830400,Sean Landeta,P,1962-01-06,1040686.0,,1040686.0,28779.0,0.0,2006-01-01,58.07,,Yes-Retired-ST
2242,19850285,Doug Flutie,QB,1962-10-23,1031198.0,,1031198.0,28772.0,0.0,2006-01-01,57.28,,Yes-Retired-ST
3381,19870400,John Carney,K,1964-04-20,6000526.0,,6000526.0,54934.0,0.0,2010-10-10,55.79,,Yes-Retired-ST
...,...,...,...,...,...,...,...,...,...,...,...,...,...
154845,20190951,Terry Wright,WR,1997-01-28,,15009330.0,15009330.0,57873.0,0.0,2019-08-24,23.01,Yes-Active-ST,
154871,20190973,Spencer Schnell,WR,1994-12-07,,15079939.0,15079939.0,58128.0,0.0,2019-12-21,25.16,Yes-Active-ST,
155042,20191044,D'Ernest Johnson,RB,1996-02-27,,15082317.0,15082317.0,58143.0,0.0,2019-12-29,23.93,Yes-Active-ST,
155071,20191070,Malik Taylor,WR,1995-12-21,,15004730.0,15004730.0,57854.0,0.0,2019-08-15,24.12,Yes-Active-ST,


In [7]:
# Merged dataset for Defense pool of active and retired players

# Initialize the Defense dataset with players
Defense_Active_And_Retired_Player_Status = players

# Randomly select specified number of unique 'playerId's and filter the dataframe
Defense_Active_And_Retired_Player_Status = Defense_Active_And_Retired_Player_Status[Defense_Active_And_Retired_Player_Status['playerId'].isin(Defense_Active_And_Retired_Player_Status['playerId'].drop_duplicates().sample(n=19000, random_state=62))]

# Rename 'playId' in datasets to ensure it gets a suffix when merging
tackles = tackles.rename(columns={'playId': 'playId_tackles'})
sacks = sacks.rename(columns={'playId': 'playId_sacks'})
fumblForced = fumblForced.rename(columns={'playId': 'playId_fumblForced'})

# Merge Defense datasets along with player data to determine "optimal team" based on player characteristics
Defense_Active_And_Retired_Player_Status = pd.merge(Defense_Active_And_Retired_Player_Status, tackles, on='playerId', how='left')
Defense_Active_And_Retired_Player_Status = pd.merge(Defense_Active_And_Retired_Player_Status, sacks, on='playerId', how='left')
Defense_Active_And_Retired_Player_Status = pd.merge(Defense_Active_And_Retired_Player_Status, fumblForced, on='playerId', how='left')

# Combine 'playId_suffix' variants into a single 'playId' column
Defense_Active_And_Retired_Player_Status['playId'] = Defense_Active_And_Retired_Player_Status['playId_tackles'].combine_first(Defense_Active_And_Retired_Player_Status['playId_sacks']).combine_first(Defense_Active_And_Retired_Player_Status['playId_fumblForced'])

# Merge plays and games to Defense datasets to complete the profile
Defense_Active_And_Retired_Player_Status = pd.merge(Defense_Active_And_Retired_Player_Status, plays, on='playId', how='left')
Defense_Active_And_Retired_Player_Status = pd.merge(Defense_Active_And_Retired_Player_Status, games, on='gameId', how='left')

# Drop rows where 'gameDate' is NaN
Defense_Active_And_Retired_Player_Status = Defense_Active_And_Retired_Player_Status.dropna(subset=['gameDate'])

# Convert dates to datetime format
Defense_Active_And_Retired_Player_Status['gameDate'] = pd.to_datetime(Defense_Active_And_Retired_Player_Status['gameDate'])
Defense_Active_And_Retired_Player_Status['dob'] = pd.to_datetime(Defense_Active_And_Retired_Player_Status['dob'])

# Find the index of the most recent gameDate for each playerId
idx = Defense_Active_And_Retired_Player_Status.groupby('playerId')['gameDate'].idxmax()

# Filter the dataframe to keep only rows with the most recent gameDate per playerId
Defense_Active_And_Retired_Player_Status = Defense_Active_And_Retired_Player_Status.loc[idx]

# Vectorized calculation of player age and append to end of dataset
Defense_Active_And_Retired_Player_Status['ageAtDatasetEnd'] = (dataset_end_date - Defense_Active_And_Retired_Player_Status['dob']).dt.days / 365.25
Defense_Active_And_Retired_Player_Status['ageAtDatasetEnd'] = Defense_Active_And_Retired_Player_Status['ageAtDatasetEnd'].round(2)

# Drop rows where 'ageAtDatasetEnd' is NaN
Defense_Active_And_Retired_Player_Status = Defense_Active_And_Retired_Player_Status.dropna(subset=['ageAtDatasetEnd'])

# Vectorized determination of conditions for active/retired
conditions_active = [
    (Defense_Active_And_Retired_Player_Status['gameDate'] >= active_or_retired_game_date_threshold) & (Defense_Active_And_Retired_Player_Status['ageAtDatasetEnd'] < 49),
    (Defense_Active_And_Retired_Player_Status['gameDate'] >= active_or_retired_game_date_threshold) & (Defense_Active_And_Retired_Player_Status['ageAtDatasetEnd'] >= 49)
]
choices_active = ['Yes-Active-D', 'Inconclusive-Active-D']

conditions_retired = [
    (Defense_Active_And_Retired_Player_Status['gameDate'] < active_or_retired_game_date_threshold) & (Defense_Active_And_Retired_Player_Status['ageAtDatasetEnd'] >= 49),
    ((Defense_Active_And_Retired_Player_Status['gameDate'] >= definitive_retired_game_date_threshold) & (Defense_Active_And_Retired_Player_Status['gameDate'] < active_or_retired_game_date_threshold) & (Defense_Active_And_Retired_Player_Status['ageAtDatasetEnd'] < 49)),
    (Defense_Active_And_Retired_Player_Status['gameDate'] < definitive_retired_game_date_threshold)
]
choices_retired = ['Yes-Retired-D', 'Inconclusive-Retired-D', 'Yes-Retired-D']

# Append vectorized determination of conditions for active/retired
Defense_Active_And_Retired_Player_Status['ActivePlayer'] = np.select(conditions_active, choices_active, default='')
Defense_Active_And_Retired_Player_Status['RetiredPlayer'] = np.select(conditions_retired, choices_retired, default='')


In [8]:
# Display a preview of the dataframe

Defense_Active_And_Retired_Player_Status


Unnamed: 0,playerId,nameFull,position,dob,playId_tackles,playId_sacks,playId_fumblForced,playId,gameId,safety,gameDate,ageAtDatasetEnd,ActivePlayer,RetiredPlayer
1412,19820086,Morten Andersen,K,1960-08-19,28610.0,,,28610.0,26920.0,0.0,2004-09-12,59.46,,Yes-Retired-D
2173,19850016,Jerry Rice,WR,1962-10-13,44316.0,,,44316.0,27082.0,0.0,2004-11-28,57.31,,Yes-Retired-D
2606,19860201,Ray Brown,OG,1962-12-12,53499.0,,,53499.0,27158.0,0.0,2005-01-02,57.14,,Yes-Retired-D
3098,19880068,Tom Tupa,P,1966-02-06,41951.0,,,41951.0,27078.0,0.0,2004-11-28,53.99,,Yes-Retired-D
3315,19880400,Jeff Feagles,P,1966-03-07,1034602.0,,,1034602.0,28709.0,0.0,2005-12-04,53.91,,Yes-Retired-D
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118547601,20191131,Joey Slye,K,1996-04-10,15049747.0,,,15049747.0,58024.0,0.0,2019-11-03,23.81,Yes-Active-D,
118547679,20191176,Joe Walker,WR,1996-01-12,15000863.0,,,15000863.0,57839.0,0.0,2019-08-08,24.06,Yes-Active-D,
118547731,20191179,John Battle,DB,1995-02-21,15013447.0,,,15013447.0,57889.0,0.0,2019-08-29,24.95,Yes-Active-D,
118547767,20191195,Quart'e Sapp,LB,1997-03-08,15012954.0,,,15012954.0,57886.0,0.0,2019-08-29,22.90,Yes-Active-D,


In [9]:
# Selecting 200 active and 200 retired players randomly from each category for Offense, Special Teams, and Defense

# Sampling for active offense players
qb_active = Offense_Active_And_Retired_Player_Status[
    (Offense_Active_And_Retired_Player_Status['ActivePlayer'] == 'Yes-Active-O') & 
    (Offense_Active_And_Retired_Player_Status['playId_passer'] == Offense_Active_And_Retired_Player_Status['playId'])
].sample(22, random_state=42)

rb_active = Offense_Active_And_Retired_Player_Status[
    (Offense_Active_And_Retired_Player_Status['ActivePlayer'] == 'Yes-Active-O') & 
    (Offense_Active_And_Retired_Player_Status['playId_rusher'] == Offense_Active_And_Retired_Player_Status['playId'])
].sample(23, random_state=42)

wr_active = Offense_Active_And_Retired_Player_Status[
    (Offense_Active_And_Retired_Player_Status['ActivePlayer'] == 'Yes-Active-O') & 
    (Offense_Active_And_Retired_Player_Status['playId_receiver'] == Offense_Active_And_Retired_Player_Status['playId'])
].sample(22, random_state=42)

active_offense = pd.concat([qb_active, rb_active, wr_active])

# Sampling for retired offense players
qb_retired = Offense_Active_And_Retired_Player_Status[
    (Offense_Active_And_Retired_Player_Status['RetiredPlayer'] == 'Yes-Retired-O') & 
    (Offense_Active_And_Retired_Player_Status['playId_passer'] == Offense_Active_And_Retired_Player_Status['playId'])
].sample(22, random_state=42)

rb_retired = Offense_Active_And_Retired_Player_Status[
    (Offense_Active_And_Retired_Player_Status['RetiredPlayer'] == 'Yes-Retired-O') & 
    (Offense_Active_And_Retired_Player_Status['playId_rusher'] == Offense_Active_And_Retired_Player_Status['playId'])
].sample(23, random_state=42)

wr_retired = Offense_Active_And_Retired_Player_Status[
    (Offense_Active_And_Retired_Player_Status['RetiredPlayer'] == 'Yes-Retired-O') & 
    (Offense_Active_And_Retired_Player_Status['playId_receiver'] == Offense_Active_And_Retired_Player_Status['playId'])
].sample(22, random_state=42)

retired_offense = pd.concat([qb_retired, rb_retired, wr_retired])



# Sampling for active special teams players
kicks_active = Special_Teams_Active_And_Retired_Player_Status[
    (Special_Teams_Active_And_Retired_Player_Status['ActivePlayer'] == 'Yes-Active-ST') &
    (Special_Teams_Active_And_Retired_Player_Status['playId_kicks'] == Special_Teams_Active_And_Retired_Player_Status['playId'])
].sample(33, random_state=42) 

kickReturns_active = Special_Teams_Active_And_Retired_Player_Status[
    (Special_Teams_Active_And_Retired_Player_Status['ActivePlayer'] == 'Yes-Active-ST') &
    (Special_Teams_Active_And_Retired_Player_Status['playId_kickReturns'] == Special_Teams_Active_And_Retired_Player_Status['playId'])
].sample(33, random_state=42)

active_special_teams = pd.concat([kicks_active, kickReturns_active])

# Sampling for retired special teams players
kicks_retired = Special_Teams_Active_And_Retired_Player_Status[
    (Special_Teams_Active_And_Retired_Player_Status['RetiredPlayer'] == 'Yes-Retired-ST') &
    (Special_Teams_Active_And_Retired_Player_Status['playId_kicks'] == Special_Teams_Active_And_Retired_Player_Status['playId'])
].sample(33, random_state=42)

kickReturns_retired = Special_Teams_Active_And_Retired_Player_Status[
    (Special_Teams_Active_And_Retired_Player_Status['RetiredPlayer'] == 'Yes-Retired-ST') &
    (Special_Teams_Active_And_Retired_Player_Status['playId_kickReturns'] == Special_Teams_Active_And_Retired_Player_Status['playId'])
].sample(33, random_state=42)

retired_special_teams = pd.concat([kicks_retired, kickReturns_retired])



# Sampling for active defense players including safeties
tackles_active = Defense_Active_And_Retired_Player_Status[
    (Defense_Active_And_Retired_Player_Status['ActivePlayer'] == 'Yes-Active-D') &
    (Defense_Active_And_Retired_Player_Status['playId_tackles'] == Defense_Active_And_Retired_Player_Status['playId'])
].sample(22, random_state=42)

sacks_active = Defense_Active_And_Retired_Player_Status[
    (Defense_Active_And_Retired_Player_Status['ActivePlayer'] == 'Yes-Active-D') &
    (Defense_Active_And_Retired_Player_Status['playId_sacks'] == Defense_Active_And_Retired_Player_Status['playId'])
].sample(23, random_state=42)

fumblForced_active = Defense_Active_And_Retired_Player_Status[
    (Defense_Active_And_Retired_Player_Status['ActivePlayer'] == 'Yes-Active-D') &
    (Defense_Active_And_Retired_Player_Status['playId_fumblForced'] == Defense_Active_And_Retired_Player_Status['playId'])
].sample(22, random_state=42)

active_defense = pd.concat([tackles_active, sacks_active, fumblForced_active])

# Sampling for retired defense players including safeties
tackles_retired = Defense_Active_And_Retired_Player_Status[
    (Defense_Active_And_Retired_Player_Status['RetiredPlayer'] == 'Yes-Retired-D') &
    (Defense_Active_And_Retired_Player_Status['playId_tackles'] == Defense_Active_And_Retired_Player_Status['playId'])
].sample(22, random_state=42)

sacks_retired = Defense_Active_And_Retired_Player_Status[
    (Defense_Active_And_Retired_Player_Status['RetiredPlayer'] == 'Yes-Retired-D') &
    (Defense_Active_And_Retired_Player_Status['playId_sacks'] == Defense_Active_And_Retired_Player_Status['playId'])
].sample(23, random_state=42)

fumblForced_retired = Defense_Active_And_Retired_Player_Status[
    (Defense_Active_And_Retired_Player_Status['RetiredPlayer'] == 'Yes-Retired-D') &
    (Defense_Active_And_Retired_Player_Status['playId_fumblForced'] == Defense_Active_And_Retired_Player_Status['playId'])
].sample(22, random_state=42)

retired_defense = pd.concat([tackles_retired, sacks_retired, fumblForced_retired])

     
# Combining all active and retired selections into two pools of 200 for each
Active_Players_Pool = pd.concat([active_offense, active_special_teams, active_defense])
Retired_Players_Pool = pd.concat([retired_offense, retired_special_teams, retired_defense])


In [10]:
# Display a preview of the player pool dataframe

Active_Players_Pool


Unnamed: 0,playerId,nameFull,position,dob,playId_passer,playId_rusher,playId_receiver,playId,gameId,safety,gameDate,ageAtDatasetEnd,ActivePlayer,RetiredPlayer,playId_kicks,playId_kickReturns,playId_tackles,playId_sacks,playId_fumblForced
58027087,20130039,Geno Smith,QB,1990-10-10,15014583.0,15014621.0,10030521.0,15014583.0,57898.0,0.0,2019-08-29,29.31,Yes-Active-O,,,,,,
63350079,20160207,Jeff Driskel,QB,1993-04-23,15063563.0,15063584.0,15053420.0,15063563.0,58070.0,0.0,2019-11-24,26.78,Yes-Active-O,,,,,,
2711276,20000199,Tom Brady,QB,1977-08-03,15086099.0,15079206.0,13088510.0,15086099.0,58157.0,0.0,2020-01-04,42.50,Yes-Active-O,,,,,,
63869766,20170067,Alvin Kamara,RB,1995-07-25,15034052.0,15086252.0,15086256.0,15034052.0,57967.0,0.0,2019-10-06,24.53,Yes-Active-O,,,,,,
60513430,20140191,Pat O'Donnell,P,1991-02-22,13032991.0,10019104.0,,13032991.0,57311.0,0.0,2017-10-09,28.94,Yes-Active-O,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108119758,20130055,Vance McDonald,TE,1990-06-13,,,,13025285.0,57270.0,0.0,2017-09-24,29.64,Yes-Active-D,,,,13025285.0,,13025285.0
118480335,20180857,Pharoah McKever,TE,1994-02-04,,,,15013752.0,57891.0,0.0,2019-08-29,25.99,Yes-Active-D,,,,15013752.0,,15013752.0
108418069,20130106,Dion Sims,TE,1991-02-18,,,,13063425.0,57403.0,0.0,2017-11-26,28.96,Yes-Active-D,,,,13063425.0,,13063425.0
118474816,20180434,Jacob Tuioti-Mariner,DT,1996-07-25,,,,15084271.0,58152.0,0.0,2019-12-29,23.52,Yes-Active-D,,,,15084271.0,15013986.0,15084271.0


In [11]:
# Display a preview of the player pool dataframe

Retired_Players_Pool


Unnamed: 0,playerId,nameFull,position,dob,playId_passer,playId_rusher,playId_receiver,playId,gameId,safety,gameDate,ageAtDatasetEnd,ActivePlayer,RetiredPlayer,playId_kicks,playId_kickReturns,playId_tackles,playId_sacks,playId_fumblForced
52716534,20060452,Brett Basanez,QB,1983-05-11,5008442.0,3033374.0,,5008442.0,54778.0,0.0,2009-09-03,36.73,,Yes-Retired-O,,,,,
49784667,20040078,Bernard Berrian,WR,1980-12-27,5028773.0,6028973.0,6030356.0,5028773.0,54634.0,0.0,2009-11-29,39.10,,Yes-Retired-O,,,,,
50187348,20050488,Brian Wrobel,QB,1982-04-04,2042258.0,,,2042258.0,29131.0,0.0,2006-08-12,37.83,,Yes-Retired-O,,,,,
49795051,20040202,John Navarre,QB,1980-09-09,2000458.0,2001113.0,,2000458.0,29176.0,0.0,2006-08-31,39.40,,Yes-Retired-O,,,,,
44137721,20010059,Marques Tuiasosopo,QB,1979-03-22,4036684.0,3039433.0,,4036684.0,29667.0,0.0,2008-11-09,40.87,,Yes-Retired-O,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72409895,20080718,Charleston Hughes,LB,1983-12-14,,,,5023059.0,54740.0,0.0,2009-08-20,36.14,,Yes-Retired-D,,,,,5023059.0
77137104,20090146,Scott McKillop,LB,1986-03-04,,,,5046725.0,54752.0,0.0,2009-08-22,33.92,,Yes-Retired-D,,,,,5046725.0
40405494,20030544,Cliff Washburn,DE,1980-01-25,,,,9615.0,27195.0,0.0,2004-08-21,40.02,,Yes-Retired-D,,,9615.0,,9615.0
24026094,20020470,Justin Smith,LB,1979-06-05,,,,1008299.0,28795.0,0.0,2005-08-13,40.66,,Yes-Retired-D,,,1008299.0,,1008299.0


In [40]:
# Active_Players_Pool is a dataFrame containing all the unique playerIds
active_player_ids = Active_Players_Pool['playerId'].unique()

# Retired_Players_Pool is a dataFrame containing all the unique playerIds
retired_player_ids = Retired_Players_Pool['playerId'].unique()

# Load and filter datasets with specific columns for active players
players_active = pd.read_csv('players.csv', usecols=['playerId', 'nameFull', 'position'])[players['playerId'].isin(active_player_ids)]

# Load and filter datasets with specific columns for retired players
players_retired = pd.read_csv('players.csv', usecols=['playerId', 'nameFull', 'position'])[players['playerId'].isin(retired_player_ids)]


# Data types specifications for each dataset

# Offense datasets
dtype_dict_passer = {
    'playerId': 'int32',
    'playId': 'int32',
    'passOutcomes': 'category',
    'passDepth': 'category',
    'passLength': 'float16',
    'passComp': 'float16',
    'passTd': 'float16',
    'passInt': 'float16',
    'passIntTd': 'float16',
    'passSack': 'float16',
    'passSackYds': 'float16'
}

dtype_dict_rusher = {
    'playerId': 'int32',
    'playId': 'int32',
    'rushType': 'category',
    'rushYards': 'float16',
    'rushPrimary': 'float16',
    'rushTd': 'float32'
}

dtype_dict_receiver = {
    'playerId': 'int32',
    'playId': 'int32',
    'recYards': 'float16',
    'rec': 'float16',
    'recYac': 'float16',
    'rec1down': 'float16',
    'recFumble': 'float16',
    'recEnd': 'category'
}




# Special Teams datasets
dtype_dict_kicks = {
    'playerId': 'int32',
    'playId': 'int32',
    'kickType': 'category',
    'kickOutcome': 'category',
    'kickInside20': 'float16',
    'kickOnside': 'float16',
    'kickOwnRecovery': 'float16',
    'kickLength': 'float32',
    'kickReturnYds': 'float16',
    'kickNetYds': 'float32',
    'kickReturnTd': 'float16'
}

dtype_dict_kickReturns = {
    'playerId': 'int32',
    'playId': 'int32',
    'kickRetOutcome': 'category',
    'kickRetPrimary': 'float16',
    'kickRetYds': 'float16',
    'kickRetTd': 'float32'
}




# Defense datasets
dtype_dict_tackles = {
    'playerId': 'int32',
    'playId': 'int32',
    'tackleType': 'category',
    'tackleYdsScrim': 'float32'
}

dtype_dict_sacks = {
    'playerId': 'int32',
    'playId': 'int32',
    'sackType': 'category',
    'sackYards': 'float16'
}


dtype_dict_fumblForced = {
    'playerId': 'int32',
    'playId': 'int32',
    'fumForcedTurnover': 'float32'
}




# Load and filter datasets for active players with specific columns and data types

#Starting Offense - Passing - Load the data, Filter by active player IDs, Scale & OHE specific columns 
passer_active = pd.read_csv('passer.csv', usecols=dtype_dict_passer.keys(), dtype=dtype_dict_passer)
passer_active = passer_active[passer_active['playerId'].isin(active_player_ids)]
passer_active[['passLength', 'passSackYds']] = StandardScaler().fit_transform(passer_active[['passLength', 'passSackYds']])
passer_active = pd.get_dummies(passer_active, columns=['passOutcomes', 'passDepth'])
#Starting Offense - Rushing - Load the data, Filter by active player IDs, Scale & OHE specific columns 
rusher_active = pd.read_csv('rusher.csv', usecols=dtype_dict_rusher.keys(), dtype=dtype_dict_rusher)
rusher_active = rusher_active[rusher_active['playerId'].isin(active_player_ids)]
rusher_active[['rushYards']] = StandardScaler().fit_transform(rusher_active[['rushYards']])
rusher_active = pd.get_dummies(rusher_active, columns=['rushType'])
#Starting Offense - Receiving - Load the data, Filter by active player IDs, Scale & OHE specific columns 
receiver_active = pd.read_csv('receiver.csv', usecols=dtype_dict_receiver.keys(), dtype=dtype_dict_receiver)
receiver_active = receiver_active[receiver_active['playerId'].isin(active_player_ids)]
receiver_active[['recYards', 'recYac']] = StandardScaler().fit_transform(receiver_active[['recYards', 'recYac']])
receiver_active = pd.get_dummies(receiver_active, columns=['recEnd'])


#Starting Special Teams - Punting - Load the data, Filter by active player IDs, Scale & OHE specific columns 
kicks_active = pd.read_csv('kicks.csv', usecols=dtype_dict_kicks.keys(), dtype=dtype_dict_kicks)
kicks_active = kicks_active[kicks_active['playerId'].isin(active_player_ids)]
kicks_active[['kickLength', 'kickReturnYds', 'kickNetYds']] = StandardScaler().fit_transform(kicks_active[['kickLength', 'kickReturnYds', 'kickNetYds']])
kicks_active = pd.get_dummies(kicks_active, columns=['kickType', 'kickOutcome'])
#Starting Special Teams - Punt Returns and Starting Special Teams - Kick Returns - Load the data, Filter by retired player IDs, Scale & OHE specific columns 
kickReturns_active = pd.read_csv('kickReturns.csv', usecols=dtype_dict_kickReturns.keys(), dtype=dtype_dict_kickReturns)
kickReturns_active = kickReturns_active[kickReturns_active['playerId'].isin(active_player_ids)]
kickReturns_active[['kickRetYds']] = StandardScaler().fit_transform(kickReturns_active[['kickRetYds']])
kickReturns_active = pd.get_dummies(kickReturns_active, columns=['kickRetOutcome'])


#Starting Defense - Tackles - Load the data, Filter by active player IDs, Scale & OHE specific columns 
tackles_active = pd.read_csv('tackles.csv', usecols=dtype_dict_tackles.keys(), dtype=dtype_dict_tackles)
tackles_active = tackles_active[tackles_active['playerId'].isin(active_player_ids)]
tackles_active[['tackleYdsScrim']] = StandardScaler().fit_transform(tackles_active[['tackleYdsScrim']])
tackles_active = pd.get_dummies(tackles_active, columns=['tackleType'])
#Starting Defense - Sacks & Safeties - Load the data, Filter by active player IDs, Scale & OHE specific columns 
sacks_active = pd.read_csv('sacks.csv', usecols=dtype_dict_sacks.keys(), dtype=dtype_dict_sacks)
sacks_active = sacks_active[sacks_active['playerId'].isin(active_player_ids)]
sacks_active[['sackYards']] = StandardScaler().fit_transform(sacks_active[['sackYards']])
sacks_active = pd.get_dummies(sacks_active, columns=['sackType'])
#Starting Defense - Fumbles - Load the data, Filter by active player IDs, Scale & OHE specific columns
fumblForced_active = pd.read_csv('fumblForced.csv', usecols=dtype_dict_fumblForced.keys(), dtype=dtype_dict_fumblForced)
fumblForced_active = fumblForced_active[fumblForced_active['playerId'].isin(active_player_ids)]




# Load and filter datasets for retired players with specific columns and data types

#Starting Offense - Passing - Load the data, Filter by retired player IDs, Scale & OHE specific columns  
passer_retired = pd.read_csv('passer.csv', usecols=dtype_dict_passer.keys(), dtype=dtype_dict_passer)
passer_retired = passer_retired[passer_retired['playerId'].isin(retired_player_ids)]
passer_retired[['passLength', 'passSackYds']] = StandardScaler().fit_transform(passer_retired[['passLength', 'passSackYds']])
passer_retired = pd.get_dummies(passer_retired, columns=['passOutcomes', 'passDepth'])
#Starting Offense - Rushing - Load the data, Filter by retired player IDs, Scale & OHE specific columns
rusher_retired = pd.read_csv('rusher.csv', usecols=dtype_dict_rusher.keys(), dtype=dtype_dict_rusher)
rusher_retired = rusher_retired[rusher_retired['playerId'].isin(retired_player_ids)]
rusher_retired[['rushYards']] = StandardScaler().fit_transform(rusher_retired[['rushYards']])
rusher_retired = pd.get_dummies(rusher_retired, columns=['rushType'])
#Starting Offense - Receiving - Load the data, Filter by retired player IDs, Scale & OHE specific columns 
receiver_retired = pd.read_csv('receiver.csv', usecols=dtype_dict_receiver.keys(), dtype=dtype_dict_receiver)
receiver_retired = receiver_retired[receiver_retired['playerId'].isin(retired_player_ids)]
receiver_retired[['recYards', 'recYac']] = StandardScaler().fit_transform(receiver_retired[['recYards', 'recYac']])
receiver_retired = pd.get_dummies(receiver_retired, columns=['recEnd'])


#Starting Special Teams - Punting - Load the data, Filter by retired player IDs, Scale & OHE specific columns 
kicks_retired = pd.read_csv('kicks.csv', usecols=dtype_dict_kicks.keys(), dtype=dtype_dict_kicks)
kicks_retired = kicks_retired[kicks_retired['playerId'].isin(retired_player_ids)]
kicks_retired[['kickLength', 'kickReturnYds', 'kickNetYds']] = StandardScaler().fit_transform(kicks_retired[['kickLength', 'kickReturnYds', 'kickNetYds']])
kicks_retired = pd.get_dummies(kicks_retired, columns=['kickType', 'kickOutcome'])
#Starting Special Teams - Punt Returns and Starting Special Teams - Kick Returns - Load the data, Filter by retired player IDs, Scale & OHE specific columns 
kickReturns_retired = pd.read_csv('kickReturns.csv', usecols=dtype_dict_kickReturns.keys(), dtype=dtype_dict_kickReturns)
kickReturns_retired = kickReturns_retired[kickReturns_retired['playerId'].isin(retired_player_ids)]
kickReturns_retired[['kickRetYds']] = StandardScaler().fit_transform(kickReturns_retired[['kickRetYds']])
kickReturns_retired = pd.get_dummies(kickReturns_retired, columns=['kickRetOutcome'])


#Starting Defense - Tackles - Load the data, Filter by retired player IDs, Scale & OHE specific columns
tackles_retired = pd.read_csv('tackles.csv', usecols=dtype_dict_tackles.keys(), dtype=dtype_dict_tackles)
tackles_retired = tackles_retired[tackles_retired['playerId'].isin(retired_player_ids)]
tackles_retired[['tackleYdsScrim']] = StandardScaler().fit_transform(tackles_retired[['tackleYdsScrim']])
tackles_retired = pd.get_dummies(tackles_retired, columns=['tackleType'])
#Starting Defense - Sacks & Safeties - Load the data, Filter by active player IDs, Scale & OHE specific columns 
sacks_retired = pd.read_csv('sacks.csv', usecols=dtype_dict_sacks.keys(), dtype=dtype_dict_sacks)
sacks_retired = sacks_retired[sacks_retired['playerId'].isin(retired_player_ids)]
sacks_retired[['sackYards']] = StandardScaler().fit_transform(sacks_retired[['sackYards']])
sacks_retired = pd.get_dummies(sacks_retired, columns=['sackType'])
#Starting Defense - Fumbles - Load the data, Filter by retired player IDs, Scale & OHE specific columns
fumblForced_retired = pd.read_csv('fumblForced.csv', usecols=dtype_dict_fumblForced.keys(), dtype=dtype_dict_fumblForced)
fumblForced_retired = fumblForced_retired[fumblForced_retired['playerId'].isin(retired_player_ids)]


In [41]:
# Passing - Offense - Optimal Team Selection 'RB', 'DE', 'OT', 'DT', 'DB', 'LB', 'OG', 'QB', 'TE', 'C', 'WR', 'P', 'FB', 'K', 'OL', 'LS', 'OLB', 'KR', 'S', 'DL', 'PK'
# P (Punter), K (Kicker), WR (Wide Receiver), DB (Defensive Back), RB (Running Back), DE (Defensive End), OT (Offensive Tackle), DT (Defensive Tackle), LB (Linebacker), 
# OG (Offensive Guard), QB (Quarterback), TE (Tight End), C (Center), FB (Fullback), OL (Offensive Lineman), LS (Long Snapper), OLB (Outside Linebacker), KR (Kick Returner), 
# S (Safety), DL (Defensive Lineman), PK (Placekicker)

# Set random seeds for reproducibility
def set_seeds(seed=42):
    np.random.seed(seed)
    random.seed(seed)
    tf.random.set_seed(seed)

def process_passer_data(passer_data, players_data, score_label):
    # Merge to include full names and exclude specific positions
    passer_data = pd.merge(passer_data, players_data[['playerId', 'nameFull', 'position']], on='playerId', how='left')
    passer_data = passer_data[~passer_data['position'].isin(['P', 'K', 'RB', 'DB', 'WR'])]


     # Calculate the performance score for each play
    passer_data[score_label] = (
        passer_data['passComp'] * 0.40 +  # Accurate completion rate is crucial
        passer_data['passTd'] * 0.30 +  # Touchdowns directly contribute to scoring
        passer_data['passInt'] * -0.20 +  # Interceptions result in turnovers
        passer_data['passIntTd'] * -0.10 +  # Interception touchdowns are very damaging
        passer_data['passSack'] * -0.10 +  # Sacks disrupt the offensive flow
        passer_data['passSackYds'] * -0.05 +  # Loss of yardage due to sacks is negative
        passer_data['passLength'] * 0.15 +  # Longer passes indicate greater yardage gains
        passer_data['passOutcomes_complete'] * 0.05 +  # Successful completions are beneficial
        passer_data['passOutcomes_incomplete'] * -0.05 +  # Incompletions halt the drive
        passer_data['passOutcomes_interception'] * -0.15 +  # Interceptions are turnovers
        passer_data['passOutcomes_sack'] * -0.10 +  # Sacks are negative plays
        passer_data['passDepth_deep'] * 0.10 +  # Ability to make deep passes
        passer_data['passDepth_short'] * 0.10  # Reliability in short passes
    )

    
    # Aggregate data by playerId to capture overall performance characteristics
    features = passer_data.columns.difference(['playerId', 'nameFull', 'position', 'playId', score_label])
    aggregated_data = passer_data.groupby('playerId').agg({**{feat: 'mean' for feat in features},
                                                           'nameFull': 'first',  
                                                           'position': 'first',
                                                           'playId': 'first',
                                                           score_label: 'mean'}).reset_index()  

   
    # Define target and features for the model
    y = aggregated_data[score_label]  # Target score_labels
    X = aggregated_data[features]  # All features after aggregation

    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    
    # Build and train the model, 
    model = Sequential([
        Input(shape=(X_train.shape[1],)),  #Input layer num of input features based on the training data
        Dense(256, activation='relu'),  # A fully connected hidden layer w/ 256 neurons
        Dense(128, activation='relu'),  # A fully connected hidden layer w/ 128 neurons
        Dense(64, activation='relu'),  #A third fully connected hidden layer with 64 neurons
        Dense(1)  #The fully connected output layer with 1 neuron for the regression task
    ])
        
    
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=[MeanSquaredError(), MeanAbsoluteError()])
    model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

    # Predict performance scores
    predictions = model.predict(X_test)
    
    # Calculate the 40th percentile of the predicted scores
    threshold = np.percentile(predictions, 40)

    # Add the threshold value as a repeating value in the DataFrame
    aggregated_data['Threshold Value (A %ile of Predicted)'] = threshold
    
    # Label players based on the actual performance score compared to the threshold
    aggregated_data['Predicted Class Label (Via Threshold)'] = aggregated_data[score_label].apply(lambda x: 'Optimal Player' if x >= threshold else 'Not Optimal Player')

    # Sort and select the top players based on their actual performance scores
    top_indices = aggregated_data[score_label].argsort()[-3:][::-1]
    top_3_passing_players = aggregated_data.iloc[top_indices]

    # Sort and print the top 3 players
    top_3_passing_players = top_3_passing_players.sort_values(by=score_label, ascending=False)    
    return top_3_passing_players[['playerId', 'nameFull', 'position', score_label, 'Threshold Value (A %ile of Predicted)', 'Predicted Class Label (Via Threshold)']]

# Output active and retired passers dataframes
#process_passer_data(passer_active, players_active, 'Active_Passer_Performance_Score')
#process_passer_data(passer_retired, players_retired, 'Retired_Passer_Performance_Score')


In [42]:
# Define the global variable and store the data in a DataFrame
global optimal_passers_active
optimal_passers_active = pd.DataFrame(process_passer_data(passer_active, players_active, 'Active_Passer_Performance_Score'))
optimal_passers_active

Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - loss: 0.0492 - mean_absolute_error: 0.2126 - mean_squared_error: 0.0492 - val_loss: 0.0119 - val_mean_absolute_error: 0.1079 - val_mean_squared_error: 0.0119
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 355ms/step - loss: 0.0206 - mean_absolute_error: 0.1413 - mean_squared_error: 0.0206 - val_loss: 0.0015 - val_mean_absolute_error: 0.0352 - val_mean_squared_error: 0.0015
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 133ms/step - loss: 0.0056 - mean_absolute_error: 0.0670 - mean_squared_error: 0.0056 - val_loss: 0.0011 - val_mean_absolute_error: 0.0295 - val_mean_squared_error: 0.0011
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 159ms/step - loss: 0.0013 - mean_absolute_error: 0.0288 - mean_squared_error: 0.0013 - val_loss: 0.0058 - val_mean_absolute_error: 0.0744 - val_mean_squared_error: 0.0058
Epoch 5/10
[1m1/1[0m [32

Unnamed: 0,playerId,nameFull,position,Active_Passer_Performance_Score,Threshold Value (A %ile of Predicted),Predicted Class Label (Via Threshold)
7,20150001,Jameis Winston,QB,0.349675,0.301055,Optimal Player
8,20160135,Dak Prescott,QB,0.346751,0.301055,Optimal Player
1,20040004,Philip Rivers,QB,0.339104,0.301055,Optimal Player


In [43]:
# Define the global variable and store the data in a DataFrame
global optimal_passers_retired
optimal_passers_retired = pd.DataFrame(process_passer_data(passer_retired, players_retired, 'Retired_Passer_Performance_Score'))
optimal_passers_retired

Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - loss: 0.0523 - mean_absolute_error: 0.2123 - mean_squared_error: 0.0523 - val_loss: 0.0357 - val_mean_absolute_error: 0.1611 - val_mean_squared_error: 0.0357
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 104ms/step - loss: 0.0263 - mean_absolute_error: 0.1377 - mean_squared_error: 0.0263 - val_loss: 0.0186 - val_mean_absolute_error: 0.0990 - val_mean_squared_error: 0.0186
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 163ms/step - loss: 0.0128 - mean_absolute_error: 0.0960 - mean_squared_error: 0.0128 - val_loss: 0.0102 - val_mean_absolute_error: 0.0922 - val_mean_squared_error: 0.0102
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 162ms/step - loss: 0.0063 - mean_absolute_error: 0.0702 - mean_squared_error: 0.0063 - val_loss: 0.0059 - val_mean_absolute_error: 0.0767 - val_mean_squared_error: 0.0059
Epoch 5/10
[1m1/1[0m [32

Unnamed: 0,playerId,nameFull,position,Retired_Passer_Performance_Score,Threshold Value (A %ile of Predicted),Predicted Class Label (Via Threshold)
9,20050488,Brian Wrobel,QB,0.49082,0.270345,Optimal Player
11,20060452,Brett Basanez,QB,0.35157,0.270345,Optimal Player
6,20030110,Seneca Wallace,QB,0.330737,0.270345,Optimal Player


In [44]:
# Rushing - Offense - Optimal Team Selection 'RB', 'DE', 'OT', 'DT', 'DB', 'LB', 'OG', 'QB', 'TE', 'C', 'WR', 'P', 'FB', 'K', 'OL', 'LS', 'OLB', 'KR', 'S', 'DL', 'PK'
# P (Punter), K (Kicker), WR (Wide Receiver), DB (Defensive Back), RB (Running Back), DE (Defensive End), OT (Offensive Tackle), DT (Defensive Tackle), LB (Linebacker), 
# OG (Offensive Guard), QB (Quarterback), TE (Tight End), C (Center), FB (Fullback), OL (Offensive Lineman), LS (Long Snapper), OLB (Outside Linebacker), KR (Kick Returner), 
# S (Safety), DL (Defensive Lineman), PK (Placekicker)

# Set random seeds for reproducibility
def set_seeds(seed=42):
    np.random.seed(seed)
    random.seed(seed)
    tf.random.set_seed(seed)
    
def process_rusher_data(rusher_data, players_data, score_label):
    # Merge to include full names and exclude specific positions
    rusher_data = pd.merge(rusher_data, players_data[['playerId', 'nameFull', 'position']], on='playerId', how='left')
    rusher_data = rusher_data[~rusher_data['position'].isin(['QB', 'P', 'K', 'WR', 'TE', 'FB', 'S', 'DB'])]


     # Calculate the performance score for each play
    rusher_data[score_label] = (
        rusher_data['rushYards'] * 0.35 +  # Yardage gained is crucial
        rusher_data['rushPrimary'] * 0.15 +  # Primary rushing attempts are important
        rusher_data['rushTd'] * 0.25 +  # Touchdowns directly contribute to scoring
        rusher_data['rushType_aborted'] * -0.05 +  # Aborted rush attempts are negative
        rusher_data['rushType_kneel'] * 0.00 +  # Kneel downs are neutral
        rusher_data['rushType_pass'] * -0.10 +  # Pass attempts in rushing context can be negative
        rusher_data['rushType_qb scramble'] * 0.05 +  # QB scrambles indicate versatility
        rusher_data['rushType_rush'] * 0.25  # Successful rush attempts are critical
    )

    
    # Aggregate data by playerId to capture overall performance characteristics
    features = rusher_data.columns.difference(['playerId', 'nameFull', 'position', 'playId', score_label])
    aggregated_data = rusher_data.groupby('playerId').agg({**{feat: 'mean' for feat in features},
                                                           'nameFull': 'first',  
                                                           'position': 'first',
                                                           'playId': 'first',
                                                           score_label: 'mean'}).reset_index()  

   
    # Define target and features for the model
    y = aggregated_data[score_label]  # Target score_labels
    X = aggregated_data[features]  # All features after aggregation

    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    
    # Build and train the model, 
    model = Sequential([
        Input(shape=(X_train.shape[1],)),  #Input layer num of input features based on the training data
        Dense(256, activation='relu'),  # A fully connected hidden layer w/ 256 neurons
        Dense(128, activation='relu'),  # A fully connected hidden layer w/ 128 neurons
        Dense(64, activation='relu'),  #A third fully connected hidden layer with 64 neurons
        Dense(1)  #The fully connected output layer with 1 neuron for the regression task
    ])
        
    
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=[MeanSquaredError(), MeanAbsoluteError()])
    model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

    # Predict performance scores
    predictions = model.predict(X_test)
    
    # Calculate the 35th percentile of the predicted scores
    threshold = np.percentile(predictions, 35)

    # Add the threshold value as a repeating value in the DataFrame
    aggregated_data['Threshold Value (A %ile of Predicted)'] = threshold
    
    # Label players based on the actual performance score compared to the threshold
    aggregated_data['Predicted Class Label (Via Threshold)'] = aggregated_data[score_label].apply(lambda x: 'Optimal Player' if x >= threshold else 'Not Optimal Player')

    # Sort and select the top players based on their actual performance scores
    top_indices = aggregated_data[score_label].argsort()[-4:][::-1]
    top_4_rushing_players = aggregated_data.iloc[top_indices]

    # Sort and print the top 4 players
    top_4_rushing_players = top_4_rushing_players.sort_values(by=score_label, ascending=False)    
    return top_4_rushing_players[['playerId', 'nameFull', 'position', score_label, 'Threshold Value (A %ile of Predicted)', 'Predicted Class Label (Via Threshold)']]

# Output active and passive dataframes
#process_rusher_data(rusher_active, players_active, 'Active_Rusher_Performance_Score')
#process_rusher_data(rusher_retired, players_retired, 'Retired_Rusher_Performance_Score')


In [45]:
# Define the global variable and store the data in a DataFrame
global optimal_rushers_active
optimal_rushers_active = pd.DataFrame(process_rusher_data(rusher_active, players_active, 'Active_Rusher_Performance_Score'))
optimal_rushers_active

Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - loss: 0.1382 - mean_absolute_error: 0.3656 - mean_squared_error: 0.1382 - val_loss: 0.0957 - val_mean_absolute_error: 0.3053 - val_mean_squared_error: 0.0957
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 342ms/step - loss: 0.0602 - mean_absolute_error: 0.2347 - mean_squared_error: 0.0602 - val_loss: 0.0429 - val_mean_absolute_error: 0.1997 - val_mean_squared_error: 0.0429
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 120ms/step - loss: 0.0210 - mean_absolute_error: 0.1335 - mean_squared_error: 0.0210 - val_loss: 0.0131 - val_mean_absolute_error: 0.0979 - val_mean_squared_error: 0.0131
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 115ms/step - loss: 0.0062 - mean_absolute_error: 0.0626 - mean_squared_error: 0.0062 - val_loss: 0.0036 - val_mean_absolute_error: 0.0538 - val_mean_squared_error: 0.0036
Epoch 5/10
[1m1/1[0m [32

Unnamed: 0,playerId,nameFull,position,Active_Rusher_Performance_Score,Threshold Value (A %ile of Predicted),Predicted Class Label (Via Threshold)
10,20150905,Raheem Mostert,RB,0.507599,0.39317,Optimal Player
0,20080073,Jamaal Charles,RB,0.477637,0.39317,Optimal Player
17,20170067,Alvin Kamara,RB,0.474016,0.39317,Optimal Player
19,20170121,Joe Williams,RB,0.455529,0.39317,Optimal Player


In [46]:
# Define the global variable and store the data in a DataFrame
global optimal_rushers_retired
optimal_rushers_retired = pd.DataFrame(process_rusher_data(rusher_retired, players_retired, 'Retired_Rusher_Performance_Score'))
optimal_rushers_retired

Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - loss: 0.1994 - mean_absolute_error: 0.4404 - mean_squared_error: 0.1994 - val_loss: 0.1146 - val_mean_absolute_error: 0.3354 - val_mean_squared_error: 0.1146
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 104ms/step - loss: 0.1110 - mean_absolute_error: 0.3256 - mean_squared_error: 0.1110 - val_loss: 0.0580 - val_mean_absolute_error: 0.2370 - val_mean_squared_error: 0.0580
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 112ms/step - loss: 0.0561 - mean_absolute_error: 0.2271 - mean_squared_error: 0.0561 - val_loss: 0.0236 - val_mean_absolute_error: 0.1473 - val_mean_squared_error: 0.0236
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 118ms/step - loss: 0.0233 - mean_absolute_error: 0.1399 - mean_squared_error: 0.0233 - val_loss: 0.0058 - val_mean_absolute_error: 0.0631 - val_mean_squared_error: 0.0058
Epoch 5/10
[1m1/1[0m [32

Unnamed: 0,playerId,nameFull,position,Retired_Rusher_Performance_Score,Threshold Value (A %ile of Predicted),Predicted Class Label (Via Threshold)
0,19910154,Fred McAfee,RB,0.762634,0.496001,Optimal Player
3,20000166,Chad Morton,RB,0.513293,0.496001,Optimal Player
19,20070416,Selvin Young,RB,0.458506,0.496001,Not Optimal Player
11,20050412,Ryan Grant,RB,0.453792,0.496001,Not Optimal Player


In [47]:
# Receiving - Offense - Optimal Team Selection 'RB', 'DE', 'OT', 'DT', 'DB', 'LB', 'OG', 'QB', 'TE', 'C', 'WR', 'P', 'FB', 'K', 'OL', 'LS', 'OLB', 'KR', 'S', 'DL', 'PK'
# P (Punter), K (Kicker), WR (Wide Receiver), DB (Defensive Back), RB (Running Back), DE (Defensive End), OT (Offensive Tackle), DT (Defensive Tackle), LB (Linebacker), 
# OG (Offensive Guard), QB (Quarterback), TE (Tight End), C (Center), FB (Fullback), OL (Offensive Lineman), LS (Long Snapper), OLB (Outside Linebacker), KR (Kick Returner), 
# S (Safety), DL (Defensive Lineman), PK (Placekicker)

# Set random seeds for reproducibility
def set_seeds(seed=42):
    np.random.seed(seed)
    random.seed(seed)
    tf.random.set_seed(seed)
    
def process_receiver_data(receiver_data, players_data, score_label):
    # Merge to include full names and exclude specific positions
    receiver_data = pd.merge(receiver_data, players_data[['playerId', 'nameFull', 'position']], on='playerId', how='left')
    receiver_data = receiver_data[~receiver_data['position'].isin(['FB', 'QB', 'OG', 'RB', 'DB'])]


     # Calculate the performance score for each play
    receiver_data[score_label] = (
        receiver_data['recYards'] * 0.30 +  # Yardage gained is crucial
        receiver_data['rec'] * 0.20 +  # Receptions are crucial
        receiver_data['recYac'] * 0.15 +  # Yards after catch are important
        receiver_data['rec1down'] * 0.20 +  # First downs are important
        receiver_data['recFumble'] * -0.25 +  # Fumbles are highly negative
        receiver_data['recEnd_in bounds'] * 0.05 +  # Staying in bounds is positive
        receiver_data['recEnd_pushed out of bounds'] * 0.05 +  # Pushed out of bounds is positive
        receiver_data['recEnd_ran out of bounds'] * 0.05  # Running out of bounds is positive
    )

    
    # Aggregate data by playerId to capture overall performance characteristics
    features = receiver_data.columns.difference(['playerId', 'nameFull', 'position', 'playId', score_label])
    aggregated_data = receiver_data.groupby('playerId').agg({**{feat: 'mean' for feat in features},
                                                           'nameFull': 'first',  
                                                           'position': 'first',
                                                           'playId': 'first',
                                                           score_label: 'mean'}).reset_index()  

   
    # Define target and features for the model
    y = aggregated_data[score_label]  # Target score_labels
    X = aggregated_data[features]  # All features after aggregation

    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    
    # Build and train the model, 
    model = Sequential([
        Input(shape=(X_train.shape[1],)),  #Input layer num of input features based on the training data
        Dense(256, activation='relu'),  # A fully connected hidden layer w/ 256 neurons
        Dense(128, activation='relu'),  # A fully connected hidden layer w/ 128 neurons
        Dense(64, activation='relu'),  #A third fully connected hidden layer with 64 neurons
        Dense(1)  #The fully connected output layer with 1 neuron for the regression task
    ])
        
    
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=[MeanSquaredError(), MeanAbsoluteError()])
    model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

    # Predict performance scores
    predictions = model.predict(X_test)
    
    # Calculate the 65th percentile of the predicted scores
    threshold = np.percentile(predictions, 65)

    # Add the threshold value as a repeating value in the DataFrame
    aggregated_data['Threshold Value (A %ile of Predicted)'] = threshold
    
    # Label players based on the actual performance score compared to the threshold
    aggregated_data['Predicted Class Label (Via Threshold)'] = aggregated_data[score_label].apply(lambda x: 'Optimal Player' if x >= threshold else 'Not Optimal Player')

    # Sort and select the top players based on their actual performance scores
    top_indices = aggregated_data[score_label].argsort()[-4:][::-1]
    top_4_receiving_players = aggregated_data.iloc[top_indices]

    # Sort and print the top 4 players
    top_4_receiving_players = top_4_receiving_players.sort_values(by=score_label, ascending=False)    
    return top_4_receiving_players[['playerId', 'nameFull', 'position', score_label, 'Threshold Value (A %ile of Predicted)', 'Predicted Class Label (Via Threshold)']]

# Output active and passive dataframes
#process_receiver_data(receiver_active, players_active, 'Active_Receiver_Performance_Score')
#process_receiver_data(receiver_retired, players_retired, 'Retired_Receiver_Performance_Score')


In [48]:
# Define the global variable and store the data in a DataFrame
global optimal_receivers_active
optimal_receivers_active = pd.DataFrame(process_receiver_data(receiver_active, players_active, 'Active_Receiver_Performance_Score'))
optimal_receivers_active

Epoch 1/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 349ms/step - loss: 0.0270 - mean_absolute_error: 0.1491 - mean_squared_error: 0.0270 - val_loss: 0.0136 - val_mean_absolute_error: 0.0764 - val_mean_squared_error: 0.0136
Epoch 2/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step - loss: 0.0063 - mean_absolute_error: 0.0593 - mean_squared_error: 0.0063 - val_loss: 0.0181 - val_mean_absolute_error: 0.1126 - val_mean_squared_error: 0.0181
Epoch 3/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step - loss: 0.0086 - mean_absolute_error: 0.0745 - mean_squared_error: 0.0086 - val_loss: 0.0107 - val_mean_absolute_error: 0.0816 - val_mean_squared_error: 0.0107
Epoch 4/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step - loss: 0.0042 - mean_absolute_error: 0.0515 - mean_squared_error: 0.0042 - val_loss: 0.0045 - val_mean_absolute_error: 0.0439 - val_mean_squared_error: 0.0045
Epoch 5/10
[1m2/2[0m [32

Unnamed: 0,playerId,nameFull,position,Active_Receiver_Performance_Score,Threshold Value (A %ile of Predicted),Predicted Class Label (Via Threshold)
12,20120760,Sean McGrath,TE,0.38897,0.214641,Optimal Player
41,20170582,Chris Thompson,WR,0.261886,0.214641,Optimal Player
13,20121011,Eric Wallace,TE,0.25813,0.214641,Optimal Player
56,20190608,Jesper Horsted,WR,0.25419,0.214641,Optimal Player


In [49]:
# Define the global variable and store the data in a DataFrame
global optimal_receivers_retired
optimal_receivers_retired = pd.DataFrame(process_receiver_data(receiver_retired, players_retired, 'Retired_Receiver_Performance_Score'))
optimal_receivers_retired

Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - loss: 0.0372 - mean_absolute_error: 0.1367 - mean_squared_error: 0.0372 - val_loss: 0.0042 - val_mean_absolute_error: 0.0535 - val_mean_squared_error: 0.0042
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 140ms/step - loss: 0.0202 - mean_absolute_error: 0.0926 - mean_squared_error: 0.0202 - val_loss: 0.0027 - val_mean_absolute_error: 0.0463 - val_mean_squared_error: 0.0027
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 105ms/step - loss: 0.0118 - mean_absolute_error: 0.0719 - mean_squared_error: 0.0118 - val_loss: 0.0031 - val_mean_absolute_error: 0.0470 - val_mean_squared_error: 0.0031
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 99ms/step - loss: 0.0075 - mean_absolute_error: 0.0674 - mean_squared_error: 0.0075 - val_loss: 0.0021 - val_mean_absolute_error: 0.0422 - val_mean_squared_error: 0.0021
Epoch 5/10
[1m1/1[0m [32m

Unnamed: 0,playerId,nameFull,position,Retired_Receiver_Performance_Score,Threshold Value (A %ile of Predicted),Predicted Class Label (Via Threshold)
29,20030409,Terrence Edwards,WR,0.810742,0.250052,Optimal Player
48,20060629,Rick Gatewood,WR,0.510612,0.250052,Optimal Player
3,19940181,Bill Schroeder,WR,0.376343,0.250052,Optimal Player
26,20030226,Walter Young,WR,0.315848,0.250052,Optimal Player


In [50]:
# Punting - Special Teams - Optimal Team Selection 'RB', 'DE', 'OT', 'DT', 'DB', 'LB', 'OG', 'QB', 'TE', 'C', 'WR', 'P', 'FB', 'K', 'OL', 'LS', 'OLB', 'KR', 'S', 'DL', 'PK'
# P (Punter), K (Kicker), WR (Wide Receiver), DB (Defensive Back), RB (Running Back), DE (Defensive End), OT (Offensive Tackle), DT (Defensive Tackle), LB (Linebacker), 
# OG (Offensive Guard), QB (Quarterback), TE (Tight End), C (Center), FB (Fullback), OL (Offensive Lineman), LS (Long Snapper), OLB (Outside Linebacker), KR (Kick Returner), 
# S (Safety), DL (Defensive Lineman), PK (Placekicker)

# Set random seeds for reproducibility
def set_seeds(seed=42):
    np.random.seed(seed)
    random.seed(seed)
    tf.random.set_seed(seed)
    
def process_kicks_data(kicks_data, players_data, score_label):
    # Merge to include full names and exclude specific positions
    kicks_data = pd.merge(kicks_data, players_data[['playerId', 'nameFull', 'position']], on='playerId', how='left')
    kicks_data = kicks_data[~kicks_data['position'].isin(['QB', 'DE' ])]


     # Calculate the performance score for each play
    kicks_data[score_label] = (
        kicks_data['kickInside20'] * 0.20 +  # Placing the ball inside the 20-yard line improves field position significantly
        kicks_data['kickOnside'] * 0.10 +  # Onside kicks can potentially change possession, thus important
        kicks_data['kickOwnRecovery'] * 0.15 +  # Recovering one's own kick is highly advantageous
        kicks_data['kickLength'] * 0.10 +  # The distance of the kick indicates overall effectiveness
        kicks_data['kickReturnYds'] * -0.05 +  # Yardage gained by the return team is negative for the kicking team
        kicks_data['kickNetYds'] * 0.20 +  # Net yardage combines kick length and return yards, key for overall effectiveness
        kicks_data['kickReturnTd'] * -0.25 +  # Return touchdowns by the opposing team are highly detrimental
        kicks_data['kickType_aborted'] * -0.10 +  # Aborted kick attempts are negative plays
        kicks_data['kickType_field goal'] * 0.15 +  # Successful field goals contribute directly to scoring
        kicks_data['kickType_kickoff'] * 0.05 +  # Kickoffs are important but less critical than field goals
        kicks_data['kickType_pass'] * -0.10 +  # Pass attempts in kicking context can be negative if they fail
        kicks_data['kickType_punt'] * 0.05 +  # Punts are useful for changing field position
        kicks_data['kickType_xp'] * 0.10 +  # Extra points directly contribute to the score
        kicks_data['kickOutcome_blocked'] * -0.20 +  # Blocked kicks are highly negative outcomes
        kicks_data['kickOutcome_downed'] * 0.10 +  # Successfully downing the ball is positive
        kicks_data['kickOutcome_fair catch'] * 0.05 +  # Fair catches prevent return yardage, beneficial for the kicking team
        kicks_data['kickOutcome_good'] * 0.25 +  # Successful kicks (field goals and extra points) are highly positive
        kicks_data['kickOutcome_missed'] * -0.20 +  # Missed kicks are highly negative outcomes
        kicks_data['kickOutcome_onside'] * 0.10 +  # Successful onside kicks are positive
        kicks_data['kickOutcome_out of bounds'] * -0.10 +  # Kicks out of bounds are generally negative
        kicks_data['kickOutcome_own recovery'] * 0.15 +  # Recovering one's own kick is highly positive
        kicks_data['kickOutcome_returned'] * -0.05 +  # Returned kicks are generally negative
        kicks_data['kickOutcome_touchback'] * 0.05  # Touchbacks prevent returns, thus slightly positive
    )

    
    # Aggregate data by playerId to capture overall performance characteristics
    features = kicks_data.columns.difference(['playerId', 'nameFull', 'position', 'playId', score_label])
    aggregated_data = kicks_data.groupby('playerId').agg({**{feat: 'mean' for feat in features},
                                                           'nameFull': 'first',  
                                                           'position': 'first',
                                                           'playId': 'first',
                                                           score_label: 'mean'}).reset_index()  

   
    # Define target and features for the model
    y = aggregated_data[score_label]  # Target score_labels
    X = aggregated_data[features]  # All features after aggregation

    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    
    # Build and train the model, 
    model = Sequential([
        Input(shape=(X_train.shape[1],)),  #Input layer num of input features based on the training data
        Dense(256, activation='relu'),  # A fully connected hidden layer w/ 256 neurons
        Dense(128, activation='relu'),  # A fully connected hidden layer w/ 128 neurons
        Dense(64, activation='relu'),  #A third fully connected hidden layer with 64 neurons
        Dense(1)  #The fully connected output layer with 1 neuron for the regression task
    ])
        
    
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=[MeanSquaredError(), MeanAbsoluteError()])
    model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

    # Predict performance scores
    predictions = model.predict(X_test)
    
    # Calculate the 35th percentile of the predicted scores
    threshold = np.percentile(predictions, 35)

    # Add the threshold value as a repeating value in the DataFrame
    aggregated_data['Threshold Value (A %ile of Predicted)'] = threshold
    
    # Label players based on the actual performance score compared to the threshold
    aggregated_data['Predicted Class Label (Via Threshold)'] = aggregated_data[score_label].apply(lambda x: 'Optimal Player' if x >= threshold else 'Not Optimal Player')

    # Sort and select the top players based on their actual performance scores
    top_indices = aggregated_data[score_label].argsort()[-4:][::-1]
    top_4_punting_players = aggregated_data.iloc[top_indices]

    # Sort and print the top 4 players
    top_4_punting_players = top_4_punting_players.sort_values(by=score_label, ascending=False)    
    return top_4_punting_players[['playerId', 'nameFull', 'position', score_label, 'Threshold Value (A %ile of Predicted)', 'Predicted Class Label (Via Threshold)']]

# Output active and passive dataframes
#process_kicks_data(kicks_active, players_active, 'Active_Kicks_Performance_Score')
#process_kicks_data(kicks_retired, players_retired, 'Retired_Kicks_Performance_Score')


In [51]:
# Define the global variable and store the data in a DataFrame
global optimal_punting_active
optimal_punting_active = pd.DataFrame(process_kicks_data(kicks_active, players_active, 'Active_Punting_Performance_Score'))
optimal_punting_active

Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - loss: 0.0285 - mean_absolute_error: 0.1596 - mean_squared_error: 0.0285 - val_loss: 0.0095 - val_mean_absolute_error: 0.0852 - val_mean_squared_error: 0.0095
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 128ms/step - loss: 0.0107 - mean_absolute_error: 0.0864 - mean_squared_error: 0.0107 - val_loss: 0.0044 - val_mean_absolute_error: 0.0438 - val_mean_squared_error: 0.0044
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 135ms/step - loss: 0.0042 - mean_absolute_error: 0.0405 - mean_squared_error: 0.0042 - val_loss: 0.0038 - val_mean_absolute_error: 0.0586 - val_mean_squared_error: 0.0038
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 110ms/step - loss: 0.0031 - mean_absolute_error: 0.0491 - mean_squared_error: 0.0031 - val_loss: 0.0041 - val_mean_absolute_error: 0.0587 - val_mean_squared_error: 0.0041
Epoch 5/10
[1m1/1[0m [32

Unnamed: 0,playerId,nameFull,position,Active_Punting_Performance_Score,Threshold Value (A %ile of Predicted),Predicted Class Label (Via Threshold)
24,20170673,Taylor Symmank,P,0.30888,0.109924,Optimal Player
11,20130165,Sam Martin,P,0.249128,0.109924,Optimal Player
32,20190163,Jake Bailey,P,0.247981,0.109924,Optimal Player
8,20110192,Matt Bosher,P,0.246381,0.109924,Optimal Player


In [52]:
# Define the global variable and store the data in a DataFrame
global optimal_punting_retired
optimal_punting_retired = pd.DataFrame(process_kicks_data(kicks_retired, players_retired, 'Retired_Punting_Performance_Score'))
optimal_punting_retired

Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - loss: 0.0277 - mean_absolute_error: 0.1503 - mean_squared_error: 0.0277 - val_loss: 0.0053 - val_mean_absolute_error: 0.0584 - val_mean_squared_error: 0.0053
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 342ms/step - loss: 0.0066 - mean_absolute_error: 0.0681 - mean_squared_error: 0.0066 - val_loss: 0.0016 - val_mean_absolute_error: 0.0359 - val_mean_squared_error: 0.0016
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 128ms/step - loss: 0.0025 - mean_absolute_error: 0.0391 - mean_squared_error: 0.0025 - val_loss: 0.0060 - val_mean_absolute_error: 0.0710 - val_mean_squared_error: 0.0060
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 135ms/step - loss: 0.0049 - mean_absolute_error: 0.0642 - mean_squared_error: 0.0049 - val_loss: 0.0082 - val_mean_absolute_error: 0.0815 - val_mean_squared_error: 0.0082
Epoch 5/10
[1m1/1[0m [32

Unnamed: 0,playerId,nameFull,position,Retired_Punting_Performance_Score,Threshold Value (A %ile of Predicted),Predicted Class Label (Via Threshold)
33,20080638,Waylon Prather,P,0.391962,0.185195,Optimal Player
31,20070575,Brian Monroe,P,0.282024,0.185195,Optimal Player
30,20070543,Brendan Carney,P,0.266111,0.185195,Optimal Player
6,19930200,Craig Hentrich,P,0.265365,0.185195,Optimal Player


In [53]:
# Punt Returns and Kick Return - Special Teams - Optimal Team Selection 'RB', 'DE', 'OT', 'DT', 'DB', 'LB', 'OG', 'QB', 'TE', 'C', 'WR', 'P', 'FB', 'K', 'OL', 'LS', 'OLB', 'KR', 'S', 'DL', 'PK'
# P (Punter), K (Kicker), WR (Wide Receiver), DB (Defensive Back), RB (Running Back), DE (Defensive End), OT (Offensive Tackle), DT (Defensive Tackle), LB (Linebacker), 
# OG (Offensive Guard), QB (Quarterback), TE (Tight End), C (Center), FB (Fullback), OL (Offensive Lineman), LS (Long Snapper), OLB (Outside Linebacker), KR (Kick Returner), 
# S (Safety), DL (Defensive Lineman), PK (Placekicker)

# Set random seeds for reproducibility
def set_seeds(seed=42):
    np.random.seed(seed)
    random.seed(seed)
    tf.random.set_seed(seed)
    
def process_kickReturns_data(kickReturns_data, players_data, score_label):
    # Merge to include full names and exclude specific positions
    kickReturns_data = pd.merge(kickReturns_data, players_data[['playerId', 'nameFull', 'position']], on='playerId', how='left')
    kickReturns_data = kickReturns_data[~kickReturns_data['position'].isin(['DB'  ])]


     # Calculate the performance score for each play
    kickReturns_data[score_label] = (
        kickReturns_data['kickRetPrimary'] * 0.20 +  # Primary return attempts are crucial
        kickReturns_data['kickRetYds'] * 0.30 +  # Yardage gained is highly important
        kickReturns_data['kickRetTd'] * 0.30 +  # Return touchdowns directly contribute to scoring
        kickReturns_data['kickRetOutcome_fair catch'] * 0.10 +  # Fair catches are strategic plays
        kickReturns_data['kickRetOutcome_returned'] * 0.10  # Successfully returning the kick is positive
    )

    
    # Aggregate data by playerId to capture overall performance characteristics
    features = kickReturns_data.columns.difference(['playerId', 'nameFull', 'position', 'playId', score_label])
    aggregated_data = kickReturns_data.groupby('playerId').agg({**{feat: 'mean' for feat in features},
                                                           'nameFull': 'first',  
                                                           'position': 'first',
                                                           'playId': 'first',
                                                           score_label: 'mean'}).reset_index()  

   
    # Define target and features for the model
    y = aggregated_data[score_label]  # Target score_labels
    X = aggregated_data[features]  # All features after aggregation

    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    
    # Build and train the model, 
    model = Sequential([
        Input(shape=(X_train.shape[1],)),  #Input layer num of input features based on the training data
        Dense(256, activation='relu'),  # A fully connected hidden layer w/ 256 neurons
        Dense(128, activation='relu'),  # A fully connected hidden layer w/ 128 neurons
        Dense(64, activation='relu'),  #A third fully connected hidden layer with 64 neurons
        Dense(1)  #The fully connected output layer with 1 neuron for the regression task
    ])
        
    
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=[MeanSquaredError(), MeanAbsoluteError()])
    model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

    # Predict performance scores
    predictions = model.predict(X_test)
    
    # Calculate the 55th percentile of the predicted scores
    threshold = np.percentile(predictions, 55)

    # Add the threshold value as a repeating value in the DataFrame
    aggregated_data['Threshold Value (A %ile of Predicted)'] = threshold
    
    # Label players based on the actual performance score compared to the threshold
    aggregated_data['Predicted Class Label (Via Threshold)'] = aggregated_data[score_label].apply(lambda x: 'Optimal Player' if x >= threshold else 'Not Optimal Player')

    # Sort and select the top players based on their actual performance scores
    top_indices = aggregated_data[score_label].argsort()[-7:][::-1]
    top_7_kickReturn_players = aggregated_data.iloc[top_indices]

    # Sort and print the top 7 players
    top_7_kickReturn_players = top_7_kickReturn_players.sort_values(by=score_label, ascending=False)    
    return top_7_kickReturn_players[['playerId', 'nameFull', 'position', score_label, 'Threshold Value (A %ile of Predicted)', 'Predicted Class Label (Via Threshold)']]
  
# Output active and passive dataframes
#process_kickReturns_data(kickReturns_active, players_active, 'Active_KickReturns_Performance_Score')
#process_kickReturns_data(kickReturns_retired, players_retired, 'Retired_KickReturns_Performance_Score')


In [54]:
# Define the global variable and store the data in a DataFrame
global optimal_kickReturns_active
optimal_kickReturns_active = pd.DataFrame(process_kickReturns_data(kickReturns_active, players_active, 'Active_KickReturns_Performance_Score'))
optimal_kickReturns_active

Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - loss: 0.0846 - mean_absolute_error: 0.2634 - mean_squared_error: 0.0846 - val_loss: 0.0255 - val_mean_absolute_error: 0.1340 - val_mean_squared_error: 0.0255
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 113ms/step - loss: 0.0357 - mean_absolute_error: 0.1623 - mean_squared_error: 0.0357 - val_loss: 0.0119 - val_mean_absolute_error: 0.0853 - val_mean_squared_error: 0.0119
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 100ms/step - loss: 0.0167 - mean_absolute_error: 0.1003 - mean_squared_error: 0.0167 - val_loss: 0.0120 - val_mean_absolute_error: 0.0991 - val_mean_squared_error: 0.0120
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 103ms/step - loss: 0.0140 - mean_absolute_error: 0.0928 - mean_squared_error: 0.0140 - val_loss: 0.0120 - val_mean_absolute_error: 0.1011 - val_mean_squared_error: 0.0120
Epoch 5/10
[1m1/1[0m [32

Unnamed: 0,playerId,nameFull,position,Active_KickReturns_Performance_Score,Threshold Value (A %ile of Predicted),Predicted Class Label (Via Threshold)
13,20140222,Storm Johnson,RB,0.721216,0.033517,Optimal Player
34,20170067,Alvin Kamara,RB,0.710837,0.033517,Optimal Player
36,20170121,Joe Williams,RB,0.687402,0.033517,Optimal Player
49,20190927,DeAndre Thompkins,WR,0.664941,0.033517,Optimal Player
43,20180540,Vyncint Smith,WR,0.654857,0.033517,Optimal Player
3,20100087,Eric Decker,WR,0.603225,0.033517,Optimal Player
37,20170128,Josh Malone,WR,0.581323,0.033517,Optimal Player


In [55]:
# Define the global variable and store the data in a DataFrame
global optimal_kickReturns_retired
optimal_kickReturns_retired = pd.DataFrame(process_kickReturns_data(kickReturns_retired, players_retired, 'Retired_KickReturns_Performance_Score'))
optimal_kickReturns_retired

Epoch 1/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 335ms/step - loss: 0.1222 - mean_absolute_error: 0.2988 - mean_squared_error: 0.1222 - val_loss: 0.0488 - val_mean_absolute_error: 0.1750 - val_mean_squared_error: 0.0488
Epoch 2/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step - loss: 0.0486 - mean_absolute_error: 0.1811 - mean_squared_error: 0.0486 - val_loss: 0.0300 - val_mean_absolute_error: 0.1445 - val_mean_squared_error: 0.0300
Epoch 3/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step - loss: 0.0278 - mean_absolute_error: 0.1407 - mean_squared_error: 0.0278 - val_loss: 0.0245 - val_mean_absolute_error: 0.1310 - val_mean_squared_error: 0.0245
Epoch 4/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step - loss: 0.0229 - mean_absolute_error: 0.1240 - mean_squared_error: 0.0229 - val_loss: 0.0225 - val_mean_absolute_error: 0.1169 - val_mean_squared_error: 0.0225
Epoch 5/10
[1m2/2[0m [32

Unnamed: 0,playerId,nameFull,position,Retired_KickReturns_Performance_Score,Threshold Value (A %ile of Predicted),Predicted Class Label (Via Threshold)
39,20050412,Ryan Grant,RB,0.628809,0.201175,Optimal Player
48,20060664,Damien Rhodes,RB,0.580225,0.201175,Optimal Player
42,20050589,Tyson Thompson,RB,0.574179,0.201175,Optimal Player
29,20030551,Avon Cobourne,RB,0.555892,0.201175,Optimal Player
30,20030677,ReShard Lee,RB,0.524437,0.201175,Optimal Player
23,20010162,Jonathan Carter,WR,0.514478,0.201175,Optimal Player
45,20060190,Jeff Webb,WR,0.511621,0.201175,Optimal Player


In [56]:
# Tackles - Defense - Optimal Team Selection 'RB', 'DE', 'OT', 'DT', 'DB', 'LB', 'OG', 'QB', 'TE', 'C', 'WR', 'P', 'FB', 'K', 'OL', 'LS', 'OLB', 'KR', 'S', 'DL', 'PK'
# P (Punter), K (Kicker), WR (Wide Receiver), DB (Defensive Back), RB (Running Back), DE (Defensive End), OT (Offensive Tackle), DT (Defensive Tackle), LB (Linebacker), 
# OG (Offensive Guard), QB (Quarterback), TE (Tight End), C (Center), FB (Fullback), OL (Offensive Lineman), LS (Long Snapper), OLB (Outside Linebacker), KR (Kick Returner), 
# S (Safety), DL (Defensive Lineman), PK (Placekicker)

# Set random seeds for reproducibility
def set_seeds(seed=42):
    np.random.seed(seed)
    random.seed(seed)
    tf.random.set_seed(seed)
    
def process_tackles_data(tackles_data, players_data, score_label):
    # Merge to include full names and exclude specific positions
    tackles_data = pd.merge(tackles_data, players_data[['playerId', 'nameFull', 'position']], on='playerId', how='left')
    tackles_data = tackles_data[~tackles_data['position'].isin(['TE', 'FB', 'QB', 'P', 'RB', 'K', 'WR', 'OG', 'OT', 'C'])]


     # Calculate the performance score for each play
    tackles_data[score_label] = (
        tackles_data['tackleYdsScrim'] * 0.30 +  # Yardage gained or lost at the line of scrimmage
        tackles_data['tackleType_assist'] * 0.20 +  # Assisted tackles
        tackles_data['tackleType_for a loss'] * 0.30 +  # Tackles for a loss
        tackles_data['tackleType_solo'] * 0.20  # Solo tackles
    )

    
    # Aggregate data by playerId to capture overall performance characteristics
    features = tackles_data.columns.difference(['playerId', 'nameFull', 'position', 'playId', score_label])
    aggregated_data = tackles_data.groupby('playerId').agg({**{feat: 'mean' for feat in features},
                                                           'nameFull': 'first',  
                                                           'position': 'first',
                                                           'playId': 'first',
                                                           score_label: 'mean'}).reset_index()  

   
    # Define target and features for the model
    y = aggregated_data[score_label]  # Target score_labels
    X = aggregated_data[features]  # All features after aggregation

    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    
    # Build and train the model, 
    model = Sequential([
        Input(shape=(X_train.shape[1],)),  #Input layer num of input features based on the training data
        Dense(256, activation='relu'),  # A fully connected hidden layer w/ 256 neurons
        Dense(128, activation='relu'),  # A fully connected hidden layer w/ 128 neurons
        Dense(64, activation='relu'),  #A third fully connected hidden layer with 64 neurons
        Dense(1)  #The fully connected output layer with 1 neuron for the regression task
    ])
        
    
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=[MeanSquaredError(), MeanAbsoluteError()])
    model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

    # Predict performance scores
    predictions = model.predict(X_test)
    
    # Calculate the 40th percentile of the predicted scores
    threshold = np.percentile(predictions, 40)

    # Add the threshold value as a repeating value in the DataFrame
    aggregated_data['Threshold Value (A %ile of Predicted)'] = threshold
    
    # Label players based on the actual performance score compared to the threshold
    aggregated_data['Predicted Class Label (Via Threshold)'] = aggregated_data[score_label].apply(lambda x: 'Optimal Player' if x >= threshold else 'Not Optimal Player')

    # Sort and select the top players based on their actual performance scores
    top_indices = aggregated_data[score_label].argsort()[-4:][::-1]
    top_4_tackles_players = aggregated_data.iloc[top_indices]

    # Sort and print the top 4 players
    top_4_tackles_players = top_4_tackles_players.sort_values(by=score_label, ascending=False)    
    return top_4_tackles_players[['playerId', 'nameFull', 'position', score_label, 'Threshold Value (A %ile of Predicted)', 'Predicted Class Label (Via Threshold)']]
    
# Output active and passive dataframes
#process_tackles_data(tackles_active, players_active, 'Active_Tackles_Performance_Score')
#process_tackles_data(tackles_retired, players_retired, 'Retired_Tackles_Performance_Score')


In [57]:
# Define the global variable and store the data in a DataFrame
global optimal_tackles_active
optimal_tackles_active = pd.DataFrame(process_tackles_data(tackles_active, players_active, 'Active_Tackles_Performance_Score'))
optimal_tackles_active

Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - loss: 0.0634 - mean_absolute_error: 0.1797 - mean_squared_error: 0.0634 - val_loss: nan - val_mean_absolute_error: nan - val_mean_squared_error: nan
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 362ms/step - loss: 0.0429 - mean_absolute_error: 0.1419 - mean_squared_error: 0.0429 - val_loss: nan - val_mean_absolute_error: nan - val_mean_squared_error: nan
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 146ms/step - loss: 0.0292 - mean_absolute_error: 0.1138 - mean_squared_error: 0.0292 - val_loss: nan - val_mean_absolute_error: nan - val_mean_squared_error: nan
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 136ms/step - loss: 0.0188 - mean_absolute_error: 0.0899 - mean_squared_error: 0.0188 - val_loss: nan - val_mean_absolute_error: nan - val_mean_squared_error: nan
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

  top_indices = aggregated_data[score_label].argsort()[-4:][::-1]


Unnamed: 0,playerId,nameFull,position,Active_Tackles_Performance_Score,Threshold Value (A %ile of Predicted),Predicted Class Label (Via Threshold)
5,20130093,Will Davis,DB,0.445969,0.167492,Optimal Player
1,20110005,Patrick Peterson,DB,0.435107,0.167492,Optimal Player
29,20170770,Terrish Webb,DB,0.422913,0.167492,Optimal Player
42,20180894,Rico Gafford,DB,,0.167492,Not Optimal Player


In [58]:
# Define the global variable and store the data in a DataFrame
global optimal_tackles_retired
optimal_tackles_retired = pd.DataFrame(process_tackles_data(tackles_retired, players_retired, 'Retired_Tackles_Performance_Score'))
optimal_tackles_retired

Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - loss: 0.0482 - mean_absolute_error: 0.1632 - mean_squared_error: 0.0482 - val_loss: 0.0402 - val_mean_absolute_error: 0.1443 - val_mean_squared_error: 0.0402
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 110ms/step - loss: 0.0291 - mean_absolute_error: 0.1318 - mean_squared_error: 0.0291 - val_loss: 0.0264 - val_mean_absolute_error: 0.1166 - val_mean_squared_error: 0.0264
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 156ms/step - loss: 0.0174 - mean_absolute_error: 0.1060 - mean_squared_error: 0.0174 - val_loss: 0.0175 - val_mean_absolute_error: 0.0943 - val_mean_squared_error: 0.0175
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 100ms/step - loss: 0.0097 - mean_absolute_error: 0.0807 - mean_squared_error: 0.0097 - val_loss: 0.0115 - val_mean_absolute_error: 0.0753 - val_mean_squared_error: 0.0115
Epoch 5/10
[1m1/1[0m [32

Unnamed: 0,playerId,nameFull,position,Retired_Tackles_Performance_Score,Threshold Value (A %ile of Predicted),Predicted Class Label (Via Threshold)
38,20050678,Russell Rabe,LB,0.567317,0.103103,Optimal Player
41,20050799,Antwaun Rogers,DB,0.534,0.103103,Optimal Player
3,19980010,Duane Starks,DB,0.411741,0.103103,Optimal Player
24,20040223,Jacques Reeves,DB,0.383523,0.103103,Optimal Player


In [59]:
# Sacks & Safeties - Defense - Optimal Team Selection 'RB', 'DE', 'OT', 'DT', 'DB', 'LB', 'OG', 'QB', 'TE', 'C', 'WR', 'P', 'FB', 'K', 'OL', 'LS', 'OLB', 'KR', 'S', 'DL', 'PK'
# P (Punter), K (Kicker), WR (Wide Receiver), DB (Defensive Back), RB (Running Back), DE (Defensive End), OT (Offensive Tackle), DT (Defensive Tackle), LB (Linebacker), 
# OG (Offensive Guard), QB (Quarterback), TE (Tight End), C (Center), FB (Fullback), OL (Offensive Lineman), LS (Long Snapper), OLB (Outside Linebacker), KR (Kick Returner), 
# S (Safety), DL (Defensive Lineman), PK (Placekicker)

# Set random seeds for reproducibility
def set_seeds(seed=42):
    np.random.seed(seed)
    random.seed(seed)
    tf.random.set_seed(seed)
    
def process_sacks_safeties_data(sacks_safeties_data, players_data, players_pool, score_label):
    # Merge to include full names, exclude specific positions, and add 'safety' column
    sacks_safeties_data = pd.merge(sacks_safeties_data, players_data[['playerId', 'nameFull', 'position']], on='playerId', how='left')
    sacks_safeties_data = pd.merge(sacks_safeties_data, players_pool[['playerId', 'safety']], on='playerId', how='left')    
    sacks_safeties_data = sacks_safeties_data[~sacks_safeties_data['position'].isin([   ])]


     # Calculate the performance score for each play
    sacks_safeties_data[score_label] = (
        sacks_safeties_data['sackYards'] * 0.30 +  # Yardage lost due to sacks
        sacks_safeties_data['sackType_0.5'] * 0.20 +  # Half sacks
        sacks_safeties_data['sackType_1.0'] * 0.50 +  # Full sacks
        sacks_safeties_data['safety'] * 0.20  # Significant positive defensive play
    )

    
    # Aggregate data by playerId to capture overall performance characteristics
    features = sacks_safeties_data.columns.difference(['playerId', 'nameFull', 'position', 'playId', score_label])
    aggregated_data = sacks_safeties_data.groupby('playerId').agg({**{feat: 'mean' for feat in features},
                                                           'nameFull': 'first',  
                                                           'position': 'first',
                                                           'playId': 'first',
                                                           score_label: 'mean'}).reset_index()  

   
    # Define target and features for the model
    y = aggregated_data[score_label]  # Target score_labels
    X = aggregated_data[features]  # All features after aggregation

    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    
    # Build and train the model, 
    model = Sequential([
        Input(shape=(X_train.shape[1],)),  #Input layer num of input features based on the training data
        Dense(256, activation='relu'),  # A fully connected hidden layer w/ 256 neurons
        Dense(128, activation='relu'),  # A fully connected hidden layer w/ 128 neurons
        Dense(64, activation='relu'),  #A third fully connected hidden layer with 64 neurons
        Dense(1)  #The fully connected output layer with 1 neuron for the regression task
    ])
        
    
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=[MeanSquaredError(), MeanAbsoluteError()])
    model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

    # Predict performance scores
    predictions = model.predict(X_test)
    
    # Calculate the 65th percentile of the predicted scores
    threshold = np.percentile(predictions, 65)

    # Add the threshold value as a repeating value in the DataFrame
    aggregated_data['Threshold Value (A %ile of Predicted)'] = threshold
    
    # Label players based on the actual performance score compared to the threshold
    aggregated_data['Predicted Class Label (Via Threshold)'] = aggregated_data[score_label].apply(lambda x: 'Optimal Player' if x >= threshold else 'Not Optimal Player')

    # Sort and select the top players based on their actual performance scores
    top_indices = aggregated_data[score_label].argsort()[-4:][::-1]
    top_4_sacks_safeties_players = aggregated_data.iloc[top_indices]

    # Sort and print the top 4 players
    top_4_sacks_safeties_players = top_4_sacks_safeties_players.sort_values(by=score_label, ascending=False)
    return top_4_sacks_safeties_players[['playerId', 'nameFull', 'position', score_label, 'Threshold Value (A %ile of Predicted)', 'Predicted Class Label (Via Threshold)']]
    
# Output active and passive dataframes
#process_sacks_safeties_data(sacks_active, players_active, Active_Players_Pool, 'Active_Sacks_Safeties_Performance_Score')
#process_sacks_safeties_data(sacks_retired, players_retired, Retired_Players_Pool, 'Retired_Sacks_Safeties_Performance_Score')


In [60]:
# Define the global variable and store the data in a DataFrame
global optimal_sacks_safeties_active
optimal_sacks_safeties_active = pd.DataFrame(process_sacks_safeties_data(sacks_active, players_active, Active_Players_Pool, 'Active_Sacks_Safeties_Performance_Score'))
optimal_sacks_safeties_active

Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - loss: 0.3200 - mean_absolute_error: 0.4782 - mean_squared_error: 0.3200 - val_loss: 0.2505 - val_mean_absolute_error: 0.3440 - val_mean_squared_error: 0.2505
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 335ms/step - loss: 0.2460 - mean_absolute_error: 0.4102 - mean_squared_error: 0.2460 - val_loss: 0.2019 - val_mean_absolute_error: 0.3096 - val_mean_squared_error: 0.2019
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 108ms/step - loss: 0.1932 - mean_absolute_error: 0.3573 - mean_squared_error: 0.1932 - val_loss: 0.1677 - val_mean_absolute_error: 0.2811 - val_mean_squared_error: 0.1677
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 95ms/step - loss: 0.1530 - mean_absolute_error: 0.3097 - mean_squared_error: 0.1530 - val_loss: 0.1341 - val_mean_absolute_error: 0.2643 - val_mean_squared_error: 0.1341
Epoch 5/10
[1m1/1[0m [32m

Unnamed: 0,playerId,nameFull,position,Active_Sacks_Safeties_Performance_Score,Threshold Value (A %ile of Predicted),Predicted Class Label (Via Threshold)
32,20190565,Terrell Bonds,DB,1.060059,0.50537,Optimal Player
27,20180434,Jacob Tuioti-Mariner,DT,1.060059,0.50537,Optimal Player
16,20170248,Keion Adams,OLB,1.060059,0.50537,Optimal Player
9,20140219,T.J. Carrie,DB,0.791504,0.50537,Optimal Player


In [61]:
# Define the global variable and store the data in a DataFrame
global optimal_sacks_safeties_retired
optimal_sacks_safeties_retired = pd.DataFrame(process_sacks_safeties_data(sacks_retired, players_retired, Retired_Players_Pool, 'Retired_Sacks_Safeties_Performance_Score'))
optimal_sacks_safeties_retired

Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - loss: 0.2982 - mean_absolute_error: 0.4623 - mean_squared_error: 0.2982 - val_loss: 0.2040 - val_mean_absolute_error: 0.4254 - val_mean_squared_error: 0.2040
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 356ms/step - loss: 0.2024 - mean_absolute_error: 0.3738 - mean_squared_error: 0.2024 - val_loss: 0.1418 - val_mean_absolute_error: 0.3602 - val_mean_squared_error: 0.1418
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 165ms/step - loss: 0.1339 - mean_absolute_error: 0.2963 - mean_squared_error: 0.1339 - val_loss: 0.0920 - val_mean_absolute_error: 0.2925 - val_mean_squared_error: 0.0920
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 134ms/step - loss: 0.0834 - mean_absolute_error: 0.2350 - mean_squared_error: 0.0834 - val_loss: 0.0524 - val_mean_absolute_error: 0.2199 - val_mean_squared_error: 0.0524
Epoch 5/10
[1m1/1[0m [32

Unnamed: 0,playerId,nameFull,position,Retired_Sacks_Safeties_Performance_Score,Threshold Value (A %ile of Predicted),Predicted Class Label (Via Threshold)
31,20050727,Charles Howard,DL,1.06543,0.683783,Optimal Player
3,19980010,Duane Starks,DB,1.06543,0.683783,Optimal Player
1,19960252,Keith McKenzie,DE,0.979004,0.683783,Optimal Player
12,20010114,Cedric Scott,DE,0.893311,0.683783,Optimal Player


In [62]:
# Fumbles - Defense - Optimal Team Selection 'RB', 'DE', 'OT', 'DT', 'DB', 'LB', 'OG', 'QB', 'TE', 'C', 'WR', 'P', 'FB', 'K', 'OL', 'LS', 'OLB', 'KR', 'S', 'DL', 'PK'
# P (Punter), K (Kicker), WR (Wide Receiver), DB (Defensive Back), RB (Running Back), DE (Defensive End), OT (Offensive Tackle), DT (Defensive Tackle), LB (Linebacker), 
# OG (Offensive Guard), QB (Quarterback), TE (Tight End), C (Center), FB (Fullback), OL (Offensive Lineman), LS (Long Snapper), OLB (Outside Linebacker), KR (Kick Returner), 
# S (Safety), DL (Defensive Lineman), PK (Placekicker)

# Set random seeds for reproducibility
def set_seeds(seed=42):
    np.random.seed(seed)
    random.seed(seed)
    tf.random.set_seed(seed)
    
def process_fumblForced_data(fumblForced_data, players_data, score_label):
    # Merge to include full names and exclude specific positions
    fumblForced_data = pd.merge(fumblForced_data, players_data[['playerId', 'nameFull', 'position']], on='playerId', how='left')
    fumblForced_data = fumblForced_data[~fumblForced_data['position'].isin(['WR', 'QB', 'P', 'TE', 'RB', 'OG', 'FB', 'K', 'C'])]


     # Calculate the performance score for each play
    fumblForced_data[score_label] = (
        fumblForced_data['fumForcedTurnover'] * 1.0   # Fumble turnovers are highly positive for a defensive player        
    )

    
    # Aggregate data by playerId to capture overall performance characteristics
    features = fumblForced_data.columns.difference(['playerId', 'nameFull', 'position', 'playId', score_label])
    aggregated_data = fumblForced_data.groupby('playerId').agg({**{feat: 'mean' for feat in features},
                                                           'nameFull': 'first',  
                                                           'position': 'first',
                                                           'playId': 'first',
                                                           score_label: 'mean'}).reset_index()  

   
    # Define target and features for the model
    y = aggregated_data[score_label]  # Target score_labels
    X = aggregated_data[features]  # All features after aggregation

    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    
    # Build and train the model, 
    model = Sequential([
        Input(shape=(X_train.shape[1],)),  #Input layer num of input features based on the training data
        Dense(256, activation='relu'),  # A fully connected hidden layer w/ 256 neurons
        Dense(128, activation='relu'),  # A fully connected hidden layer w/ 128 neurons
        Dense(64, activation='relu'),  #A third fully connected hidden layer with 64 neurons
        Dense(1)  #The fully connected output layer with 1 neuron for the regression task
    ])
        
    
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=[MeanSquaredError(), MeanAbsoluteError()])
    model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

    # Predict performance scores
    predictions = model.predict(X_test)
    
    # Calculate the 80th percentile of the predicted scores
    threshold = np.percentile(predictions, 80)

    # Add the threshold value as a repeating value in the DataFrame
    aggregated_data['Threshold Value (A %ile of Predicted)'] = threshold
    
    # Label players based on the actual performance score compared to the threshold
    aggregated_data['Predicted Class Label (Via Threshold)'] = aggregated_data[score_label].apply(lambda x: 'Optimal Player' if x >= threshold else 'Not Optimal Player')

    # Sort and select the top players based on their actual performance scores
    top_indices = aggregated_data[score_label].argsort()[-3:][::-1]
    top_3_fumblForced_players = aggregated_data.iloc[top_indices]

    # Sort and print the top 3 players
    top_3_fumblForced_players = top_3_fumblForced_players.sort_values(by=score_label, ascending=False)
    return top_3_fumblForced_players[['playerId', 'nameFull', 'position', score_label, 'Threshold Value (A %ile of Predicted)', 'Predicted Class Label (Via Threshold)']]

# Output active and passive dataframes
#process_fumblForced_data(fumblForced_active, players_active, 'Active_FumblForced_Performance_Score')
#process_fumblForced_data(fumblForced_retired, players_retired, 'Retired_FumblForced_Performance_Score')


In [63]:
# Define the global variable and store the data in a DataFrame
global optimal_fumblForced_active
optimal_fumblForced_active = pd.DataFrame(process_fumblForced_data(fumblForced_active, players_active, 'Active_FumblForced_Performance_Score'))
optimal_fumblForced_active

Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - loss: 0.5580 - mean_absolute_error: 0.6393 - mean_squared_error: 0.5580 - val_loss: 0.3699 - val_mean_absolute_error: 0.4282 - val_mean_squared_error: 0.3699
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 124ms/step - loss: 0.4875 - mean_absolute_error: 0.5976 - mean_squared_error: 0.4875 - val_loss: 0.3270 - val_mean_absolute_error: 0.4049 - val_mean_squared_error: 0.3270
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 141ms/step - loss: 0.4304 - mean_absolute_error: 0.5617 - mean_squared_error: 0.4304 - val_loss: 0.2893 - val_mean_absolute_error: 0.3835 - val_mean_squared_error: 0.2893
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 139ms/step - loss: 0.3802 - mean_absolute_error: 0.5283 - mean_squared_error: 0.3802 - val_loss: 0.2561 - val_mean_absolute_error: 0.3641 - val_mean_squared_error: 0.2561
Epoch 5/10
[1m1/1[0m [32

Unnamed: 0,playerId,nameFull,position,Active_FumblForced_Performance_Score,Threshold Value (A %ile of Predicted),Predicted Class Label (Via Threshold)
24,20180860,Matthew Thomas,LB,1.0,0.483187,Optimal Player
8,20140620,Anthony Johnson,DT,1.0,0.483187,Optimal Player
21,20180435,Jon Cunningham,DT,1.0,0.483187,Optimal Player


In [64]:
# Define the global variable and store the data in a DataFrame
global optimal_fumblForced_retired
optimal_fumblForced_retired = pd.DataFrame(process_fumblForced_data(fumblForced_retired, players_retired, 'Retired_FumblForced_Performance_Score'))
optimal_fumblForced_retired

Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - loss: 0.4575 - mean_absolute_error: 0.5039 - mean_squared_error: 0.4575 - val_loss: 0.2817 - val_mean_absolute_error: 0.3632 - val_mean_squared_error: 0.2817
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 142ms/step - loss: 0.3961 - mean_absolute_error: 0.4700 - mean_squared_error: 0.3961 - val_loss: 0.2454 - val_mean_absolute_error: 0.3413 - val_mean_squared_error: 0.2454
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 138ms/step - loss: 0.3457 - mean_absolute_error: 0.4407 - mean_squared_error: 0.3457 - val_loss: 0.2185 - val_mean_absolute_error: 0.3247 - val_mean_squared_error: 0.2185
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 108ms/step - loss: 0.3083 - mean_absolute_error: 0.4179 - mean_squared_error: 0.3083 - val_loss: 0.1965 - val_mean_absolute_error: 0.3109 - val_mean_squared_error: 0.1965
Epoch 5/10
[1m1/1[0m [32

Unnamed: 0,playerId,nameFull,position,Retired_FumblForced_Performance_Score,Threshold Value (A %ile of Predicted),Predicted Class Label (Via Threshold)
19,20040809,Jack Hunt,S,1.0,0.430834,Optimal Player
15,20040184,Deandre Eiland,S,1.0,0.430834,Optimal Player
2,19980010,Duane Starks,DB,1.0,0.430834,Optimal Player


In [65]:
# Define the category based on the source dataframe
def assign_category(df, category):
    df['Optimal Team Category'] = category
    # Reorder the columns to ensure 'Category' is the first column
    cols = ['Optimal Team Category'] + [col for col in df.columns if col != 'Optimal Team Category']
    return df[cols]

# Combine dataframes for active players with assigned categories
optimal_active_players_team = pd.concat([
    assign_category(optimal_passers_active.rename(columns={'Active_Passer_Performance_Score': 'Performance Score'}), 'Starting Offense - Passing'),
    assign_category(optimal_rushers_active.rename(columns={'Active_Rusher_Performance_Score': 'Performance Score'}), 'Starting Offense - Rushing'),
    assign_category(optimal_receivers_active.rename(columns={'Active_Receiver_Performance_Score': 'Performance Score'}), 'Starting Offense - Receiving'),
    assign_category(optimal_punting_active.rename(columns={'Active_Punting_Performance_Score': 'Performance Score'}), 'Starting Special Teams - Punting'),
    assign_category(optimal_kickReturns_active.rename(columns={'Active_KickReturns_Performance_Score': 'Performance Score'}), 'Starting Special Teams - Returns'),
    assign_category(optimal_tackles_active.rename(columns={'Active_Tackles_Performance_Score': 'Performance Score'}), 'Starting Defense - Tackles'),
    assign_category(optimal_sacks_safeties_active.rename(columns={'Active_Sacks_Safeties_Performance_Score': 'Performance Score'}), 'Starting Defense - Sacks/Safeties'),
    assign_category(optimal_fumblForced_active.rename(columns={'Active_FumblForced_Performance_Score': 'Performance Score'}), 'Starting Defense - Fumbles')
], ignore_index=True)

# Combine dataframes for retired players with assigned categories
optimal_retired_players_team = pd.concat([
    assign_category(optimal_passers_retired.rename(columns={'Retired_Passer_Performance_Score': 'Performance Score'}), 'Starting Offense - Passing'),
    assign_category(optimal_rushers_retired.rename(columns={'Retired_Rusher_Performance_Score': 'Performance Score'}), 'Starting Offense - Rushing'),
    assign_category(optimal_receivers_retired.rename(columns={'Retired_Receiver_Performance_Score': 'Performance Score'}), 'Starting Offense - Receiving'),
    assign_category(optimal_punting_retired.rename(columns={'Retired_Punting_Performance_Score': 'Performance Score'}), 'Starting Special Teams - Punting'),
    assign_category(optimal_kickReturns_retired.rename(columns={'Retired_KickReturns_Performance_Score': 'Performance Score'}), 'Starting Special Teams - Returns'),
    assign_category(optimal_tackles_retired.rename(columns={'Retired_Tackles_Performance_Score': 'Performance Score'}), 'Starting Defense - Tackles'),
    assign_category(optimal_sacks_safeties_retired.rename(columns={'Retired_Sacks_Safeties_Performance_Score': 'Performance Score'}), 'Starting Defense - Sacks/Safeties'),
    assign_category(optimal_fumblForced_retired.rename(columns={'Retired_FumblForced_Performance_Score': 'Performance Score'}), 'Starting Defense - Fumbles')
], ignore_index=True)

# Display the final optimal team dataframe
#optimal_active_players_team
#optimal_retired_players_team


In [66]:
# Display the final optimal active player team dataframe

optimal_active_players_team


Unnamed: 0,Optimal Team Category,playerId,nameFull,position,Performance Score,Threshold Value (A %ile of Predicted),Predicted Class Label (Via Threshold)
0,Starting Offense - Passing,20150001,Jameis Winston,QB,0.349675,0.301055,Optimal Player
1,Starting Offense - Passing,20160135,Dak Prescott,QB,0.346751,0.301055,Optimal Player
2,Starting Offense - Passing,20040004,Philip Rivers,QB,0.339104,0.301055,Optimal Player
3,Starting Offense - Rushing,20150905,Raheem Mostert,RB,0.507599,0.39317,Optimal Player
4,Starting Offense - Rushing,20080073,Jamaal Charles,RB,0.477637,0.39317,Optimal Player
5,Starting Offense - Rushing,20170067,Alvin Kamara,RB,0.474016,0.39317,Optimal Player
6,Starting Offense - Rushing,20170121,Joe Williams,RB,0.455529,0.39317,Optimal Player
7,Starting Offense - Receiving,20120760,Sean McGrath,TE,0.38897,0.214641,Optimal Player
8,Starting Offense - Receiving,20170582,Chris Thompson,WR,0.261886,0.214641,Optimal Player
9,Starting Offense - Receiving,20121011,Eric Wallace,TE,0.25813,0.214641,Optimal Player


In [67]:
# Display the final optimal retired player team dataframe

optimal_retired_players_team


Unnamed: 0,Optimal Team Category,playerId,nameFull,position,Performance Score,Threshold Value (A %ile of Predicted),Predicted Class Label (Via Threshold)
0,Starting Offense - Passing,20050488,Brian Wrobel,QB,0.49082,0.270345,Optimal Player
1,Starting Offense - Passing,20060452,Brett Basanez,QB,0.35157,0.270345,Optimal Player
2,Starting Offense - Passing,20030110,Seneca Wallace,QB,0.330737,0.270345,Optimal Player
3,Starting Offense - Rushing,19910154,Fred McAfee,RB,0.762634,0.496001,Optimal Player
4,Starting Offense - Rushing,20000166,Chad Morton,RB,0.513293,0.496001,Optimal Player
5,Starting Offense - Rushing,20070416,Selvin Young,RB,0.458506,0.496001,Not Optimal Player
6,Starting Offense - Rushing,20050412,Ryan Grant,RB,0.453792,0.496001,Not Optimal Player
7,Starting Offense - Receiving,20030409,Terrence Edwards,WR,0.810742,0.250052,Optimal Player
8,Starting Offense - Receiving,20060629,Rick Gatewood,WR,0.510612,0.250052,Optimal Player
9,Starting Offense - Receiving,19940181,Bill Schroeder,WR,0.376343,0.250052,Optimal Player


In [72]:
Interpretations ='''

Problem Statement
The objective of this assignment is to develop a method for selecting an optimal football team from a pool of 200 active players and 200 
retired players using a deep artificial neural network (MLP). The task involves analyzing player performance across various game 
phases: offense, special teams, and defense. The step-by-step process of completing this assignment is outlined below.


1.	Select from a pool of 200 "Active Players" and 200 "Retired Players." 
Player data for both active and retired players was loaded from the datasets. This data includes player IDs, full names, positions, and 
various performance metrics relevant to their positions and roles. The player data was merged with performance metrics data for different 
categories (e.g., passing, rushing, receiving, punting, tackles, sacks, safeties, and fumbles) to create comprehensive datasets that link 
player information with their respective performance metrics. The oldest player to play a game in NFL history was George Blanda at 48 years 108 days,
so the cutoff for an active player was under 49 years old (age was calculated from their birthday to the end of the dataset) and must have played 
within the last 3 seasons of when the database ended. To build the active and retired player pools players, a set number of players were selected 
from each dataset as follows:

Offense: passer (22), rusher (23), receiver (22) 
Special Teams: kicks (33), kickReturns (33)
Defense: tackles (22), sacks (23), fumbForced (22)


2. Define Optimal Team Based on Player Characteristics
The optimal team was defined based on specific characteristics:

Starting Offense (11 players): Passing, rushing, and receiving
Starting Special Teams (11 players): Punting, punt returns, and kick returns.
Starting Defense (11 players): Tackles, sacks, safeties, and fumbles.

Certain positions that are not relevant to a specific category of performance metrics were excluded. For example, in the receiving category, 
positions like fullbacks (FB), quarterbacks (QB), offensive guards (OG), running backs (RB), and defensive backs (DB) were excluded. Similar 
exclusions were done for the other categories based on relevance. For each player, a performance score was calculated based on the weighted 
contributions of various performance metrics. The weights were chosen to reflect the importance of each metric within the specific performance 
category. 


3. Identify the Optimal Team from Each Pool
The performance data was aggregated by player ID to capture overall performance characteristics. This aggregation involved 
calculating mean values of performance metrics for each player. The aggregated data was split into training and testing sets to train the MLP model 
and evaluate its performance. An MLP model was trained using the training data. The model consisted of an input layer, three hidden layers, and an 
output layer. The training process involved forward propagation, backpropagation, and weight updates over multiple epochs. After training, the 
model's predictions were used to calculate a threshold value (based on a specific percentile of the predicted scores) to classify players as 
"Optimal Player" or "Not Optimal Player". The top players in each category were sorted and selected based on their actual performance scores. 
These top players based on their actual performance scores formed the final optimal active and retired players and teams for each performance 
metric category passing, rushing, etc.


4. Examine the Multilayer Neural Network MLP Architecture of an Artificial Neural Network 
The image depicts a typical MLP architecture with an input layer, hidden layers with neurons and weights and an output layer with neurons
and weights. In this architecture:

The input layer receives the input features (player characteristics).
The hidden layers process the inputs using weighted connections and activation functions.
The output layer generates the final predictions.


5. Build a Deep Artificial Neural Network MLP
The code builds 8 MLP's for 8 datasets corresponding to the components of offense, special teams and defense with:

Input Layer: Defined by the number of features used for each dataset.
Hidden Layers: Three fully connected hidden layers with 256, 128, and 64 neurons, respectively.
Output Layer: A single neuron for the regression task.
The architecture uses ReLU activation functions in the hidden layers and mean squared error as the loss function.

6. Explain the Architecture and Player Characteristics Used as Inputs
The code builds 8 MLP's for 8 datasets corresponding to the components of offense, special teams and defense with:

Input Layer: Takes the performance metrics of players as inputs (e.g., yards, touchdowns, tackles).
Hidden Layers: Three layers with decreasing neurons to capture complex patterns and interactions between input features.
Output Layer: A single neuron providing the performance score prediction.

Player performance features and characteristics used among the 8 datasets:

Passing: Completions (passComp), Passing yards (passYards), Touchdowns (passTd), Interceptions (passInt)
Rushing: Rushing yards (rushYards), Touchdowns (rushTd)
Receiving: Receptions (rec), Receiving yards (recYards), Yards after catch (recYac), First downs (rec1down), Fumbles (recFumble), 
End in bounds (recEnd_in bounds), Pushed out of bounds (recEnd_pushed out of bounds), Ran out of bounds (recEnd_ran out of bounds)

Punting: Inside 20 (kickInside20), Onside (kickOnside), Own recovery (kickOwnRecovery), Kick length (kickLength), 
Return yards (kickReturnYds), Net yards (kickNetYds), Return touchdowns (kickReturnTd), Aborted kick (kickType_aborted), 
Field goal (kickType_field goal), Kickoff (kickType_kickoff), Pass (kickType_pass), Punt (kickType_punt), Extra point (kickType_xp), 
locked (kickOutcome_blocked), Downed (kickOutcome_downed), Fair catch (kickOutcome_fair catch), Good (kickOutcome_good), 
Missed (kickOutcome_missed), Onside (kickOutcome_onside), Out of bounds (kickOutcome_out of bounds), Own recovery (kickOutcome_own recovery), 
Returned (kickOutcome_returned), Touchback (kickOutcome_touchback)

KickReturns: Primary return attempts (kickRetPrimary), Return yards (kickRetYds), Return touchdowns (kickRetTd), 
Fair catch (kickRetOutcome_fair catch), Returned (kickRetOutcome_returned)

Tackles: Metrics Yards at scrimmage (tackleYdsScrim), Assist tackles (tackleType_assist), Tackles for a loss (tackleType_for a loss), 
Solo tackles (tackleType_solo)

Sacks and Safeties: Sack yards (sackYards), Half sacks (sackType_0.5), Full sacks (sackType_1.0), Safety (safety)
Fumbles: Forced turnovers (fumForcedTurnover)




Activate the MLP by performing the following steps:

1. Starting at the input layer, forward propagate the patterns of the training data through the network to generate an output.
Completed: Patterns of training data are propagated through the network.

2. Based on the network's output, calculate the error that we want to minimize using a cost function that we will describe later.
Completed: Using mean squared error.

3. Back propagate the error, find its derivative with respect to each weight in the network, and update the model.
Completed: Error is backpropagated to update the weights.

4. Repeat steps 1 through 3 for multiple epochs and learn the weights of the MLP.
Completed: Repeated for several epochs (10) to accurately learn the model weights.

5. Use forward propagation to calculate the network output and apply a threshold function to obtain the predicted class labels 
in the one-hot representation.
Completed: A threshold function was applied to the network output to obtain predicted values, which were subsequently used at selected percentiles
to select binary class labels for each player as "Optimal Player" or "Not Optimal Player". 

6. Interpret the output of your MLP in the context of selecting an optimal football team.
The MLP model predicts performance scores for players, which are then used at a select percentile threshold to classify players as "Optimal Player" 
or "Not Optimal Player". The top performers in each category sorted by actual score are selected for the optimal team  but the predicted performance
score percentile threshold and binary classification assignments help to validate and predict the selections.

Below is an example from results which these players were identified with the highest performance scores as being the best for their areas of 
passing, rushing, and receiving. As a data scientist these selections are logical, but furthermore as a fan of NFL football these selections are 
logical given that I know the players and have seen them play over the years. It's always a positive when domain knowledge can be partnered with 
the data science to add another layer, so to speak, to the data science layers coded for. 

Examples from Results:
Active Passing: Jameis Winston, Dak Prescott, and Philip Rivers are classified as optimal passers based on their performance scores.
Active Rushing: Raheem Mostert, Jamaal Charles, Alvin Kamara, and Joe Williams are classified as optimal rushers.
Active Receiving: Sean McGrath, Chris Thompson, Eric Wallace, and Jesper Horsted are classified as optimal receivers.

Several metrics were used when running the ANN-MLP and their interpretations are outlined below. 

Mean Absolute Error (MAE): Measures the average magnitude of the errors in a set of predictions, without considering their direction. It is the
average over the test sample of the absolute differences between prediction and actual observation where all individual differences have equal weight.
- Ideal Threshold: We would ideally want the MAE to be as low as possible. A lower MAE indicates that the predictions are close to the actual values. 
  Ideally, values closer to 0 are better.
- Trends: As training progresses through epochs, we want to see a decreasing trend in MAE, indicating that the model is learning and improving its 
  predictions.

Mean Squared Error (MSE): Measures the average of the squares of the errors; the average squared difference between the estimated values and the 
actual value.
 - Ideal Threshold: Similar to MAE, we aim for a low MSE. A lower MSE signifies that the average squared difference between predicted and 
   actual values are small, indicating accurate predictions. Ideally, values closer to 0 are better.
 - Trends: A decreasing trend in MSE over epochs is desirable, showing that the model's predictions are becoming more accurate.

Validation Loss (val_loss): The loss calculated on the validation dataset, which is a separate portion of the dataset not used for training. 
It provides an indication of how well the model generalizes to unseen data.
 - Ideal Threshold: We want the validation loss to be low, indicating good generalization to unseen data. Low validation loss values suggest that 
   the model is not overfitting and performs well on the validation set.
 - Trends: The validation loss should ideally decrease and stabilize at a low value. If it starts increasing after some epochs, it might 
   indicate overfitting.

Validation Mean Absolute Error (val_mean_absolute_error): The MAE calculated on the validation dataset.
 - Ideal Threshold: Similar to MAE, we want the validation MAE to be low, indicating that the model's predictions on the validation set are close 
   to the actual values.
 - Trends: A decreasing trend in validation MAE over epochs is desirable. If it decreases and stabilizes at a low value, it indicates good 
   generalization.

Validation Mean Squared Error (val_mean_squared_error): The MSE calculated on the validation dataset.
 - Ideal Threshold: We aim for a low validation MSE, which would mean the model's average squared error on the validation set is minimal.
 - Trends: A decreasing trend in validation MSE over epochs is ideal. An increase might suggest overfitting.



Below is my interpretation of the actual data for each dataset run and trained on based on the validation metrics outlined above and final selections
of the optimal team members and teams.

Passing:
Epoch 10 MAE: 0.0211 (train), 0.0156 (validation)
Epoch 10 MSE: 0.00056351 (train), 0.00035898 (validation)
Epoch 10 Validation Loss: 0.00035898
Interpretation: Both training and validation MAE/MSE values are low and close to each other, indicating the model is performing well without 
overfitting. The low validation loss further supports this. Active was detailed, but this is true for both the active and retired player selections.

Rushing:
Epoch 10 MAE: 0.0622  (train), 0.0458 (validation)
Epoch 10 MSE: 0.0070 (train), 0.0036 (validation)
Epoch 10 Validation Loss: 0.0036 
Interpretation: The training and validation MAE/MSE values are reasonably low, showing that the model is making accurate predictions. 
The low validation loss indicates good generalization. Active was detailed, but this is true for both the active and retired player selections.

Receiving:
Epoch 10 MAE: 0.0246 (train), 0.0239 (validation)
Epoch 10 MSE: 0.00079793 (train), 0.0006599 (validation)
Epoch 10 Validation Loss: 0.0006599
Interpretation: The close and low MAE/MSE values for training and validation sets indicate effective learning and good generalization. The low 
validation loss further supports this. Active was detailed, but this is true for both the active and retired player selections.

Punting:
Epoch 10 MAE: 0.0280 (train), 0.0317 (validation)
Epoch 10 MSE: 0.0013 (train), 0.0014 (validation)
Epoch 10 Validation Loss: 0.0014
Interpretation: The MAE/MSE values are low and close to each other, indicating accurate predictions and good generalization. The validation loss 
being low supports these findings. Retired was detailed, but this is true for both the active and retired player selections.

Kick Returns:
Epoch 10 MAE: 0.0669 (train), 0.0737 (validation)
Epoch 10 MSE: 0.0055 (train),0.0072 (validation)
Epoch 10 Validation Loss: 0.0072
Interpretation: While the MAE/MSE values are higher compared to other datasets, they are still reasonable. The model seems to have captured the 
complexity of kick return data reasonably well, as indicated by the decreasing validation loss. Active was detailed, but this is true for both the 
active and retired player selections.

Tackles:
Epoch 10 MAE: 0.0547 (train),  0.0683 (validation)
Epoch 10 MSE: 0.0041 (train), 0.0064 (validation)
Epoch 10 Validation Loss: 0.0064
Interpretation: The training metrics indicate good performance for the retired players, but the validation metrics are NaN (for active payers), 
suggesting an issue with the validation dataset or the validation process. Retired was detailed, but this is true for both the active and retired 
player selections.

Sacks and Safeties:
Epoch 10 MAE: 0.1580 (train), 0.1044 (validation)
Epoch 10 MSE: 0.0318 (train), 0.0127 (validation)
Epoch 10 Validation Loss: 0.0127
Interpretation: The MAE/MSE values are reasonable, and the validation metrics are lower than the training metrics, suggesting good generalization. 
The decreasing validation loss indicates the model is learning effectively. Retired was detailed, but this is true for both the active and retired 
player selections.

Fumbles:
Epoch 10 MAE: 0.3477 (train), 0.2569 (validation)
Epoch 10 MSE: 0.1608 (train), 0.1030 (validation)
Epoch 10 Validation Loss: 0.1030
Interpretation: The MAE/MSE values are higher, reflecting the challenging nature of predicting fumbles. However, the decreasing validation loss 
indicates the model is learning and generalizing reasonably well. Active was detailed, but this is true for both the active and retired player 
selections.


Overall, the MLP model effectively identified players with the best predicted performance scores for each aspect of the game, helping to 
form an optimal football team as outlined below.

Passing: Players with the highest predicted performance scores in passing metrics (completions, passing yards, touchdowns, interceptions) were 
selected as optimal quarterbacks.
Rushing: Players with high predicted performance scores in rushing metrics (rushing yards, touchdowns) were selected as optimal running backs.
Receiving: Players with the best scores in receiving metrics (receptions, receiving yards, touchdowns) were selected as optimal receivers.
Punting: Players excelling in punting metrics (net yards, touchbacks) were selected as optimal punters.
Kick Returns: Players with high scores in kick return metrics (return yards, touchdowns) were chosen as optimal returners.
Tackles: Players with strong performance in tackle metrics (solo tackles, assists, tackles for a loss) were selected as optimal tacklers.
Sacks and Safeties: Players with high scores in sack and safety metrics (sack yards, safeties) were chosen as optimal defenders.
Fumbles: Players who excelled in forcing fumbles were selected as optimal defenders.

'''

In [73]:
References ='''

References:

Goodfellow, I., Bengio, Y., & Courville, A. (2016). Deep Learning. MIT Press. Retrieved from http://www.deeplearningbook.org

Steussie, T. (Owner). (n.d.). NFL Play Statistics dataset (primary). Retrieved from 
https://www.kaggle.com/datasets/toddsteussie/nfl-play-statistics-dataset-2004-to-present/data

Stack Overflow. (n.d.). Retrieved January, 2024, from https://stackoverflow.com/

Reddit. (n.d.). Retrieved January, 2024, from https://www.reddit.com/

GitHub. (n.d.). Retrieved January, 2024, from https://www.github.com/

Towards Data Science. (n.d.). Retrieved January, 2024, from https://towardsdatascience.com/

DataCamp Community. (n.d.). Retrieved January, 2024, from https://www.datacamp.com/community

Scikit-learn. (n.d.). Support vector machines. Retrieved from https://scikit-learn.org/stable/modules/svm.html

'''