In [16]:
from pybaseball import batting_stats
from pybaseball import playerid_reverse_lookup
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from tensorflow.keras.callbacks import ModelCheckpoint
import pandas as pd
import tensorflow as tf
import pandas as pd 
import warnings

## Create the DataFrame

#### Define Timeframe

In [24]:
start_year = 2018
end_year = 2018

#### Import Lahman player awards

In [25]:
# download up-to-date data from https://www.seanlahman.com/baseball-archive/statistics/

awards = pd.read_csv('Resources/AwardsPlayers.csv').drop(columns=['tie','notes'])
mvp_awards = awards.sort_values('yearID').loc[
    (awards['yearID'] >= start_year) & (awards['awardID'] == 'Most Valuable Player')]
mvp_awards.head()

Unnamed: 0,playerID,awardID,yearID,lgID
6237,yelicch01,Most Valuable Player,2018,NL
6236,bettsmo01,Most Valuable Player,2018,AL
6312,bellico01,Most Valuable Player,2019,NL
6311,troutmi01,Most Valuable Player,2019,AL
6387,freemfr01,Most Valuable Player,2020,NL


#### MVP names based on ID

In [26]:
def reverseLookup(mvp_awards):
    player_names = playerid_reverse_lookup(mvp_awards['playerID'], key_type='bbref')
    player_names['Name'] = player_names['name_first'].map(
        lambda x: x.title()) + ' ' + player_names['name_last'].map(lambda x: x.title())
    player_names = player_names[['Name', 'key_bbref','mlb_played_first', 'mlb_played_last']]
    return player_names.merge(mvp_awards, left_on= 'key_bbref', right_on='playerID').drop(['key_bbref'], axis=1)

mvp_awards = reverseLookup(mvp_awards)
all_awards = reverseLookup(awards)
mvp_awards.head()

Unnamed: 0,Name,mlb_played_first,mlb_played_last,playerID,awardID,yearID,lgID
0,José Abreu,2014.0,2022.0,abreujo02,Most Valuable Player,2020,AL
1,Cody Bellinger,2017.0,2022.0,bellico01,Most Valuable Player,2019,NL
2,Mookie Betts,2014.0,2022.0,bettsmo01,Most Valuable Player,2018,AL
3,Freddie Freeman,2010.0,2022.0,freemfr01,Most Valuable Player,2020,NL
4,Bryce Harper,2012.0,2022.0,harpebr03,Most Valuable Player,2021,NL


In [27]:
all_awards.head()

Unnamed: 0,Name,mlb_played_first,mlb_played_last,playerID,awardID,yearID,lgID
0,Hank Aaron,1954.0,1976.0,aaronha01,TSN All-Star,1956,ML
1,Hank Aaron,1954.0,1976.0,aaronha01,TSN Player of the Year,1956,NL
2,Hank Aaron,1954.0,1976.0,aaronha01,Most Valuable Player,1957,NL
3,Hank Aaron,1954.0,1976.0,aaronha01,Gold Glove,1958,NL
4,Hank Aaron,1954.0,1976.0,aaronha01,TSN All-Star,1958,ML


### Merge awards and players dataframes

In [28]:
warnings.simplefilter('ignore')


def Merger(start_year, end_year, mvp_awards, all_awards):
    for i in range(end_year-start_year +1):
        current_year = start_year + i
        if i < 1:
            data = getPlayersYearStats(current_year)
            mvpChecker(data, 'Name', mvp_awards, current_year)
            data = addAllAwards(data, all_awards, current_year)
        else:
            new_data = getPlayersYearStats(current_year)
            mvpChecker(new_data, 'Name', mvp_awards, current_year)
            new_data = addAllAwards(new_data, all_awards, current_year)
            data = pd.concat([data, new_data])
    return data



def getPlayersYearStats(current_year):
    return batting_stats_range(str(current_year)+'-04-07', str(current_year)+'-10-05')
def mvpChecker(data, column, mvp_awards, current_year):
    for mvp in mvp_awards[column].loc[mvp_awards['yearID'] == current_year]:
        data['MVP'] = data[column].map(lambda x: categorizer(mvp, x))
def categorizer(mvp, player):
        if mvp == player:
            return 1
        else:
            return 0
def addAllAwards(data, all_awards, current_year):
    data = data.merge(all_awards.loc[all_awards['yearID'] == current_year][
        ['Name','awardID']], how='left', on='Name')
    data['awardID'] = data['awardID'].fillna('No Award')
    return data
        
## 
    
raw_data = Merger(start_year, end_year, mvp_awards, all_awards)
data = raw_data.drop(['Name'], axis=1)
data.head()

Unnamed: 0,Age,#days,Lev,Tm,G,PA,AB,R,H,2B,...,SF,GDP,SB,CS,BA,OBP,SLG,OPS,MVP,awardID
0,31,1421,Maj-AL,Chicago,122,526,474,64,124,35,...,6,13,2,0,0.262,0.323,0.466,0.789,0,Silver Slugger
1,20,1407,Maj-NL,Atlanta,111,487,433,78,127,26,...,3,4,16,4,0.293,0.366,0.552,0.918,0,No Award
2,22,1407,Maj-AL,Tampa Bay,85,323,288,43,80,7,...,2,6,6,5,0.278,0.348,0.406,0.754,0,No Award
3,28,1407,Maj-NL,Atlanta,19,22,19,8,3,1,...,0,1,1,0,0.158,0.273,0.368,0.641,0,No Award
4,29,1409,Maj-NL,"St. Louis,Washington",115,324,296,39,71,9,...,0,6,0,0,0.24,0.306,0.473,0.779,0,No Award


In [29]:
data.
## Filter: PA > 400
## Delete: OPS, gdp, sh
## Add: WAR, WRC+

Index(['Age', '#days', 'Lev', 'Tm', 'G', 'PA', 'AB', 'R', 'H', '2B', '3B',
       'HR', 'RBI', 'BB', 'IBB', 'SO', 'HBP', 'SH', 'SF', 'GDP', 'SB', 'CS',
       'BA', 'OBP', 'SLG', 'OPS', 'MVP', 'awardID'],
      dtype='object')

In [30]:
### Check that MVP's are in dataframe
mvp_check = raw_data.loc[raw_data['MVP'] == 1]
print('There are/is '+str(mvp_check['Name'].nunique())+' unique mvps')
print(mvp_check['Name'].unique())
mvp_check

There are/is 1 unique mvps
['Christian Yelich']


Unnamed: 0,Name,Age,#days,Lev,Tm,G,PA,AB,R,H,...,SF,GDP,SB,CS,BA,OBP,SLG,OPS,MVP,awardID
994,Christian Yelich,26,1406,Maj-NL,Milwaukee,141,624,548,112,177,...,2,13,22,4,0.323,0.402,0.599,1.001,1,Most Valuable Player
995,Christian Yelich,26,1406,Maj-NL,Milwaukee,141,624,548,112,177,...,2,13,22,4,0.323,0.402,0.599,1.001,1,Hank Aaron Award
996,Christian Yelich,26,1406,Maj-NL,Milwaukee,141,624,548,112,177,...,2,13,22,4,0.323,0.402,0.599,1.001,1,Silver Slugger
997,Christian Yelich,26,1406,Maj-NL,Milwaukee,141,624,548,112,177,...,2,13,22,4,0.323,0.402,0.599,1.001,1,TSN All-Star


### Clean Categorical

In [8]:
data_objects = data.dtypes[data.dtypes=='object'].index.to_list()
data[data_objects].nunique()

Lev          3
Tm         339
awardID     27
dtype: int64

In [9]:
data['Tm'] = data['Tm'].map(lambda x: x.split(',')[-1])
data_objects = data.dtypes[data.dtypes=='object'].index.to_list()
data[data_objects].nunique()

Lev         3
Tm         28
awardID    27
dtype: int64

In [10]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encoded_df = pd.DataFrame(enc.fit_transform(data[data_objects]), 
                          columns = enc.get_feature_names(data_objects))

encoded_df.head()

Unnamed: 0,Lev_Maj-AL,"Lev_Maj-AL,Maj-NL",Lev_Maj-NL,Tm_Arizona,Tm_Atlanta,Tm_Baltimore,Tm_Boston,Tm_Chicago,Tm_Cincinnati,Tm_Cleveland,...,awardID_Rolaids Relief Man Award,awardID_Rookie of the Year,awardID_SIlver Slugger,awardID_Silver Slugger,awardID_TSN All-Star,awardID_TSN Major League Player of the Year,awardID_TSN Pitcher of the Year,awardID_TSN Reliever of the Year,awardID_Triple Crown,awardID_World Series MVP
0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# Merge one-hot encoded features and drop the originals

data = data.merge(encoded_df, left_index=True, right_index=True).drop(columns=data_objects, axis=1)
data.head()

Unnamed: 0,Age,#days,G,PA,AB,R,H,2B,3B,HR,...,awardID_Rolaids Relief Man Award,awardID_Rookie of the Year,awardID_SIlver Slugger,awardID_Silver Slugger,awardID_TSN All-Star,awardID_TSN Major League Player of the Year,awardID_TSN Pitcher of the Year,awardID_TSN Reliever of the Year,awardID_Triple Crown,awardID_World Series MVP
0,26,5153,1,1,1,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,35,4690,151,663,559,96,164,29,3,15,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,24,4347,1,1,1,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,37,3966,136,556,479,51,118,29,1,7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,26,3610,5,7,7,0,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
# Split our preprocessed data into our features and target arrays
y = data.MVP.values
X = data.drop(columns=['MVP'], axis=1)

# # Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state= 0)

In [13]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### Compile, Train and Evaluate Neural Net

In [14]:
# Define the model - deep neural net
number_input_features = len(X_train_scaled[0])
hidden_nodes_layer1 = number_input_features
hidden_nodes_layer2 = hidden_nodes_layer1/4
hidden_nodes_layer3 = hidden_nodes_layer2/5


nn = tf.keras.models.Sequential()
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation="sigmoid"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))


nn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 82)                6806      
                                                                 
 dense_1 (Dense)             (None, 20)                1660      
                                                                 
 dense_2 (Dense)             (None, 4)                 84        
                                                                 
 dense_3 (Dense)             (None, 1)                 5         
                                                                 
Total params: 8,555
Trainable params: 8,555
Non-trainable params: 0
_________________________________________________________________


2022-08-08 17:26:01.964468: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [15]:
# Compile the Sequential model together and customize metrics
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])


# Train the model 
fit_model = nn.fit(X_train_scaled, y_train, epochs=16) 
# Evaluate the model using the test data 
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=0)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16
Loss: nan, Accuracy: 0.9968608021736145
