# Using the Cleaner to import our raw data (similar to what's in the database).
## Creating and cleaning the two dataframes (AL and NL), merging the dataframes, and saving the dataframe as a CSV to potentially be used in the machine learning model.

In [1]:
pip install scikit-learn==1.0 -U

Note: you may need to restart the kernel to use updated packages.


In [2]:
from Cleaner import *
from config import db_password

from sqlalchemy import create_engine
from pathlib import Path
import numpy as np
import pandas as pd
import psycopg2

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [3]:
awards = pd.read_csv('Resources/AwardsPlayers.csv')
all_awards = reverseLookup(awards)

Gathering player lookup table. This may take a moment.


In [4]:
data_NL, data_AL = Merger(all_awards)
data_NL

............Scraping Data.............

1982 Time: 1.2883124351501465 Seconds
1983 Time: 1.223088264465332 Seconds
1984 Time: 1.0875601768493652 Seconds
1985 Time: 1.229494333267212 Seconds
1986 Time: 1.2267870903015137 Seconds

.....Approximately 46-58 Seconds.....

Total Time: 58.760353565216064 Seconds


Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,...,CSW%,xBA,xSLG,xwOBA,key_fangraphs,playerID,awardID,yearID,lgID,MVP
6,1002015,1982,Gary Carter,MON,28,154,557,653,163,101,...,,,,,1002015.0,cartega01,Gold Glove,1982.0,NL,0
7,1002015,1982,Gary Carter,MON,28,154,557,653,163,101,...,,,,,1002015.0,cartega01,Silver Slugger,1982.0,NL,0
8,1002015,1982,Gary Carter,MON,28,154,557,653,163,101,...,,,,,1002015.0,cartega01,TSN All-Star,1982.0,NL,0
9,1003091,1982,Andre Dawson,MON,27,148,607,659,183,116,...,,,,,1003091.0,dawsoan01,Gold Glove,1982.0,NL,0
12,1011586,1982,Mike Schmidt,PHI,32,148,514,631,144,80,...,,,,,1011586.0,schmimi01,Gold Glove,1982.0,NL,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139,3516,2021,Eric Hosmer,SDP,31,151,509,565,137,97,...,0.263,0.268,0.406,0.326,,,No Award,,NL,0
141,7859,2021,Charlie Blackmon,COL,34,150,514,582,139,97,...,0.250,0.291,0.450,0.359,,,No Award,,NL,0
142,19892,2021,Pavin Smith,ARI,25,145,498,545,133,91,...,0.282,0.258,0.402,0.317,,,No Award,,NL,0
144,12552,2021,Eugenio Suarez,CIN,29,145,505,574,100,46,...,0.295,0.215,0.455,0.326,,,No Award,,NL,0


In [5]:
# Clean data_NL
# Dropped all columns with NaN values 

data_NL = data_NL.dropna(axis=1)
data_NL

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,...,BB%+,K%+,OBP+,SLG+,ISO+,BABIP+,Events,awardID,lgID,MVP
6,1002015,1982,Gary Carter,MON,28,154,557,653,163,101,...,142,78,116,132,181,98,0,Gold Glove,NL,0
7,1002015,1982,Gary Carter,MON,28,154,557,653,163,101,...,142,78,116,132,181,98,0,Silver Slugger,NL,0
8,1002015,1982,Gary Carter,MON,28,154,557,653,163,101,...,142,78,116,132,181,98,0,TSN All-Star,NL,0
9,1003091,1982,Andre Dawson,MON,27,148,607,659,183,116,...,62,114,105,129,164,112,0,Gold Glove,NL,0
12,1011586,1982,Mike Schmidt,PHI,32,148,514,631,144,80,...,202,165,122,142,222,106,0,Gold Glove,NL,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139,3516,2021,Eric Hosmer,SDP,31,151,509,565,137,97,...,92,79,103,94,73,106,413,No Award,NL,0
141,7859,2021,Charlie Blackmon,COL,34,150,514,582,139,97,...,101,70,107,98,82,104,426,No Award,NL,0
142,19892,2021,Pavin Smith,ARI,25,145,498,545,133,91,...,84,87,101,96,80,108,393,No Award,NL,0
144,12552,2021,Eugenio Suarez,CIN,29,145,505,574,100,46,...,106,134,88,102,134,76,339,No Award,NL,0


In [6]:
# Make a list of the columns to determine which columns should be kept

list(data_NL.columns)

['IDfg',
 'Season',
 'Name',
 'Team',
 'Age',
 'G',
 'AB',
 'PA',
 'H',
 '1B',
 '2B',
 '3B',
 'HR',
 'R',
 'RBI',
 'BB',
 'IBB',
 'SO',
 'HBP',
 'SF',
 'SH',
 'GDP',
 'SB',
 'CS',
 'AVG',
 'BB%',
 'K%',
 'BB/K',
 'OBP',
 'SLG',
 'OPS',
 'ISO',
 'BABIP',
 'wOBA',
 'wRAA',
 'wRC',
 'Bat',
 'Fld',
 'Rep',
 'Pos',
 'RAR',
 'WAR',
 'Spd',
 'wRC+',
 'WPA',
 '-WPA',
 '+WPA',
 'RE24',
 'REW',
 'pLI',
 'PH',
 'WPA/LI',
 'Clutch',
 'BsR',
 'Def',
 'wSB',
 'Age Rng',
 'Off',
 'Lg',
 'TTO%',
 'AVG+',
 'BB%+',
 'K%+',
 'OBP+',
 'SLG+',
 'ISO+',
 'BABIP+',
 'Events',
 'awardID',
 'lgID',
 'MVP']

In [7]:
# Google each stat to determine it's meaning and relevance. 
# Determined that some stats were fielding, pitching, or otherwise irrelevant to our task at hand.

data_NL = data_NL[[ 'G',
 'AB',
 'PA',
 'H',
 '1B',
 '2B',
 '3B',
 'HR',
 'R',
 'RBI',
 'BB',
 'IBB',
 'SO',
 'HBP',
 'SF',
 'SH',
 'SB',
 'AVG',
 'OBP',
 'SLG', 
 'OPS',
 'ISO',
 'BABIP',
 'wOBA',
 'wRAA',
 'wRC',
 'WAR',
 'Spd',
 'wRC+',
 'WPA',
 '-WPA',
 '+WPA',
 'RE24',
 'REW',
 'pLI',
 'WPA/LI',
 'Clutch',
 'BsR',
 'wSB',
 'Off',
 'AVG+',
 'OBP+',
 'SLG+',
 'ISO+',
 'BABIP+',
 'MVP']]

data_NL

Unnamed: 0,G,AB,PA,H,1B,2B,3B,HR,R,RBI,...,Clutch,BsR,wSB,Off,AVG+,OBP+,SLG+,ISO+,BABIP+,MVP
6,154,557,653,163,101,32,1,29,91,97,...,-1.05,-1.6,-1.6,34.9,110,116,132,181,98,0
7,154,557,653,163,101,32,1,29,91,97,...,-1.05,-1.6,-1.6,34.9,110,116,132,181,98,0
8,154,557,653,163,101,32,1,29,91,97,...,-1.05,-1.6,-1.6,34.9,110,116,132,181,98,0
9,148,607,659,183,116,37,7,23,107,83,...,-0.85,3.9,3.9,30.8,114,105,129,164,112,0
12,148,514,631,144,80,26,3,35,108,87,...,-0.47,0.1,0.1,42.9,105,122,142,222,106,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139,151,509,565,137,97,28,0,12,53,65,...,0.32,-5.5,-1.2,-3.9,108,103,94,73,106,0
141,150,514,582,139,97,25,4,13,76,78,...,0.52,1.4,0.0,-2.8,109,107,98,82,104,0
142,145,498,545,133,91,27,4,11,68,49,...,-0.13,-2.4,-0.3,-4.9,107,101,96,80,108,0
144,145,505,574,100,46,23,0,31,71,79,...,-1.37,-4.1,-0.8,-16.1,80,88,102,134,76,0


In [8]:
# Remove duplicate rows that were present due to players winning multiple awards in one year (see original, raw dataframe)

data_NL = data_NL.drop_duplicates()
data_NL

Unnamed: 0,G,AB,PA,H,1B,2B,3B,HR,R,RBI,...,Clutch,BsR,wSB,Off,AVG+,OBP+,SLG+,ISO+,BABIP+,MVP
6,154,557,653,163,101,32,1,29,91,97,...,-1.05,-1.6,-1.6,34.9,110,116,132,181,98,0
9,148,607,659,183,116,37,7,23,107,83,...,-0.85,3.9,3.9,30.8,114,105,129,164,112,0
12,148,514,631,144,80,26,3,35,108,87,...,-0.47,0.1,0.1,42.9,105,122,142,222,106,0
17,150,575,652,175,111,27,5,32,87,100,...,-0.98,2.4,2.4,43.2,115,115,139,192,108,0
19,162,598,698,168,107,23,2,36,113,109,...,0.50,0.3,0.3,34.9,106,115,131,188,106,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139,151,509,565,137,97,28,0,12,53,65,...,0.32,-5.5,-1.2,-3.9,108,103,94,73,106,0
141,150,514,582,139,97,25,4,13,76,78,...,0.52,1.4,0.0,-2.8,109,107,98,82,104,0
142,145,498,545,133,91,27,4,11,68,49,...,-0.13,-2.4,-0.3,-4.9,107,101,96,80,108,0
144,145,505,574,100,46,23,0,31,71,79,...,-1.37,-4.1,-0.8,-16.1,80,88,102,134,76,0


In [9]:
# Count instances of MVP winners to make sure our data makes sense 

data_NL['MVP'].value_counts()[1]

38

In [10]:
# Repeat the above cleaning on the data_AL data set
# Merge both data_NL and data_AL afterwards 

data_AL

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,...,CSW%,xBA,xSLG,xwOBA,key_fangraphs,playerID,awardID,yearID,lgID,MVP
0,1014396,1982,Robin Yount,MIL,26,156,635,704,210,123,...,,,,,1014396.0,yountro01,Gold Glove,1982.0,AL,1
1,1014396,1982,Robin Yount,MIL,26,156,635,704,210,123,...,,,,,1014396.0,yountro01,Most Valuable Player,1982.0,AL,1
2,1014396,1982,Robin Yount,MIL,26,156,635,704,210,123,...,,,,,1014396.0,yountro01,Silver Slugger,1982.0,AL,1
3,1014396,1982,Robin Yount,MIL,26,156,635,704,210,123,...,,,,,1014396.0,yountro01,TSN All-Star,1982.0,AL,1
4,1014396,1982,Robin Yount,MIL,26,156,635,704,210,123,...,,,,,1014396.0,yountro01,TSN Major League Player of the Year,1982.0,AL,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140,18373,2021,Ryan Mountcastle,BAL,24,144,534,586,136,79,...,0.282,0.245,0.457,0.326,,,No Award,,AL,0
143,10243,2021,Randal Grichuk,TOR,29,149,511,545,123,75,...,0.249,0.234,0.402,0.297,,,No Award,,AL,0
147,2396,2021,Carlos Santana,KCR,35,158,565,659,121,87,...,0.242,0.244,0.421,0.335,,,No Award,,AL,0
148,1744,2021,Miguel Cabrera,DET,38,130,472,526,121,90,...,0.274,0.234,0.420,0.315,,,No Award,,AL,0


In [11]:
# Dropped all columns with NaN values 

data_AL = data_AL.dropna(axis=1)
data_AL

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,...,BB%+,K%+,OBP+,SLG+,ISO+,BABIP+,Events,awardID,lgID,MVP
0,1014396,1982,Robin Yount,MIL,26,156,635,704,210,123,...,91,71,115,144,179,116,0,Gold Glove,AL,1
1,1014396,1982,Robin Yount,MIL,26,156,635,704,210,123,...,91,71,115,144,179,116,0,Most Valuable Player,AL,1
2,1014396,1982,Robin Yount,MIL,26,156,635,704,210,123,...,91,71,115,144,179,116,0,Silver Slugger,AL,1
3,1014396,1982,Robin Yount,MIL,26,156,635,704,210,123,...,91,71,115,144,179,116,0,TSN All-Star,AL,1
4,1014396,1982,Robin Yount,MIL,26,156,635,704,210,123,...,91,71,115,144,179,116,0,TSN Major League Player of the Year,AL,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140,18373,2021,Ryan Mountcastle,BAL,24,144,534,586,136,79,...,83,120,98,117,137,102,380,No Award,AL,0
143,10243,2021,Randal Grichuk,TOR,29,149,511,545,123,75,...,58,91,89,102,107,91,401,No Award,AL,0
147,2396,2021,Carlos Santana,KCR,35,158,565,659,121,87,...,154,68,101,82,75,78,468,No Award,AL,0
148,1744,2021,Miguel Cabrera,DET,38,130,472,526,121,90,...,90,98,100,93,76,104,363,No Award,AL,0


In [12]:
# Google each stat to determine it's meaning and relevance. 
# Determined that some stats were fielding, pitching, or otherwise irrelevant to our task at hand.

data_AL = data_AL[[ 'G',
 'AB',
 'PA',
 'H',
 '1B',
 '2B',
 '3B',
 'HR',
 'R',
 'RBI',
 'BB',
 'IBB',
 'SO',
 'HBP',
 'SF',
 'SH',
 'SB',
 'AVG',
 'OBP',
 'SLG', 
 'OPS',
 'ISO',
 'BABIP',
 'wOBA',
 'wRAA',
 'wRC',
 'WAR',
 'Spd',
 'wRC+',
 'WPA',
 '-WPA',
 '+WPA',
 'RE24',
 'REW',
 'pLI',
 'WPA/LI',
 'Clutch',
 'BsR',
 'wSB',
 'Off',
 'AVG+',
 'OBP+',
 'SLG+',
 'ISO+',
 'BABIP+',
 'MVP']]

data_AL

Unnamed: 0,G,AB,PA,H,1B,2B,3B,HR,R,RBI,...,Clutch,BsR,wSB,Off,AVG+,OBP+,SLG+,ISO+,BABIP+,MVP
0,156,635,704,210,123,46,12,29,129,114,...,-0.63,1.6,1.6,54.7,125,115,144,179,116,1
1,156,635,704,210,123,46,12,29,129,114,...,-0.63,1.6,1.6,54.7,125,115,144,179,116,1
2,156,635,704,210,123,46,12,29,129,114,...,-0.63,1.6,1.6,54.7,125,115,144,179,116,1
3,156,635,704,210,123,46,12,29,129,114,...,-0.63,1.6,1.6,54.7,125,115,144,179,116,1
4,156,635,704,210,123,46,12,29,129,114,...,-0.63,1.6,1.6,54.7,125,115,144,179,116,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140,144,534,586,136,79,23,1,33,77,89,...,-1.03,-1.2,-0.9,6.6,104,98,117,137,102,0
143,149,511,545,123,75,25,1,22,59,81,...,-1.14,-4.9,-1.6,-14.7,98,89,102,107,91,0
147,158,565,659,121,87,15,0,19,66,69,...,0.25,-0.8,-0.2,-15.3,87,101,82,75,78,0
148,130,472,526,121,90,16,0,15,48,75,...,0.53,-6.3,-0.5,-10.7,104,100,93,76,104,0


In [13]:
# Remove duplicate rows that were present due to players winning multiple awards in one year (see original, raw dataframe)

data_AL = data_AL.drop_duplicates()
data_AL

Unnamed: 0,G,AB,PA,H,1B,2B,3B,HR,R,RBI,...,Clutch,BsR,wSB,Off,AVG+,OBP+,SLG+,ISO+,BABIP+,MVP
0,156,635,704,210,123,46,12,29,129,114,...,-0.63,1.6,1.6,54.7,125,115,144,179,116,1
10,153,575,655,173,96,42,5,30,94,97,...,-1.68,-0.6,-0.6,35.7,114,112,136,179,107,0
15,162,609,727,178,102,37,7,32,122,98,...,-0.87,-0.2,-0.2,44.2,111,123,133,175,114,0
24,162,602,708,183,125,29,4,25,100,78,...,0.00,2.2,2.2,37.5,115,121,122,135,106,0
25,151,550,627,174,111,30,1,32,87,110,...,1.72,0.6,0.6,40.0,120,119,136,168,114,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140,144,534,586,136,79,23,1,33,77,89,...,-1.03,-1.2,-0.9,6.6,104,98,117,137,102,0
143,149,511,545,123,75,25,1,22,59,81,...,-1.14,-4.9,-1.6,-14.7,98,89,102,107,91,0
147,158,565,659,121,87,15,0,19,66,69,...,0.25,-0.8,-0.2,-15.3,87,101,82,75,78,0
148,130,472,526,121,90,16,0,15,48,75,...,0.53,-6.3,-0.5,-10.7,104,100,93,76,104,0


In [14]:
# Count instances of MVP winners to make sure our data makes sense 

data_AL['MVP'].value_counts()[1]

35

In [15]:
# Going back and adding Lindsay's new metric to each data_NL and data_AL
data_NL['XBH+'] = (((data_NL['R'] + data_NL['2B'] + data_NL['3B'] + data_NL['HR']) - (data_NL['SO']))/data_NL['PA'])
data_NL.head()

Unnamed: 0,G,AB,PA,H,1B,2B,3B,HR,R,RBI,...,BsR,wSB,Off,AVG+,OBP+,SLG+,ISO+,BABIP+,MVP,XBH+
6,154,557,653,163,101,32,1,29,91,97,...,-1.6,-1.6,34.9,110,116,132,181,98,0,0.136294
9,148,607,659,183,116,37,7,23,107,83,...,3.9,3.9,30.8,114,105,129,164,112,0,0.119879
12,148,514,631,144,80,26,3,35,108,87,...,0.1,0.1,42.9,105,122,142,222,106,0,0.064976
17,150,575,652,175,111,27,5,32,87,100,...,2.4,2.4,43.2,115,115,139,192,108,0,0.095092
19,162,598,698,168,107,23,2,36,113,109,...,0.3,0.3,34.9,106,115,131,188,106,1,0.057307


In [16]:
data_AL['XBH+'] = (((data_AL['R'] + data_AL['2B'] + data_AL['3B'] + data_AL['HR']) - (data_AL['SO']))/data_AL['PA'])
data_AL.head()

Unnamed: 0,G,AB,PA,H,1B,2B,3B,HR,R,RBI,...,BsR,wSB,Off,AVG+,OBP+,SLG+,ISO+,BABIP+,MVP,XBH+
0,156,635,704,210,123,46,12,29,129,114,...,1.6,1.6,54.7,125,115,144,179,116,1,0.21733
10,153,575,655,173,96,42,5,30,94,97,...,-0.6,-0.6,35.7,114,112,136,179,107,0,0.138931
15,162,609,727,178,102,37,7,32,122,98,...,-0.2,-0.2,44.2,111,123,133,175,114,0,0.100413
24,162,602,708,183,125,29,4,25,100,78,...,2.2,2.2,37.5,115,121,122,135,106,0,0.149718
25,151,550,627,174,111,30,1,32,87,110,...,0.6,0.6,40.0,120,119,136,168,114,0,0.108453


In [17]:
# Merge the final data_NL and data_AL datasets 
cleaned_merged_df = pd.concat([data_NL, data_AL], axis=0)
cleaned_merged_df

Unnamed: 0,G,AB,PA,H,1B,2B,3B,HR,R,RBI,...,BsR,wSB,Off,AVG+,OBP+,SLG+,ISO+,BABIP+,MVP,XBH+
6,154,557,653,163,101,32,1,29,91,97,...,-1.6,-1.6,34.9,110,116,132,181,98,0,0.136294
9,148,607,659,183,116,37,7,23,107,83,...,3.9,3.9,30.8,114,105,129,164,112,0,0.119879
12,148,514,631,144,80,26,3,35,108,87,...,0.1,0.1,42.9,105,122,142,222,106,0,0.064976
17,150,575,652,175,111,27,5,32,87,100,...,2.4,2.4,43.2,115,115,139,192,108,0,0.095092
19,162,598,698,168,107,23,2,36,113,109,...,0.3,0.3,34.9,106,115,131,188,106,1,0.057307
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140,144,534,586,136,79,23,1,33,77,89,...,-1.2,-0.9,6.6,104,98,117,137,102,0,-0.046075
143,149,511,545,123,75,25,1,22,59,81,...,-4.9,-1.6,-14.7,98,89,102,107,91,0,-0.012844
147,158,565,659,121,87,15,0,19,66,69,...,-0.8,-0.2,-15.3,87,101,82,75,78,0,-0.003035
148,130,472,526,121,90,16,0,15,48,75,...,-6.3,-0.5,-10.7,104,100,93,76,104,0,-0.074144


In [18]:
# Count instances of MVP winners to make sure our data makes sense 

cleaned_merged_df['MVP'].value_counts()[1]

73

In [19]:
# Saving cleaned_merged_df as a CSV. I think this dataframe has potential to be used in the machine learning model.

cleaned_merged_df.to_csv('Resources/cleaned_merged_df.csv')

In [20]:
# Build ML model 

y = cleaned_merged_df['MVP'].values
X = cleaned_merged_df.drop(['MVP'],1).values

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [22]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [23]:
# Train the EasyEnsembleClassifier

from imblearn.ensemble import EasyEnsembleClassifier

ees_model = EasyEnsembleClassifier()
ees_model.fit(X_train_scaled, y_train)

EasyEnsembleClassifier()

In [24]:
# Calculated the balanced accuracy score

y_pred = ees_model.predict(X_test_scaled)
balanced_accuracy_score(y_test, y_pred)

0.9407894736842105

In [25]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,1072,144
Actual 1,0,16


In [26]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.88      1.00      0.94      0.94      0.87      1216
          1       0.10      1.00      0.88      0.18      0.94      0.89        16

avg / total       0.99      0.88      1.00      0.93      0.94      0.87      1232



In [27]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
acc_score = accuracy_score(y_test, y_pred)

In [28]:

from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
acc_score = accuracy_score(y_test, y_pred)
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, y_pred))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,1072,144
Actual 1,0,16


Accuracy Score : 0.8831168831168831
Classification Report
              precision    recall  f1-score   support

           0       1.00      0.88      0.94      1216
           1       0.10      1.00      0.18        16

    accuracy                           0.88      1232
   macro avg       0.55      0.94      0.56      1232
weighted avg       0.99      0.88      0.93      1232



In [29]:
# Import 2022 stats (AL)

from pybaseball import batting_stats

current_year_AL = batting_stats(2022, league='al')

current_year_AL

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,...,Barrel%,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,xBA,xSLG,xwOBA
0,15640,2022,Aaron Judge,NYY,30,120,448,529,133,65,...,0.256,118.4,193,0.603,320,0.177,0.292,0.297,0.698,0.453
3,13510,2022,Jose Ramirez,CLE,29,118,452,510,129,63,...,0.066,111.6,154,0.378,407,0.156,0.213,0.265,0.421,0.331
1,19556,2022,Yordan Alvarez,HOU,25,106,367,441,109,59,...,0.200,117.4,175,0.614,285,0.167,0.254,0.323,0.654,0.455
2,17350,2022,Rafael Devers,BOS,25,107,428,468,128,70,...,0.121,113.7,178,0.514,346,0.103,0.253,0.286,0.520,0.367
4,19950,2022,Andres Gimenez,CLE,23,109,359,404,111,74,...,0.078,109.9,109,0.385,283,0.150,0.281,0.268,0.437,0.342
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62,9256,2022,A.J. Pollock,CHW,34,103,360,389,85,57,...,0.090,110.8,119,0.413,288,0.129,0.255,0.256,0.409,0.314
67,12282,2022,Rougned Odor,BAL,28,110,346,382,69,39,...,0.069,109.7,88,0.340,259,0.140,0.269,0.202,0.333,0.268
58,19198,2022,Yuli Gurriel,HOU,38,113,419,452,103,62,...,0.022,107.1,125,0.343,364,0.185,0.252,0.231,0.314,0.267
65,18607,2022,Jared Walsh,LAA,28,118,423,454,91,56,...,0.095,112.1,121,0.425,285,0.169,0.294,0.222,0.369,0.280


In [30]:
# Add new columns for MVP and XBH+, then set the columns to match the trained set

current_year_AL['MVP'] = 0
current_year_AL['XBH+'] =  (((current_year_AL['R'] + current_year_AL['2B'] + current_year_AL['3B'] + current_year_AL['HR']) - (current_year_AL['SO']))/current_year_AL['PA'])

current_year_AL = current_year_AL[[ 'G',
 'AB',
 'PA',
 'H',
 '1B',
 '2B',
 '3B',
 'HR',
 'R',
 'RBI',
 'BB',
 'IBB',
 'SO',
 'HBP',
 'SF',
 'SH',
 'SB',
 'AVG',
 'OBP',
 'SLG', 
 'OPS',
 'ISO',
 'BABIP',
 'wOBA',
 'wRAA',
 'wRC',
 'WAR',
 'Spd',
 'wRC+',
 'WPA',
 '-WPA',
 '+WPA',
 'RE24',
 'REW',
 'pLI',
 'WPA/LI',
 'Clutch',
 'BsR',
 'wSB',
 'Off',
 'AVG+',
 'OBP+',
 'SLG+',
 'ISO+',
 'BABIP+',
 'XBH+',
 'MVP']]

current_year_AL

Unnamed: 0,G,AB,PA,H,1B,2B,3B,HR,R,RBI,...,BsR,wSB,Off,AVG+,OBP+,SLG+,ISO+,BABIP+,XBH+,MVP
0,120,448,529,133,65,20,0,48,100,105,...,1.3,1.9,58.0,123,128,170,247,109,0.062382,0
3,118,452,510,129,63,37,4,25,70,103,...,4.2,0.4,33.9,118,115,141,179,94,0.166667,0
1,106,367,441,109,59,17,2,31,78,79,...,-1.0,-0.7,39.8,123,130,156,210,106,0.090703,0
2,107,428,468,128,70,32,1,25,68,64,...,0.9,-0.4,27.2,124,116,142,172,111,0.091880,0
4,109,359,404,111,74,20,2,15,49,59,...,2.3,1.7,25.8,128,122,129,130,126,0.009901,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62,103,360,389,85,57,20,1,7,42,40,...,-1.5,-0.5,-9.3,98,92,91,81,96,-0.012853,0
67,110,346,382,69,39,16,3,11,40,43,...,0.1,-0.1,-9.6,82,86,92,107,81,-0.054974,0
58,113,419,452,103,62,34,0,7,46,37,...,0.3,1.2,-3.8,102,95,97,89,93,0.064159,0
65,118,423,454,91,56,18,2,15,41,44,...,-0.8,-0.3,-12.1,89,87,96,107,97,-0.136564,0


In [31]:
# Create X values with current 2022 data to predict 2022 AL MVP

realX = current_year_AL.drop(['MVP'],1).values

In [32]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(realX)

# Scale the data

X_test_scaled = X_scaler.transform(realX)

pred_mvp_AL = ees_model.predict(X_test_scaled)

pred_mvp_AL

array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0], dtype=int64)

In [33]:
# Import 2022 stats (NL)

from pybaseball import batting_stats

current_year_NL = batting_stats(2022, league='nl')

current_year_NL

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,...,Barrel%,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,xBA,xSLG,xwOBA
0,9218,2022,Paul Goldschmidt,STL,34,117,442,511,148,83,...,0.123,112.3,159,0.466,341,0.194,0.289,0.265,0.499,0.371
2,9777,2022,Nolan Arenado,STL,31,116,440,491,132,72,...,0.086,111.4,151,0.395,382,0.164,0.252,0.268,0.454,0.343
1,5361,2022,Freddie Freeman,LAD,32,123,479,548,156,97,...,0.097,112.3,191,0.473,404,0.121,0.206,0.315,0.540,0.401
5,11493,2022,Manny Machado,SDP,29,115,441,494,131,77,...,0.095,112.4,167,0.483,346,0.130,0.240,0.264,0.444,0.339
21,12916,2022,Francisco Lindor,NYM,28,124,482,546,131,89,...,0.081,110.7,155,0.403,385,0.155,0.252,0.253,0.426,0.333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61,11342,2022,Jesus Aguilar,MIA,32,113,415,456,98,65,...,0.072,108.1,113,0.354,319,0.176,0.301,0.240,0.400,0.302
54,11737,2022,Nick Castellanos,PHI,30,123,481,513,129,91,...,0.066,110.1,128,0.353,363,0.107,0.278,0.253,0.402,0.307
60,2434,2022,Nelson Cruz,WSN,41,111,403,454,95,69,...,0.101,113.8,138,0.463,298,0.127,0.279,0.241,0.407,0.322
68,12179,2022,Maikel Franco,WSN,29,103,371,388,85,61,...,0.047,112.1,103,0.344,299,0.161,0.298,0.243,0.351,0.272


In [34]:
# Add new columns for MVP and XBH+, then set the columns to match the trained set

current_year_NL['MVP'] = 0
current_year_NL['XBH+'] =  (((current_year_NL['R'] + current_year_NL['2B'] + current_year_NL['3B'] + current_year_NL['HR']) - (current_year_NL['SO']))/current_year_NL['PA'])

current_year_NL = current_year_NL[[ 'G',
 'AB',
 'PA',
 'H',
 '1B',
 '2B',
 '3B',
 'HR',
 'R',
 'RBI',
 'BB',
 'IBB',
 'SO',
 'HBP',
 'SF',
 'SH',
 'SB',
 'AVG',
 'OBP',
 'SLG', 
 'OPS',
 'ISO',
 'BABIP',
 'wOBA',
 'wRAA',
 'wRC',
 'WAR',
 'Spd',
 'wRC+',
 'WPA',
 '-WPA',
 '+WPA',
 'RE24',
 'REW',
 'pLI',
 'WPA/LI',
 'Clutch',
 'BsR',
 'wSB',
 'Off',
 'AVG+',
 'OBP+',
 'SLG+',
 'ISO+',
 'BABIP+',
 'XBH+',
 'MVP']]

current_year_NL

Unnamed: 0,G,AB,PA,H,1B,2B,3B,HR,R,RBI,...,BsR,wSB,Off,AVG+,OBP+,SLG+,ISO+,BABIP+,XBH+,MVP
0,117,442,511,148,83,34,0,31,87,100,...,2.4,0.4,56.8,137,132,156,185,130,0.090020,0
2,116,440,491,132,72,33,1,26,61,82,...,-1.5,-1.1,31.6,123,116,139,165,102,0.120163,0
1,123,479,548,156,97,41,2,16,87,79,...,5.5,0.6,41.8,133,127,130,125,124,0.118613,0
5,115,441,494,131,77,31,1,22,77,76,...,3.0,0.5,30.9,122,117,130,144,115,0.068826,0
21,124,482,546,131,89,17,4,21,80,84,...,3.7,0.8,22.3,111,110,114,117,103,0.034799,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61,113,415,456,98,65,18,0,15,37,49,...,-2.6,-0.2,-8.1,97,91,97,98,95,-0.078947,0
54,123,481,513,129,91,26,0,12,52,59,...,-2.7,0.5,-4.7,110,98,99,83,114,-0.058480,0
60,111,403,454,95,69,16,0,10,46,61,...,-2.9,0.3,-9.4,96,100,87,73,102,-0.081498,0
68,103,371,388,85,61,15,0,9,31,39,...,-3.7,-0.1,-20.3,94,81,86,73,90,-0.051546,0


In [35]:
# Create X values with current 2022 data to predict 2022 NL MVP

realX = current_year_NL.drop(['MVP'],1).values

In [36]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(realX)

# Scale the data

X_test_scaled = X_scaler.transform(realX)

pred_mvp_NL = ees_model.predict(X_test_scaled)

pred_mvp_NL

array([1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0], dtype=int64)