In [1]:
from Cleaner import *
from config import db_password

from sqlalchemy import create_engine
from pathlib import Path
import numpy as np
import pandas as pd
import psycopg2

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split


In [2]:
awards = pd.read_csv('Resources/AwardsPlayers.csv')
all_awards = reverseLookup(awards)

Gathering player lookup table. This may take a moment.


In [3]:
data_NL, data_AL = Merger(all_awards)
data_NL

............Scraping Data.............

1982 Time: 0.9484102725982666 Seconds
1983 Time: 1.0020620822906494 Seconds
1984 Time: 3.9717698097229004 Seconds
1985 Time: 0.9840247631072998 Seconds
1986 Time: 1.1053252220153809 Seconds

.....Approximately 43-54 Seconds.....

Total Time: 49.46082806587219 Seconds


Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,...,CSW%,xBA,xSLG,xwOBA,key_fangraphs,playerID,awardID,yearID,lgID,MVP
6,1002015,1982,Gary Carter,MON,28,154,557,653,163,101,...,,,,,1002015.0,cartega01,Gold Glove,1982.0,NL,0
7,1002015,1982,Gary Carter,MON,28,154,557,653,163,101,...,,,,,1002015.0,cartega01,Silver Slugger,1982.0,NL,0
8,1002015,1982,Gary Carter,MON,28,154,557,653,163,101,...,,,,,1002015.0,cartega01,TSN All-Star,1982.0,NL,0
9,1003091,1982,Andre Dawson,MON,27,148,607,659,183,116,...,,,,,1003091.0,dawsoan01,Gold Glove,1982.0,NL,0
12,1011586,1982,Mike Schmidt,PHI,32,148,514,631,144,80,...,,,,,1011586.0,schmimi01,Gold Glove,1982.0,NL,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140,7859,2021,Charlie Blackmon,COL,34,150,514,582,139,97,...,0.250,0.291,0.450,0.359,,,No Award,,NL,0
141,3516,2021,Eric Hosmer,SDP,31,151,509,565,137,97,...,0.263,0.268,0.406,0.326,,,No Award,,NL,0
142,19892,2021,Pavin Smith,ARI,25,145,498,545,133,91,...,0.282,0.258,0.402,0.317,,,No Award,,NL,0
143,12552,2021,Eugenio Suarez,CIN,29,145,505,574,100,46,...,0.295,0.215,0.455,0.326,,,No Award,,NL,0


In [4]:
# Determined that some stats were fielding, pitching, or otherwise irrelevant to our task at hand.

data_NL = data_NL[[ 'G',
 'AB',
 'PA',
 'H',
 '1B',
 '2B',
 '3B',
 'HR',
 'R',
 'RBI',
 'BB',
 'IBB',
 'SO',
 'HBP',
 'SF',
 'SH',
 'SB',
 'AVG',
 'OBP',
 'SLG', 
 'OPS',
 'ISO',
 'BABIP',
 'wOBA',
 'wRAA',
 'wRC',
 'WAR',
 'Spd',
 'wRC+',
 'WPA',
 '-WPA',
 '+WPA',
 'RE24',
 'REW',
 'pLI',
 'WPA/LI',
 'Clutch',
 'BsR',
 'wSB',
 'Off',
 'AVG+',
 'OBP+',
 'SLG+',
 'ISO+',
 'BABIP+',
 'MVP']]

data_NL

Unnamed: 0,G,AB,PA,H,1B,2B,3B,HR,R,RBI,...,Clutch,BsR,wSB,Off,AVG+,OBP+,SLG+,ISO+,BABIP+,MVP
6,154,557,653,163,101,32,1,29,91,97,...,-1.05,-1.6,-1.6,34.9,110,116,132,181,98,0
7,154,557,653,163,101,32,1,29,91,97,...,-1.05,-1.6,-1.6,34.9,110,116,132,181,98,0
8,154,557,653,163,101,32,1,29,91,97,...,-1.05,-1.6,-1.6,34.9,110,116,132,181,98,0
9,148,607,659,183,116,37,7,23,107,83,...,-0.85,3.9,3.9,30.8,114,105,129,164,112,0
12,148,514,631,144,80,26,3,35,108,87,...,-0.47,0.1,0.1,42.9,105,122,142,222,106,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140,150,514,582,139,97,25,4,13,76,78,...,0.52,1.4,0.0,-2.8,109,107,98,82,104,0
141,151,509,565,137,97,28,0,12,53,65,...,0.32,-5.5,-1.2,-3.8,108,103,94,73,106,0
142,145,498,545,133,91,27,4,11,68,49,...,-0.13,-2.4,-0.3,-5.2,107,101,96,80,108,0
143,145,505,574,100,46,23,0,31,71,79,...,-1.37,-4.1,-0.8,-15.1,80,88,102,134,76,0


In [5]:
# Remove duplicate rows that were present due to players winning multiple awards in one year (see original, raw dataframe)

data_NL = data_NL.drop_duplicates()
data_NL


Unnamed: 0,G,AB,PA,H,1B,2B,3B,HR,R,RBI,...,Clutch,BsR,wSB,Off,AVG+,OBP+,SLG+,ISO+,BABIP+,MVP
6,154,557,653,163,101,32,1,29,91,97,...,-1.05,-1.6,-1.6,34.9,110,116,132,181,98,0
9,148,607,659,183,116,37,7,23,107,83,...,-0.85,3.9,3.9,30.8,114,105,129,164,112,0
12,148,514,631,144,80,26,3,35,108,87,...,-0.47,0.1,0.1,42.9,105,122,142,222,106,0
17,150,575,652,175,111,27,5,32,87,100,...,-0.98,2.4,2.4,43.2,115,115,139,192,108,0
19,162,598,698,168,107,23,2,36,113,109,...,0.50,0.3,0.3,34.9,106,115,131,188,106,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140,150,514,582,139,97,25,4,13,76,78,...,0.52,1.4,0.0,-2.8,109,107,98,82,104,0
141,151,509,565,137,97,28,0,12,53,65,...,0.32,-5.5,-1.2,-3.8,108,103,94,73,106,0
142,145,498,545,133,91,27,4,11,68,49,...,-0.13,-2.4,-0.3,-5.2,107,101,96,80,108,0
143,145,505,574,100,46,23,0,31,71,79,...,-1.37,-4.1,-0.8,-15.1,80,88,102,134,76,0


In [6]:

# Count instances of MVP winners to make sure our data makes sense 

data_NL['MVP'].value_counts()[1]

38

In [7]:
# Repeat the above cleaning on the data_AL data set
# Merge both data_NL and data_AL afterwards 

data_AL

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,...,CSW%,xBA,xSLG,xwOBA,key_fangraphs,playerID,awardID,yearID,lgID,MVP
0,1014396,1982,Robin Yount,MIL,26,156,635,704,210,123,...,,,,,1014396.0,yountro01,Gold Glove,1982.0,AL,1
1,1014396,1982,Robin Yount,MIL,26,156,635,704,210,123,...,,,,,1014396.0,yountro01,Most Valuable Player,1982.0,AL,1
2,1014396,1982,Robin Yount,MIL,26,156,635,704,210,123,...,,,,,1014396.0,yountro01,Silver Slugger,1982.0,AL,1
3,1014396,1982,Robin Yount,MIL,26,156,635,704,210,123,...,,,,,1014396.0,yountro01,TSN All-Star,1982.0,AL,1
4,1014396,1982,Robin Yount,MIL,26,156,635,704,210,123,...,,,,,1014396.0,yountro01,TSN Major League Player of the Year,1982.0,AL,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139,18373,2021,Ryan Mountcastle,BAL,24,144,534,586,136,79,...,0.282,0.245,0.457,0.326,,,No Award,,AL,0
144,10243,2021,Randal Grichuk,TOR,29,149,511,545,123,75,...,0.249,0.234,0.402,0.297,,,No Award,,AL,0
147,2396,2021,Carlos Santana,KCR,35,158,565,659,121,87,...,0.242,0.244,0.421,0.335,,,No Award,,AL,0
148,1744,2021,Miguel Cabrera,DET,38,130,472,526,121,90,...,0.274,0.234,0.420,0.315,,,No Award,,AL,0


In [8]:
# Dropped all columns with NaN values 

data_AL = data_AL.dropna(axis=1)
data_AL

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,...,BB%+,K%+,OBP+,SLG+,ISO+,BABIP+,Events,awardID,lgID,MVP
0,1014396,1982,Robin Yount,MIL,26,156,635,704,210,123,...,91,71,115,144,179,116,0,Gold Glove,AL,1
1,1014396,1982,Robin Yount,MIL,26,156,635,704,210,123,...,91,71,115,144,179,116,0,Most Valuable Player,AL,1
2,1014396,1982,Robin Yount,MIL,26,156,635,704,210,123,...,91,71,115,144,179,116,0,Silver Slugger,AL,1
3,1014396,1982,Robin Yount,MIL,26,156,635,704,210,123,...,91,71,115,144,179,116,0,TSN All-Star,AL,1
4,1014396,1982,Robin Yount,MIL,26,156,635,704,210,123,...,91,71,115,144,179,116,0,TSN Major League Player of the Year,AL,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139,18373,2021,Ryan Mountcastle,BAL,24,144,534,586,136,79,...,83,120,98,117,137,102,380,No Award,AL,0
144,10243,2021,Randal Grichuk,TOR,29,149,511,545,123,75,...,58,91,89,102,107,91,401,No Award,AL,0
147,2396,2021,Carlos Santana,KCR,35,158,565,659,121,87,...,154,68,101,82,75,78,468,No Award,AL,0
148,1744,2021,Miguel Cabrera,DET,38,130,472,526,121,90,...,90,98,100,93,76,104,363,No Award,AL,0


In [9]:
# Determined that some stats were fielding, pitching, or otherwise irrelevant to our task at hand.

data_AL = data_AL[[ 'G',
 'AB',
 'PA',
 'H',
 '1B',
 '2B',
 '3B',
 'HR',
 'R',
 'RBI',
 'BB',
 'IBB',
 'SO',
 'HBP',
 'SF',
 'SH',
 'SB',
 'AVG',
 'OBP',
 'SLG', 
 'OPS',
 'ISO',
 'BABIP',
 'wOBA',
 'wRAA',
 'wRC',
 'WAR',
 'Spd',
 'wRC+',
 'WPA',
 '-WPA',
 '+WPA',
 'RE24',
 'REW',
 'pLI',
 'WPA/LI',
 'Clutch',
 'BsR',
 'wSB',
 'Off',
 'AVG+',
 'OBP+',
 'SLG+',
 'ISO+',
 'BABIP+',
 'MVP']]

data_AL

Unnamed: 0,G,AB,PA,H,1B,2B,3B,HR,R,RBI,...,Clutch,BsR,wSB,Off,AVG+,OBP+,SLG+,ISO+,BABIP+,MVP
0,156,635,704,210,123,46,12,29,129,114,...,-0.63,1.6,1.6,54.7,125,115,144,179,116,1
1,156,635,704,210,123,46,12,29,129,114,...,-0.63,1.6,1.6,54.7,125,115,144,179,116,1
2,156,635,704,210,123,46,12,29,129,114,...,-0.63,1.6,1.6,54.7,125,115,144,179,116,1
3,156,635,704,210,123,46,12,29,129,114,...,-0.63,1.6,1.6,54.7,125,115,144,179,116,1
4,156,635,704,210,123,46,12,29,129,114,...,-0.63,1.6,1.6,54.7,125,115,144,179,116,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139,144,534,586,136,79,23,1,33,77,89,...,-1.03,-1.2,-0.9,6.6,104,98,117,137,102,0
144,149,511,545,123,75,25,1,22,59,81,...,-1.14,-4.9,-1.6,-14.7,98,89,102,107,91,0
147,158,565,659,121,87,15,0,19,66,69,...,0.25,-0.8,-0.2,-14.6,87,101,82,75,78,0
148,130,472,526,121,90,16,0,15,48,75,...,0.53,-6.3,-0.5,-11.4,104,100,93,76,104,0


In [10]:
# Remove duplicate rows that were present due to players winning multiple awards in one year (see original, raw dataframe)

data_AL = data_AL.drop_duplicates()
data_AL

Unnamed: 0,G,AB,PA,H,1B,2B,3B,HR,R,RBI,...,Clutch,BsR,wSB,Off,AVG+,OBP+,SLG+,ISO+,BABIP+,MVP
0,156,635,704,210,123,46,12,29,129,114,...,-0.63,1.6,1.6,54.7,125,115,144,179,116,1
10,153,575,655,173,96,42,5,30,94,97,...,-1.68,-0.6,-0.6,35.7,114,112,136,179,107,0
15,162,609,727,178,102,37,7,32,122,98,...,-0.87,-0.2,-0.2,44.2,111,123,133,175,114,0
24,162,602,708,183,125,29,4,25,100,78,...,0.00,2.2,2.2,37.5,115,121,122,135,106,0
25,151,550,627,174,111,30,1,32,87,110,...,1.72,0.6,0.6,40.0,120,119,136,168,114,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139,144,534,586,136,79,23,1,33,77,89,...,-1.03,-1.2,-0.9,6.6,104,98,117,137,102,0
144,149,511,545,123,75,25,1,22,59,81,...,-1.14,-4.9,-1.6,-14.7,98,89,102,107,91,0
147,158,565,659,121,87,15,0,19,66,69,...,0.25,-0.8,-0.2,-14.6,87,101,82,75,78,0
148,130,472,526,121,90,16,0,15,48,75,...,0.53,-6.3,-0.5,-11.4,104,100,93,76,104,0


In [11]:
# Count instances of MVP winners to make sure our data makes sense 

data_AL['MVP'].value_counts()[1]

35

In [12]:
# Going back and adding Lindsay's new metric to each data_NL and data_AL
data_NL['XBH+'] = (((data_NL['R'] + data_NL['2B'] + data_NL['3B'] + data_NL['HR']) - (data_NL['SO']))/data_NL['PA'])
data_NL.head()

Unnamed: 0,G,AB,PA,H,1B,2B,3B,HR,R,RBI,...,BsR,wSB,Off,AVG+,OBP+,SLG+,ISO+,BABIP+,MVP,XBH+
6,154,557,653,163,101,32,1,29,91,97,...,-1.6,-1.6,34.9,110,116,132,181,98,0,0.136294
9,148,607,659,183,116,37,7,23,107,83,...,3.9,3.9,30.8,114,105,129,164,112,0,0.119879
12,148,514,631,144,80,26,3,35,108,87,...,0.1,0.1,42.9,105,122,142,222,106,0,0.064976
17,150,575,652,175,111,27,5,32,87,100,...,2.4,2.4,43.2,115,115,139,192,108,0,0.095092
19,162,598,698,168,107,23,2,36,113,109,...,0.3,0.3,34.9,106,115,131,188,106,1,0.057307


In [13]:
data_AL['XBH+'] = (((data_AL['R'] + data_AL['2B'] + data_AL['3B'] + data_AL['HR']) - (data_AL['SO']))/data_AL['PA'])
data_AL.head()

Unnamed: 0,G,AB,PA,H,1B,2B,3B,HR,R,RBI,...,BsR,wSB,Off,AVG+,OBP+,SLG+,ISO+,BABIP+,MVP,XBH+
0,156,635,704,210,123,46,12,29,129,114,...,1.6,1.6,54.7,125,115,144,179,116,1,0.21733
10,153,575,655,173,96,42,5,30,94,97,...,-0.6,-0.6,35.7,114,112,136,179,107,0,0.138931
15,162,609,727,178,102,37,7,32,122,98,...,-0.2,-0.2,44.2,111,123,133,175,114,0,0.100413
24,162,602,708,183,125,29,4,25,100,78,...,2.2,2.2,37.5,115,121,122,135,106,0,0.149718
25,151,550,627,174,111,30,1,32,87,110,...,0.6,0.6,40.0,120,119,136,168,114,0,0.108453


In [14]:
# Merge the final data_NL and data_AL datasets 
cleaned_merged_df = pd.concat([data_NL, data_AL], axis=0)
cleaned_merged_df

Unnamed: 0,G,AB,PA,H,1B,2B,3B,HR,R,RBI,...,BsR,wSB,Off,AVG+,OBP+,SLG+,ISO+,BABIP+,MVP,XBH+
6,154,557,653,163,101,32,1,29,91,97,...,-1.6,-1.6,34.9,110,116,132,181,98,0,0.136294
9,148,607,659,183,116,37,7,23,107,83,...,3.9,3.9,30.8,114,105,129,164,112,0,0.119879
12,148,514,631,144,80,26,3,35,108,87,...,0.1,0.1,42.9,105,122,142,222,106,0,0.064976
17,150,575,652,175,111,27,5,32,87,100,...,2.4,2.4,43.2,115,115,139,192,108,0,0.095092
19,162,598,698,168,107,23,2,36,113,109,...,0.3,0.3,34.9,106,115,131,188,106,1,0.057307
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139,144,534,586,136,79,23,1,33,77,89,...,-1.2,-0.9,6.6,104,98,117,137,102,0,-0.046075
144,149,511,545,123,75,25,1,22,59,81,...,-4.9,-1.6,-14.7,98,89,102,107,91,0,-0.012844
147,158,565,659,121,87,15,0,19,66,69,...,-0.8,-0.2,-14.6,87,101,82,75,78,0,-0.003035
148,130,472,526,121,90,16,0,15,48,75,...,-6.3,-0.5,-11.4,104,100,93,76,104,0,-0.074144


In [15]:
# Count instances of MVP winners to make sure our data makes sense 

cleaned_merged_df['MVP'].value_counts()[1]

73

In [17]:
y = cleaned_merged_df['MVP'].values
X = cleaned_merged_df.drop(['MVP'],1).values

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [19]:
classifier = LogisticRegression(solver='lbfgs')
classifier

In [20]:
# Train the data
classifier.fit(X_train, y_train)

In [22]:
y_pred = classifier.predict(X_test)

In [23]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.9821428571428571
