In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.metrics import plot_confusion_matrix, recall_score,\
    accuracy_score, precision_score, f1_score
%matplotlib inline
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImPipeline
from sklearn.dummy import DummyClassifier
import xgboost
import pickle
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

### Loading in Data

In [2]:
allstar_df = pd.read_csv("../data/AllstarFull.csv")

In [3]:
apperances_df = pd.read_csv("../data/Appearances.csv")

In [4]:
awards1_df = pd.read_csv("../data/AwardsPlayers.csv")

In [5]:
awards2_df = pd.read_csv("../data/AwardsSharePlayers.csv")

In [6]:
batting_df = pd.read_csv("../data/Batting.csv")

In [9]:
people_df = pd.read_csv("../data/People.csv")

In [12]:
salaries_df = pd.read_csv("../data/Salaries.csv")

In [13]:
playoff_results_df = pd.read_csv("../data/SeriesPost.csv")

In [14]:
teams_df = pd.read_csv("../data/Teams.csv")

### Data Exploration

In [15]:
batting_df.tail(10)

Unnamed: 0,playerID,yearID,stint,teamID,lgID,G,AB,R,H,2B,...,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
110485,zavalse01,2021,1,CHA,AL,37,93,15,17,3,...,15.0,0.0,0.0,6,41.0,0.0,1.0,4.0,0.0,1.0
110486,zerpaan01,2021,1,KCA,AL,1,0,0,0,0,...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
110487,zeuchtj01,2021,1,TOR,AL,5,0,0,0,0,...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
110488,zimmebr01,2021,1,CLE,AL,99,299,44,68,9,...,35.0,15.0,3.0,30,122.0,0.0,15.0,0.0,4.0,3.0
110489,zimmebr02,2021,1,BAL,AL,14,4,0,0,0,...,0.0,0.0,0.0,0,3.0,0.0,0.0,0.0,0.0,0.0
110490,zimmejo02,2021,1,MIL,NL,2,1,0,0,0,...,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0
110491,zimmeky01,2021,1,KCA,AL,52,0,0,0,0,...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
110492,zimmery01,2021,1,WAS,NL,110,255,27,62,16,...,46.0,0.0,0.0,16,77.0,0.0,0.0,0.0,2.0,9.0
110493,zuberty01,2021,1,KCA,AL,31,1,0,0,0,...,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0
110494,zuninmi01,2021,1,TBA,AL,109,333,64,72,11,...,62.0,0.0,0.0,34,132.0,0.0,7.0,0.0,1.0,7.0


In [18]:
batting_df.columns

Index(['playerID', 'yearID', 'stint', 'teamID', 'lgID', 'G', 'AB', 'R', 'H',
       '2B', '3B', 'HR', 'RBI', 'SB', 'CS', 'BB', 'SO', 'IBB', 'HBP', 'SH',
       'SF', 'GIDP'],
      dtype='object')

Need to add columns for Batting Average (AVG), On Base Percentage (OBP), Slugging Percentage (SLG), On Base + Slugging (OPS), Strikeout percentage, and walk percentage

Can drop AB, H, 2B, 3B, BB, SO, after creating the new columns.

Also want to drop HBP, SH, SF, GIDP

In [56]:
#Creating a Plate Appearances column
batting_df['PA'] = (batting_df['AB'] + batting_df['HBP'] + batting_df['BB'] + batting_df['SF'] + batting_df['SH'] + batting_df['IBB'])

In [47]:
#Creating a Batting Average column
batting_df['AVG'] = batting_df['H'] / batting_df['AB']

In [49]:
#Creating an On Base Percentage column
batting_df['OBP'] = (batting_df['H'] + batting_df['BB'] + batting_df ['HBP']) / batting_df['PA']

In [50]:
#Creating a Slugging Percentage column
batting_df['SLG'] = ((batting_df['H'] - batting_df['2B'] - batting_df['3B'] - batting_df['HR']) + (2 * batting_df['2B']) + (3 * batting_df['3B'] + (4 * batting_df['HR']))) / batting_df['AB']

In [52]:
#Creating an On Base Plus Slugging Percentage column
batting_df['OPS'] = batting_df['OBP'] + batting_df['SLG']

In [58]:
#Creating a Strikeout Percentage column
batting_df['K%'] = batting_df['SO'] / batting_df['PA']

In [59]:
#Creating a Walk Percentage column
batting_df['BB%'] = batting_df['BB'] + batting_df['IBB'] / batting_df['PA']

In [63]:
#Creating a Stolen Base Percentage column
batting_df['SB%'] = batting_df['SB'] / (batting_df['SB'] + batting_df['CS'] )

In [64]:
batting_df.tail(20)

Unnamed: 0,playerID,yearID,teamID,lgID,G,R,HR,RBI,SB,CS,IBB,AVG,OBP,SLG,OPS,K%,BB%,SB%
110475,yanghy01,2021,TEX,AL,12,0,0,0.0,0.0,0.0,0.0,,,,,,,
110476,yarbrry01,2021,TBA,AL,30,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,
110477,yardler01,2021,MIL,NL,17,0,0,0.0,0.0,0.0,0.0,,,,,,,
110478,yastrmi01,2021,SFN,NL,139,75,25,71.0,4.0,0.0,4.0,0.224359,0.310734,0.457265,0.767999,0.244403,51.007463,1.0
110479,yelicch01,2021,MIL,NL,117,70,9,51.0,9.0,3.0,5.0,0.24812,0.362105,0.373434,0.735539,0.235417,70.010417,0.75
110480,ynoahu01,2021,ATL,NL,18,3,2,6.0,0.0,0.0,0.0,0.21875,0.21875,0.4375,0.65625,0.46875,0.0,
110481,youngal01,2021,ARI,NL,30,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,
110482,youngal01,2021,CLE,AL,10,0,0,0.0,0.0,0.0,0.0,,,,,,,
110483,youngan02,2021,ARI,NL,58,13,6,15.0,0.0,0.0,0.0,0.208791,0.298077,0.483516,0.781593,0.432692,6.0,
110484,zamorda01,2021,SEA,AL,4,0,0,0.0,0.0,0.0,0.0,,,,,,,


In [20]:
batting_df['stint'].value_counts(normalize=True)

1    0.924911
2    0.070872
3    0.003937
4    0.000244
5    0.000036
Name: stint, dtype: float64

I can drop stint, I don't know what this column means (maybe how many times they were sent down and came back up? However with 92.5% of the data being 1's I feel safe dropping this column.  

In [61]:
batting_df.drop(['stint', 'PA','AB','H', '2B', '3B', 'BB', 'SO', 'HBP', 'SH', 'SF', 'GIDP'], axis=1, inplace=True) 

In [65]:
batting_df.drop(['IBB', 'CS'], axis=1, inplace=True) 

In [66]:
batting_df.tail(25)

Unnamed: 0,playerID,yearID,teamID,lgID,G,R,HR,RBI,SB,AVG,OBP,SLG,OPS,K%,BB%,SB%
110470,wrighky01,2021,ATL,NL,2,1,0,0.0,0.0,0.333333,0.333333,0.666667,1.0,0.333333,0.0,
110471,wrighmi01,2021,CHA,AL,13,0,0,0.0,0.0,,,,,,,
110472,wynnsau01,2021,BAL,AL,45,14,4,14.0,1.0,0.184615,0.231884,0.307692,0.539576,0.223022,8.0,1.0
110473,yajurmi01,2021,PIT,NL,4,0,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,
110474,yamamjo01,2021,NYN,NL,2,0,0,0.0,0.0,0.5,0.5,0.5,1.0,0.5,0.0,
110475,yanghy01,2021,TEX,AL,12,0,0,0.0,0.0,,,,,,,
110476,yarbrry01,2021,TBA,AL,30,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,
110477,yardler01,2021,MIL,NL,17,0,0,0.0,0.0,,,,,,,
110478,yastrmi01,2021,SFN,NL,139,75,25,71.0,4.0,0.224359,0.310734,0.457265,0.767999,0.244403,51.007463,1.0
110479,yelicch01,2021,MIL,NL,117,70,9,51.0,9.0,0.24812,0.362105,0.373434,0.735539,0.235417,70.010417,0.75


In [22]:
people_df.tail(10)

Unnamed: 0,playerID,birthYear,birthMonth,birthDay,birthCountry,birthState,birthCity,deathYear,deathMonth,deathDay,...,nameLast,nameGiven,weight,height,bats,throws,debut,finalGame,retroID,bbrefID
20360,zuberty01,1995.0,6.0,16.0,USA,AR,White Hall,,,,...,Zuber,John Tyler,175.0,71.0,R,R,2020-07-24,2021-09-30,zubet001,zuberty01
20361,zuletju01,1975.0,3.0,28.0,Panama,Panama,Panama,,,,...,Zuleta,Julio Ernesto,230.0,78.0,R,R,2000-04-06,2001-06-25,zulej001,zuletju01
20362,zumayjo01,1984.0,11.0,9.0,USA,CA,Chula Vista,,,,...,Zumaya,Joel Martin,215.0,75.0,R,R,2006-04-03,2010-06-28,zumaj001,zumayjo01
20363,zuninmi01,1991.0,3.0,25.0,USA,FL,Cape Coral,,,,...,Zunino,Michael Accorsi,235.0,74.0,R,R,2013-06-12,2021-10-03,zunim001,zuninmi01
20364,zupcibo01,1966.0,8.0,18.0,USA,PA,Pittsburgh,,,,...,Zupcic,Robert,220.0,76.0,R,R,1991-09-07,1994-08-04,zupcb001,zupcibo01
20365,zupofr01,1939.0,8.0,29.0,USA,CA,San Francisco,2005.0,3.0,25.0,...,Zupo,Frank Joseph,182.0,71.0,L,R,1957-07-01,1961-05-09,zupof101,zupofr01
20366,zuvelpa01,1958.0,10.0,31.0,USA,CA,San Mateo,,,,...,Zuvella,Paul,173.0,72.0,R,R,1982-09-04,1991-05-02,zuvep001,zuvelpa01
20367,zuverge01,1924.0,8.0,20.0,USA,MI,Holland,2014.0,9.0,8.0,...,Zuverink,George,195.0,76.0,R,R,1951-04-21,1959-06-15,zuveg101,zuverge01
20368,zwilldu01,1888.0,11.0,2.0,USA,MO,St. Louis,1978.0,3.0,27.0,...,Zwilling,Edward Harrison,160.0,66.0,L,L,1910-08-14,1916-07-12,zwild101,zwilldu01
20369,zychto01,1990.0,8.0,7.0,USA,IL,Monee,,,,...,Zych,Anthony Aaron,190.0,75.0,R,R,2015-09-04,2017-08-19,zycht001,zychto01


In [21]:
people_df.columns

Index(['playerID', 'birthYear', 'birthMonth', 'birthDay', 'birthCountry',
       'birthState', 'birthCity', 'deathYear', 'deathMonth', 'deathDay',
       'deathCountry', 'deathState', 'deathCity', 'nameFirst', 'nameLast',
       'nameGiven', 'weight', 'height', 'bats', 'throws', 'debut', 'finalGame',
       'retroID', 'bbrefID'],
      dtype='object')

Can drop birth country, birth state, birth city, death year, death month, death day, death country, death state, death city, final game.

Need to do more research to see which id is used, I see playedID, retroID, and bbredID.  At first glance it seems like playerID and bbrefID are the same. Not sure if retroID is used for anything.

In [69]:
people_df.drop(['birthCountry','birthState', 'birthCity', 'deathYear', 'deathMonth', 'deathDay','deathCountry', 'deathState', 'deathCity', 'finalGame'], axis=1, inplace=True) 

In [89]:
people_df.tail(10)

Unnamed: 0,playerID,birthYear,birthMonth,birthDay,nameFirst,nameLast,nameGiven,weight,height,bats,throws,debut,retroID,bbrefID
20360,zuberty01,1995.0,6.0,16.0,Tyler,Zuber,John Tyler,175.0,71.0,R,R,2020-07-24,zubet001,zuberty01
20361,zuletju01,1975.0,3.0,28.0,Julio,Zuleta,Julio Ernesto,230.0,78.0,R,R,2000-04-06,zulej001,zuletju01
20362,zumayjo01,1984.0,11.0,9.0,Joel,Zumaya,Joel Martin,215.0,75.0,R,R,2006-04-03,zumaj001,zumayjo01
20363,zuninmi01,1991.0,3.0,25.0,Mike,Zunino,Michael Accorsi,235.0,74.0,R,R,2013-06-12,zunim001,zuninmi01
20364,zupcibo01,1966.0,8.0,18.0,Bob,Zupcic,Robert,220.0,76.0,R,R,1991-09-07,zupcb001,zupcibo01
20365,zupofr01,1939.0,8.0,29.0,Frank,Zupo,Frank Joseph,182.0,71.0,L,R,1957-07-01,zupof101,zupofr01
20366,zuvelpa01,1958.0,10.0,31.0,Paul,Zuvella,Paul,173.0,72.0,R,R,1982-09-04,zuvep001,zuvelpa01
20367,zuverge01,1924.0,8.0,20.0,George,Zuverink,George,195.0,76.0,R,R,1951-04-21,zuveg101,zuverge01
20368,zwilldu01,1888.0,11.0,2.0,Dutch,Zwilling,Edward Harrison,160.0,66.0,L,L,1910-08-14,zwild101,zwilldu01
20369,zychto01,1990.0,8.0,7.0,Tony,Zych,Anthony Aaron,190.0,75.0,R,R,2015-09-04,zycht001,zychto01


I am dropping birthMonth and birthDay because I am going to pull the year from debut and subtract it from birthYear to have debut age, and then will drop debut and birthYear.

Also dropping nameGiven because it is redundant.

In [90]:
people_df.drop(['birthMonth' , 'birthDay' , 'nameGiven'], axis=1, inplace=True)

In [91]:
people_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20370 entries, 0 to 20369
Data columns (total 11 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   playerID   20370 non-null  object 
 1   birthYear  20259 non-null  float64
 2   nameFirst  20333 non-null  object 
 3   nameLast   20370 non-null  object 
 4   weight     19554 non-null  float64
 5   height     19634 non-null  float64
 6   bats       19189 non-null  object 
 7   throws     19393 non-null  object 
 8   debut      20160 non-null  object 
 9   retroID    20321 non-null  object 
 10  bbrefID    20357 non-null  object 
dtypes: float64(3), object(8)
memory usage: 1.7+ MB


In [93]:
people_df['DebutAge'] = pd.to_datetime(people_df['debut']).dt.year - people_df['birthYear']

In [94]:
people_df.tail(20)

Unnamed: 0,playerID,birthYear,nameFirst,nameLast,weight,height,bats,throws,debut,retroID,bbrefID,DebutAge
20350,ziskri01,1949.0,Richie,Zisk,200.0,73.0,R,R,1971-09-08,ziskr101,ziskri01,22.0
20351,zitoba01,1978.0,Barry,Zito,205.0,74.0,L,L,2000-07-22,zitob001,zitoba01,22.0
20352,zitzmbi01,1895.0,Billy,Zitzmann,175.0,70.0,R,R,1919-04-27,zitzb101,zitzmbi01,24.0
20353,zmiched01,1884.0,Ed,Zmich,180.0,72.0,L,L,1910-07-23,zmice101,zmiched01,26.0
20354,zobribe01,1981.0,Ben,Zobrist,210.0,75.0,B,R,2006-08-01,zobrb001,zobribe01,25.0
20355,zoccope01,1977.0,Peter,Zoccolillo,200.0,74.0,L,R,2003-09-05,zoccp001,zoccope01,26.0
20356,zoldasa01,1918.0,Sam,Zoldak,185.0,71.0,L,L,1944-05-13,zolds101,zoldasa01,26.0
20357,zoskyed01,1968.0,Eddie,Zosky,175.0,72.0,R,R,1991-09-02,zoske001,zoskyed01,23.0
20358,zuberbi01,1913.0,Bill,Zuber,195.0,74.0,R,R,1936-09-16,zubeb101,zuberbi01,23.0
20359,zuberjo01,1969.0,Jon,Zuber,190.0,73.0,L,L,1996-04-19,zubej001,zuberjo01,27.0


In [95]:
people_df.drop(['debut' , 'birthYear'], axis=1, inplace=True)

In [96]:
people_df.tail(10) 

Unnamed: 0,playerID,nameFirst,nameLast,weight,height,bats,throws,retroID,bbrefID,DebutAge
20360,zuberty01,Tyler,Zuber,175.0,71.0,R,R,zubet001,zuberty01,25.0
20361,zuletju01,Julio,Zuleta,230.0,78.0,R,R,zulej001,zuletju01,25.0
20362,zumayjo01,Joel,Zumaya,215.0,75.0,R,R,zumaj001,zumayjo01,22.0
20363,zuninmi01,Mike,Zunino,235.0,74.0,R,R,zunim001,zuninmi01,22.0
20364,zupcibo01,Bob,Zupcic,220.0,76.0,R,R,zupcb001,zupcibo01,25.0
20365,zupofr01,Frank,Zupo,182.0,71.0,L,R,zupof101,zupofr01,18.0
20366,zuvelpa01,Paul,Zuvella,173.0,72.0,R,R,zuvep001,zuvelpa01,24.0
20367,zuverge01,George,Zuverink,195.0,76.0,R,R,zuveg101,zuverge01,27.0
20368,zwilldu01,Dutch,Zwilling,160.0,66.0,L,L,zwild101,zwilldu01,22.0
20369,zychto01,Tony,Zych,190.0,75.0,R,R,zycht001,zychto01,25.0


In [45]:
(people_df['playerID'] == people_df['bbrefID']).sum()

20123

In [43]:
20123/20370

0.9878743249877271

98.8% of our playerID coulumn is a match to our bbrefID column.  Need to keep digging in to find out how to best merge everything.

In [46]:
(people_df['playerID'] == people_df['retroID']).sum()

0

playerID and retroID have no rows in which they are equal.  Can probably drop retroID but I want to dig in more to make sure it doesn't connect to one of the tables.

In [97]:
people_df.drop(['bbrefID' , 'retroID'], axis=1, inplace=True)

In [98]:
allstar_df.tail(10)

Unnamed: 0,playerID,yearID,startingPos
5444,reyesal02,2021,
5445,rogertr01,2021,
5446,schwaky01,2021,
5447,sotoju01,2021,
5448,tayloch03,2021,
5449,turneju01,2021,
5450,turnetr01,2021,
5451,walketa01,2021,
5452,wheelza01,2021,
5453,woodrbr01,2021,


In [22]:
allstar_df.columns

Index(['playerID', 'yearID', 'gameNum', 'gameID', 'teamID', 'lgID', 'GP',
       'startingPos'],
      dtype='object')

In [23]:
allstar_df['gameNum'].value_counts(normalize=True)

0    0.937110
2    0.033187
1    0.029703
Name: gameNum, dtype: float64

Can drop gameNum column because since I am only going to use data from 1990-present.  There hasn't been more than 1 All Star Game since 1962.

In [71]:
#Dropping the gameNum column
allstar_df.drop(['gameNum'], axis=1, inplace=True) 

In [24]:
allstar_df['startingPos'].value_counts(normalize=True)

8.0     0.108708
7.0     0.108708
6.0     0.108708
5.0     0.108708
4.0     0.108708
3.0     0.108708
2.0     0.108708
9.0     0.108124
1.0     0.106371
10.0    0.024547
Name: startingPos, dtype: float64

In [72]:
allstar_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5454 entries, 0 to 5453
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   playerID     5454 non-null   object 
 1   yearID       5454 non-null   int64  
 2   gameID       5405 non-null   object 
 3   teamID       5454 non-null   object 
 4   lgID         5454 non-null   object 
 5   GP           5454 non-null   int64  
 6   startingPos  1711 non-null   float64
dtypes: float64(1), int64(2), object(4)
memory usage: 298.4+ KB


startingPos will be a good way to see if a player started in an all star game or if they came off the bench (the NaN values).

In [26]:
allstar_df['GP'].value_counts(normalize=True)

1    0.779611
0    0.220389
Name: GP, dtype: float64

Whether or not someone actually played in the All Star Game isn't important, what's important is that they were on voted to make the team.  I feel safe dropping GP.

In [73]:
#Dropping the Game Played column
allstar_df.drop(['GP'], axis=1, inplace=True) 

In [85]:
allstar_df.tail()

Unnamed: 0,playerID,yearID,gameID,teamID,lgID,startingPos
5449,turneju01,2021,NLS202107130,LAN,NL,
5450,turnetr01,2021,NLS202107130,WAS,NL,
5451,walketa01,2021,NLS202107130,NYN,NL,
5452,wheelza01,2021,NLS202107130,PHI,NL,
5453,woodrbr01,2021,NLS202107130,MIL,NL,


I am dropping gameID, teamID, and lgID because this data will be found in other dataframes this will be merged on.

In [86]:
allstar_df.drop(['gameID','teamID','lgID'], axis=1, inplace=True)

In [88]:
allstar_df.tail(50)

Unnamed: 0,playerID,yearID,startingPos
5404,pressry01,2021,
5405,ramirjo01,2021,
5406,rodonca01,2021,
5407,rogerta01,2021,
5408,sotogr01,2021,
5409,troutmi01,2021,
5410,walshja01,2021,
5411,wendljo01,2021,
5412,zuninmi01,2021,
5413,scherma01,2021,1.0


In [25]:
apperances_df.tail(10)

Unnamed: 0,yearID,teamID,lgID,playerID,G_all,GS,G_batting,G_defense,G_p,G_c,...,G_2b,G_3b,G_ss,G_lf,G_cf,G_rf,G_of,G_dh,G_ph,G_pr
110413,2021,CHA,AL,zavalse01,37,31.0,37,33.0,0,33,...,0,0,0,0,0,0,0,1.0,3.0,1.0
110414,2021,KCA,AL,zerpaan01,1,1.0,0,1.0,1,0,...,0,0,0,0,0,0,0,0.0,0.0,0.0
110415,2021,TOR,AL,zeuchtj01,5,3.0,0,5.0,5,0,...,0,0,0,0,0,0,0,0.0,0.0,0.0
110416,2021,CLE,AL,zimmebr01,99,84.0,99,97.0,0,0,...,0,0,0,3,54,43,97,1.0,5.0,3.0
110417,2021,BAL,AL,zimmebr02,14,13.0,2,14.0,14,0,...,0,0,0,0,0,0,0,0.0,0.0,0.0
110418,2021,MIL,NL,zimmejo02,2,0.0,2,2.0,2,0,...,0,0,0,0,0,0,0,0.0,0.0,0.0
110419,2021,KCA,AL,zimmeky01,52,2.0,3,52.0,52,0,...,0,0,0,0,0,0,0,0.0,0.0,0.0
110420,2021,WAS,NL,zimmery01,110,48.0,110,54.0,0,0,...,0,0,0,0,0,0,0,3.0,58.0,0.0
110421,2021,KCA,AL,zuberty01,31,0.0,1,31.0,31,0,...,0,0,0,0,0,0,0,0.0,0.0,0.0
110422,2021,TBA,AL,zuninmi01,109,97.0,109,105.0,0,105,...,0,0,0,0,0,0,0,0.0,5.0,0.0


In [27]:
apperances_df.columns

Index(['yearID', 'teamID', 'lgID', 'playerID', 'G_all', 'GS', 'G_batting',
       'G_defense', 'G_p', 'G_c', 'G_1b', 'G_2b', 'G_3b', 'G_ss', 'G_lf',
       'G_cf', 'G_rf', 'G_of', 'G_dh', 'G_ph', 'G_pr'],
      dtype='object')

Appearances will be a great way to see what position a player plays, and see if playing a premium position like Shortstop or Centerfield will have an affect on Salary.

In [26]:
awards1_df.tail(10)

Unnamed: 0,playerID,awardID,yearID,lgID,tie,notes
6521,albieoz01,TSN All-Star,2021,NL,,2B
6522,machama01,TSN All-Star,2021,NL,,3B
6523,tatisfe02,TSN All-Star,2021,NL,,SS
6524,sotoju01,TSN All-Star,2021,NL,,OF
6525,harpebr03,TSN All-Star,2021,NL,,OF
6526,reynobr01,TSN All-Star,2021,NL,,OF
6527,scherma01,TSN All-Star,2021,NL,,SP
6528,haderjo01,TSN All-Star,2021,NL,,RP
6529,friedma01,Silver Slugger,2021,NL,,P
6530,ramirjo01,TSN All-Star,2021,AL,,3B


In [28]:
awards1_df.columns

Index(['playerID', 'awardID', 'yearID', 'lgID', 'tie', 'notes'], dtype='object')

In [27]:
awards2_df.tail(10)

Unnamed: 0,awardID,yearID,lgID,playerID,pointsWon,pointsMax,votesFirst
6869,Rookie of the Year,2016,AL,mazarno01,4.0,150,0.0
6870,Rookie of the Year,2016,AL,anderti01,2.0,150,0.0
6871,Rookie of the Year,2016,NL,seageco01,150.0,150,30.0
6872,Rookie of the Year,2016,NL,turnetr01,42.0,150,0.0
6873,Rookie of the Year,2016,NL,maedake01,37.0,150,0.0
6874,Rookie of the Year,2016,NL,storytr01,24.0,150,0.0
6875,Rookie of the Year,2016,NL,diazal02,14.0,150,0.0
6876,Rookie of the Year,2016,NL,grayjo02,1.0,150,0.0
6877,Rookie of the Year,2016,NL,matzst01,1.0,150,0.0
6878,Rookie of the Year,2016,NL,ohse01,1.0,150,0.0


In [29]:
awards2_df.columns

Index(['awardID', 'yearID', 'lgID', 'playerID', 'pointsWon', 'pointsMax',
       'votesFirst'],
      dtype='object')

In [30]:
people_df.tail(10)

Unnamed: 0,playerID,birthYear,birthMonth,birthDay,birthCountry,birthState,birthCity,deathYear,deathMonth,deathDay,...,nameLast,nameGiven,weight,height,bats,throws,debut,finalGame,retroID,bbrefID
20360,zuberty01,1995.0,6.0,16.0,USA,AR,White Hall,,,,...,Zuber,John Tyler,175.0,71.0,R,R,2020-07-24,2021-09-30,zubet001,zuberty01
20361,zuletju01,1975.0,3.0,28.0,Panama,Panama,Panama,,,,...,Zuleta,Julio Ernesto,230.0,78.0,R,R,2000-04-06,2001-06-25,zulej001,zuletju01
20362,zumayjo01,1984.0,11.0,9.0,USA,CA,Chula Vista,,,,...,Zumaya,Joel Martin,215.0,75.0,R,R,2006-04-03,2010-06-28,zumaj001,zumayjo01
20363,zuninmi01,1991.0,3.0,25.0,USA,FL,Cape Coral,,,,...,Zunino,Michael Accorsi,235.0,74.0,R,R,2013-06-12,2021-10-03,zunim001,zuninmi01
20364,zupcibo01,1966.0,8.0,18.0,USA,PA,Pittsburgh,,,,...,Zupcic,Robert,220.0,76.0,R,R,1991-09-07,1994-08-04,zupcb001,zupcibo01
20365,zupofr01,1939.0,8.0,29.0,USA,CA,San Francisco,2005.0,3.0,25.0,...,Zupo,Frank Joseph,182.0,71.0,L,R,1957-07-01,1961-05-09,zupof101,zupofr01
20366,zuvelpa01,1958.0,10.0,31.0,USA,CA,San Mateo,,,,...,Zuvella,Paul,173.0,72.0,R,R,1982-09-04,1991-05-02,zuvep001,zuvelpa01
20367,zuverge01,1924.0,8.0,20.0,USA,MI,Holland,2014.0,9.0,8.0,...,Zuverink,George,195.0,76.0,R,R,1951-04-21,1959-06-15,zuveg101,zuverge01
20368,zwilldu01,1888.0,11.0,2.0,USA,MO,St. Louis,1978.0,3.0,27.0,...,Zwilling,Edward Harrison,160.0,66.0,L,L,1910-08-14,1916-07-12,zwild101,zwilldu01
20369,zychto01,1990.0,8.0,7.0,USA,IL,Monee,,,,...,Zych,Anthony Aaron,190.0,75.0,R,R,2015-09-04,2017-08-19,zycht001,zychto01


In [33]:
people_df.columns

Index(['playerID', 'birthYear', 'birthMonth', 'birthDay', 'birthCountry',
       'birthState', 'birthCity', 'deathYear', 'deathMonth', 'deathDay',
       'deathCountry', 'deathState', 'deathCity', 'nameFirst', 'nameLast',
       'nameGiven', 'weight', 'height', 'bats', 'throws', 'debut', 'finalGame',
       'retroID', 'bbrefID'],
      dtype='object')

I am going to drop 'birthCountry', 'birthState', 'birthCity', 'deathYear', 'deathMonth', 'deathDay', 'deathCountry', 'deathState', 'deathCity', 'debut', 'finalGame'

In [33]:
salaries_df.tail(10)

Unnamed: 0,yearID,teamID,lgID,playerID,salary
26418,2016,WAS,NL,riverfe01,516100
26419,2016,WAS,NL,roarkta01,543400
26420,2016,WAS,NL,robincl01,534900
26421,2016,WAS,NL,rossjo01,514400
26422,2016,WAS,NL,scherma01,22142857
26423,2016,WAS,NL,strasst01,10400000
26424,2016,WAS,NL,taylomi02,524000
26425,2016,WAS,NL,treinbl01,524900
26426,2016,WAS,NL,werthja01,21733615
26427,2016,WAS,NL,zimmery01,14000000


In [37]:
salaries_df.shape

(26428, 5)

In [34]:
playoff_results_df.tail(10)

Unnamed: 0,yearID,round,teamIDwinner,lgIDwinner,teamIDloser,lgIDloser,wins,losses,ties
357,2020,WS,LAN,NL,TBA,AL,4,2,0
358,2021,ALWC,BOS,AL,NYA,AL,1,0,0
359,2021,ALDS1,HOU,AL,CHA,AL,3,1,0
360,2021,ALDS2,BOS,AL,TBA,AL,3,1,0
361,2021,ALCS,HOU,AL,BOS,AL,4,2,0
362,2021,NLWC,LAN,NL,SLN,NL,1,0,0
363,2021,NLDS1,ATL,NL,MIL,NL,3,1,0
364,2021,NLDS2,LAN,NL,SFN,NL,3,2,0
365,2021,NLCS,ATL,NL,LAN,NL,4,2,0
366,2021,WS,ATL,NL,HOU,AL,4,2,0


In [35]:
teams_df.tail(10)

Unnamed: 0,yearID,lgID,teamID,franchID,divID,Rank,G,Ghome,W,L,...,DP,FP,name,park,attendance,BPF,PPF,teamIDBR,teamIDlahman45,teamIDretro
2975,2021,NL,PHI,PHI,E,2,162,81.0,82,80,...,142,0.984,Philadelphia Phillies,Citizens Bank Park,1515890.0,99,99,PHI,PHI,PHI
2976,2021,NL,PIT,PIT,C,5,162,81.0,61,101,...,139,0.988,Pittsburgh Pirates,PNC Park,859498.0,98,100,PIT,PIT,PIT
2977,2021,NL,SDN,SDP,W,3,162,81.0,79,83,...,139,0.986,San Diego Padres,Petco Park,2191950.0,92,92,SDP,SDN,SDN
2978,2021,AL,SEA,SEA,W,2,162,81.0,90,72,...,135,0.986,Seattle Mariners,T-Mobile Park,1215985.0,95,96,SEA,SEA,SEA
2979,2021,NL,SFN,SFG,W,1,162,81.0,107,55,...,122,0.986,San Francisco Giants,Oracle Park,1679484.0,98,97,SFG,SFN,SFN
2980,2021,NL,SLN,STL,C,2,162,81.0,90,72,...,137,0.986,St. Louis Cardinals,Busch Stadium III,2102530.0,92,92,STL,SLN,SLN
2981,2021,AL,TBA,TBD,E,1,162,81.0,100,62,...,130,0.986,Tampa Bay Rays,Tropicana Field,761072.0,92,91,TBR,TBA,TBA
2982,2021,AL,TEX,TEX,W,5,162,81.0,60,102,...,146,0.986,Texas Rangers,Globe Life Field,2110258.0,99,101,TEX,TEX,TEX
2983,2021,AL,TOR,TOR,E,4,162,80.0,91,71,...,122,0.984,Toronto Blue Jays,Sahlen Field,805901.0,102,101,TOR,TOR,TOR
2984,2021,NL,WAS,WSN,E,5,162,81.0,65,97,...,116,0.983,Washington Nationals,Nationals Park,1465543.0,95,96,WSN,MON,WAS


In [35]:
teams_df.columns

Index(['yearID', 'lgID', 'teamID', 'franchID', 'divID', 'Rank', 'G', 'Ghome',
       'W', 'L', 'DivWin', 'WCWin', 'LgWin', 'WSWin', 'R', 'AB', 'H', '2B',
       '3B', 'HR', 'BB', 'SO', 'SB', 'CS', 'HBP', 'SF', 'RA', 'ER', 'ERA',
       'CG', 'SHO', 'SV', 'IPouts', 'HA', 'HRA', 'BBA', 'SOA', 'E', 'DP', 'FP',
       'name', 'park', 'attendance', 'BPF', 'PPF', 'teamIDBR',
       'teamIDlahman45', 'teamIDretro'],
      dtype='object')

Can drop 'G', 'Ghome', 'R', 'AB', 'H', '2B','3B', 'HR', 'BB', 'SO', 'SB', 'CS', 'HBP', 'SF', 'RA', 'ER', 'ERA', 'CG', 'SHO', 'SV', 'IPouts', 'HA', 'HRA', 'BBA', 'SOA', 'E', 'DP', 'FP', 'name', 'park', 'attendance', 'BPF', 'PPF', 'teamIDBR', 'teamIDlahman45', 'teamIDretro'

I have to decide which of playoff_results_df and teams_df will be better suited to pair to each player to designate whether or not that player was on a winning team.