In [81]:
# First we load the packages
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# I. Building the Dataset: Player Stats

In [82]:
# Step (1)
#  Load the “Salaries” file
Salary = pd.read_csv("../Data/Salaries.csv")
Batting = pd.read_csv("../Data/Batting.csv")

In [83]:
# Step (1) and (2)
#  Drop any missing values, create a variable for the natural log (ln) of player salary, and rename the 
#  column “yearID” as “SalYear”.
#  Create a copy of the “Salaries” dataframe called “Master”. 
master = Salary[Salary['salary']>0].rename({'yearID':'salYear'}, axis=1)
master['lnSalary'] = np.log(master['salary'])
master

Unnamed: 0,salYear,teamID,lgID,playerID,salary,lnSalary
0,1985,ATL,NL,barkele01,870000,13.676248
1,1985,ATL,NL,bedrost01,550000,13.217674
2,1985,ATL,NL,benedbr01,545000,13.208541
3,1985,ATL,NL,campri01,633333,13.358752
4,1985,ATL,NL,ceronri01,625000,13.345507
...,...,...,...,...,...,...
26423,2016,WAS,NL,strasst01,10400000,16.157316
26424,2016,WAS,NL,taylomi02,524000,13.169247
26425,2016,WAS,NL,treinbl01,524900,13.170963
26426,2016,WAS,NL,werthja01,21733615,16.894371


In [84]:
# Step (3)
#  Load the batting data and sum data across stints.
bat = (Batting.groupby(['playerID','yearID'])
              .sum()
              .reset_index()
              .rename({'Doubles':'2B','Triples':'3B'}, axis=1))

In [85]:
# Step (4)
#  Subset batting data to only include batting seasons (yearID) 1998-2006 and players with at least 130AB.
bat = bat[(bat['yearID'].astype('int') >= 1998) &
          (bat['yearID'].astype('int') <= 2006) &
          (bat['AB'].astype('int') >= 130)]
bat

Unnamed: 0,playerID,yearID,stint,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
98,abbotje01,1998,1,89,244,33,68,14,1,12,41.0,3.0,3.0,9,28.0,1.0,0.0,2.0,5.0,2.0
100,abbotje01,2000,1,80,215,31,59,15,1,3,29.0,2.0,1.0,21,38.0,1.0,2.0,2.0,1.0,2.0
117,abbotku01,1998,3,77,194,26,51,13,1,5,24.0,2.0,1.0,12,53.0,0.0,2.0,1.0,3.0,5.0
118,abbotku01,1999,1,96,286,41,78,17,2,8,41.0,3.0,2.0,16,69.0,0.0,0.0,2.0,1.0,4.0
119,abbotku01,2000,1,79,157,22,34,7,1,6,12.0,1.0,1.0,14,51.0,2.0,1.0,0.0,1.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94988,zeileto01,2002,1,144,506,61,138,23,0,18,87.0,1.0,1.0,66,92.0,3.0,1.0,0.0,7.0,27.0
94989,zeileto01,2003,3,100,299,40,68,10,2,11,42.0,1.0,0.0,34,54.0,0.0,3.0,0.0,5.0,6.0
94990,zeileto01,2004,1,137,348,30,81,16,0,9,35.0,0.0,0.0,44,83.0,1.0,1.0,1.0,2.0,13.0
95105,zimmery01,2006,1,157,614,84,176,47,3,20,110.0,11.0,8.0,61,120.0,7.0,2.0,1.0,4.0,15.0


In [86]:
# Step (5)
#  Calculate PA, OBP, SLG, and batting average
bat['PA'] = bat['AB'] + bat['BB'] + bat['HBP'] + bat['SH'] + bat['SF']
bat['OBP'] = (bat['H']+bat['BB']+bat['HBP'])/(bat['AB']+bat['BB']+bat['HBP']+bat['SF'])
bat['SLG'] = (bat['H']-bat['2B']-bat['3B']-bat['HR']+2*bat['2B']+3*bat['3B']+4*bat['HR'])/bat['AB']
bat['BA'] = bat['H'] / bat['AB']

In [87]:
# Step (6)
#  Create SalYear variable to create one year lag between batting performance and salary 
bat['salYear'] = bat['yearID'].astype('int').add(1)
bat

Unnamed: 0,playerID,yearID,stint,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP,PA,OBP,SLG,BA,salYear
98,abbotje01,1998,1,89,244,33,68,14,1,12,41.0,3.0,3.0,9,28.0,1.0,0.0,2.0,5.0,2.0,260.0,0.298450,0.491803,0.278689,1999
100,abbotje01,2000,1,80,215,31,59,15,1,3,29.0,2.0,1.0,21,38.0,1.0,2.0,2.0,1.0,2.0,241.0,0.343096,0.395349,0.274419,2001
117,abbotku01,1998,3,77,194,26,51,13,1,5,24.0,2.0,1.0,12,53.0,0.0,2.0,1.0,3.0,5.0,212.0,0.308057,0.417526,0.262887,1999
118,abbotku01,1999,1,96,286,41,78,17,2,8,41.0,3.0,2.0,16,69.0,0.0,0.0,2.0,1.0,4.0,305.0,0.310231,0.430070,0.272727,2000
119,abbotku01,2000,1,79,157,22,34,7,1,6,12.0,1.0,1.0,14,51.0,2.0,1.0,0.0,1.0,2.0,173.0,0.283237,0.388535,0.216561,2001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94988,zeileto01,2002,1,144,506,61,138,23,0,18,87.0,1.0,1.0,66,92.0,3.0,1.0,0.0,7.0,27.0,580.0,0.353448,0.424901,0.272727,2003
94989,zeileto01,2003,3,100,299,40,68,10,2,11,42.0,1.0,0.0,34,54.0,0.0,3.0,0.0,5.0,6.0,341.0,0.307918,0.384615,0.227425,2004
94990,zeileto01,2004,1,137,348,30,81,16,0,9,35.0,0.0,0.0,44,83.0,1.0,1.0,1.0,2.0,13.0,396.0,0.318987,0.356322,0.232759,2005
95105,zimmery01,2006,1,157,614,84,176,47,3,20,110.0,11.0,8.0,61,120.0,7.0,2.0,1.0,4.0,15.0,682.0,0.350954,0.470684,0.286645,2007


In [88]:
# Step (7)
#  Merge batting data and master data.
master = pd.merge(bat,master,on=['playerID','salYear'])

In [89]:
# Quiz Q1
master.groupby('salYear')['salary'].mean()[[1999,2006]]
# 1999: $2223975
# 2006: $3689305

salYear
1999    2.223975e+06
2006    3.689305e+06
Name: salary, dtype: float64

In [90]:
# Quiz Q2
display(master.groupby('yearID')['OBP'].mean().sort_values(ascending=False))
display(master.groupby('yearID')['SLG'].mean().sort_values(ascending=False))

yearID
1999    0.349470
2000    0.348212
2006    0.342937
2004    0.341663
1998    0.337745
2003    0.337403
2005    0.335947
2001    0.335789
2002    0.335490
Name: OBP, dtype: float64

yearID
2000    0.443840
2006    0.443275
1999    0.442120
2004    0.440307
2001    0.433420
2003    0.430938
2005    0.428867
2002    0.426230
1998    0.423344
Name: SLG, dtype: float64

In [91]:
# Quiz Q3
master.groupby('playerID')['HR'].sum().sort_values(ascending=False)

playerID
rodrial01    400
sosasa01     367
ramirma02    361
bondsba01    355
delgaca01    340
            ... 
husonje01      0
tynerja01      0
posesc01       0
gathrjo01      0
strando01      0
Name: HR, Length: 786, dtype: int64

# II. Building the Dataset: Player Info

In [92]:
# Read in People data retrieved from Lahman's Database
ppl = pd.read_csv("../Data/People.csv")

# Read in player appearance data retrieved from Lahman's Database
app = pd.read_csv("../Data/Appearances.csv")

In [93]:
# Step (1)
#  Read in “People” data and extract the player’s debut year 
ppl['debutYear'] = ppl['debut'].astype('str').str[:4]
ppl = ppl[['playerID','debutYear']]
ppl

# ppl[ppl['debutYear']=='nan']

Unnamed: 0,playerID,debutYear
0,aardsda01,2004
1,aaronha01,1954
2,aaronto01,1962
3,aasedo01,1977
4,abadan01,2001
...,...,...
19365,zupofr01,1957
19366,zuvelpa01,1982
19367,zuverge01,1951
19368,zwilldu01,1910


In [94]:
# Step (2)
#  Merge debut year into master data and calculate years of experience
master = pd.merge(master, ppl, on='playerID')
master

Unnamed: 0,playerID,yearID,stint,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP,PA,OBP,SLG,BA,salYear,teamID,lgID,salary,lnSalary,debutYear
0,abbotje01,1998,1,89,244,33,68,14,1,12,41.0,3.0,3.0,9,28.0,1.0,0.0,2.0,5.0,2.0,260.0,0.298450,0.491803,0.278689,1999,CHA,AL,255000,12.449019,1997
1,abbotje01,2000,1,80,215,31,59,15,1,3,29.0,2.0,1.0,21,38.0,1.0,2.0,2.0,1.0,2.0,241.0,0.343096,0.395349,0.274419,2001,FLO,NL,300000,12.611538,1997
2,abbotku01,1998,3,77,194,26,51,13,1,5,24.0,2.0,1.0,12,53.0,0.0,2.0,1.0,3.0,5.0,212.0,0.308057,0.417526,0.262887,1999,COL,NL,900000,13.710150,1993
3,abbotku01,1999,1,96,286,41,78,17,2,8,41.0,3.0,2.0,16,69.0,0.0,0.0,2.0,1.0,4.0,305.0,0.310231,0.430070,0.272727,2000,NYN,NL,500000,13.122363,1993
4,abbotku01,2000,1,79,157,22,34,7,1,6,12.0,1.0,1.0,14,51.0,2.0,1.0,0.0,1.0,2.0,173.0,0.283237,0.388535,0.216561,2001,ATL,NL,600000,13.304685,1993
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3109,zeileto01,2000,1,153,544,67,146,36,3,22,79.0,3.0,4.0,74,85.0,4.0,2.0,0.0,3.0,15.0,623.0,0.356340,0.466912,0.268382,2001,NYN,NL,6833333,15.737323,1989
3110,zeileto01,2001,1,151,531,66,141,25,1,10,62.0,1.0,0.0,73,102.0,3.0,6.0,0.0,2.0,15.0,612.0,0.359477,0.372881,0.265537,2002,COL,NL,6833333,15.737323,1989
3111,zeileto01,2002,1,144,506,61,138,23,0,18,87.0,1.0,1.0,66,92.0,3.0,1.0,0.0,7.0,27.0,580.0,0.353448,0.424901,0.272727,2003,NYA,AL,1500000,14.220976,1989
3112,zeileto01,2003,3,100,299,40,68,10,2,11,42.0,1.0,0.0,34,54.0,0.0,3.0,0.0,5.0,6.0,341.0,0.307918,0.384615,0.227425,2004,NYN,NL,1000000,13.815511,1989


In [95]:
# Step (3)
#  Based on a player’s years of experience, create indicator variables for arbitration eligible players 
#  (3-6 years) and free agent players (more than 6 years) 
master['debutYear'] = pd.to_numeric(master['debutYear'])
master['exp'] = master['yearID'] - master['debutYear']
master['arb'] = np.where((master['exp']>=3) & (master['exp']<=6), 1, 0)
master['fa'] = np.where(master['exp']>6, 1, 0)
master

Unnamed: 0,playerID,yearID,stint,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP,PA,OBP,SLG,BA,salYear,teamID,lgID,salary,lnSalary,debutYear,exp,arb,fa
0,abbotje01,1998,1,89,244,33,68,14,1,12,41.0,3.0,3.0,9,28.0,1.0,0.0,2.0,5.0,2.0,260.0,0.298450,0.491803,0.278689,1999,CHA,AL,255000,12.449019,1997,1,0,0
1,abbotje01,2000,1,80,215,31,59,15,1,3,29.0,2.0,1.0,21,38.0,1.0,2.0,2.0,1.0,2.0,241.0,0.343096,0.395349,0.274419,2001,FLO,NL,300000,12.611538,1997,3,1,0
2,abbotku01,1998,3,77,194,26,51,13,1,5,24.0,2.0,1.0,12,53.0,0.0,2.0,1.0,3.0,5.0,212.0,0.308057,0.417526,0.262887,1999,COL,NL,900000,13.710150,1993,5,1,0
3,abbotku01,1999,1,96,286,41,78,17,2,8,41.0,3.0,2.0,16,69.0,0.0,0.0,2.0,1.0,4.0,305.0,0.310231,0.430070,0.272727,2000,NYN,NL,500000,13.122363,1993,6,1,0
4,abbotku01,2000,1,79,157,22,34,7,1,6,12.0,1.0,1.0,14,51.0,2.0,1.0,0.0,1.0,2.0,173.0,0.283237,0.388535,0.216561,2001,ATL,NL,600000,13.304685,1993,7,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3109,zeileto01,2000,1,153,544,67,146,36,3,22,79.0,3.0,4.0,74,85.0,4.0,2.0,0.0,3.0,15.0,623.0,0.356340,0.466912,0.268382,2001,NYN,NL,6833333,15.737323,1989,11,0,1
3110,zeileto01,2001,1,151,531,66,141,25,1,10,62.0,1.0,0.0,73,102.0,3.0,6.0,0.0,2.0,15.0,612.0,0.359477,0.372881,0.265537,2002,COL,NL,6833333,15.737323,1989,12,0,1
3111,zeileto01,2002,1,144,506,61,138,23,0,18,87.0,1.0,1.0,66,92.0,3.0,1.0,0.0,7.0,27.0,580.0,0.353448,0.424901,0.272727,2003,NYA,AL,1500000,14.220976,1989,13,0,1
3112,zeileto01,2003,3,100,299,40,68,10,2,11,42.0,1.0,0.0,34,54.0,0.0,3.0,0.0,5.0,6.0,341.0,0.307918,0.384615,0.227425,2004,NYN,NL,1000000,13.815511,1989,14,0,1


In [96]:
# Step (4) & (5)
#  Read in the data for player appearances and group by stint.  Then identify the maximum number of 
#  games played at a given position for each year. 
#  Create a function to determine player position. 
pd.set_option('display.max_columns', 50)

pos_cols = ['G_p', 'G_c', 'G_1b', 'G_2b', 'G_3b', 'G_ss', 'G_lf', 'G_cf', 'G_rf', 'G_of', 'G_dh']
player_pos = app.groupby(['playerID','yearID'])[pos_cols].sum().reset_index()
player_pos['pos'] = player_pos[pos_cols].idxmax(axis=1).str.replace('G_','')
player_pos

Unnamed: 0,playerID,yearID,G_p,G_c,G_1b,G_2b,G_3b,G_ss,G_lf,G_cf,G_rf,G_of,G_dh,pos
0,aardsda01,2004,11,0,0,0,0,0,0,0,0,0,0.0,p
1,aardsda01,2006,45,0,0,0,0,0,0,0,0,0,0.0,p
2,aardsda01,2007,25,0,0,0,0,0,0,0,0,0,0.0,p
3,aardsda01,2008,47,0,0,0,0,0,0,0,0,0,0.0,p
4,aardsda01,2009,73,0,0,0,0,0,0,0,0,0,0.0,p
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96607,zwilldu01,1915,0,0,3,0,0,0,0,148,0,148,0.0,cf
96608,zwilldu01,1916,0,0,0,0,0,0,0,5,4,9,0.0,of
96609,zychto01,2015,13,0,0,0,0,0,0,0,0,0,0.0,p
96610,zychto01,2016,12,0,0,0,0,0,0,0,0,0,0.0,p


In [97]:
# Step (6)
#  Exclude non-position players.
inc_pos = ['c','1b','2b','3b','ss','lf','cf','rf','of','dh']
player_pos = player_pos[player_pos['pos'].isin(inc_pos)]

new_col = player_pos['pos'].replace({'rf':'of','cf':'of','lf':'of'})
player_pos['pos'] = new_col
player_pos

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  player_pos['pos'] = new_col


Unnamed: 0,playerID,yearID,G_p,G_c,G_1b,G_2b,G_3b,G_ss,G_lf,G_cf,G_rf,G_of,G_dh,pos
9,aaronha01,1954,0,0,0,0,0,0,105,0,11,116,0.0,of
10,aaronha01,1955,0,0,0,27,0,0,30,0,104,126,0.0,of
11,aaronha01,1956,0,0,0,0,0,0,0,0,152,152,0.0,of
12,aaronha01,1957,0,0,0,0,0,0,0,69,84,150,0.0,of
13,aaronha01,1958,0,0,0,0,0,0,0,39,119,153,0.0,of
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96596,zuvelpa01,1991,0,0,0,0,2,0,0,0,0,0,0.0,3b
96605,zwilldu01,1910,0,0,0,0,0,0,0,27,0,27,0.0,of
96606,zwilldu01,1914,0,0,0,0,0,0,0,154,0,154,0.0,of
96607,zwilldu01,1915,0,0,3,0,0,0,0,148,0,148,0.0,of


In [98]:
# Step (7)
#  Create an indicator variable for catcher and the infield (2B, SS, 3B) positions separately. Thus, you 
#  should have a separate indicator variable for 2B, SS, and 3B individually as opposed to one infielder 
#  indicator variable combining these positions. 

pos_indicators = pd.get_dummies(player_pos['pos'])
player_pos = (pd.concat([player_pos, pos_indicators], axis=1)
                .loc[:,['playerID','yearID','pos','2b','ss','3b','c']])
player_pos

Unnamed: 0,playerID,yearID,pos,2b,ss,3b,c
9,aaronha01,1954,of,0,0,0,0
10,aaronha01,1955,of,0,0,0,0
11,aaronha01,1956,of,0,0,0,0
12,aaronha01,1957,of,0,0,0,0
13,aaronha01,1958,of,0,0,0,0
...,...,...,...,...,...,...,...
96596,zuvelpa01,1991,3b,0,0,1,0
96605,zwilldu01,1910,of,0,0,0,0
96606,zwilldu01,1914,of,0,0,0,0
96607,zwilldu01,1915,of,0,0,0,0


In [101]:
# Step (8)
#  Merge this into your master data.
master = pd.merge(master, player_pos, on=['playerID','yearID'])

In [102]:
master

Unnamed: 0,playerID,yearID,stint,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP,PA,OBP,SLG,BA,salYear,teamID,lgID,salary,lnSalary,debutYear,exp,arb,fa,pos,2b,ss,3b,c
0,abbotje01,1998,1,89,244,33,68,14,1,12,41.0,3.0,3.0,9,28.0,1.0,0.0,2.0,5.0,2.0,260.0,0.298450,0.491803,0.278689,1999,CHA,AL,255000,12.449019,1997,1,0,0,of,0,0,0,0
1,abbotje01,2000,1,80,215,31,59,15,1,3,29.0,2.0,1.0,21,38.0,1.0,2.0,2.0,1.0,2.0,241.0,0.343096,0.395349,0.274419,2001,FLO,NL,300000,12.611538,1997,3,1,0,of,0,0,0,0
2,abbotku01,1998,3,77,194,26,51,13,1,5,24.0,2.0,1.0,12,53.0,0.0,2.0,1.0,3.0,5.0,212.0,0.308057,0.417526,0.262887,1999,COL,NL,900000,13.710150,1993,5,1,0,ss,0,1,0,0
3,abbotku01,1999,1,96,286,41,78,17,2,8,41.0,3.0,2.0,16,69.0,0.0,0.0,2.0,1.0,4.0,305.0,0.310231,0.430070,0.272727,2000,NYN,NL,500000,13.122363,1993,6,1,0,2b,1,0,0,0
4,abbotku01,2000,1,79,157,22,34,7,1,6,12.0,1.0,1.0,14,51.0,2.0,1.0,0.0,1.0,2.0,173.0,0.283237,0.388535,0.216561,2001,ATL,NL,600000,13.304685,1993,7,0,1,ss,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3109,zeileto01,2000,1,153,544,67,146,36,3,22,79.0,3.0,4.0,74,85.0,4.0,2.0,0.0,3.0,15.0,623.0,0.356340,0.466912,0.268382,2001,NYN,NL,6833333,15.737323,1989,11,0,1,1b,0,0,0,0
3110,zeileto01,2001,1,151,531,66,141,25,1,10,62.0,1.0,0.0,73,102.0,3.0,6.0,0.0,2.0,15.0,612.0,0.359477,0.372881,0.265537,2002,COL,NL,6833333,15.737323,1989,12,0,1,1b,0,0,0,0
3111,zeileto01,2002,1,144,506,61,138,23,0,18,87.0,1.0,1.0,66,92.0,3.0,1.0,0.0,7.0,27.0,580.0,0.353448,0.424901,0.272727,2003,NYA,AL,1500000,14.220976,1989,13,0,1,3b,0,0,1,0
3112,zeileto01,2003,3,100,299,40,68,10,2,11,42.0,1.0,0.0,34,54.0,0.0,3.0,0.0,5.0,6.0,341.0,0.307918,0.384615,0.227425,2004,NYN,NL,1000000,13.815511,1989,14,0,1,3b,0,0,1,0


In [105]:
# Quiz Q1
temp = master.groupby(['yearID','pos'])['salary'].mean().reset_index()
display(temp[temp['yearID']==1999].sort_values('salary', ascending=False))
display(temp[temp['yearID']==2004].sort_values('salary', ascending=False))
# 1999: DH, $3214643
# 2004: 1B, $4211004

Unnamed: 0,yearID,pos,salary
7,1999,1b,3392656.0
11,1999,dh,3110811.0
12,1999,of,2905682.0
8,1999,2b,2330329.0
9,1999,3b,2198130.0
13,1999,ss,2065411.0
10,1999,c,1919788.0


Unnamed: 0,yearID,pos,salary
42,2004,1b,5034910.0
46,2004,dh,4418950.0
47,2004,of,4145419.0
44,2004,3b,3853252.0
48,2004,ss,3210046.0
45,2004,c,2316479.0
43,2004,2b,1946476.0


In [108]:
# Quiz Q2
master['arb'].sum()/master['arb'].count() + master['fa'].sum()/master['fa'].count()
# 78.81%

0.788053949903661

In [112]:
# Quiz Q3
temp = master.groupby(['teamID','yearID'])['exp'].sum().reset_index()
temp[temp['yearID']==2002].sort_values('exp', ascending=False)
# Most 114, Fewest 36

Unnamed: 0,teamID,yearID,exp
226,SFN,2002,114
217,SEA,2002,110
10,ARI,2002,107
55,CHN,2002,105
19,ATL,2002,101
235,SLN,2002,99
109,HOU,2002,99
253,TEX,2002,94
163,NYA,2002,86
172,NYN,2002,81


In [109]:
master

Unnamed: 0,playerID,yearID,stint,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP,PA,OBP,SLG,BA,salYear,teamID,lgID,salary,lnSalary,debutYear,exp,arb,fa,pos,2b,ss,3b,c
0,abbotje01,1998,1,89,244,33,68,14,1,12,41.0,3.0,3.0,9,28.0,1.0,0.0,2.0,5.0,2.0,260.0,0.298450,0.491803,0.278689,1999,CHA,AL,255000,12.449019,1997,1,0,0,of,0,0,0,0
1,abbotje01,2000,1,80,215,31,59,15,1,3,29.0,2.0,1.0,21,38.0,1.0,2.0,2.0,1.0,2.0,241.0,0.343096,0.395349,0.274419,2001,FLO,NL,300000,12.611538,1997,3,1,0,of,0,0,0,0
2,abbotku01,1998,3,77,194,26,51,13,1,5,24.0,2.0,1.0,12,53.0,0.0,2.0,1.0,3.0,5.0,212.0,0.308057,0.417526,0.262887,1999,COL,NL,900000,13.710150,1993,5,1,0,ss,0,1,0,0
3,abbotku01,1999,1,96,286,41,78,17,2,8,41.0,3.0,2.0,16,69.0,0.0,0.0,2.0,1.0,4.0,305.0,0.310231,0.430070,0.272727,2000,NYN,NL,500000,13.122363,1993,6,1,0,2b,1,0,0,0
4,abbotku01,2000,1,79,157,22,34,7,1,6,12.0,1.0,1.0,14,51.0,2.0,1.0,0.0,1.0,2.0,173.0,0.283237,0.388535,0.216561,2001,ATL,NL,600000,13.304685,1993,7,0,1,ss,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3109,zeileto01,2000,1,153,544,67,146,36,3,22,79.0,3.0,4.0,74,85.0,4.0,2.0,0.0,3.0,15.0,623.0,0.356340,0.466912,0.268382,2001,NYN,NL,6833333,15.737323,1989,11,0,1,1b,0,0,0,0
3110,zeileto01,2001,1,151,531,66,141,25,1,10,62.0,1.0,0.0,73,102.0,3.0,6.0,0.0,2.0,15.0,612.0,0.359477,0.372881,0.265537,2002,COL,NL,6833333,15.737323,1989,12,0,1,1b,0,0,0,0
3111,zeileto01,2002,1,144,506,61,138,23,0,18,87.0,1.0,1.0,66,92.0,3.0,1.0,0.0,7.0,27.0,580.0,0.353448,0.424901,0.272727,2003,NYA,AL,1500000,14.220976,1989,13,0,1,3b,0,0,1,0
3112,zeileto01,2003,3,100,299,40,68,10,2,11,42.0,1.0,0.0,34,54.0,0.0,3.0,0.0,5.0,6.0,341.0,0.307918,0.384615,0.227425,2004,NYN,NL,1000000,13.815511,1989,14,0,1,3b,0,0,1,0


# III. Running Regressions

In [125]:
# Step (1)
#  lnSal on OBP, SLG, batting average, plate appearances, arbitration (dummy), free agent (dummy), 
#  and all positional dummy variables during the seasons prior to the publication of Moneyball 
#  (1999-2003) combined. 
import statsmodels.formula.api as smf

df = (master[(master['yearID']>=1999) & (master['yearID']<=2003)]
      .rename({'2b':'sB','3b':'tB'}, axis=1))
smf.ols(formula='lnSalary ~ OBP+SLG+BA+PA+arb+fa+sB+ss+tB+c', data=df).fit().summary()

0,1,2,3
Dep. Variable:,lnSalary,R-squared:,0.665
Model:,OLS,Adj. R-squared:,0.663
Method:,Least Squares,F-statistic:,344.1
Date:,"Wed, 15 Dec 2021",Prob (F-statistic):,0.0
Time:,20:14:56,Log-Likelihood:,-1920.7
No. Observations:,1741,AIC:,3863.0
Df Residuals:,1730,BIC:,3923.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,10.2778,0.176,58.557,0.000,9.934,10.622
OBP,2.8105,0.740,3.798,0.000,1.359,4.262
SLG,2.7213,0.340,8.001,0.000,2.054,3.388
BA,-2.8814,0.939,-3.068,0.002,-4.723,-1.039
PA,0.0031,0.000,26.007,0.000,0.003,0.003
arb,1.2062,0.049,24.640,0.000,1.110,1.302
fa,1.7960,0.049,36.902,0.000,1.701,1.891
sB,-0.1179,0.059,-2.008,0.045,-0.233,-0.003
ss,0.0618,0.061,1.012,0.312,-0.058,0.182

0,1,2,3
Omnibus:,5.895,Durbin-Watson:,1.483
Prob(Omnibus):,0.052,Jarque-Bera (JB):,7.169
Skew:,0.01,Prob(JB):,0.0278
Kurtosis:,3.314,Cond. No.,29200.0


In [126]:
# Step (2)
#  Repeat step 2) but run the regression for the seasons 2004-2006 (all years combined).

df = (master[(master['yearID']>=2004) & (master['yearID']<=2006)]
      .rename({'2b':'sB','3b':'tB'}, axis=1))
smf.ols(formula='lnSalary ~ OBP+SLG+BA+PA+arb+fa+sB+ss+tB+c', data=df).fit().summary()

0,1,2,3
Dep. Variable:,lnSalary,R-squared:,0.638
Model:,OLS,Adj. R-squared:,0.634
Method:,Least Squares,F-statistic:,177.3
Date:,"Wed, 15 Dec 2021",Prob (F-statistic):,4.32e-214
Time:,20:15:47,Log-Likelihood:,-1149.9
No. Observations:,1018,AIC:,2322.0
Df Residuals:,1007,BIC:,2376.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,10.1597,0.254,39.982,0.000,9.661,10.658
OBP,3.6287,1.074,3.378,0.001,1.521,5.736
SLG,2.7628,0.462,5.982,0.000,1.857,3.669
BA,-2.5067,1.259,-1.991,0.047,-4.978,-0.036
PA,0.0029,0.000,17.847,0.000,0.003,0.003
arb,1.1742,0.066,17.731,0.000,1.044,1.304
fa,1.7871,0.063,28.395,0.000,1.664,1.911
sB,-0.1093,0.077,-1.429,0.153,-0.260,0.041
ss,0.0180,0.085,0.211,0.833,-0.150,0.186

0,1,2,3
Omnibus:,16.332,Durbin-Watson:,1.451
Prob(Omnibus):,0.0,Jarque-Bera (JB):,27.901
Skew:,0.063,Prob(JB):,8.74e-07
Kurtosis:,3.801,Cond. No.,30500.0


In [129]:
# Step (3)
#  Run the same regression model as in steps 2) and 3) separately for each season.  It may be 
#  easiest to read output if you display your results in a couple of tables (one for Pre-Moneyball 
#  and one for Post-Moneyball).
from statsmodels.iolib.summary2 import summary_col

ols_mods = []
years = list(range(1999,2007))
for year in years:
    df = master[master['yearID']==year].rename({'2b':'sB','3b':'tB'}, axis=1)
    ols_mods.append(smf.ols(formula='lnSalary ~ OBP+SLG+BA+PA+arb+fa+sB+ss+tB+c', data=df).fit())
    
tbl = summary_col(ols_mods, stars=True, float_format='%.3f', model_names=years)
print(tbl)


                 1999      2000      2001      2002      2003     2004      2005      2006  
--------------------------------------------------------------------------------------------
Intercept      9.990*** 10.234*** 10.335*** 10.219*** 10.155*** 9.894*** 10.072*** 10.601***
               (0.363)  (0.347)   (0.407)   (0.427)   (0.459)   (0.451)  (0.470)   (0.418)  
OBP            3.160**  0.527     0.754     3.256*    7.782***  4.778**  3.300*    2.736    
               (1.459)  (1.475)   (1.838)   (1.808)   (1.851)   (1.932)  (1.883)   (1.807)  
SLG            3.047*** 3.516***  2.251***  1.866**   3.158***  3.666*** 2.070**   2.503*** 
               (0.681)  (0.671)   (0.775)   (0.872)   (0.850)   (0.828)  (0.819)   (0.766)  
BA             -2.353   -1.477    -0.192    -2.166    -8.430*** -4.133*  -1.259    -2.476   
               (1.882)  (1.993)   (2.130)   (2.189)   (2.361)   (2.284)  (2.386)   (1.964)  
PA             0.002*** 0.003***  0.003***  0.003***  0.003***  0.003

In [None]:
#Uncomment this cell once the assignment is complete in order to export your Master dataset
#Master.to_csv("../Data/Master.csv")