In [1]:
# First we load the packages
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# I. Building the Dataset: Player Stats

In [2]:
# Read in salary data retrieved from Lahman's Database
Salary = pd.read_csv("../Data/Salaries.csv")

# Read in batting
Batting = pd.read_csv("../Data/Batting.csv")

In [3]:
#preview the dataframe
Salary=Salary[Salary.salary > 0]
Salary.head()

Unnamed: 0,yearID,teamID,lgID,playerID,salary
0,1985,ATL,NL,barkele01,870000
1,1985,ATL,NL,bedrost01,550000
2,1985,ATL,NL,benedbr01,545000
3,1985,ATL,NL,campri01,633333
4,1985,ATL,NL,ceronri01,625000


In [4]:
# create natural log of player salary
Salary['lnSal']= np.log(Salary.salary)
#rename column
Salary= Salary.rename(columns={'yearID': 'SalYear'})
Salary.head()

Unnamed: 0,SalYear,teamID,lgID,playerID,salary,lnSal
0,1985,ATL,NL,barkele01,870000,13.676248
1,1985,ATL,NL,bedrost01,550000,13.217674
2,1985,ATL,NL,benedbr01,545000,13.208541
3,1985,ATL,NL,campri01,633333,13.358752
4,1985,ATL,NL,ceronri01,625000,13.345507


In [5]:
# create copy of df
Master= Salary.copy()

In [6]:
# view df
Batting.head()

Unnamed: 0,playerID,yearID,stint,teamID,lgID,G,AB,R,H,Doubles,...,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
0,abercda01,1871,1,TRO,,1,4,0,0,0,...,0.0,0.0,0.0,0,0.0,,,,,
1,addybo01,1871,1,RC1,,25,118,30,32,6,...,13.0,8.0,1.0,4,0.0,,,,,
2,allisar01,1871,1,CL1,,29,137,28,40,4,...,19.0,3.0,1.0,2,5.0,,,,,
3,allisdo01,1871,1,WS3,,27,133,28,44,10,...,27.0,1.0,1.0,0,2.0,,,,,
4,ansonca01,1871,1,RC1,,25,120,29,39,11,...,16.0,6.0,2.0,2,1.0,,,,,


In [7]:
# sum across stints
Batting= Batting.groupby(['playerID', 'yearID']).sum()
Batting= Batting.reset_index()
Batting.head()

Unnamed: 0,playerID,yearID,stint,G,AB,R,H,Doubles,Triples,HR,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
0,aardsda01,2004,1,11,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
1,aardsda01,2006,1,45,2,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,1.0,0.0,0.0
2,aardsda01,2007,1,25,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,aardsda01,2008,1,47,1,0,0,0,0,0,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0
4,aardsda01,2009,1,73,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
# subset for year 1998-2006 and at least 130 AB
Batting= Batting[(Batting.yearID >= 1998) & (Batting.yearID <= 2006) & (Batting.AB >= 130)]
Batting.head()

Unnamed: 0,playerID,yearID,stint,G,AB,R,H,Doubles,Triples,HR,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
98,abbotje01,1998,1,89,244,33,68,14,1,12,41.0,3.0,3.0,9,28.0,1.0,0.0,2.0,5.0,2.0
100,abbotje01,2000,1,80,215,31,59,15,1,3,29.0,2.0,1.0,21,38.0,1.0,2.0,2.0,1.0,2.0
117,abbotku01,1998,3,77,194,26,51,13,1,5,24.0,2.0,1.0,12,53.0,0.0,2.0,1.0,3.0,5.0
118,abbotku01,1999,1,96,286,41,78,17,2,8,41.0,3.0,2.0,16,69.0,0.0,0.0,2.0,1.0,4.0
119,abbotku01,2000,1,79,157,22,34,7,1,6,12.0,1.0,1.0,14,51.0,2.0,1.0,0.0,1.0,2.0


In [9]:
# calcuate PA, OBP, SLG and Batting AVG

Batting['PA'] = Batting['AB'] + Batting['BB'] + Batting['HBP'] + Batting['SH'] + Batting['SF']

Batting['OBP'] = (Batting['H'] + Batting['BB'] + Batting['HBP'])/(Batting['AB'] + Batting['BB']
                                                                  + Batting['HBP'] + Batting['SF'])

Batting['SLG'] = ((Batting['H'] - Batting['Doubles'] - Batting['Triples'] - Batting['HR']) 
                  + 2*Batting['Doubles'] + 3*Batting['Triples'] + 4*Batting['HR'])/Batting['AB']

Batting['AVG']= Batting.H /Batting.AB

Batting.sort_values(by='AVG', ascending=False).head()

Unnamed: 0,playerID,yearID,stint,G,AB,R,H,Doubles,Triples,HR,...,SO,IBB,HBP,SH,SF,GIDP,PA,OBP,SLG,AVG
89325,walkela01,1999,1,127,438,108,166,26,4,37,...,52.0,8.0,12.0,0.0,6.0,12.0,513.0,0.45809,0.710046,0.378995
36925,heltoto01,2000,1,160,580,138,216,59,2,42,...,61.0,22.0,4.0,0.0,10.0,12.0,697.0,0.463415,0.698276,0.372414
29430,garcino01,2000,1,140,529,104,197,51,3,21,...,50.0,20.0,2.0,0.0,7.0,8.0,599.0,0.434057,0.599244,0.372401
83727,suzukic01,2004,1,161,704,101,262,24,5,8,...,63.0,19.0,4.0,2.0,3.0,6.0,762.0,0.414474,0.454545,0.372159
7778,bondsba01,2002,1,143,403,117,149,31,2,46,...,47.0,68.0,9.0,0.0,2.0,4.0,612.0,0.581699,0.799007,0.369727


In [10]:
# create SalYear variable to create one year lag between batting performace and salary
Batting['SalYear']= Batting.yearID + 1


In [11]:
# merge both datasets
Master= pd.merge(Batting, Master, on=['SalYear', 'playerID'])
Master.head()

Unnamed: 0,playerID,yearID,stint,G,AB,R,H,Doubles,Triples,HR,...,GIDP,PA,OBP,SLG,AVG,SalYear,teamID,lgID,salary,lnSal
0,abbotje01,1998,1,89,244,33,68,14,1,12,...,2.0,260.0,0.29845,0.491803,0.278689,1999,CHA,AL,255000,12.449019
1,abbotje01,2000,1,80,215,31,59,15,1,3,...,2.0,241.0,0.343096,0.395349,0.274419,2001,FLO,NL,300000,12.611538
2,abbotku01,1998,3,77,194,26,51,13,1,5,...,5.0,212.0,0.308057,0.417526,0.262887,1999,COL,NL,900000,13.71015
3,abbotku01,1999,1,96,286,41,78,17,2,8,...,4.0,305.0,0.310231,0.43007,0.272727,2000,NYN,NL,500000,13.122363
4,abbotku01,2000,1,79,157,22,34,7,1,6,...,2.0,173.0,0.283237,0.388535,0.216561,2001,ATL,NL,600000,13.304685


In [12]:
# Calculate average salary by season
MeanSal= Master.groupby('SalYear')['salary'].mean()
MeanSal

SalYear
1999    2.223975e+06
2000    2.590626e+06
2001    3.029559e+06
2002    3.153773e+06
2003    3.472972e+06
2004    3.322968e+06
2005    3.575228e+06
2006    3.689305e+06
2007    3.942908e+06
Name: salary, dtype: float64

In [13]:
# Cal average player OBP and SLG by season
Master.groupby(['yearID'])['OBP', 'SLG'].mean()

Unnamed: 0_level_0,OBP,SLG
yearID,Unnamed: 1_level_1,Unnamed: 2_level_1
1998,0.337745,0.423344
1999,0.34947,0.44212
2000,0.348212,0.44384
2001,0.335789,0.43342
2002,0.33549,0.42623
2003,0.337403,0.430938
2004,0.341663,0.440307
2005,0.335947,0.428867
2006,0.342937,0.443275


In [14]:
# Cal aggregate player HR across timeframe
Master.groupby('playerID')['HR'].sum().sort_values(ascending=False).head()

playerID
rodrial01    400
sosasa01     367
ramirma02    361
bondsba01    355
delgaca01    340
Name: HR, dtype: int64

# II. Building the Dataset: Player Info

In [15]:
# Read in People data retrieved from Lahman's Database
People = pd.read_csv("../Data/People.csv")

# Read in player appearance data retrieved from Lahman's Database
Appearances = pd.read_csv("../Data/Appearances.csv")

In [16]:
#view df
People.head()

Unnamed: 0,playerID,birthYear,birthMonth,birthDay,birthCountry,birthState,birthCity,deathYear,deathMonth,deathDay,...,nameLast,nameGiven,weight,height,bats,throws,debut,finalGame,retroID,bbrefID
0,aardsda01,1981.0,12.0,27.0,USA,CO,Denver,,,,...,Aardsma,David Allan,215.0,75.0,R,R,2004-04-06,2015-08-23,aardd001,aardsda01
1,aaronha01,1934.0,2.0,5.0,USA,AL,Mobile,,,,...,Aaron,Henry Louis,180.0,72.0,R,R,1954-04-13,1976-10-03,aaroh101,aaronha01
2,aaronto01,1939.0,8.0,5.0,USA,AL,Mobile,1984.0,8.0,16.0,...,Aaron,Tommie Lee,190.0,75.0,R,R,1962-04-10,1971-09-26,aarot101,aaronto01
3,aasedo01,1954.0,9.0,8.0,USA,CA,Orange,,,,...,Aase,Donald William,190.0,75.0,R,R,1977-07-26,1990-10-03,aased001,aasedo01
4,abadan01,1972.0,8.0,25.0,USA,FL,Palm Beach,,,,...,Abad,Fausto Andres,184.0,73.0,L,L,2001-09-10,2006-04-13,abada001,abadan01


In [17]:
# extract debut year 
Debut= People[['playerID', 'debut']].copy()
Debut.head(1)

Unnamed: 0,playerID,debut
0,aardsda01,2004-04-06


In [18]:
# extract year
Debut['debut_yr']= Debut.debut.astype(str).str[0:4]
Debut.head(2)

Unnamed: 0,playerID,debut,debut_yr
0,aardsda01,2004-04-06,2004
1,aaronha01,1954-04-13,1954


In [19]:
Debut= Debut[['playerID', 'debut_yr']]

In [20]:
# merge datasets
Master= pd.merge(Master, Debut, on='playerID', how='left')
Master.head()

Unnamed: 0,playerID,yearID,stint,G,AB,R,H,Doubles,Triples,HR,...,PA,OBP,SLG,AVG,SalYear,teamID,lgID,salary,lnSal,debut_yr
0,abbotje01,1998,1,89,244,33,68,14,1,12,...,260.0,0.29845,0.491803,0.278689,1999,CHA,AL,255000,12.449019,1997
1,abbotje01,2000,1,80,215,31,59,15,1,3,...,241.0,0.343096,0.395349,0.274419,2001,FLO,NL,300000,12.611538,1997
2,abbotku01,1998,3,77,194,26,51,13,1,5,...,212.0,0.308057,0.417526,0.262887,1999,COL,NL,900000,13.71015,1993
3,abbotku01,1999,1,96,286,41,78,17,2,8,...,305.0,0.310231,0.43007,0.272727,2000,NYN,NL,500000,13.122363,1993
4,abbotku01,2000,1,79,157,22,34,7,1,6,...,173.0,0.283237,0.388535,0.216561,2001,ATL,NL,600000,13.304685,1993


In [21]:
Master['Exp']= Master.yearID - Master.debut_yr.astype(int)
Master.head()

Unnamed: 0,playerID,yearID,stint,G,AB,R,H,Doubles,Triples,HR,...,OBP,SLG,AVG,SalYear,teamID,lgID,salary,lnSal,debut_yr,Exp
0,abbotje01,1998,1,89,244,33,68,14,1,12,...,0.29845,0.491803,0.278689,1999,CHA,AL,255000,12.449019,1997,1
1,abbotje01,2000,1,80,215,31,59,15,1,3,...,0.343096,0.395349,0.274419,2001,FLO,NL,300000,12.611538,1997,3
2,abbotku01,1998,3,77,194,26,51,13,1,5,...,0.308057,0.417526,0.262887,1999,COL,NL,900000,13.71015,1993,5
3,abbotku01,1999,1,96,286,41,78,17,2,8,...,0.310231,0.43007,0.272727,2000,NYN,NL,500000,13.122363,1993,6
4,abbotku01,2000,1,79,157,22,34,7,1,6,...,0.283237,0.388535,0.216561,2001,ATL,NL,600000,13.304685,1993,7


In [22]:
Master['Rookie']= np.where(Master.Exp < 3, 1, 0)
Master['Arb']= np.where((Master.Exp >=3) & (Master.Exp <=6), 1, 0)
Master['Free']= np.where(Master.Exp>6, 1, 0)
Master.head()

Unnamed: 0,playerID,yearID,stint,G,AB,R,H,Doubles,Triples,HR,...,SalYear,teamID,lgID,salary,lnSal,debut_yr,Exp,Rookie,Arb,Free
0,abbotje01,1998,1,89,244,33,68,14,1,12,...,1999,CHA,AL,255000,12.449019,1997,1,1,0,0
1,abbotje01,2000,1,80,215,31,59,15,1,3,...,2001,FLO,NL,300000,12.611538,1997,3,0,1,0
2,abbotku01,1998,3,77,194,26,51,13,1,5,...,1999,COL,NL,900000,13.71015,1993,5,0,1,0
3,abbotku01,1999,1,96,286,41,78,17,2,8,...,2000,NYN,NL,500000,13.122363,1993,6,0,1,0
4,abbotku01,2000,1,79,157,22,34,7,1,6,...,2001,ATL,NL,600000,13.304685,1993,7,0,0,1


In [23]:
Appearances.head()

Unnamed: 0,yearID,teamID,lgID,playerID,G_all,GS,G_batting,G_defense,G_p,G_c,...,G_2b,G_3b,G_ss,G_lf,G_cf,G_rf,G_of,G_dh,G_ph,G_pr
0,1871,TRO,,abercda01,1,1.0,1,1.0,0,0,...,0,0,1,0,0,0,0,0.0,0.0,0.0
1,1871,RC1,,addybo01,25,25.0,25,25.0,0,0,...,22,0,3,0,0,0,0,0.0,0.0,0.0
2,1871,CL1,,allisar01,29,29.0,29,29.0,0,0,...,2,0,0,0,29,0,29,0.0,0.0,0.0
3,1871,WS3,,allisdo01,27,27.0,27,27.0,0,27,...,0,0,0,0,0,0,0,0.0,0.0,0.0
4,1871,RC1,,ansonca01,25,25.0,25,25.0,0,5,...,2,20,0,1,0,0,1,0.0,0.0,0.0


In [24]:
Appearances = Appearances.groupby(['playerID','yearID'])['G_c','G_1b','G_2b','G_3b','G_ss','G_of','G_dh'].sum()                                       
Appearances.reset_index(inplace=True)
display(Appearances[0:10])

Unnamed: 0,playerID,yearID,G_c,G_1b,G_2b,G_3b,G_ss,G_of,G_dh
0,aardsda01,2004,0,0,0,0,0,0,0.0
1,aardsda01,2006,0,0,0,0,0,0,0.0
2,aardsda01,2007,0,0,0,0,0,0,0.0
3,aardsda01,2008,0,0,0,0,0,0,0.0
4,aardsda01,2009,0,0,0,0,0,0,0.0
5,aardsda01,2010,0,0,0,0,0,0,0.0
6,aardsda01,2012,0,0,0,0,0,0,0.0
7,aardsda01,2013,0,0,0,0,0,0,0.0
8,aardsda01,2015,0,0,0,0,0,0,0.0
9,aaronha01,1954,0,0,0,0,0,116,0.0


In [25]:
# create a max column
Appearances['Max_G']= Appearances[['G_c','G_1b','G_2b','G_3b','G_ss','G_of','G_dh']].max(axis=1)
Appearances.head()

Unnamed: 0,playerID,yearID,G_c,G_1b,G_2b,G_3b,G_ss,G_of,G_dh,Max_G
0,aardsda01,2004,0,0,0,0,0,0,0.0,0.0
1,aardsda01,2006,0,0,0,0,0,0,0.0,0.0
2,aardsda01,2007,0,0,0,0,0,0,0.0,0.0
3,aardsda01,2008,0,0,0,0,0,0,0.0,0.0
4,aardsda01,2009,0,0,0,0,0,0,0.0,0.0


In [26]:
# (ii) Assign each player season to a primary position 

# Create definiton to position classification process
def Position(df):
    if (df['Max_G'] == df['G_c']): return "C"
    elif (df['Max_G'] == df['G_1b']): return "1B"
    elif (df['Max_G'] == df['G_2b']): return "2B"
    elif (df['Max_G'] == df['G_3b']): return "3B"
    elif (df['Max_G'] == df['G_ss']): return "SS"
    elif (df['Max_G'] == df['G_of']): return "OF"
    elif (df['Max_G'] == df['G_dh']): return "DH"

# Apply newly created definition to appearance data as variable 'POS'
Appearances['POS'] = Appearances.apply(Position, axis = 1)    

display(Appearances[0:10])

Unnamed: 0,playerID,yearID,G_c,G_1b,G_2b,G_3b,G_ss,G_of,G_dh,Max_G,POS
0,aardsda01,2004,0,0,0,0,0,0,0.0,0.0,C
1,aardsda01,2006,0,0,0,0,0,0,0.0,0.0,C
2,aardsda01,2007,0,0,0,0,0,0,0.0,0.0,C
3,aardsda01,2008,0,0,0,0,0,0,0.0,0.0,C
4,aardsda01,2009,0,0,0,0,0,0,0.0,0.0,C
5,aardsda01,2010,0,0,0,0,0,0,0.0,0.0,C
6,aardsda01,2012,0,0,0,0,0,0,0.0,0.0,C
7,aardsda01,2013,0,0,0,0,0,0,0.0,0.0,C
8,aardsda01,2015,0,0,0,0,0,0,0.0,0.0,C
9,aaronha01,1954,0,0,0,0,0,116,0.0,116.0,OF


In [27]:
# Exclude non-position players (designated hitters) and keep necessary variables

Appearances = Appearances[Appearances['Max_G'] > 0] 
Appearances = Appearances[['playerID','yearID','POS']]
display(Appearances[0:5])

Unnamed: 0,playerID,yearID,POS
9,aaronha01,1954,OF
10,aaronha01,1955,OF
11,aaronha01,1956,OF
12,aaronha01,1957,OF
13,aaronha01,1958,OF


In [28]:
Appearances['Catch'] = np.where(Appearances['POS'] == "C", 1, 0)
Appearances['Infld'] = np.where((Appearances['POS'] == "2B") | (Appearances['POS'] == "3B") | 
                                (Appearances['POS'] == "SS"), 1, 0)
Appearances['Outfld']= np.where(Appearances.POS == 'OF', 1, 0)

display(Appearances[0:5])

Unnamed: 0,playerID,yearID,POS,Catch,Infld,Outfld
9,aaronha01,1954,OF,0,0,1
10,aaronha01,1955,OF,0,0,1
11,aaronha01,1956,OF,0,0,1
12,aaronha01,1957,OF,0,0,1
13,aaronha01,1958,OF,0,0,1


In [34]:
Master= pd.merge(Master, Appearances, on=['playerID', 'yearID'], how= 'left')
Master.head()

Unnamed: 0,playerID,yearID,stint,G,AB,R,H,Doubles,Triples,HR,...,lnSal,debut_yr,Exp,Rookie,Arb,Free,POS,Catch,Infld,Outfld
0,abbotje01,1998,1,89,244,33,68,14,1,12,...,12.449019,1997,1,1,0,0,OF,0,0,1
1,abbotje01,2000,1,80,215,31,59,15,1,3,...,12.611538,1997,3,0,1,0,OF,0,0,1
2,abbotku01,1998,3,77,194,26,51,13,1,5,...,13.71015,1993,5,0,1,0,SS,0,1,0
3,abbotku01,1999,1,96,286,41,78,17,2,8,...,13.122363,1993,6,0,1,0,2B,0,1,0
4,abbotku01,2000,1,79,157,22,34,7,1,6,...,13.304685,1993,7,0,0,1,SS,0,1,0


In [47]:
POSSal= Master.groupby(['SalYear', 'POS'])['salary'].mean().reset_index()
POSSal= POSSal.sort_values(by='salary', ascending=False)
POSSal= POSSal[(POSSal.SalYear == 1999) |( POSSal.SalYear==2004)]
POSSal

Unnamed: 0,SalYear,POS,salary
35,2004,1B,4211004.0
40,2004,OF,4067223.0
41,2004,SS,3367722.0
4,1999,DH,3214643.0
39,2004,DH,3167862.0
0,1999,1B,3014788.0
38,2004,C,2471140.0
2,1999,3B,2431312.0
37,2004,3B,2429081.0
5,1999,OF,2393861.0


In [51]:
((sum(Master.Arb) + sum(Master.Free)) / (len(Master.playerID)))

0.7880539499036608

In [53]:
# sum years of exp by team in 2002
df2002= Master[Master.yearID ==2002]
df2002= df2002.groupby(['teamID'])['Exp'].sum().reset_index()
df2002.sort_values(by='Exp', ascending=False).head()

Unnamed: 0,teamID,Exp
25,SFN,114
24,SEA,110
1,ARI,107
6,CHN,105
2,ATL,101


# III. Running Regressions

In [54]:
#Dataframe for before and after MB
MB= Master[(Master.yearID>= 1999) & (Master.yearID <=2003)]
MA= Master[(Master.yearID >=2004) & (Master.yearID <= 2006)]


In [71]:
# Regression for labor market valuation of OBP and SLG for seasons 2000-2004 (all) = Table 3 Column 1

import statsmodels.formula.api as smf
MB_Data_Pre = Master[(Master.SalYear >= 1999) & (Master.SalYear <= 2003)]
Pre_MB_lm = smf.ols(formula = 'lnSal ~ OBP + SLG + AVG + PA + Arb + Free+ Infld', data=MB_Data_Pre).fit()
Pre_MB_lm.summary()

0,1,2,3
Dep. Variable:,lnSal,R-squared:,0.69
Model:,OLS,Adj. R-squared:,0.689
Method:,Least Squares,F-statistic:,554.3
Date:,"Thu, 29 Dec 2022",Prob (F-statistic):,0.0
Time:,16:41:48,Log-Likelihood:,-1860.7
No. Observations:,1753,AIC:,3737.0
Df Residuals:,1745,BIC:,3781.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,10.4577,0.159,65.599,0.000,10.145,10.770
OBP,1.2977,0.700,1.854,0.064,-0.075,2.670
SLG,2.9004,0.313,9.276,0.000,2.287,3.514
AVG,-2.1126,0.887,-2.383,0.017,-3.851,-0.374
PA,0.0031,0.000,27.444,0.000,0.003,0.003
Arb,1.2575,0.046,27.144,0.000,1.167,1.348
Free,1.8285,0.046,39.435,0.000,1.738,1.919
Infld,-0.0192,0.037,-0.520,0.603,-0.092,0.053

0,1,2,3
Omnibus:,10.812,Durbin-Watson:,1.391
Prob(Omnibus):,0.004,Jarque-Bera (JB):,14.762
Skew:,-0.032,Prob(JB):,0.000623
Kurtosis:,3.445,Cond. No.,29000.0


In [64]:

import statsmodels.formula.api as smf
Val_Pre_lm = smf.ols(formula = 'lnSal ~ OBP + SLG + PA +AVG+ Arb + Free + Catch + Infld ', data=MA).fit()
Val_Pre_lm.summary()

0,1,2,3
Dep. Variable:,lnSal,R-squared:,0.636
Model:,OLS,Adj. R-squared:,0.634
Method:,Least Squares,F-statistic:,220.7
Date:,"Thu, 29 Dec 2022",Prob (F-statistic):,1.22e-215
Time:,16:37:15,Log-Likelihood:,-1151.7
No. Observations:,1018,AIC:,2321.0
Df Residuals:,1009,BIC:,2366.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,10.1646,0.254,40.059,0.000,9.667,10.663
OBP,3.6469,1.069,3.413,0.001,1.550,5.744
SLG,2.8461,0.458,6.218,0.000,1.948,3.744
PA,0.0029,0.000,18.098,0.000,0.003,0.003
AVG,-2.7290,1.249,-2.185,0.029,-5.180,-0.278
Arb,1.1827,0.066,17.905,0.000,1.053,1.312
Free,1.7922,0.063,28.489,0.000,1.669,1.916
Catch,0.1189,0.075,1.583,0.114,-0.029,0.266
Infld,-0.0088,0.055,-0.160,0.873,-0.116,0.099

0,1,2,3
Omnibus:,15.252,Durbin-Watson:,1.454
Prob(Omnibus):,0.0,Jarque-Bera (JB):,25.403
Skew:,0.06,Prob(JB):,3.05e-06
Kurtosis:,3.764,Cond. No.,30200.0


In [60]:
# Here we just run each regression by taking a subset for the relevant year, but we don't print out the results. 

MB_Data_2000 = Master[(Master.SalYear == 2000)]
Val_2000_lm = smf.ols(formula = 'lnSal ~ OBP + SLG + PA + AVG+ Rookie+ Arb + Free + Catch + Infld + Outfld', data=MB_Data_2000).fit()
MB_Data_2001 = Master[(Master.SalYear == 2001)]
Val_2001_lm = smf.ols(formula = 'lnSal ~ OBP + SLG + PA + AVG+ Rookie+ Arb + Free + Catch + Infld + Outfld', data=MB_Data_2001).fit()
MB_Data_2002 = Master[(Master.SalYear == 2002)]
Val_2002_lm = smf.ols(formula = 'lnSal ~ OBP + SLG + PA + AVG+ Rookie+ Arb + Free + Catch + Infld + Outfld', data=MB_Data_2002).fit()
MB_Data_2003 = Master[(Master.SalYear == 2003)]
Val_2003_lm = smf.ols(formula = 'lnSal ~ OBP + SLG + PA + AVG+ Rookie+ Arb + Free + Catch + Infld + Outfld', data=MB_Data_2003).fit()
MB_Data_2004 = Master[(Master.SalYear == 2004)]
Val_2004_lm = smf.ols(formula = 'lnSal ~ OBP + SLG + PA + AVG+ Rookie+ Arb + Free + Catch + Infld + Outfld', data=MB_Data_2004).fit()

from statsmodels.iolib.summary2 import summary_col
Header = ['All years','2000-2003','2000','2001','2002','2003','2004']
Table_3 = summary_col([Val_All_lm,Val_Pre_lm,Val_2000_lm,Val_2001_lm,Val_2002_lm,Val_2003_lm,Val_2004_lm,],\
                      regressor_order=['OBP','SLG','PA','AVG', 'Rookie', 'Arb','Free','Catch','Infld', 'Outfld', 'Intercept'],stars=True, \
                      float_format="'%.3f'",model_names = Header)
print(Table_3)


           All years  2000-2003     2000       2001       2002       2003        2004   
----------------------------------------------------------------------------------------
OBP       '2.810'***  '3.718'*** '3.019'**  '0.772'    '0.821'    '3.043'*   '8.221'*** 
          ('0.744')   ('1.070')  ('1.468')  ('1.472')  ('1.846')  ('1.825')  ('1.883')  
SLG       '2.766'***  '2.910'*** '2.833'*** '3.437'*** '2.365'*** '2.211'**  '3.046'*** 
          ('0.335')   ('0.461')  ('0.670')  ('0.655')  ('0.765')  ('0.872')  ('0.844')  
PA        '0.003'***  '0.003'*** '0.002'*** '0.003'*** '0.003'*** '0.003'*** '0.003'*** 
          ('0.000')   ('0.000')  ('0.000')  ('0.000')  ('0.000')  ('0.000')  ('0.000')  
AVG       '-3.029'*** '-2.841'** '-1.883'   '-1.470'   '-0.409'   '-2.749'   '-8.808'***
          ('0.938')   ('1.253')  ('1.858')  ('2.001')  ('2.132')  ('2.194')  ('2.357')  
Rookie    '1.808'***  '1.778'*** '1.692'*** '1.807'*** '1.764'*** '1.800'*** '1.857'*** 
          ('0.052') 

In [61]:
#Uncomment this cell once the assignment is complete in order to export your Master dataset
#Master.to_csv("../Data/Master.csv")