In [1]:
import pandas as pd
import numpy as np
from numpy.random import (
    beta, binomial
)
import scipy.stats as stats
from IPython.display import display
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)
import plotly.express as px
import matplotlib.pyplot as plt


IPython.utils.traitlets has moved to a top-level traitlets package.



In [2]:
Batting = pd.read_csv('data/baseballdatabank-master/core/Batting.csv', index_col=['playerID'])
Pitching = pd.read_csv('data/baseballdatabank-master/core/Pitching.csv', index_col=['playerID'])
Master = pd.read_csv('data/baseballdatabank-master/core/Master.csv', index_col='playerID')

In [3]:
Batting.head()

Unnamed: 0_level_0,yearID,stint,teamID,lgID,G,AB,R,H,2B,3B,...,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
abercda01,1871,1,TRO,,1,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,,,,
addybo01,1871,1,RC1,,25,118.0,30.0,32.0,6.0,0.0,...,13.0,8.0,1.0,4.0,0.0,,,,,
allisar01,1871,1,CL1,,29,137.0,28.0,40.0,4.0,5.0,...,19.0,3.0,1.0,2.0,5.0,,,,,
allisdo01,1871,1,WS3,,27,133.0,28.0,44.0,10.0,2.0,...,27.0,1.0,1.0,0.0,2.0,,,,,
ansonca01,1871,1,RC1,,25,120.0,29.0,39.0,11.0,3.0,...,16.0,6.0,2.0,2.0,1.0,,,,,


In [4]:
Pitching.head()

Unnamed: 0_level_0,yearID,stint,teamID,lgID,W,L,G,GS,CG,SHO,...,IBB,WP,HBP,BK,BFP,GF,R,SH,SF,GIDP
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
bechtge01,1871,1,PH1,,1,2,3,3,2,0,...,,,,0,,,42,,,
brainas01,1871,1,WS3,,12,15,30,30,30,0,...,,,,0,,,292,,,
fergubo01,1871,1,NY2,,0,0,1,0,0,0,...,,,,0,,,9,,,
fishech01,1871,1,RC1,,4,16,24,24,22,1,...,,,,0,,,257,,,
fleetfr01,1871,1,NY2,,0,1,1,1,1,0,...,,,,0,,,21,,,


In [5]:
Batting = Batting[~Batting.index.isin(Pitching.index)]
career = (Batting[Batting["AB"] > 0]
            .groupby(level=0)
            .agg({
                'H': np.sum, 
                'AB': np.sum
            })
          .assign(average = lambda r: r['H'] / r['AB']))
career.head()

Unnamed: 0_level_0,H,AB,average
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
aaronha01,3771.0,12364.0,0.304998
aaronto01,216.0,944.0,0.228814
abadan01,2.0,21.0,0.095238
abadijo01,11.0,49.0,0.22449
abbated01,772.0,3044.0,0.253614


In [6]:
career = (
    career
    .join(Master)
    .assign(name = lambda x: x['nameFirst'] + ' ' + x['nameLast'])[['name', 'H', 'AB', 'average']]
)
career.head()

Unnamed: 0_level_0,name,H,AB,average
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
aaronha01,Hank Aaron,3771.0,12364.0,0.304998
aaronto01,Tommie Aaron,216.0,944.0,0.228814
abadan01,Andy Abad,2.0,21.0,0.095238
abadijo01,John Abadie,11.0,49.0,0.22449
abbated01,Ed Abbaticchio,772.0,3044.0,0.253614


In [7]:
fig = px.histogram(career, x="average", nbins=100)
fig.update_layout(
    xaxis_title_text='average'
)
fig.update_xaxes(range=[0.15, 0.375])
fig.show()

In [8]:
career_500 = career[career['AB'] > 500]

alpha0, beta0, loc0, scale0 = stats.beta.fit(career_500['average'], floc=0, fscale=1)

print(alpha0, beta0, loc0, scale0)

79.00652097208157 226.11806031430336 0 1


In [9]:
fitted = beta(alpha0, beta0, size=len(career_500))

fig = go.Figure()
fig.add_trace(go.Histogram(x=fitted, name='Fitted'))
fig.add_trace(go.Histogram(x=career_500['average'], name='True'))
fig.update_layout(
    xaxis_title_text='average',
    barmode='overlay'
)
fig.show()

In [10]:
career_500 = career.assign(estimated_average = lambda x: ((x['H'] + alpha0) / (x['AB'] + alpha0 + beta0)))
career_500.head()

Unnamed: 0_level_0,name,H,AB,average,estimated_average
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
aaronha01,Hank Aaron,3771.0,12364.0,0.304998,0.303889
aaronto01,Tommie Aaron,216.0,944.0,0.228814,0.236171
abadan01,Andy Abad,2.0,21.0,0.095238,0.248391
abadijo01,John Abadie,11.0,49.0,0.22449,0.254166
abbated01,Ed Abbaticchio,772.0,3044.0,0.253614,0.254098


In [11]:
prior = alpha0 / (alpha0 + beta0)
data = [
    go.Scatter(x=career_500['average'], y=career_500['estimated_average'],
        mode='markers',
    )
]
fig = go.Figure(data=data)
fig.update_yaxes(range=[0.15, 0.4])
fig.add_shape(type='line',
    x0=0, x1=1, y0=prior, y1=prior,
    line=dict(
        color="red",
        width=1,
        dash="dashdot",
    )
)
fig.show()