# ERA

In [1]:
# Import dependencies.
import plotly.plotly as py
import plotly.graph_objs as go

import numpy as np
import pandas as pd
import statistics

In [2]:
import plotly
plotly.tools.set_credentials_file(username='', api_key='')

### Open up the 1905 and 1969 csv files and inspect.

In [3]:
# Open up the csv for World Series Winners ERA from 1905 onwards.
df2a = pd.read_csv("../clean_data/wswin_1905ERA.csv")
df2a = df2a.drop("Unnamed: 0", axis=1)
df2a

Unnamed: 0,ERA,Count,Frequency
0,2.4,2,0.018182
1,2.1,3,0.027273
2,1.7,1,0.009091
3,1.8,1,0.009091
4,3.0,9,0.081818
5,2.8,3,0.027273
6,3.2,7,0.063636
7,2.7,4,0.036364
8,2.5,1,0.009091
9,2.2,2,0.018182


In [4]:
# Open up the csv for all teams ERA from 1905 onwards.
df2b = pd.read_csv("../clean_data/nowswin_1905ERA.csv")
df2b = df2b.drop("Unnamed: 0", axis=1)
df2b

Unnamed: 0,ERA,Count,Frequency
0,2.8,33,0.014973
1,3.8,163,0.073956
2,3.5,87,0.039474
3,2.0,5,0.002269
4,3.0,43,0.01951
5,2.9,34,0.015426
6,2.2,11,0.004991
7,2.7,26,0.011797
8,3.6,138,0.062613
9,3.4,113,0.05127


In [5]:
# Open up the csv for World Series Winners ERA from 1969 onwards.
df3a = pd.read_csv("../clean_data/wswin_1969ERA.csv")
df3a = df3a.drop("Unnamed: 0", axis=1)
df3a

Unnamed: 0,ERA,Count,Frequency
0,3.0,4,0.086957
1,3.2,2,0.043478
2,3.3,2,0.043478
3,2.6,1,0.021739
4,3.4,7,0.152174
5,3.5,4,0.086957
6,3.6,3,0.065217
7,3.1,2,0.043478
8,4.6,2,0.043478
9,3.7,5,0.108696


In [6]:
# Open up the csv for all teams ERA from 1969 onwards.
df3b = pd.read_csv("../clean_data/nowswin_1969ERA.csv")
df3b = df3b.drop("Unnamed: 0", axis=1)
df3b

Unnamed: 0,ERA,Count,Frequency
0,3.5,50,0.041186
1,2.8,4,0.003295
2,3.9,70,0.057661
3,4.2,82,0.067545
4,3.3,38,0.031301
5,4.1,71,0.058484
6,3.6,84,0.069193
7,3.7,78,0.06425
8,3.1,20,0.016474
9,3.2,31,0.025535


### Pull data into lists from tables.

In [7]:
ERA2a = list(df2a["ERA"])
frequency2a = list(df2a["Count"])

In [8]:
ERA2b = list(df2b["ERA"])
frequency2b = list(df2b["Count"])

In [9]:
ERA3a = list(df3a["ERA"])
frequency3a = list(df3a["Count"])

In [10]:
ERA3b = list(df3b["ERA"])
frequency3b = list(df3b["Count"])

### Graph the data.

In [11]:
trace1 = go.Bar(
    x = ERA2a,
    y = frequency2a,
    marker = dict(
        color = "rgb(28, 74, 175)"
    ),
    name = "World Series Winners"
)

trace2 = go.Bar(
    x = ERA2b, 
    y = frequency2b,
    marker = dict(
        color = "rgb(192, 57, 43)"
    ),
    name = "Non-World Series Winning Teams"
)

data = [trace1, trace2]
layout = go.Layout(
    xaxis = {"title": "ERA"},
    yaxis = {"title": "Frequency"},
    barmode = "group",
    title = "World Series ERA (1905-2015)"
)

fig = go.Figure(data = data, layout = layout)
py.iplot(fig, filename="1905ERA")

    # Of the 1905 data, both groups have a discernable normal distribution.

In [12]:
trace1 = go.Bar(
    x = ERA3a,
    y = frequency3a,
    marker = dict(
        color = "rgb(28, 74, 175)"
    ),
    name = "World Series Winners"
)

trace2 = go.Bar(
    x = ERA3b, 
    y = frequency3b,
    marker = dict(
        color = "rgb(192, 57, 43)"
    ),
    name = "Non-World Series Winning Teams"
)

data = [trace1, trace2]
layout = go.Layout(
    xaxis = {"title": "ERA"},
    yaxis = {"title": "Frequency"},
    barmode = "group",
    title = "World Series ERA (1969-2015)"
)

fig = go.Figure(data = data, layout = layout)
py.iplot(fig, filename="1969ERA")

    # Both data sets have a somewhat normal distribution.  
    # Interestingly, the data for the World Series winners appears to be more spread compared to that of the 1905 dataset.

## Perform basic stats analysis.
    # Calculate mean and standard deviation of ERAs of the World Series winning population.
    # Calculate mean and standard deviation of ERAs of the World Series non-winning population.
    # Calculate confidence interval.

### 1905.

In [13]:
# Calculate mean of ERAs of World Series winning teams for 1905 onwards.
wswin_mean1905 = round((np.mean(df2a["ERA"])), 2)
wswin_mean1905

3.25

In [14]:
# Calculate standard deviation of World Series winning teams for 1905 onwards.
wswin_dev1905 = round((statistics.stdev(df2a["ERA"])), 2)
wswin_dev1905

0.87

In [34]:
# Calculate number of entries in World Series winning teams for 1905 onwards.
wswin_len1905 = df2a["Count"].sum()
wswin_len1905

110

In [16]:
# Calculate mean of ERAs of non-World Series winning teams for 1905 onwards.
nowswin_mean1905 = round((np.mean(df2b["ERA"])), 2)
nowswin_mean1905

4.07

In [17]:
# Caluclate standard deviation of non-World Series winning teams for 1905 onwards.
nowswin_dev1905 = round((statistics.stdev(df2b["ERA"])), 2)
nowswin_dev1905

1.38

In [33]:
# Calculate total number of non-World Series winning teams for 1905 onwards.
nowswin_len1905 = df2b["Count"].sum()
nowswin_len1905

2204

### Determine significance of 1905 winners and non-winners.

In [35]:
# Calculate variance for each sample group.
wswin_variance1905 = (wswin_dev1905)**2
nowswin_variance1905 = (nowswin_dev1905)**2
print (wswin_variance1905)
print (nowswin_variance1905)

0.7569
1.9043999999999996


    # The hypothesis I'm testing is that World Series winners tend to have lower ERAs than non-World Series teams.
    # That means:
        # null hypothesis is:  u1 < u2....  or u1 - u2 >= 0
        # hypothesis to test is:  u1 - u2 < 0
    # This stands to be a left tail test.

In [36]:
# Calculate Z.
num_1905 = wswin_mean1905 - nowswin_mean1905
dem_1905 = ((wswin_variance1905/wswin_len1905) + (nowswin_variance1905/nowswin_len1905)) ** 0.5
Z_1905 = num_1905/dem_1905
Z_1905

-9.317595620609078

### Look up the Z value on table.  P-Value is < 0.00001
### This means that I can say that if the confidence level is 99.99% (alpha = 0.0001) that World Series teams definitely has a lower ERA than all other teams.... because p<= alpha (reject null hypothesis; accept test hypothesis).

### 1969.

In [22]:
# Calculate mean of ERAs of World Series winning teams for 1969 onwards.
wswin_mean1969 = round((np.mean(df3a["ERA"])), 2)
wswin_mean1969

3.76

In [23]:
# Calculate standard deviation of World Series winning teams for 1969 onwards.
wswin_dev1969 = round((statistics.stdev(df3a["ERA"])), 2)
wswin_dev1969

0.6

In [38]:
# Calculate number of World Series winning teams for 1969 onwards.
wswin_len1969 = df3a["Count"].sum()
wswin_len1969

46

In [24]:
# Calculate mean of ERAs of non-World Series winning teams for 1969 onwards.
nowswin_mean1969 = round((np.mean(df3b["ERA"])), 2)
nowswin_mean1969

4.31

In [25]:
# Calculate standard deviation of ERAs of non-World Series winning teams for 1969 onwards.
nowswin_dev1969 = round((statistics.stdev(df3b["ERA"])), 2)
nowswin_dev1969

1.06

In [39]:
# Calculate number of non-World Series winning teams for 1969 onwards.
nowswin_len1969 = df3b["Count"].sum()
nowswin_len1969

1214

### Determine significance of 1969 winners and non-winners.

In [40]:
# Calculate variance for each sample group.
wswin_variance1969 = (wswin_dev1969)**2
nowswin_variance1969 = (nowswin_dev1969)**2
print (wswin_variance1969)
print (nowswin_variance1969)

0.36
1.1236000000000002


    # The hypothesis I'm testing is that World Series winners tend to have lower ERAs than non-World Series teams.
    # That means:
        # null hypothesis is:  u1 < u2....  or u1 - u2 >= 0
        # hypothesis to test is:  u1 - u2 < 0
    # This stands to be a left tail test.

In [41]:
# Calculate Z.
num_1969 = wswin_mean1969 - nowswin_mean1969
dem_1969 = ((wswin_variance1969/wswin_len1969) + (nowswin_variance1969/nowswin_len1969)) ** 0.5
Z_1969 = num_1969/dem_1969
Z_1969

-5.879202302743572

### Look up the Z value on table.  P-Value is < 0.00001
### This means that I can say that if the confidence level is 99.99% (alpha = 0.0001) that World Series teams definitely has a lower ERA than all other teams.... because p<= alpha (reject null hypothesis; accept test hypothesis).

# CONCLUSION:
### -- ERAs from 1905 onwards, or from 1969 onwards, is a good indicator of being a World Series winning team.
### -- This is one of the factors that lead to a winning team.  However, what is pitching if your offense is bad?

### -- Next step: Check out runs per team.