# Data Analysis

In this notebook, we examine the data that we retrieved with various calculations and plots. 

In [1]:
import numpy as np
import pandas as pd
import requests
import plotly.offline as py
import matplotlib.pyplot as plt
from plotly.graph_objs import *
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score
from sklearn.cluster import KMeans
from bs4 import BeautifulSoup
%matplotlib inline
pd.set_option("max_r", 15)
py.init_notebook_mode(connected=True)

# Predicting Wins

We wanted to see how the teams would have performed if the swaps we made had actually happened. The way to measure performance is with number of wins, which leaves us with the question: given various data (such as runs, at bats, strike outs, etc.), can we predict the number of wins a team would have had?

After research, we concluded that the Pythagorean expectation could give us a good estimate of how many wins a team would have.

Invented by Bill James, the Pythagorean expectation uses the basic formula: 

$${\displaystyle \mathrm {Win\ Ratio} ={\frac {{\text{runs scored}}^{2}}{{\text{runs scored}}^{2}+{\text{runs allowed}}^{2}}}={\frac {1}{1+({\text{runs allowed}}/{\text{runs scored}})^{2}}}}$$

When applying this formula for our data, we faced a limitation, where we were not able to account for differences in runs allowed with the new player swapped in. This is the case because runs allowed are dependent on the defense as a whole, not the individual players.

Before proceeding, we researched Sweeney and Jeter and discovered that both players are Golden Glove award-winners (strong defensively). We therefore decided that it is safe to assume that they make an equivalent defensive contribution. Thus we've concluded the best way to go about our analyses is to focus on changes in offensive performances.

In [2]:
# read in CSVs
dataNYY = pd.read_csv("nyy.csv")
dataKCR = pd.read_csv("kcr.csv")
nyy_links = pd.read_csv("nyy_links.csv")
kcr_links = pd.read_csv("kcr_links.csv")
jeter_in_kcr = pd.read_csv("jeter_in_KCR.csv")
sweeney_in_nyy = pd.read_csv("sweeney_in_NYY.csv")

In [3]:
# function that performs the pythagorean expectation
def calc_win_ratio(r, ra):
    return (r**2) / (r**2 + ra**2)

In [4]:
# Calculate the win ratio for each year.
runs_allowed_nyy = dataNYY.groupby("year")['runs_allowed'].mean()
runs_allowed_kcr = dataKCR.groupby("year")['runs_allowed'].mean()
runs_scored_nyy = dataNYY.groupby("year")['R'].sum()
runs_scored_kcr = dataKCR.groupby("year")['R'].sum()

# we multiplied by 162 because 162 games are played in the season
expected_wins_nyy = calc_win_ratio(runs_scored_nyy, runs_allowed_nyy) * 162
expected_wins_nyy = pd.DataFrame(expected_wins_nyy).reset_index()
expected_wins_kcr = calc_win_ratio(runs_scored_kcr, runs_allowed_kcr) * 162
expected_wins_kcr = pd.DataFrame(expected_wins_kcr).reset_index()

In [5]:
# we chose >= 1995 because thats when both players joined the league (and thats when they were supposed to be traded)
nyy_yw = nyy_links[["year","wins"]]
nyy_yw = nyy_yw[nyy_yw.year >= 1995]
kcr_yw = kcr_links[["year","wins"]]
kcr_yw = kcr_yw[kcr_yw.year >= 1995]

Now, we will examine the efficacy of the Pythagorean Expectation in predicting the number of wins. We will do so by plotting the wins that the Pythagorean Expectation predicted for each year as well as the actual wins for that year.

In [6]:
# Plotly: show how close Pythagorean Expectation is to real-life win results.
trace0 = Scatter(
    x = nyy_yw['year'],
    y = nyy_yw['wins'],
    name = "Actual Wins",
    line = dict(
        color = ('rgb(0,0,255)'),
    )
)
trace1 = Scatter(
    x = expected_wins_nyy['year'],
    y = expected_wins_nyy[0],
    name = "Predicted Wins",
    line = dict(
        color = ('rgb(255,0,0)')
    )
)

traces = [trace0,trace1]

layout = dict(
    title = "Efficacy of Pythagorean Expectation (NYY)",
    xaxis = dict(title = "Year"),
    yaxis = dict(title = "Number of Wins")
)

fig = dict(data = traces, layout = layout)
py.iplot(fig)

In [7]:
# Plotly: show how close Pythagorean Expectation is to real-life results for other team.
trace0 = Scatter(
    x = kcr_yw['year'],
    y = kcr_yw['wins'],
    name = "Actual Wins",
    line = dict(
        color = ('rgb(0,0,0)'),
    )
)
trace1 = Scatter(
    x = expected_wins_kcr['year'],
    y = expected_wins_kcr[0],
    name = "Predicted Wins",
    line = dict(
        color = ('rgb(51,204,255)')
    )
)

traces = [trace0,trace1]

layout = dict(
    title = "Efficacy of Pythagorean Expectation (KCR)",
    xaxis = dict(title = "Year"),
    yaxis = dict(title = "Number of Wins")
)

fig = dict(data = traces, layout = layout)
py.iplot(fig)

Examining the difference (residuals) in actual and predicted number of wins. The error in the Pythagorean expectation is also known as the Pythagorean Luck.

In [8]:
# Data on the error at each year.
diffs = abs(expected_wins_nyy[0] - nyy_yw['wins'])
expected_wins_nyy["diff"] = diffs
diffs = abs(expected_wins_kcr[0] - kcr_yw['wins'])
expected_wins_kcr['diff'] = diffs

trace0 = Bar(
    x = expected_wins_nyy['year'],
    y = expected_wins_nyy['diff'],
    name = 'NYY'
)

trace1 = Bar(
    x = expected_wins_kcr['year'],
    y = expected_wins_kcr['diff'],
    name = 'KCR',
    marker=dict(
        color = ('rgb(51,204,255)')
    )
)

traces = [trace0, trace1]
layout = Layout(
    title = "Error in Pythagorean Expectation (Pythagorean Luck)",
    barmode = "group",
    xaxis = dict(title = "Year"),
    yaxis = dict(title = "Difference in Actual & Predicted Number of Wins")
)

fig = Figure(data=traces, layout=layout)
py.iplot(fig)

Mean differences between actual and predicted number of wins for each team:

In [9]:
# Mean differences between actual and predicted number of wins for each team.
expected_wins_nyy['diff'].mean(), expected_wins_kcr['diff'].mean()

(8.719003371722819, 8.598454533799156)

In [10]:
jeter_in_kcr = jeter_in_kcr.sort_values(by="year")
sweeney_in_nyy = sweeney_in_nyy.sort_values(by="year")

Now that we have examined the accuracy of the Pythagorean Expectation on the actual data, we have concluded that it will provide relatively accurate data for predicting how the teams would have performed with the trades. 

In [11]:
# Mike Sweeney was on the Royals from 1995-2007.
dfj = jeter_in_kcr[jeter_in_kcr.year <= 2007]
dfs = sweeney_in_nyy[sweeney_in_nyy.year <= 2007]

In [12]:
# Calculate the win ratio for each year if the players were traded.
# Runs-allowed limitation applies here.
# When we swapped the players, each player's "runs_allowed" column still had the value from their old team
# therefore, we are not going to recalculate the "runs_allowed" during the swap, because we want each team
# to keep their original runs allowed despite the switch.
runs_allowed_dfj = dataKCR.groupby("year")['runs_allowed'].mean()
runs_allowed_dfs = dataNYY.groupby("year")['runs_allowed'].mean()
runs_scored_dfj = dfj.groupby("year")['R'].sum()
runs_scored_dfs = dfs.groupby("year")['R'].sum()

In [13]:
expected_wins_dfj = calc_win_ratio(runs_scored_dfj, runs_allowed_dfj) * 162
expected_wins_dfj = pd.DataFrame(expected_wins_dfj).reset_index()
expected_wins_dfs = calc_win_ratio(runs_scored_dfs, runs_allowed_dfs) * 162
expected_wins_dfs = pd.DataFrame(expected_wins_dfs).reset_index()

Now we will plot graphs of the performances (wins) of the teams with the trade in place and with the trade not in place. That we could visually examine how these teams may have performed if the trade were to have actually happened.

In [14]:
# Plot expected wins of KCR with Jeter vs. expected wins of KCR with Sweeney (their actual wins).
trace0 = Scatter(
    x = kcr_yw[kcr_yw['year'] <= 2007]['year'],
    y = kcr_yw[kcr_yw['year'] <= 2007]['wins'],
    name = "Wins with Sweeney (without trade)",
    line = dict(
        color = ('rgb(51,204,255)'),
    )
)
trace1 = Scatter(
    x = expected_wins_dfj[expected_wins_dfj['year'] <= 2007]['year'],
    y = expected_wins_dfj[expected_wins_dfj['year'] <= 2007][0],
    name = "Predicted Wins with Jeter (with trade)",
    line = dict(
        color = ('rgb(0,0,255)')
    )
)

traces = [trace0,trace1]

layout = dict(
    title = "KCR Wins With and Without The Trade",
    xaxis = dict(title = "Year"),
    yaxis = dict(title = "Number of Wins")
)

fig = dict(data = traces, layout = layout)
py.iplot(fig)

In [15]:
# Plot expected wins of NYY with Sweeney vs. expected wins of NYY with Jeter (their actual wins).
trace0 = Scatter(
    x = nyy_yw[nyy_yw['year'] <= 2007]['year'],
    y = nyy_yw[nyy_yw['year'] <= 2007]['wins'],
    name = "Wins with Jeter (without trade)",
    line = dict(
        color = ('rgb(0,0,255)'),
    )
)
trace1 = Scatter(
    x = expected_wins_dfs[expected_wins_dfs['year'] <= 2007]['year'],
    y = expected_wins_dfs[expected_wins_dfs['year'] <= 2007][0],
    name = "Predicted Wins with Sweeney (with trade)",
    line = dict(
        color = ('rgb(51,204,255)')
    )
)

traces = [trace0,trace1]

layout = dict(
    title = "NYY Wins With and Without The Trade",
    xaxis = dict(title = "Year"),
    yaxis = dict(title = "Number of Wins")
)

fig = dict(data = traces, layout = layout)
py.iplot(fig)

In [16]:
# export to csvs
expected_wins_dfj.to_csv("expected_wins_dfj.csv", index=False)
expected_wins_dfs.to_csv("expected_wins_dfs.csv", index=False)
nyy_yw.to_csv("nyy_yw.csv", index=False)
kcr_yw.to_csv("kcr_yw.csv", index=False)