# Web Scraping to Obtain Team and Player Data

In this notebook, we retrieved data from an html website on two teams: the New York Yankees and the Kansas City Royals.

Not only did we acquire data on the overall team statistics, but for every year that we looked at, we would perform a second webscrape in order to get a deeper look at the data.

The second webscrape gave us information and statistics on each specific player of the team.

In [1]:
import numpy as np
import pandas as pd
import requests
import plotly.offline as py
import matplotlib.pyplot as plt
from plotly.graph_objs import *
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score
from sklearn.cluster import KMeans
from bs4 import BeautifulSoup
pd.set_option("max_r", 15)
py.init_notebook_mode(connected=True)

In [2]:
# website with overall team stats
req = requests.get("http://www.baseball-reference.com/teams/NYY/index.shtml")
soup = BeautifulSoup(req.text, "html.parser")

In [3]:
tbody = soup.find("tbody")

In [4]:
# Get all links to NYY's yearly rosters & player stats.
url = "http://www.baseball-reference.com"
year = 2017
nyy_data = {
    "year": [],
    "link": [],
    "wins": [],
    'runs': [],
    'runs_allowed': []
}
for row in tbody.find_all("tr"):
    nyy_data["year"].append(year)
    year = year - 1
    for td in row.find_all("td"):
        statname = td.get("data-stat")
        if statname == "team_name":
            link = url + td.find('a')["href"]
            nyy_data["link"].append(link)
        elif statname == 'W':
            wins = int(td.text)
            nyy_data['wins'].append(wins)
        elif statname == 'R':
            runs = int(td.text)
            nyy_data['runs'].append(runs)
        elif statname == 'RA':
            runs_allowed = int(td.text)
            nyy_data['runs_allowed'].append(runs_allowed)

In [5]:
# clean data
nyy = pd.DataFrame(nyy_data)
nyy = nyy.drop(0, axis=0)
nyy.head()

Unnamed: 0,link,runs,runs_allowed,wins,year
1,http://www.baseball-reference.com/teams/NYY/20...,680,702,84,2016
2,http://www.baseball-reference.com/teams/NYY/20...,764,698,87,2015
3,http://www.baseball-reference.com/teams/NYY/20...,633,664,84,2014
4,http://www.baseball-reference.com/teams/NYY/20...,650,671,85,2013
5,http://www.baseball-reference.com/teams/NYY/20...,804,668,95,2012


In [6]:
# repeat scraping for the second team
req = requests.get("http://www.baseball-reference.com/teams/KCR/index.shtml")
soup = BeautifulSoup(req.text, "html.parser")

In [7]:
tbody = soup.find("tbody")

In [8]:
# Get all links to KCR's yearly rosters & player stats.
url = "http://www.baseball-reference.com"
year = 2017
kcr_data = {
    "year": [],
    "link": [],
    "wins": [],
    'runs': [],
    'runs_allowed': []
}
for row in tbody.find_all("tr"):
    kcr_data["year"].append(year)
    year = year - 1
    for td in row.find_all("td"):
        statname = td.get("data-stat")
        if statname == "team_name":
            link = url + td.find('a')["href"]
            kcr_data["link"].append(link)
        elif statname == 'W':
            wins = int(td.text)
            kcr_data['wins'].append(wins)
        elif statname == 'R':
            runs = int(td.text)
            kcr_data['runs'].append(runs)
        elif statname == 'RA':
            runs_allowed = int(td.text)
            kcr_data['runs_allowed'].append(runs_allowed)

In [9]:
# clean up data
kcr = pd.DataFrame(kcr_data)
kcr = kcr.drop(0, axis=0)
kcr.head()

Unnamed: 0,link,runs,runs_allowed,wins,year
1,http://www.baseball-reference.com/teams/KCR/20...,675,712,81,2016
2,http://www.baseball-reference.com/teams/KCR/20...,724,641,95,2015
3,http://www.baseball-reference.com/teams/KCR/20...,651,624,89,2014
4,http://www.baseball-reference.com/teams/KCR/20...,648,601,86,2013
5,http://www.baseball-reference.com/teams/KCR/20...,676,746,72,2012


In [10]:
# Get NYY data on all players for all necessary years.
runs_all = list(nyy.runs_allowed.values)
dic = {
    "year": [],
    "runs_allowed": []
}
year = 2017
for i in range(1, 23):
    year = year - 1
    ra = runs_all[i-1]
    req = requests.get(nyy.loc[i, "link"])
    soup = BeautifulSoup(req.text, "html.parser")
    tbody = soup.find("tbody")
    for td in tbody.find_all("td"):
        statname = td.get("data-stat")
        if statname == 'G':
            dic["year"].append(year)
            dic["runs_allowed"].append(ra)
        if statname not in dic.keys():
            dic[statname] = []
        dic[statname].append(td.text)

In [11]:
# data had extra symbols we needed to remove
def clean_name(name):
        return name.strip("*#")

dataNYY = pd.DataFrame(dic)
dataNYY["player"] = dataNYY["player"].apply(clean_name)
dataNYY.head()

Unnamed: 0,2B,3B,AB,BB,CS,G,GIDP,H,HBP,HR,...,age,batting_avg,onbase_perc,onbase_plus_slugging,onbase_plus_slugging_plus,player,pos,runs_allowed,slugging_perc,year
0,13,0,429,54,0,130,15,104,7,20,...,32,0.242,0.335,0.748,97,Brian McCann,C,702,0.413,2016
1,16,0,387,47,0,116,7,79,2,15,...,36,0.204,0.292,0.654,73,Mark Teixeira,1B,702,0.362,2016
2,29,1,577,24,0,151,15,156,3,21,...,26,0.27,0.3,0.734,92,Starlin Castro,2B,702,0.433,2016
3,32,2,562,19,1,153,9,155,6,20,...,26,0.276,0.304,0.751,96,Didi Gregorius,SS,702,0.447,2016
4,18,1,467,51,2,140,7,118,6,14,...,32,0.253,0.331,0.716,90,Chase Headley,3B,702,0.385,2016


In [12]:
# Get KCR data on all players for all necessary years.
runs_all = list(kcr.runs_allowed.values)
dic = {
    "year": [],
    "runs_allowed": []
}
year = 2017
for i in range(1, 23):
    year = year - 1
    ra = runs_all[i-1]
    req = requests.get(kcr.loc[i, "link"])
    soup = BeautifulSoup(req.text, "html.parser")
    tbody = soup.find("tbody")
    for td in tbody.find_all("td"):
        statname = td.get("data-stat")
        if statname == 'G':
            dic["year"].append(year)
            dic["runs_allowed"].append(ra)
        if statname not in dic.keys():
            dic[statname] = []
        dic[statname].append(td.text)

In [13]:
dataKCR = pd.DataFrame(dic)
dataKCR["player"] = dataKCR["player"].apply(clean_name)
dataKCR.head()

Unnamed: 0,2B,3B,AB,BB,CS,G,GIDP,H,HBP,HR,...,age,batting_avg,onbase_perc,onbase_plus_slugging,onbase_plus_slugging_plus,player,pos,runs_allowed,slugging_perc,year
0,28,2,514,22,0,139,12,127,8,22,...,26,0.247,0.288,0.725,90,Salvador Perez,C,712,0.438,2016
1,24,1,605,57,3,158,18,161,1,25,...,26,0.266,0.328,0.761,102,Eric Hosmer,1B,712,0.433,2016
2,22,3,311,19,3,81,1,88,0,2,...,27,0.283,0.323,0.716,91,Whit Merrifield,2B,712,0.392,2016
3,24,6,637,27,4,162,16,166,3,7,...,29,0.261,0.292,0.642,71,Alcides Escobar,SS,712,0.35,2016
4,28,1,475,32,0,128,14,130,0,12,...,23,0.274,0.318,0.731,94,Cheslor Cuthbert,3B,712,0.413,2016


In [15]:
# Convert to numeric values.
to_numbers = dataNYY.columns.values
to_numbers = np.delete(to_numbers, [24,25])
for i in range(0, len(to_numbers)):
    dataNYY[to_numbers[i]] = pd.to_numeric(dataNYY[to_numbers[i]])
dataNYY.head()

Unnamed: 0,2B,3B,AB,BB,CS,G,GIDP,H,HBP,HR,...,age,batting_avg,onbase_perc,onbase_plus_slugging,onbase_plus_slugging_plus,player,pos,runs_allowed,slugging_perc,year
0,13,0,429,54,0,130,15,104,7,20,...,32,0.242,0.335,0.748,97.0,Brian McCann,C,702,0.413,2016
1,16,0,387,47,0,116,7,79,2,15,...,36,0.204,0.292,0.654,73.0,Mark Teixeira,1B,702,0.362,2016
2,29,1,577,24,0,151,15,156,3,21,...,26,0.27,0.3,0.734,92.0,Starlin Castro,2B,702,0.433,2016
3,32,2,562,19,1,153,9,155,6,20,...,26,0.276,0.304,0.751,96.0,Didi Gregorius,SS,702,0.447,2016
4,18,1,467,51,2,140,7,118,6,14,...,32,0.253,0.331,0.716,90.0,Chase Headley,3B,702,0.385,2016


In [16]:
dataNYY.dtypes

2B                             int64
3B                             int64
AB                             int64
BB                             int64
CS                             int64
G                              int64
GIDP                           int64
                              ...   
onbase_plus_slugging         float64
onbase_plus_slugging_plus    float64
player                        object
pos                           object
runs_allowed                   int64
slugging_perc                float64
year                           int64
dtype: object

In [17]:
# Convert to numeric values.
to_numbers = dataKCR.columns.values
to_numbers = np.delete(to_numbers, [24,25])
for i in range(0, len(to_numbers)):
    dataKCR[to_numbers[i]] = pd.to_numeric(dataKCR[to_numbers[i]])
dataKCR.head()

Unnamed: 0,2B,3B,AB,BB,CS,G,GIDP,H,HBP,HR,...,age,batting_avg,onbase_perc,onbase_plus_slugging,onbase_plus_slugging_plus,player,pos,runs_allowed,slugging_perc,year
0,28,2,514,22,0,139,12,127,8,22,...,26,0.247,0.288,0.725,90.0,Salvador Perez,C,712,0.438,2016
1,24,1,605,57,3,158,18,161,1,25,...,26,0.266,0.328,0.761,102.0,Eric Hosmer,1B,712,0.433,2016
2,22,3,311,19,3,81,1,88,0,2,...,27,0.283,0.323,0.716,91.0,Whit Merrifield,2B,712,0.392,2016
3,24,6,637,27,4,162,16,166,3,7,...,29,0.261,0.292,0.642,71.0,Alcides Escobar,SS,712,0.35,2016
4,28,1,475,32,0,128,14,130,0,12,...,23,0.274,0.318,0.731,94.0,Cheslor Cuthbert,3B,712,0.413,2016


In [18]:
dataKCR.dtypes

2B                             int64
3B                             int64
AB                             int64
BB                             int64
CS                             int64
G                              int64
GIDP                           int64
                              ...   
onbase_plus_slugging         float64
onbase_plus_slugging_plus    float64
player                        object
pos                           object
runs_allowed                   int64
slugging_perc                float64
year                           int64
dtype: object

In [19]:
# AL pitchers don't bat unless they're visiting an NL team (interleague play).
dataNYY = dataNYY.fillna(0)
dataKCR = dataKCR.fillna(0)

# Player Swap: Derek Jeter for Mike Sweeney

Now that we have team and player data over the course of several years for both the New York Yankees and the Kansas City Royals, we want to perform a single player swap to see how these teams would have performed if the swap had actually happened.

To do that, we go into our player data and remove Sweeney from his true team, the Kansas City Royals, and instead place Derek Jeter's data in with the KCR players. We do the same thing for the New York Yankees, removing Derek Jeter's data from them and replacing it with Mike Sweeney's.

In [20]:
# we needed data less than or equal to 2007 because in 2007, Mike Sweeney left his team
df = dataNYY[dataNYY["year"] <= 2007]
# get all of Jeters statistics
jeter = df[df["player"] == "Derek Jeter"]
jeter.head()

Unnamed: 0,2B,3B,AB,BB,CS,G,GIDP,H,HBP,HR,...,age,batting_avg,onbase_perc,onbase_plus_slugging,onbase_plus_slugging_plus,player,pos,runs_allowed,slugging_perc,year
458,39,4,639,56,8,156,21,206,14,12,...,33,0.322,0.388,0.84,121.0,Derek Jeter,SS,777,0.452,2007
507,39,3,623,69,5,154,13,214,12,14,...,32,0.343,0.417,0.9,132.0,Derek Jeter,SS,767,0.483,2006
556,25,5,654,77,5,159,15,202,11,19,...,31,0.309,0.389,0.839,125.0,Derek Jeter,SS,789,0.45,2005
607,44,1,643,46,4,154,19,188,14,23,...,30,0.292,0.352,0.823,114.0,Derek Jeter,SS,808,0.471,2004
650,25,3,482,43,5,119,10,156,13,10,...,29,0.324,0.393,0.844,125.0,Derek Jeter,SS,716,0.45,2003


In [21]:
# like before, we want to swap these two players so we can only go up to 2007 because we want data for both in our swap
df = dataKCR[dataKCR["year"] <= 2007]
# retrieve all of Sweeney's statistics
sweeney = df[df["player"] == "Mike Sweeney"]
sweeney.head()

Unnamed: 0,2B,3B,AB,BB,CS,G,GIDP,H,HBP,HR,...,age,batting_avg,onbase_perc,onbase_plus_slugging,onbase_plus_slugging_plus,player,pos,runs_allowed,slugging_perc,year
408,15,1,265,17,0,74,9,69,5,7,...,33,0.26,0.315,0.719,88.0,Mike Sweeney,DH,778,0.404,2007
451,15,0,217,28,0,60,5,56,4,8,...,32,0.258,0.349,0.787,102.0,Mike Sweeney,DH,971,0.438,2006
506,39,0,470,33,0,122,16,141,4,21,...,31,0.3,0.347,0.864,127.0,Mike Sweeney,DH,935,0.517,2005
552,23,0,411,33,2,106,7,118,6,22,...,30,0.287,0.347,0.851,118.0,Mike Sweeney,DH,905,0.504,2004
610,18,1,392,64,2,108,13,115,2,16,...,29,0.293,0.391,0.858,120.0,Mike Sweeney,DH,867,0.467,2003


In [22]:
# remove Derek Jeter from his real team (the New York Yankees)
no_jeter = dataNYY[dataNYY["player"] != "Derek Jeter"]
# add Sweeney into the NYY player stats (perform the swap)
sweeney_in_NYY = pd.concat([no_jeter, sweeney], axis=0)
sweeney_in_NYY.groupby("year")['R'].sum()

year
1995    745
1996    790
1997    805
1998    870
1999    867
2000    857
2001    791
       ... 
2010    748
2011    783
2012    705
2013    642
2014    586
2015    764
2016    680
Name: R, dtype: int64

In [23]:
# remove Sweeney from his real team the Kansas City Royals
no_sweeney = dataKCR[dataKCR["player"] != "Mike Sweeney"]
# add Jeter into the KCR player stats
jeter_in_KCR = pd.concat([no_sweeney, jeter], axis=0)
jeter_in_KCR.groupby("year")['R'].sum()

year
1995    633
1996    827
1997    833
1998    809
1999    889
2000    893
2001    742
       ... 
2010    676
2011    730
2012    676
2013    648
2014    651
2015    724
2016    675
Name: R, dtype: int64

In [24]:
# create CSVs for later use
sweeney_in_NYY.to_csv("sweeney_in_NYY.csv", index=False)
jeter_in_KCR.to_csv("jeter_in_KCR.csv", index=False)
dataNYY.to_csv("nyy.csv", index=False)
dataKCR.to_csv("kcr.csv", index=False)

In [25]:
nyy.to_csv("nyy_links.csv", index=False)
kcr.to_csv("kcr_links.csv", index=False)