## Scrape Batter Data

This notebook scrapes batting data from baseball reference. Beautiful soup is used to extract the specific information that is desired. The data is merged together for both American League and National League teams and written to a csv file that can be used for further analysis. 

In [None]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
from urllib.request import urlopen
from bs4 import Comment
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

### Get all AL Players Batter Data 

In [None]:
url = "https://www.baseball-reference.com/leagues/AL/2020-standard-batting.shtml"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

In [None]:
ids = []
content = urlopen("https://www.baseball-reference.com/leagues/AL/2020-standard-batting.shtml")
soup = BeautifulSoup(content.read(),"lxml")
for comment in soup.find_all(string=lambda text:isinstance(text,Comment)):
    sauce = BeautifulSoup(comment,"lxml")
    for tags in sauce.find_all('tr'):
        name = [item.get("data-append-csv") for item in tags.find_all("td")[:1]]
        ids.append(name)

In [None]:
cleaned_ids = list(set([i[0] for i in ids if len(i) > 0]))

In [None]:
player_id_name = {}
for player in cleaned_ids:
    if player is not None:
        url = "https://www.baseball-reference.com/players/gl.fcgi?id=" + player + "&t=b&year=2020"
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        player_name = " ".join(soup.find('title').text.split(' ')[:2])
        player_id_name[player] = player_name

In [None]:
df_list = []
for player in cleaned_ids:
    if player is not None:
        url = "https://www.baseball-reference.com/players/gl.fcgi?id=" + player + "&t=b&year=2020"
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        table_headers = []
        for x in soup.find('thead').find_all('th'):
            table_headers.append(x.text)

        data = []
        table = soup.find('table')
        table_body = soup.find('tbody')
        rows = table_body.find_all('tr')
        for i,row in enumerate(rows):
            cols = row.find_all('td')
            lst = [ele.text.strip() for ele in cols]
            data.append(lst)
        df = pd.DataFrame(data,columns=table_headers[1:])
        df['player_id'] = player
        df_list.append(df)

In [None]:
al_df = pd.concat(df_list)

### Get All NL Players Batter Data 

In [None]:
url = "https://www.baseball-reference.com/leagues/NL/2020-standard-batting.shtml"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

In [None]:
ids = []
content = urlopen("https://www.baseball-reference.com/leagues/NL/2020-standard-batting.shtml")
soup = BeautifulSoup(content.read(),"lxml")
for comment in soup.find_all(string=lambda text:isinstance(text,Comment)):
    sauce = BeautifulSoup(comment,"lxml")
    for tags in sauce.find_all('tr'):
        name = [item.get("data-append-csv") for item in tags.find_all("td")[:1]]
        ids.append(name)

In [None]:
cleaned_ids = list(set([i[0] for i in ids if len(i) > 0]))

In [None]:
player_id_name = {}
for player in cleaned_ids:
    if player is not None:
        url = "https://www.baseball-reference.com/players/gl.fcgi?id=" + player + "&t=b&year=2020"
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        player_name = " ".join(soup.find('title').text.split(' ')[:2])
        player_id_name[player] = player_name

In [None]:
df_list = []
for player in cleaned_ids:
    if player is not None:
        url = "https://www.baseball-reference.com/players/gl.fcgi?id=" + player + "&t=b&year=2020"
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        table_headers = []
        for x in soup.find('thead').find_all('th'):
            table_headers.append(x.text)

        data = []
        table = soup.find('table')
        table_body = soup.find('tbody')
        rows = table_body.find_all('tr')
        for i,row in enumerate(rows):
            cols = row.find_all('td')
            lst = [ele.text.strip() for ele in cols]
            data.append(lst)
        df = pd.DataFrame(data,columns=table_headers[1:])
        df['player_id'] = player
        df_list.append(df)

In [None]:
nl_df = pd.concat(df_list)

### Merge the AL & NL Dataframes Together & Export 

In [None]:
final_df = pd.concat([al_df, nl_df])

In [None]:
final_df.shape

In [None]:
final_df.to_csv('../data/all_batters_game_data.csv')