Script / function to scrape player appearance data from www.baseball-reference.com

In [None]:
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
import numpy as np
import requests
import os
import re

In [None]:
def get_appear(mlb_year):
    
    """
    Scrape player appearance data from baseball-reference.com
    
    Parameters:
    -----------
    mlb_year: int
        mlb season of player appearances to scrape
    
    Returns:
    --------
    Dataframe
    """
    
    wd_location = '/Users/jonathanarmitage/Documents/web_drivers/chromedriver'
    
    url = 'https://www.baseball-reference.com/leagues/MLB/{}-appearances-fielding.shtml'.format(mlb_year)
    browser = webdriver.Chrome(executable_path=wd_location)

    # use browser instance to navigate to url
    browser.get(url)
    
    tables = pd.read_html(str(browser.page_source), match='Age')[0]
    tables = tables.loc[tables['Name'] != 'Name'].reset_index(drop=True)
    tables.drop('Rk', axis=1, inplace=True)

    elem0 = browser.find_element_by_xpath('//*[@id="players_players_appearances_fielding"]')

    getIDs = [x.get_attribute('href') for x in elem0.find_elements_by_tag_name('a')]
    reX = re.compile('https://www.baseball-reference.com/players/\w/')

    pl0 = [] 
    for i in getIDs:
        if '/players/' in i:
            x1 = re.sub(reX, '', i).replace('.shtml', '')
            pl0.append(x1)

    tables['Name'] = tables['Name'].str.replace('\xa0', ' ').str.replace('*', '').str.strip()

    tables['bbrefID'] = pl0
    
    tables.columns = tables.columns.str.lower()
    
    if mlb_year < 1973:
        col0_ = ['p', 'c', '1b', '2b', '3b', 'ss', 'lf', 'cf', 'rf']
    else:
        col0_ = ['p', 'c', '1b', '2b', '3b', 'ss', 'lf', 'cf', 'rf', 'dh']

    tables[col0_] = tables[col0_].astype(int)
    tables['max_pos'] = tables.loc[:, col0_].idxmax(axis=1)
    tables['year'] = mlb_year
    
    browser.close()
    return tables

Check to make sure that the function works for a given mlb season

In [None]:
A = get_appear(mlb_year=1997)

Loop through years and collect data into a dataframe

In [None]:
hold_df = pd.DataFrame()
for yr in range(1973, 1980):
    print(yr)
    tmp_df = get_appear(mlb_year=yr)
    hold_df = hold_df.append(tmp_df)

In [None]:
# hold_df.to_csv('/Users/jonathanarmitage/Desktop/mlb_appear/mlb_appear_1973to1979.csv', index=False)