# Scraping data from Retrosheet.org: Part 1 - retrieving the split page

The goal is to scrape all the batter and pitcher split data from the website Retrosheet.org

In [None]:
from bs4 import BeautifulSoup;
import requests;
import re;

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; \
    Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36'};

initials = [ \
    "AA", "AB", "AC", "AD", "AF", "AG", "AH", "AI", "AK", "AL", "AM", "AN", "AO",\
    "AP", "AQ", "AR", "AS", "AT", "AU", "AV", "AX", "AY", "AZ", "BA", "BE", "BI",\
    "BJ", "BL", "BO", "BR", "BU", "BY", "CA", "CE", "CH", "CI", "CL", "CO", "CR",\
    "CU", "CV", "CY", "CZ", "DA", "DE", "DI", "DL", "DO", "DR", "DU", "DW", "DY",\
    "EA", "EB", "EC", "ED", "EE", "EF", "EG", "EH", "EI", "EK", "EL", "EM", "EN",\
    "EO", "EP", "ER", "ES", "ET", "EU", "EV", "EW", "EX", "EY", "EZ", "FA", "FE",\
    "FI", "FL", "FO", "FR", "FU", "FY", "GA", "GE", "GH", "GI", "GL", "GO", "GR",\
    "GS", "GU", "GW", "GY", "HA", "HE", "HI", "HO", "HR", "HU", "HW", "HY", "IA",\
    "IB", "IG", "IL", "IM", "IN", "IO", "IR", "IS", "IV", "IW", "IZ", "JA", "JE",\
    "JI", "JO", "JU", "KA", "KE", "KH", "KI", "KL", "KM", "KN", "KO", "KR", "KU",\
    "KV", "KW", "KY", "LA", "LE", "LI", "LL", "LO", "LU", "LY", "MA", "MC", "ME",\
    "MI", "ML", "MM", "MO", "MR", "MU", "MY", "NA", "NE", "NG", "NI", "NO", "NU",\
    "NY", "OA", "OB", "OC", "OD", "OE", "OF", "OG", "OH", "OJ", "OK", "OL", "OM",\
    "ON", "OQ", "OR", "OS", "OT", "OU", "OV", "OW", "OX", "OY", "OZ", "PA", "PE",\
    "PF", "PH", "PI", "PL", "PO", "PR", "PU", "PY", "QU", "RA", "RE", "RH", "RI",\
    "RL", "RO", "RU", "RY", "RZ", "SA", "SB", "SC", "SE", "SH", "SI", "SK", "SL",\
    "SM", "SN", "SO", "SP", "SQ", "ST", "SU", "SV", "SW", "SY", "SZ", "TA", "TE",\
    "TH", "TI", "TO", "TR", "TS", "TU", "TW", "TY", "UC", "UE", "UG", "UH", "UJ",\
    "UL", "UM", "UN", "UP", "UR", "US", "UT", "VA", "VE", "VI", "VO", "VU", "WA",\
    "WE", "WH", "WI", "WO", "WR", "WU", "WY", "YA", "YD", "YE", "YI", "YN", "YO",\
    "YU", "YV", "ZA", "ZD", "ZE", "ZI", "ZM", "ZO", "ZU", "ZW", "ZY"];
# len = 271
# Might not be a good implementation, since new initials will be added in the future.
# Better to be able to fetch this list automatically.

# Generate a list of pages containing the player pages grouped by initials
player_ini_page = [f"https://www.retrosheet.org/boxesetc/MISC/PLD_{init}.htm" for init in initials];
retro_page_affix = "https://www.retrosheet.org/boxesetc/"

player_urls = [];
# Scrape the urls for each player.
for url in player_ini_page:
    response = requests.get(url, headers=headers);
    text = BeautifulSoup(response.text, 'html.parser');
    
    if response.status_code != 200:
        raise Exception(f"The status code is not 200! It is {response.status_code}.");
    
    player_list = text.find_all("pre")[0].find_all('a')
    player_urlt = [retro_page_affix + tag.get("href")[3:] for tag in player_list]; # Some alignments
    info_list = [tag.getText() for tag in player_list];
    info_list = [[int(info[0:4]), int(info[5:9]), info[10:].split(',')[0]] for info in info_list];

    player_urls.extend(list(map(lambda x, y: x+[y], info_list, player_urlt)));



## Saving the player url data to Pandas dataframe and a .csv file.

In [None]:
import pandas as pd;
import numpy as np;

df_player_urls = pd.DataFrame(np.array(player_urls), columns = ["start_season", "end_season", "Name", "Url"]);
df_player_urls

In [None]:
df_player_urls.to_csv("players_url.csv");

## Parsing the url page

In [None]:
df_player_yearly_stats = pd.DataFrame(columns = ["ID", "Name", "Season", "Team", "split_url"]);
df_player_career_stats = pd.DataFrame(columns = ["ID", "Name", "split_url"]);

In [None]:
for ind in df_player_urls.index:
    player_name = df_player_urls.loc[ind, "Name"];
    url = df_player_urls.loc[ind, "Url"];
    
    response = requests.get(url, headers=headers);
    text = BeautifulSoup(response.text, 'html.parser');
    
    if response.status_code != 200:
        raise Exception(f"The status code is not 200! It is {response.status_code}.");
        
    a = text.findAll("pre");
    # Find the first "pre" tag containing the "Batting record". There are other batting record,
    # but the first one is the most informative and contains all the link we want to analyze in this project.
    for ta in a:
        if ta.get_text().find("Batting Record") != -1:
            print("Found batting record for " + player_name + ".");
            break;

    batting_links = ta.findAll("a");

    links_counter = False;
    career_list = [];
    split_link = "";
    
    # The code below is based on an analysis of the html structure of the player main page on retrosheet.org.
    # We used the a tag to get all the hyperlinks. The text of the hyperlink fortunately have nice structure to track.
    # On each line in the stats, there is (usually) a split link, followed by two links on the year and the team of the player.
    # The final line contains the final link, "cumulated split data", which is the first stage of our analysis.
    for i in range(len(batting_links)):
        if batting_links[i].string == "Splits": # If we have a hyperlink on "Splits", record this and the next two hyperlinks.
            links_counter = True;
            split_link = retro_page_affix + batting_links[i].get("href")[3:];
        elif batting_links[i].string.isnumeric() and links_counter:
            split_year = int(batting_links[i].string);
            print(split_year);
        elif links_counter:
            links_counter = False;
            career_list.append([ind, player_name, split_year, batting_links[i].string, split_link]);
    
    df_player_yearly_stats = df_player_yearly_stats.append(pd.DataFrame(career_list[0:], columns = df_player_yearly_stats.columns));
    # Two special cases:
    # 1. There is no "cumulated split data", but only one seasonal split data.
    #    This implies the player only batted in MLB for one season. Use the seasonal data.
    # 2. There is no split data at all. This implies there is no batting record at al (a pitcher or a 19-th century player)
    #    We will ignore the player.
    if split_link != "":
        df_player_career_stats.loc[len(df_player_career_stats)] = [ind, player_name, split_link];


In [291]:
df_player_career_stats.to_csv("player_career_split_url.csv");
df_player_career_stats

Unnamed: 0,ID,Name,split_url
0,0,David Aardsma,https://www.retrosheet.org/boxesetc/A/Jaardd00...
1,1,Hank Aaron,https://www.retrosheet.org/boxesetc/A/Jaaroh10...
2,2,Tommie Aaron,https://www.retrosheet.org/boxesetc/A/Jaarot10...
3,3,Don Aase,https://www.retrosheet.org/boxesetc/A/Jaased00...
4,4,Andy Abad,https://www.retrosheet.org/boxesetc/A/Jabada00...
...,...,...,...
16004,19913,Bob Zupcic,https://www.retrosheet.org/boxesetc/Z/Jzupcb00...
16005,19914,Frank Zupo,https://www.retrosheet.org/boxesetc/Z/Jzupof10...
16006,19915,Paul Zuvella,https://www.retrosheet.org/boxesetc/Z/Jzuvep00...
16007,19916,George Zuverink,https://www.retrosheet.org/boxesetc/Z/Jzuveg10...


In [292]:
df_player_yearly_stats.to_csv("player_season_split_url.csv");
df_player_yearly_stats

Unnamed: 0,ID,Name,Season,Team,split_url
0,0,David Aardsma,2006,CHI N,https://www.retrosheet.org/boxesetc/2006/Jaard...
1,0,David Aardsma,2008,BOS A,https://www.retrosheet.org/boxesetc/2008/Jaard...
2,0,David Aardsma,2015,ATL N,https://www.retrosheet.org/boxesetc/2015/Jaard...
0,1,Hank Aaron,1954,MIL N,https://www.retrosheet.org/boxesetc/1954/Jaaro...
1,1,Hank Aaron,1955,MIL N,https://www.retrosheet.org/boxesetc/1955/Jaaro...
...,...,...,...,...,...
8,19916,George Zuverink,1959,BAL A,https://www.retrosheet.org/boxesetc/1959/Jzuve...
0,19917,Dutch Zwilling,1910,CHI A,https://www.retrosheet.org/boxesetc/1910/Jzwil...
1,19917,Dutch Zwilling,1914,CHI F,https://www.retrosheet.org/boxesetc/1914/Jzwil...
2,19917,Dutch Zwilling,1915,CHI F,https://www.retrosheet.org/boxesetc/1915/Jzwil...
