# Scraping data from Retrosheet.org

The goal is to scrape all the batter and pitcher split data from the website Retrosheet.org

In [48]:
from bs4 import BeautifulSoup;
import requests;
import re;

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; \
    Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36'};

initials = [ \
    "AA", "AB", "AC", "AD", "AF", "AG", "AH", "AI", "AK", "AL", "AM", "AN", "AO",\
    "AP", "AQ", "AR", "AS", "AT", "AU", "AV", "AX", "AY", "AZ", "BA", "BE", "BI",\
    "BJ", "BL", "BO", "BR", "BU", "BY", "CA", "CE", "CH", "CI", "CL", "CO", "CR",\
    "CU", "CV", "CY", "CZ", "DA", "DE", "DI", "DL", "DO", "DR", "DU", "DW", "DY",\
    "EA", "EB", "EC", "ED", "EE", "EF", "EG", "EH", "EI", "EK", "EL", "EM", "EN",\
    "EO", "EP", "ER", "ES", "ET", "EU", "EV", "EW", "EX", "EY", "EZ", "FA", "FE",\
    "FI", "FL", "FO", "FR", "FU", "FY", "GA", "GE", "GH", "GI", "GL", "GO", "GR",\
    "GS", "GU", "GW", "GY", "HA", "HE", "HI", "HO", "HR", "HU", "HW", "HY", "IA",\
    "IB", "IG", "IL", "IM", "IN", "IO", "IR", "IS", "IV", "IW", "IZ", "JA", "JE",\
    "JI", "JO", "JU", "KA", "KE", "KH", "KI", "KL", "KM", "KN", "KO", "KR", "KU",\
    "KV", "KW", "KY", "LA", "LE", "LI", "LL", "LO", "LU", "LY", "MA", "MC", "ME",\
    "MI", "ML", "MM", "MO", "MR", "MU", "MY", "NA", "NE", "NG", "NI", "NO", "NU",\
    "NY", "OA", "OB", "OC", "OD", "OE", "OF", "OG", "OH", "OJ", "OK", "OL", "OM",\
    "ON", "OQ", "OR", "OS", "OT", "OU", "OV", "OW", "OX", "OY", "OZ", "PA", "PE",\
    "PF", "PH", "PI", "PL", "PO", "PR", "PU", "PY", "QU", "RA", "RE", "RH", "RI",\
    "RL", "RO", "RU", "RY", "RZ", "SA", "SB", "SC", "SE", "SH", "SI", "SK", "SL",\
    "SM", "SN", "SO", "SP", "SQ", "ST", "SU", "SV", "SW", "SY", "SZ", "TA", "TE",\
    "TH", "TI", "TO", "TR", "TS", "TU", "TW", "TY", "UC", "UE", "UG", "UH", "UJ",\
    "UL", "UM", "UN", "UP", "UR", "US", "UT", "VA", "VE", "VI", "VO", "VU", "WA",\
    "WE", "WH", "WI", "WO", "WR", "WU", "WY", "YA", "YD", "YE", "YI", "YN", "YO",\
    "YU", "YV", "ZA", "ZD", "ZE", "ZI", "ZM", "ZO", "ZU", "ZW", "ZY"];
# len = 271
# Might not be a good implementation, since new initials will be added in the future.
# Better to be able to fetch this list automatically.

# Generate a list of pages containing the player pages grouped by initials
player_ini_page = [f"https://www.retrosheet.org/boxesetc/MISC/PLD_{init}.htm" for init in initials];
retro_page_affix = "https://www.retrosheet.org/boxesetc/"

player_urls = [];
# Scrape the urls for each player.
for url in player_ini_page:
    response = requests.get(url, headers=headers);
    text = BeautifulSoup(response.text, 'html.parser');
    
    if response.status_code != 200:
        raise Exception(f"The status code is not 200! It is {response.status_code}.");
    
    player_list = text.find_all("pre")[0].find_all('a')
    player_urlt = [retro_page_affix + tag.get("href")[3:] for tag in player_list];
    info_list = [tag.getText() for tag in player_list];
    info_list = [[int(info[0:4]), int(info[5:9]), info[10:].split(',')[0]] for info in info_list];

    player_urls.extend(list(map(lambda x, y: x+[y], info_list, player_urlt)));



In [54]:
len(player_urls)
player_urls[2347]

[1902,
 1902,
 'John Burke',
 'https://www.retrosheet.org/boxesetc/B/Pburkj103.htm']

## Saving the player url data to Pandas dataframe and a .csv file.

In [58]:
import pandas as pd;
import numpy as np;

df_player_urls = pd.DataFrame(np.array(player_urls), columns = ["start_season", "end_season", "Name", "Url"]);
df_player_urls

Unnamed: 0,start_season,end_season,Name,Url
0,2004,2015,David Aardsma,https://www.retrosheet.org/boxesetc/A/Paardd00...
1,1954,1976,Hank Aaron,https://www.retrosheet.org/boxesetc/A/Paaroh10...
2,1962,1971,Tommie Aaron,https://www.retrosheet.org/boxesetc/A/Paarot10...
3,1977,1990,Don Aase,https://www.retrosheet.org/boxesetc/A/Paased00...
4,2001,2006,Andy Abad,https://www.retrosheet.org/boxesetc/A/Pabada00...
...,...,...,...,...
19914,1957,1961,Frank Zupo,https://www.retrosheet.org/boxesetc/Z/Pzupof10...
19915,1982,1991,Paul Zuvella,https://www.retrosheet.org/boxesetc/Z/Pzuvep00...
19916,1951,1959,George Zuverink,https://www.retrosheet.org/boxesetc/Z/Pzuveg10...
19917,1910,1916,Dutch Zwilling,https://www.retrosheet.org/boxesetc/Z/Pzwild10...


In [59]:
df_player_urls.to_csv("players_url.csv");