# Scraping data from Retrosheet.org: Part 2 - parsing the split page

In [112]:
from bs4 import BeautifulSoup;
import requests;
import re;
import pandas as pd;
import numpy as np;
from IPython.display import clear_output;

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; \
    Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36'};
retro_page_affix = "https://www.retrosheet.org/boxesetc/";


In [19]:
df_player_career_stats = pd.read_csv("player_career_split_url.csv", usecols = ["ID", "Name", "split_url"]);
df_player_career_stats

Unnamed: 0,ID,Name,split_url
0,0,David Aardsma,https://www.retrosheet.org/boxesetc/A/Jaardd00...
1,1,Hank Aaron,https://www.retrosheet.org/boxesetc/A/Jaaroh10...
2,2,Tommie Aaron,https://www.retrosheet.org/boxesetc/A/Jaarot10...
3,3,Don Aase,https://www.retrosheet.org/boxesetc/A/Jaased00...
4,4,Andy Abad,https://www.retrosheet.org/boxesetc/A/Jabada00...
...,...,...,...
16004,19913,Bob Zupcic,https://www.retrosheet.org/boxesetc/Z/Jzupcb00...
16005,19914,Frank Zupo,https://www.retrosheet.org/boxesetc/Z/Jzupof10...
16006,19915,Paul Zuvella,https://www.retrosheet.org/boxesetc/Z/Jzuvep00...
16007,19916,George Zuverink,https://www.retrosheet.org/boxesetc/Z/Jzuveg10...


In [12]:
# Process if have chance. Have a larger chance to be blocked by Retrosheet.
df_player_yearly_stats = pd.read_csv("player_season_split_url.csv", usecols = ["ID", "Name", "Season", "Team", "split_url"]);

In [71]:
# The columns of the dataframe
columns_career = ["ID", "Name", "Situation", "G", "AB", "R", "H", "2B", "3B", "HR", "RBI", "BB", "IBB", "SO", "HBP", "SH", \
                 "SF", "XI", "ROE", "GDP", "SB", "CS", "AVG", "OBP", "SLG"]; # len() = 25
situations = ["Total", "Home", "Away", "vs RHP", "vs LHP", "Day", "Night", "None On", "Men On", "RISP", "Close & Late", \
              "Bases Loaded", "January", "February", "March", "April", "May", "June", "July", "August", "September", \
              "October", "November", "December", "1st", "2nd", "3rd", "4th", "5th", "6th", "7th", "8th", "9th", \
              "AT P", "AT C", "AT 1B", "AT 2B", "AT 3B", "AT SS", "AT LF", "AT CF", "AT RF", "AT OF", "AT DH", "AT PH", \
              "AT PR", "AT H", "AT >1"]; # len() = 48
situations_len = [len(x) for x in situations];
situations_no_g = [3,4,7,8,9,10,11]; # situation tags with no G provided.

In [110]:
def scrape_player_split(entry):
    ''' Scrape a player in the dataframe, specified by entry.
    Return a nested list which can be fed to the data frame.
    
    Arguments: entry: the index of the dataframe. Not the id!
    '''
    ID = df_player_career_stats.loc[entry,"ID"];
    name = df_player_career_stats.loc[entry,"Name"];
    url = df_player_career_stats.loc[entry,"split_url"];

    response = requests.get(url, headers=headers);
    text = BeautifulSoup(response.text, 'html.parser');

    if response.status_code != 200:
        raise Exception(f"The status code is not 200! It is {response.status_code}.");

    pret = text.findAll("pre");

    for ta in pret:
        if ta.get_text().find("Total") != -1:
            clear_output();
            print("Found batting record chunk for " + name + '.');
            break;

    ltemp = ta.contents[0].splitlines();
        
    player_career_split = [];
    for status in ltemp:
        # status_split = status.split();

        # Try to find true
        status_tag = [status.startswith(x) for x in situations];
        try:
            tag_num = status_tag.index(True);
        except:
            tag_num = -1;

        if tag_num in situations_no_g: # On tags where no G column is available, need to pad with a np.nan
            player_career_split.append([ID, name, situations[tag_num], np.nan] + \
                                      status[situations_len[tag_num]:].split());
        elif tag_num >= 0:
            player_career_split.append([ID, name, situations[tag_num]] + \
                                      status[situations_len[tag_num]:].split());
    return player_career_split;


In [113]:
# df_player_career_stats.loc[16004,"ID"]
# df_player_career_stats.shape
psp = scrape_player_split(2)
psp = scrape_player_split(3)

Found batting record chunk for Don Aase.


In [114]:
psp

[[3,
  'Don Aase',
  'Total',
  '448',
  '5',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '3',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '.000',
  '.000',
  '.000'],
 [3,
  'Don Aase',
  'Home',
  '232',
  '4',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '2',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '.000',
  '.000',
  '.000'],
 [3,
  'Don Aase',
  'Away',
  '216',
  '1',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '1',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '.000',
  '.000',
  '.000'],
 [3,
  'Don Aase',
  'vs RHP',
  nan,
  '4',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '2',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '.000',
  '.000',
  '.000'],
 [3,
  'Don Aase',
  'vs LHP',
  nan,
  '1',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '1',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '.000',
  '.000',
  '.000'],
 [3,
  'Don Aase',
  'Da