# Scraping data from Retrosheet.org: Part 3 - parsing the seasonal split page

This is pretty much the same as part 2, which parses the career page.

The only one improvement I will try is to react better on the 404 error page.
- In part 2, it will raise an error and will have to handle everything manually and move on.
- In part 3, I will just try to leave a line on the log file and pretend nothing is happening, if it is merely a 404.

It is a huge scrape, consisting of 84k+ pages. Let's keep our hands crossed and hope things will go well. Also, I have only about 28 hours left. It is a race with time.

In [None]:
from bs4 import BeautifulSoup;
import requests;
import re;
import pandas as pd;
import numpy as np;
from IPython.display import clear_output;

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; \
    Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36'};
retro_page_affix = "https://www.retrosheet.org/boxesetc/";


In [None]:
# Process if have chance. Have a larger chance to be blocked by Retrosheet.
df_player_yearly_stats = pd.read_csv("player_season_split_url.csv", usecols = ["ID", "Name", "Season", "Team", "split_url"]);
df_player_yearly_stats # 84231 rows

In [None]:
# The columns of the dataframe
columns_career = ["ID", "Name", "Season", "Team", "Situation", "G", "AB", "R", "H", "2B", "3B", "HR", "RBI",\
                  "BB", "IBB", "SO", "HBP", "SH", "SF", "XI", "ROE", "GDP", "SB", "CS", "AVG", "OBP", "SLG"]; # len() = 27
situations = ["Total", "Home", "Away", "vs RHP", "vs LHP", "Day", "Night", "None On", "Men On", "RISP", "Close & Late", \
              "Bases Loaded", "January", "February", "March", "April", "May", "June", "July", "August", "September", \
              "October", "November", "December", "1st", "2nd", "3rd", "4th", "5th", "6th", "7th", "8th", "9th", \
              "AT P ", "AT C ", "AT 1B", "AT 2B", "AT 3B", "AT SS", "AT LF", "AT CF", "AT RF", "AT OF", "AT DH", "AT PH", \
              "AT PR", "AT H", "AT >1"]; # len() = 48
situations_len = [len(x) for x in situations];
(situations_len[33], situations_len[34]) = (4,4);
situations_no_g = [3,4,7,8,9,10,11]; # situation tags with no G provided.

### Now, we write a function to scrape the link in the i-th row, but the rules of the game is really the same as part 2

In [None]:
def scrape_player_season_split(entry):
    ''' Scrape a player in the dataframe, specified by entry.
    If scraped successfully, return a nested list which can be fed to the data frame.
    If a 404 error occurs, return the number -1
    
    Arguments: entry: the index of the dataframe. Not the id!
    '''
    ID = df_player_yearly_stats.loc[entry,"ID"];
    name = df_player_yearly_stats.loc[entry,"Name"];
    season = df_player_yearly_stats.loc[entry, "Season"];
    team = df_player_yearly_stats.loc[entry, "Team"];
    url = df_player_yearly_stats.loc[entry,"split_url"];

    response = requests.get(url, headers=headers);
    text = BeautifulSoup(response.text, 'html.parser');

    if response.status_code == 404:
        print("404 on " + name + " in " + str(season) + " at" + team);
        with open("Seasonal_split_descrepency.log", 'a') as f:
            file_buffer = str(ID) + ", " + name + ", " + str(season) + ", " + team + ", " + url + ", season split missing\n";
            f.write(file_buffer);
        return -1;
    elif response.status_code != 200:
        raise Exception(f"The status code is not 200! It is {response.status_code}.");

    pret = text.findAll("pre");

    for ta in pret:
        if ta.get_text().find("Total") != -1:
            clear_output();
            print("Found batting record chunk for " + name + " in " + str(season) + " at " + team + '.');
            break;

    ltemp = ta.contents[0].splitlines();
    
    status = ltemp[0]; # The glossary line. Use this to detect what was missing.
    glossary = status.split();
    glossary_complete = (len(glossary) == 22);
    if not glossary_complete:
        glossary_index = [columns_career.index(x)-5 for x in glossary];
        # A mask, or indicator on where the glossary maps to in full list.
    
    player_career_split = [];
    for status in ltemp:
        # status_split = status.split();

        # Try to find true
        status_tag = [status.startswith(x) for x in situations];
        try:
            tag_num = status_tag.index(True);
        except:
            tag_num = -1;

        if glossary_complete:
            if tag_num in situations_no_g: # On tags where no G column is available, need to pad with a np.nan
                player_career_split.append([ID, name, season, team, situations[tag_num], np.nan] + \
                                          status[situations_len[tag_num]:].replace('i', ' ').split());
            elif tag_num >= 0:
                player_career_split.append([ID, name, season, team, situations[tag_num]] + \
                                          status[situations_len[tag_num]:].replace('i', ' ').split());
        else:
            if tag_num in situations_no_g:
                content = [np.nan] + status[situations_len[tag_num]:].replace('i', ' ').split();
                content_dest = [np.nan] * (len(columns_career) - 5);
                for j in range(len(glossary)):
                    content_dest[glossary_index[j]] = content[j];
                player_career_split.append([ID, name, season, team, situations[tag_num]] + content_dest);
            elif tag_num >= 0:
                content = status[situations_len[tag_num]:].replace('i', ' ').split();
                content_dest = [np.nan] * (len(columns_career) - 5);
                for j in range(len(glossary)):
                    content_dest[glossary_index[j]] = content[j];
                player_career_split.append([ID, name, season, team, situations[tag_num]] + content_dest);
    
    print([len(x) for x in player_career_split]);
    return player_career_split;


## Scraping the split pages and extend the dataframe.
Now this is where the scraping happens. Let's keep our fingers crossed and hope that we won't get blocked :)

In [None]:
df_player_season_split = pd.DataFrame(columns = columns_career);

In [None]:
# for i in range(df_player_season_stats.shape[0]): # Ideally we want to do this, but need to debug once in a while.
for i in range(43521,df_player_season_stats.shape[0]): # Break into pieces
#for i in range(20):
    psp = scrape_player_season_split(i);
    if psp != -1:
        df_player_season_split = df_player_season_split.append(pd.DataFrame(psp[0:], columns = columns_career));

In [None]:
for i in range(6,24):
    df_player_season_split[columns_career[i]] = df_player_season_split[columns_career[i]].str.replace('i','').astype(float); #apply(lambda x: int(x) if isinstance(x, str) else x)

for i in range(24,27):
    df_player_season_split[columns_career[i]] = df_player_season_split[columns_career[i]].str.replace('-','');

In [None]:
df_player_season_split.to_csv("player_season_split_data.csv")
print(df_player_season_split.shape) # (413971, 25)

In [None]:
# df_player_season_split.iloc[90:100]
df_player_yearly_stats.iloc[43519:43525]

In [None]:
a = [1,2,3];
b = [];
c = None;
#a.append(c)
#a.extend(c)
a != max

# list(range(10)) #0-9
#list(range(10,20)) #10-19
i

In [None]:
#df_player_season_split.iloc[80:90]
print(i)
df_player_season_split.shape
[len(x) for x in psp]
psp[-1]
"si djo201iwd".replace('i', ' ').split()