In [2]:
import requests
from bs4 import BeautifulSoup
import bs4
import time
import pandas as pd

Objective 1: Scrap the Data to calculate the PF for NYG and SFG

The code below finds the year, the total home runs, and total at bats that were taken home and away by the New York Giants from 1951 to 1957.

In [3]:
# Initialize empty DataFrames to hold the data
columns = ['Year', 'HR', 'AB']
polo_bat_home = pd.DataFrame(columns=columns)
polo_bat_away = pd.DataFrame(columns=columns)

year = 1951
while year != 1958:
    # for each value of year, the code will fetch the corresponding page for Willie Mays
    url = requests.get(f"https://www.baseball-reference.com/teams/split.cgi?t=b&team=NYG&year={year}")
    time.sleep(5)
    soup = BeautifulSoup(url.text, 'html.parser')

    block = [i for i in soup.find_all(string = lambda text: isinstance(text, bs4.Comment)) if 'id="div_hmvis"' in i][0]

    # The selected comment is parsed to find the embedded tables  
    new_soup = BeautifulSoup(str(block), 'html.parser')
    table = new_soup.find('table').find('tbody')
    rows = table.find_all('tr')

    # For each row, the cells are found, the values for AB and HR are extracted, home vs away data are accumulated separately
    for row in rows:
        cols = row.find_all('td')
        AB = int(cols[4].text)
        HR = int(cols[9].text)
        if cols[0].text == "Home":
            polo_bat_home.loc[len(polo_bat_home.index)] = [year, HR, AB]
        elif cols[0].text == "Away":
            polo_bat_away.loc[len(polo_bat_away.index)] = [year, HR, AB]
    year += 1

KeyboardInterrupt: 

The code below finds the year, total home runs, and total at bats for the SFG play at home vs away from 1951 to 1957.

In [None]:
# Initialize empty DataFrames to hold the data
columns = ['Year', 'HR', 'AB']
candlestick_bat_home = pd.DataFrame(columns=columns)
candlestick_bat_away = pd.DataFrame(columns=columns)

year = 1960
while year != 1971:
    # for each value of year, the code will fetch the corresponding page for Willie Mays
    url = requests.get(f"https://www.baseball-reference.com/teams/split.cgi?team=SFG&t=b&year={year}")
    time.sleep(5)
    soup = BeautifulSoup(url.text, 'html.parser')

    block = [i for i in soup.find_all(string = lambda text: isinstance(text, bs4.Comment)) if 'id="div_hmvis"' in i][0]

    # The selected comment is parsed to find the embedded tables  
    new_soup = BeautifulSoup(str(block), 'html.parser')
    table = new_soup.find('table').find('tbody')
    rows = table.find_all('tr')

    # For each row, the cells are found, the values for AB and HR are extracted, home vs away data are accumulated separately
    for row in rows:
        cols = row.find_all('td')
        AB = int(cols[4].text)
        HR = int(cols[9].text)
        if cols[0].text == "Home":
            candlestick_bat_home.loc[len(candlestick_bat_home.index)] = [year, HR, AB]
        elif cols[0].text == "Away":
            candlestick_bat_away.loc[len(candlestick_bat_away.index)] = [year, HR, AB]
    year += 1

Objective 2: Determine PF for Candlestick (1960-71) and Polo Grounds (1951-57)

In [None]:
candlestick_home_ratio = (candlestick_bat_home['HR'].sum()/candlestick_bat_home['AB'].sum())
candlestick_away_ratio = (candlestick_bat_away['HR'].sum()/candlestick_bat_away['AB'].sum())
candlestick_pf = candlestick_home_ratio/candlestick_away_ratio
candlestick_pf

np.float64(1.1077558928923663)

In [None]:
polo_home_ratio = (polo_bat_home['HR'].sum()/polo_bat_home['AB'].sum())
polo_away_ratio = (polo_bat_away['HR'].sum()/polo_bat_away['AB'].sum())
polo_pf = polo_home_ratio/polo_away_ratio
polo_pf

np.float64(1.8151915506166918)

Objective 3: Scrap for Willie Mays Data

In [None]:
columns = ['Year', 'Team', 'HR', 'AB']
mays_batting_stats = pd.DataFrame(columns=columns)

url = 'https://www.baseball-reference.com/players/m/mayswi01.shtml'
time.sleep(5)
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
table = soup.find('table').find('tbody')
rows = table.find_all('tr')

for row in rows: 
    cols = row.find_all('td')
    year = row.find('th').text
    if year not in ['1948', '1950', '1952', '1953', '1972', '1973', '1958', '1959']:
        team = cols[1].text
        hr = cols[10].text
        ab = cols[5].text
        mays_batting_stats.loc[len(mays_batting_stats.index)] = [year, team, hr, ab]

# drop the rows that are empty or from minor leagues
mays_batting_stats = mays_batting_stats[(mays_batting_stats['Team']=='NYG')|(mays_batting_stats['Team']=='SFG')]
mays_batting_stats

Unnamed: 0,Year,Team,HR,AB
2,1951,NYG,20,464
3,1954,NYG,41,565
4,1955,NYG,51,580
5,1956,NYG,36,578
6,1957,NYG,35,585
7,1960,SFG,29,595
8,1961,SFG,40,572
9,1962,SFG,49,621
10,1963,SFG,38,596
11,1964,SFG,47,578


Objective 4: Find total HRs for Mays if he played in Candlestick during 1951 to 1957.

In [None]:
# find adjustment factor for Park Grounds
pg_adjustment_factor = (1+polo_pf)/2
pg_adjustment_factor

np.float64(1.407595775308346)

In [None]:
# find the adjustment factor for Candlestick Park
candlestick_adjustment_factor = (1+candlestick_pf)/2
candlestick_adjustment_factor

np.float64(1.0538779464461832)

In [None]:
# Write function to adjust the number of home runs Mays hits in Park Grounds to the amount he would have hit in Candlestick
def adjust_pg_to_candlestick(row):
    if row['Team'] == 'NYG':
        adj_hr = int(row['HR'])/pg_adjustment_factor
        return adj_hr/candlestick_adjustment_factor
    else:
        return int(row['HR'])

In [None]:
# find the Candlestick Adjusted Hrs and update the dataframe to include these
mays_batting_stats['Candlestick Adj HRs'] = mays_batting_stats.apply(adjust_pg_to_candlestick, axis=1)
mays_batting_stats

Unnamed: 0,Year,Team,HR,AB,Candlestick Adj HRs
2,1951,NYG,20,464,13.48223
3,1954,NYG,41,565,27.638571
4,1955,NYG,51,580,34.379686
5,1956,NYG,36,578,24.268014
6,1957,NYG,35,585,23.593902
7,1960,SFG,29,595,29.0
8,1961,SFG,40,572,40.0
9,1962,SFG,49,621,49.0
10,1963,SFG,38,596,38.0
11,1964,SFG,47,578,47.0


In [None]:
total_candlestick_hrs = mays_batting_stats['Candlestick Adj HRs'].sum()
total_candlestick_hrs

np.float64(519.3624024385942)

Conclusion: Mays would have hit about 519 home runs if he played in Candlestick from 1951-1957 in his career.

Objective 5: Find Park Adjusted Home Runs for all stadiums for analysis

In [None]:
def find_adj_hrs(row):
    if row['Team']=='NYG':
        return int(row['HR'])/pg_adjustment_factor
    elif row['Team']=='SFG':
        return int(row['HR'])/candlestick_adjustment_factor
    
mays_batting_stats['Adj HRs'] = mays_batting_stats.apply(find_adj_hrs, axis=1)
total_adj_hrs = mays_batting_stats['Adj HRs'].sum()
total_adj_hrs

np.float64(505.7640028684866)