In [1]:
import requests
from bs4 import BeautifulSoup, Comment
import time
import pandas as pd
import numpy as np

In [14]:
url = "https://www.baseball-reference.com/register/league.cgi?code=FRON&class=Ind"
response = requests.get(url)
base_url = "https://www.baseball-reference.com"

def get_teams_by_year():
    teams_by_year = {}
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')

        table = soup.find('table', {'class': 'suppress_all sortable stats_table'})
        bodies = table.find_all('tbody')

        for body in bodies:
            rows = body.find_all('tr')

            for row in rows:
                year = row.find('th', {'data-stat': 'year_ID'}).find('a').text
                teams = [base_url + team['href'] for team in row.find('td', {'data-stat': 'team_ID'}).find_all('a')]
                team_names = [team.text for team in row.find('td', {'data-stat': 'team_ID'}).find_all('a')]

                if year not in teams_by_year:
                    teams_by_year[year] = list(zip(teams, team_names))
                else:
                    teams_by_year[year].extend(list(zip(teams, team_names)))
        return teams_by_year
    else:
        print('BREF BLOCKED')
        return None

def get_batting_df(year, teams_by_year):
    batting_data = []

    teams = teams_by_year[year]
    for team_url, team_name in teams:
        response = requests.get(team_url)
        soup = BeautifulSoup(response.content, 'html.parser')

        table = soup.find('table', {'class': 'sortable stats_table', 'id': 'team_batting'})
        if table is not None:
            data = []
            for row in table.find('tbody').find_all('tr'):
                row_data = [td.get_text(strip=True) for td in row.find_all(['td', 'th'])]
                data.append(row_data)

            columns = [th.get_text(strip=True) for th in table.find('thead').find_all('th')]
            batting_df = pd.DataFrame(data, columns=columns)
            batting_df['Team'] = team_name
            batting_data.append(batting_df)
            time.sleep(10)

    if batting_data:
        result_df = pd.concat(batting_data, ignore_index=True)
        return result_df
    else:
        return None

def get_pos_df(year, teams_by_year):
    positions_data = []
    teams = teams_by_year[year]
    for team_url, team_name in teams:
        response = requests.get(team_url)
        soup = BeautifulSoup(response.content, 'html.parser')

        table = soup.find('table', {'class': 'sortable stats_table', 'id': 'team_batting'})

        if table is not None:
            for row in table.find('tbody').find_all('tr'):
                player_name_tag = row.find('td', {'data-stat': 'player'})
                player_name = player_name_tag.get_text(strip=True)
                player_href = player_name_tag.find('a')['href'] if player_name_tag.find('a') else None

                if player_href:
                    player_page_url = f"{base_url}{player_href}"
                    player_page_response = requests.get(player_page_url)
                    player_page_soup = BeautifulSoup(player_page_response.content, 'html.parser')

                    positions_tag = player_page_soup.find('div', {'class': 'players'})

                    if positions_tag:
                        position = positions_tag.find('p').text
                        positions_data.append({'Name': player_name, 'Position': position})
        time.sleep(10)
    position_df = pd.DataFrame(positions_data)
    position_df['Position'] = position_df['Position'].str.replace('\nPosition:\n', '')
    position_df['Position'] = position_df['Position'].str.replace('\nPositions:\n ', '').str.strip()
    position_df['Position'] = position_df['Position'].str.replace(',', '').str.split().str[0]
    position_df = position_df[~position_df.Position.isin(['Pitcher', np.nan, 'Relief', 'Starting', 'Name'])]
    position_df['Name'] = position_df['Name'].str.replace('*', '')
    position_df['Name'] = position_df['Name'].str.replace('#', '')
    position_df['Name'] = position_df['Name'].str.replace('?', '')

    return position_df

In [3]:
tby = get_teams_by_year()
batting_23 = get_batting_df('2023', tby)
batting_22 = get_batting_df('2022', tby)
batting_21 = get_batting_df('2021', tby)


KeyboardInterrupt



In [None]:
pitching_23 = get_pitching_df('2023', tby)
pitching_22 = get_pitching_df('2022', tby)
pitching_21 = get_pitching_df('2021', tby)

In [15]:
tby = get_teams_by_year()
positions_df = get_pos_df('2023', tby)

In [None]:
batting_23.to_csv('../../../Desktop/Joliet Slammers/Baseball Reference/2023 Frontier League Hitting.csv')
batting_22.to_csv('../../../Desktop/Joliet Slammers/Baseball Reference/2022 Frontier League Hitting.csv')
batting_21.to_csv('../../../Desktop/Joliet Slammers/Baseball Reference/2021 Frontier League Hitting.csv')

pitching_23.to_csv('../../../Desktop/Joliet Slammers/Baseball Reference/2023 Frontier League Pitching.csv')
pitching_22.to_csv('../../../Desktop/Joliet Slammers/Baseball Reference/2022 Frontier League Pitching.csv')
pitching_21.to_csv('../../../Desktop/Joliet Slammers/Baseball Reference/2021 Frontier League Pitching.csv')

In [17]:
positions_df.to_csv('../../../Desktop/Joliet Slammers/Baseball Reference/2023 Frontier League Positions.csv')