In [None]:

# 1. Import required libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
from random import randint
from time import sleep
from numpy import savetxt
import csv

# 2. Load complete list of schools
with open("all_teams.json") as f:
    all_schools_and_genders = json.load(f)
    f.close()

def parse_athlete_performance(html_content, team):
    """Parses HTML content to extract athlete performance details
    
    Parameters:
    - html_content (str): HTML content of the web page
    - team (str): The name of the team to include in the performance details
    
    Returns:
    - List[Dict]: A list of dictionaries where each dictionary contains performance details of an athlete """
    
    soup = BeautifulSoup(html_content, 'html.parser')
    performances = []

    table = soup.find('table', {'class': 'tablesaw'})
    if not table:
        return performances

    for row in table.find_all('tr')[1:]:
        columns = row.find_all('td')
        if len(columns) == 4:
            performance = {
                'Event Name': columns[0].text.strip(),
                'Athlete Name': columns[1].text.strip(),
                'Team': team,
                'Mark': columns[3].text.strip(),
                'Event Type': 'Individual' if len(columns[1].find_all('a')) == 1 else 'Relay'
            }

            performances.append(performance)

    return performances

# initialize back to empty list
all_performances = []


# add already parsed data from csv file
with open('performance_data.csv', 'r') as f:
    reader = csv.DictReader(f)
    for row in reader:
        all_performances.append(row)
    f.close()

# standard formatting of links in tfrrs
base_url = 'https://www.tfrrs.org'

# user agent so we don't get caught :)
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'}

restart_url = ""

fields = ['Event Name', 'Athlete Name', 'Team', 'Mark', 'Event Type']
            
# load to csv file while parsing
with open("performance_data.csv", "a") as f:
    writer = csv.DictWriter(f, fieldnames=fields)
    writer.writeheader()

    # loop through each school and gender to fetch and parse performances
    for school in all_schools_and_genders:
        team_name = school["team"]
        print(f"Getting data for {team_name}")
        url = base_url + school["url"]

        if (restart_url == "") or (restart_url == url):
                try:
                    restart_url = ""
                    response = requests.get(url, headers=headers, timeout=5)
                    if response.status_code == 200:
                        performances = parse_athlete_performance(response.text, team_name)
                        all_performances.extend(performances)
                        writer.writerows(performances)
                    else:
                        print(f"No data for {team_name}")
            
                    # sleep between each call so we don't get caught ;)
                    random_delay = randint(50, 175)/100
                    sleep(random_delay)
                except requests.exceptions.Timeout:
                    restart_url = url
                    print(restart_url)

    f.close()

# convert it into a dataframe
df = pd.DataFrame(all_performances)
df.head()

In [19]:
import requests
from bs4 import BeautifulSoup
import json
import csv
from random import randint
from time import sleep

# Load JSON list of team names and TFRRS URLs
with open("all_teams.json") as f:
    all_schools_and_genders = json.load(f)

# Function to scrape athlete IDs for a given TFRRS team URL
def scrape_athlete_ids(tfrrs_url, writer):
    response = requests.get(tfrrs_url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract athlete names and URLs
        athletes = soup.select('.tablesaw a')
        for athlete in athletes:
            athlete_name = athlete.text.strip()
            athlete_url = athlete['href']
            if athlete_url.startswith("/athletes"):
                print(athlete_url)
                writer.writerow({'Team': team_name, 'Athlete Name': athlete_name, 'Athlete URL': athlete_url})

    else:
        print("Failed to retrieve data")
    
    # Sleep between each call
    random_delay = randint(50, 175) / 100
    sleep(random_delay)

# Standard formatting of links in TFRRS
base_url = 'https://www.tfrrs.org'

# Load to CSV file while parsing
with open("all_athletes.csv", "a", newline='') as f:
    fields = ['Team', 'Athlete Name', 'Athlete URL']
    writer = csv.DictWriter(f, fieldnames=fields)
    writer.writeheader()

    # Scrape athlete IDs for each team
    for school in all_schools_and_genders:
        team_url = school["url"]
        team_name = school["team"]
        url = base_url + team_url
        scrape_athlete_ids(url, writer)
        print(f"Getting data for {team_name}")

/athletes/7791928/Tufts/Sarah_Pillone
/athletes/8020992/Tufts/Annika_Kral
/athletes/8020992/Tufts/Annika_Kral
/athletes/8629627/Tufts/Kaya_West
/athletes/8255094/Tufts/Alexandra_Dean
/athletes/8255101/Tufts/Grace_Ogden
/athletes/7791928/Tufts/Sarah_Pillone
/athletes/7927553/Tufts/Emma_Smith
/athletes/7791932/Tufts/Sydney_Spivy
/athletes/8021004/Tufts/Sarah_Firth
/athletes/7791931/Tufts/Madelyn_Silveira
/athletes/7791928/Tufts/Sarah_Pillone
/athletes/7791932/Tufts/Sydney_Spivy
/athletes/8700973/Tufts/Cecilia_LaBarge
/athletes/8020981/Tufts/Elizabeth_Graham
/athletes/7791931/Tufts/Madelyn_Silveira
/athletes/7791931/Tufts/Madelyn_Silveira
/athletes/8020978/Tufts/Jordan_Andrew.html
/athletes/8629619/Tufts/Sophia_Carpenter.html
/athletes/8632038/Tufts/Mia_Corso.html
/athletes/8020979/Tufts/Lily_Davis.html
/athletes/7408515/Tufts/Meghan_Davis.html
/athletes/8255094/Tufts/Alexandra_Dean.html
/athletes/8255095/Tufts/Julia_Dolce.html
/athletes/8255096/Tufts/Elizabeth_Donahue.html
/athletes/8371

KeyboardInterrupt: 

In [27]:
import requests
from bs4 import BeautifulSoup
import json
import csv
from random import randint
from time import sleep

# Load JSON list of team names and TFRRS URLs
with open("all_teams.json") as f:
    all_schools_and_genders = json.load(f)

# Function to scrape athlete URLS for a given TFRRS team URL
def scrape_athlete_urls(html_content, team):
    
    soup = BeautifulSoup(html_content, 'html.parser')
    athletes = soup.select(".tablesaw a")
    for athlete in athletes:
            athlete_name = athlete.text.strip()
            athlete_url = athlete['href']
            if athlete_url.startswith("/athletes"):
                athlete = {'Team': team_name, 'Athlete Name': athlete_name, 'Athlete URL': athlete_url}
                athletes.append(athlete)
    
    return athletes

# initialize back to empty list
all_athletes = []


# add already parsed data from csv file
with open('all_athletes.csv', 'r') as f:
    reader = csv.DictReader(f)
    for row in reader:
        all_athletes.append(row)
    f.close()

# standard formatting of links in tfrrs
base_url = 'https://www.tfrrs.org'

# user agent so we don't get caught :)
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'}

restart_url = ""
fields = ['Team', 'Athlete Name', 'Athlete URL']
            
# load to csv file while parsing
with open("all_athletes.csv", "a") as f:
    writer = csv.DictWriter(f, fieldnames=fields)
    writer.writeheader()

    # loop through each school and gender to fetch and parse performances
    for school in all_schools_and_genders:
        team_name = school["team"]
        print(f"Getting data for {team_name}")
        url = base_url + school["url"]

        if (restart_url == "") or (restart_url == url):
                try:
                    restart_url = ""
                    response = requests.get(url, headers=headers, timeout=5)
                    if response.status_code == 200:
                        athletes = scrape_athlete_urls(url, team_name)
                        all_athletes.extend(athletes)
                        writer.writerows(athletes)
                    else:
                        print(f"No data for {team_name}")
            
                    # sleep between each call so we don't get caught ;)
                    random_delay = randint(50, 175)/100
                    sleep(random_delay)
                except requests.exceptions.Timeout:
                    restart_url = url
                    print(restart_url)

    f.close()

# convert it into a dataframe
df_athletes = pd.DataFrame(all_athletes)
df_athletes.head()

Getting data for Tufts (F)


  soup = BeautifulSoup(html_content, 'html.parser')


Getting data for Tufts (M)
Getting data for Barton County CC (M)
https://www.tfrrs.org/teams/tf/KS_jcollege_m_Barton_County_CC.html
Getting data for Barton County CC (F)
Getting data for Bronx CC (M)
Getting data for Bronx CC (F)
Getting data for Butler CC (M)
Getting data for Butler CC (F)
Getting data for Central Arizona (M)
Getting data for Central Arizona (F)
Getting data for Cloud County CC (M)
Getting data for Cloud County CC (F)
Getting data for Coffeyville CC (M)
Getting data for Coffeyville CC (F)
Getting data for Colby CC (M)
Getting data for Colby CC (F)
Getting data for Garden City CC (M)
Getting data for Garden City CC (F)
Getting data for Glendale CC (M)
Getting data for Glendale CC (F)
Getting data for RC - Gloucester CC (M)
Getting data for RC - Gloucester CC (F)
Getting data for Highland CC (M)
Getting data for Highland CC (F)
Getting data for Hinds CC (M)
Getting data for Hinds CC (F)
Getting data for Lansing CC (M)
Getting data for Lansing CC (F)
Getting data for Par

Unnamed: 0,Event Name,Athlete Name,Team,Mark,Event Type
0,Event Name,Athlete Name,Team,Mark,Event Type
1,Team,Athlete Name,Athlete URL,,
2,Team,Athlete Name,Athlete URL,,
3,Team,Athlete Name,Athlete URL,,
4,Team,Athlete Name,Athlete URL,,


In [28]:
df_athletes

(636, 5)