In [3]:
import requests
from requests.auth import HTTPProxyAuth
import time
session = requests.Session()
from dotenv import load_dotenv
import os

# Load the environment variables from .env file
load_dotenv('/home/wjones/CC/Capstone/tbd2/Track/.env', override=True)

proxy_host = os.getenv('proxy_host')
proxy_port = os.getenv('proxy_port')
proxy_user = os.getenv('proxy_user')
proxy_pass = os.getenv('proxy_pass')

proxies = {
    "http": f"http://{proxy_user}:{proxy_pass}@{proxy_host}:{proxy_port}/",
    "https": f"http://{proxy_user}:{proxy_pass}@{proxy_host}:{proxy_port}/"
}


url = 'https://www.tfrrs.org/results_search.html'

response = session.get(url, proxies=proxies)
html = response.content


In [2]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')

In [3]:
import csv
# FILEPATH: /home/wjones/CC/Capstone/tbd2/experiment_notebooks/web_scraping.ipynb
base_url = 'https://www.tfrrs.org/results_search_page.html'
params = {
    'page': 1,  # Start with page 1
    'search_query': '',  # Assuming you don't need a specific search query
    'with_month': '',  # Assuming you don't need a specific month
    'with_sports': 'track',  # Assuming you don't need a specific sport
    'with_states': '',  # Assuming you don't need a specific state
    'with_year': '2023'  # Set the year you want to filter by
}

# Open the CSV file in append mode
with open('csv/races_urls.csv', 'a', newline='') as csvfile:
    writer = csv.writer(csvfile)
    
    while True:
        response = session.get(base_url, params=params)
        soup = BeautifulSoup(response.content, 'html.parser')

        rows = soup.find_all('tr')
        for row in rows:
            a_tag = row.find('a')
            if a_tag and 'href' in a_tag.attrs:
                meet_name = a_tag.text.strip()
                meet_url = f"https://www.tfrrs.org{a_tag.attrs['href']}"
                
                # Write the meet name and URL to the CSV file
                writer.writerow([meet_name, meet_url])

        next_page_link = soup.find('a', rel='next')  # Adjust if the website uses different attributes
        if next_page_link and 'href' in next_page_link.attrs:
            params['page'] += 1  # Increment the page number
        else:
            break

        time.sleep(.5)


In [4]:
rows = soup.find_all('tr')

In [5]:
meets = []
for row in rows:
    # Find the <a> tag in this row
    a_tag = row.find('a')
    if a_tag and 'href' in a_tag.attrs:
        # Extract the URL and the meet name
        url = a_tag.attrs['href']
        meet_name = a_tag.text.strip()
        
        # Construct the full URL if needed (if the URL is relative)
        full_url = f"https://www.tfrrs.org{url}" if url.startswith('/') else url
        
        # Append the data to your list
        meets.append({
            'meet_name': meet_name,
            'tfrrs_url': full_url
        })


In [6]:
for meet in meets:
    print(meet)

{'meet_name': 'LSU Purple Tiger', 'tfrrs_url': 'https://www.tfrrs.org/results/76417/LSU_Purple_Tiger'}
{'meet_name': 'Bearson-Gathje Classic', 'tfrrs_url': 'https://www.tfrrs.org/results/76704/Bearson-Gathje_Classic'}
{'meet_name': '2023 Rod McCravy Memorial Track & Field', 'tfrrs_url': 'https://www.tfrrs.org/results/76211/2023_Rod_McCravy_Memorial_Track__Field_'}
{'meet_name': 'Potts Invitational', 'tfrrs_url': 'https://www.tfrrs.org/results/75281/Potts_Invitational_'}
{'meet_name': '2023 Graduate Classic', 'tfrrs_url': 'https://www.tfrrs.org/results/75972/2023_Graduate_Classic'}
{'meet_name': 'Dutch Early Bird Pentathlon', 'tfrrs_url': 'https://www.tfrrs.org/results/75985/Dutch_Early_Bird_Pentathlon'}
{'meet_name': 'Jimmy Carnes Invitational (College/College-Age)', 'tfrrs_url': 'https://www.tfrrs.org/results/75574/Jimmy_Carnes_Invitational_College_College-Age'}
{'meet_name': 'Monmouth Midwest Indoor Invitational', 'tfrrs_url': 'https://www.tfrrs.org/results/76732/_Monmouth_Midwest_In

In [7]:

events_we_care_about = ['800 Meters', '1500 Meters', '3000 Steeplechase', '5000 Meters', '10,000 Meters']

# Open the CSV written abobe
with open('csv/races_urls.csv', 'r') as file:
    reader = csv.reader(file)
    meet_urls = [row[1] for row in reader]  # the url should be in the second column...
results = []

for meet_url in meet_urls:
    time.sleep(.5) # server rest
    response = session.get(meet_url)
    soup = BeautifulSoup(response.content, 'html.parser')

    tables = soup.find_all('table', class_='tablesaw')
    for table in tables:
        rows = table.find_all('tr')
        for row in rows:
            a_tag = row.find('a')
            if a_tag and 'href' in a_tag.attrs:
                event_name = a_tag.text.strip()
                if any(event in event_name for event in events_we_care_about): # check if the event is one we care about
                    event_url = a_tag.attrs['href']
                    results.append({
                        'meet_url': meet_url,
                        'event_name': event_name,
                        'event_url': event_url
                    })
with open('csv/sections.csv', 'w', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=['meet_url', 'event_name', 'event_url'])
    writer.writeheader()
    writer.writerows(results)


In [8]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('csv/sections.csv')
results = []

for url in df.iloc[:, 2]:
    response = session.get(url) # get the html using the proxy session
    soup = BeautifulSoup(response.text, 'html.parser')

    table = soup.find('table', class_='tablesaw')
    if table:
        rows = table.find_all('tr')[1:]  # first skip the header row
        for row in rows:
            cols = row.find_all('td')
            if len(cols) > 4:  # Ensure there are enough columns
                place = cols[0].text.strip()
                athlete_name = cols[1].text.strip()
                athlete_url = cols[1].find('a')['href'] if cols[1].find('a') else None
                time = cols[4].text.strip()

                # string split the url on '/', the meet id is the third last element
                # NOTE the meet id in track races is the second numeric element and is 7 digits long
                # NOTE the meet id tffrs uses is not the same as the DB meet ID which is a primary key
                meet_id = url.split('/')[-3] if url else None
                result = {
                    'meet_id': meet_id,
                    'athlete_url': athlete_url,
                    'place': place,
                    'time': time
                }
                results.append(result)
                #print(result)

# write results to put in the raceResults table.
with open('csv/raceResults.csv', 'w', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=['meet_id', 'athlete_url', 'place', 'time'])
    writer.writeheader()
    writer.writerows(results)

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [6]:
import pandas as pd

# Load the data from the CSV files
df_sections = pd.read_csv('csv/sections.csv')
df_meet = pd.read_csv('csv/races_urls.csv')

# Merge the dataframes on the 'meet_url' column
df_merged = pd.merge(df_sections, df_meet, on='meet_url')

# Write the merged dataframe back to 'sections.csv'
df_merged.to_csv('csv/races.csv', index=False)

In [11]:
import csv
from bs4 import BeautifulSoup
import re
import time

months = [
    'January', 'February', 'March', 'April', 'May', 'June',
    'July', 'August', 'September', 'October', 'November', 'December'
]

with open('csv/races_urls.csv', 'r') as file:
    reader = csv.reader(file)
    meet_data = [row for row in reader]

updated_meet_data = []

for meet_name,meet_url in meet_data[1:]:
    # get the HTML
    response = session.get(meet_url, proxies=proxies)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract the race_id from the URL 
        # NOTE the race_id is the general id for the meet, not the section
        #meet_id = event_url.split('/')[-3]

        # Date and location are located in 'panel-heading-normal-text'
        elements = soup.find_all(class_='panel-heading-normal-text')
        
        date = None
        location = None
        # BUG: the location could contain a date, for example "Augustana" contains "August"
        for element in elements:
            text = element.get_text(strip=True)
            # Checks if any month is in the text and if the text contains a number
            if any(month in text for month in months) and ',' in text and re.search(r'\d', text):  
                date = text
            elif '-' in text:  # locations should always be dashed
                location = text
        updated_meet_data.append([meet_name, meet_url, date, location])
        #print([meet_name, meet_url, meet_id, date, location])
    else:
        print(f"Could not retrieve {meet_url}. Code: {response.status_code}")
    
    time.sleep(.5) # Server rest

with open('csv/updated_race_urls.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['meet_name', 'meet_url', 'date', 'location'])  # Write header
    writer.writerows(updated_meet_data)