In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [4]:
#from athletic net inspect element
#this website: https://www.athletic.net/CrossCountry/rankings/list/63892/m/5000
paid_table_logic = {
    'year_codes' : {'2024': '73596','2023': '68585','2022': '63892','2010': '11527'},
    'sports' : {'CrossCountry'},
    'gender' : {'m', 'f'},
    'distance' : {'5000'}, #tons of legnths.
    'high_school_only' : 'restrict=true'

}


In [5]:
team_url_parameters = {
    'sport': ['cross-country'],
    'country': ['usa'],
    'level': ['club', 'college', 'high-school', 'middle-school'],
    'state': ['alabama', 'alaska', 'arizona', 'arkansas', 'california', 'colorado', 
               'connecticut', 'delaware', 'florida', 'georgia', 'hawaii', 'idaho',
               'illinois', 'indiana', 'iowa', 'kansas', 'kentucky', 'louisiana',
               'maine', 'maryland', 'massachusetts', 'michigan', 'minnesota',
               'mississippi', 'missouri', 'montana', 'nebraska', 'nevada',
               'new-hampshire', 'new-jersey', 'new-mexico', 'new-york',
               'north-carolina', 'north-dakota', 'ohio', 'oklahoma', 'oregon',
               'pennsylvania', 'rhode-island', 'south-carolina', 'south-dakota',
               'tennessee', 'texas', 'utah', 'vermont', 'virginia', 'washington',
               'west-virginia', 'wisconsin', 'wyoming'],
    'year': list(range(2005, 2025))
}

In [6]:
def construct_state_urls(team_url_parameters, select_params=None):
    base_url = "https://www.athletic.net"
    urls = []

    # Helper function to get the selected values or the entire list
    def get_values(param, select_params):
        if select_params and param in select_params:
            selected = select_params[param]
            if isinstance(selected, list):
                return selected
            return [selected]
        return team_url_parameters[param]

    countries = get_values("country", select_params)
    sports = get_values("sport", select_params)
    levels = get_values("level", select_params)
    states = get_values("state", select_params)
    years = get_values("year", select_params)

    for country in countries:
        for sport in sports:
            for level in levels:
                for state in states:
                    for year in years:
                        url = f"{base_url}/{sport}/{country}/{level}/{state}/{year}"
                        urls.append(url)

    return urls

In [7]:
kurtis_params = {'year': [2011], 
                 'state': 'oregon',
                 'level': 'high-school',
                 'country': 'usa',
                 'sport': 'cross-country'}
state_year_urls = construct_state_urls(team_url_parameters, select_params= kurtis_params)

In [8]:
# Generate GET requests to each URL
responses = []
for url in state_year_urls:
    response = requests.get(url, verify=False)
    responses.append(response)
    print(f"URL: {url} - Status Code: {response.status_code}")

# Optionally, process the responses
for response in responses:
    if response.status_code == 200:
        # Process the response content
        print(response.content)
    else:
        print(f"Failed to retrieve {response.url}")

URL: https://www.athletic.net/cross-country/usa/high-school/oregon/2011 - Status Code: 200
b'\r\n\r\n<!DOCTYPE html>\r\n<html id="html" lang="en" xmlns="http://www.w3.org/1999/xhtml">\r\n\r\n<head id="Head1">\r\n    <base href="/" />\r\n    <script type=\'text/javascript\'>\r\n        var googletag = googletag || {};\r\n        googletag.cmd = googletag.cmd || [];\r\n\r\n        var anetadslots = anetadslots || [];\r\n    </script>\r\n\r\n    <!-- Quantcast Tag, part 1 - part 2 in footer-->\r\n    <script type="text/javascript">\r\n        var _qevents = _qevents || []; (function () { var a = document.createElement("script"); a.src = ("https:" == document.location.protocol ? "https://secure" : "http://edge") + ".quantserve.com/quant.js"; a.async = !0; a.type = "text/javascript"; var b = document.getElementsByTagName("script")[0]; b.parentNode.insertBefore(a, b) })();\r\n    </script>\r\n\r\n    <title>\r\n\tTrack & Field, Cross Country Results, Statistics\r\n</title><meta name="applica



In [9]:
# Generate GET requests to each URL and find the href with class 'team-link me-1'
data = []
for url in state_year_urls:
    response = requests.get(url, verify=False)
    print(f"URL: {url} - Status Code: {response.status_code}")

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        team_link = soup.find('a', class_='team-link me-1')
        if team_link and 'href' in team_link.attrs:
            href = team_link['href']
            if href.startswith("/team/") and "/cross-country" in href:
                data.append({"url": url, "team_link": href})
        else:
            data.append({"url": url, "team_link": None})
    else:
        data.append({"url": url, "team_link": None})

# Create a DataFrame from the collected data
df = pd.DataFrame(data)
print(df)



URL: https://www.athletic.net/cross-country/usa/high-school/oregon/2011 - Status Code: 200
                                                 url team_link
0  https://www.athletic.net/cross-country/usa/hig...      None


In [11]:
# Save response content to a text file

if response.status_code == 200:
    with open(f'response_2011_oregon.txt', 'w', encoding='utf-8') as file:
        file.write(response.text)
else:
    print(f"Failed to retrieve {response.url}")