## Web Scraper
We are scrapping links from direct atheletics (https://www.directathletics.com/rankings.html) to scrap data from the main athletic database (https://www.tfrrs.org/) 

In [2]:
# Download all necessary libraries
%pip install requests beautifulsoup4 pandas numpy matplotlib seaborn 



You should consider upgrading via the 'd:\Python\python.exe -m pip install --upgrade pip' command.





## Scrapping Team Links 

In [3]:
import requests 
from bs4 import BeautifulSoup
import pandas as pd 
import csv
import random
from random import randint
from time import sleep

# URL Link for NCCA Teams
ncaa_url = 'https://www.directathletics.com/leagues/track/48.html'
naia_url = 'https://www.directathletics.com/leagues/track/53.html'
tfrrs_url = 'https://www.tfrrs.org'

# User_Agents to avoid web scraping detection
USER_AGENTS = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36',
    'Mozilla/5.0 (iPhone; CPU iPhone OS 15_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.0 Mobile/15E148 Safari/604.1',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:94.0) Gecko/20100101 Firefox/94.0',
]

# Get all the team links from the page
def get_team_links(html_content):
    """Parses HTML content to extract team links
    
    Parameters:
    - html (str): HTML content of the web page
    
    Returns:
    - List[String]: A list of strings where each string is a team link """
    
    soup = BeautifulSoup(html_content, 'html.parser')
    team_links = []
    
    links = soup.findAll('a', {'class': 'pLinks'})
    
    for team in links:
        if 'teams' in team['href']:
            team_links.append(team['href'])
    
    return team_links

# Get all the atheltes links 
def get_all_atheltes_links(urls):
    """Extracts all athlete links from a list of team URLs
    
    Parameters:
    - urls (List[String]): A list of strings where each string is a team URL
    
    Returns:
    - List[String]: A list of strings where each string is an athlete link """
    
    athlete_links = []
    
    for url in urls:
        html = make_request(url)
        soup = BeautifulSoup(html, 'html.parser')
        link = soup.find('script').contents[0].split()[2][1:-2]
        athlete_links.append(get_athelte_links(link))
    
    return athlete_links

# Get all the athlete links from one team page
def get_athelte_links(html):
    """Parses HTML content to extract athlete links
    
    Parameters:
    - html (str): HTML content of the web page
    
    Returns:
    - List[String]: A list of strings where each string is an athlete link """
    
    # Make Request to the URL
    html_content = make_request(html)
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Get if the team is male or female 
    gender = 'M' if 'm' in html.split('_') else 'F'
    
    athlete_links = []
    athlete_info = []
    
    links = soup.findAll('a', {'data-turbo-frame': '_top', 'data-turbo': 'false'})
    
    for athlete in links:
        athelte_link = athlete['href']
        if 'athletes' in athelte_link and 'html' in athelte_link and 'tfrrs' not in athelte_link:
            athlete_links.append(tfrrs_url + athlete['href'])
            
    print(athlete_links)
            
    for athlete in athlete_links:
        athlete_info.append(get_athelte_info(athlete, gender))
    
    return athlete_info

# Function to rotate user-agents
def get_random_user_agent():
    return random.choice(USER_AGENTS)

# Function to make a request to a URL
def make_request(url):
    headers = {
        'User-Agent': get_random_user_agent()
    }
    
     # Delay Each Call
    random_delay = randint(50, 175)/100
    sleep(random_delay)
    
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        return response.text
    except requests.exceptions.RequestException as e:
        print(e)
        return None

# Get all the athlete information 
def get_athelte_info(html, gender):
    """Parses HTML content to extract athlete info
    
    Parameters:
    - html (str): HTML of the web page
    
    Returns:
    - Dict[]: A list of strings where each string is an athlete link """
    # Make Request to the URL
    html_content = make_request(html)
    soup = BeautifulSoup(html_content, 'html.parser')
    
    team = html.split('/')[-2]
    athlete_name = html.split('/')[-1].replace('_', ' ')[:-5]
    performances = []
    
    event_elements = soup.find_all(class_='table table-hover >')
    
    for event in event_elements:
        event_header = event.find('th')
        # event_name = event_header.find('a').text.strip()
        event_date = event_header.find('span').text.strip()
        event_month = event_date.split()[0]
        event_day = event_date.split()[1][:-1]
        if '-' in event_day:
            event_day = event_day.split('-')[0]
        event_year = event_date.split()[-1]
        
        date_string = f"{event_year}-{event_month}-{event_day}"
        
        for row in event.find_all('tr'):
            columns = row.find_all('td')
            if len(columns) == 3:
                performance = {
                    'Date': date_string,
                    'Gender': gender,
                    'Event Name': columns[0].text.strip(),
                    'Athlete Name': athlete_name,
                    'Team': team,
                    'Mark': columns[1].find('a').text.strip(),
                    'Event Type': 'Individual',
                }

                performances.append(performance)
    
    return performances

# Load data into csv file 
def csv_load(athlete_data):
    fields = ['Date', 'Gender', 'Event Name', 'Athlete Name', 'Team', 'Mark', 'Event Type']
    with open("all_athletes.csv", "a", newline='') as f:
        writer = csv.DictWriter(f, fieldnames=fields)
        writer.writeheader()
        
        # loop through teh data and write in csv
        writer.writerows(athlete_data)
            

In [6]:
# Get the HTML content of the NCAA and NAIA pages
ncaa_response = make_request(ncaa_url)
naia_response = make_request(naia_url)

# Get the team links from the HTML content
ncaa_teams = get_team_links(ncaa_response)
naia_teams = get_team_links(naia_response)    

all_ncaa_teams = ncaa_teams.append(naia_teams)

print(all_ncaa_teams)

# Get the athlete links from the team links
# athlete_data = get_all_atheltes_links(all_ncaa_teams)

# # TESTING
# print(all_ncaa_teams)


HTTPSConnectionPool(host='www.directathletics.com', port=443): Max retries exceeded with url: /leagues/track/48.html (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x00000294DB4E3B20>, 'Connection to www.directathletics.com timed out. (connect timeout=None)'))


KeyboardInterrupt: 