# CS109 EDA for supershoes project
This notebook is used to scrape NCAA D1 performances from tfrrs.org and save them to a csv file. It scrapes the top 50 performances for each gender in the 5000m, 3000m, and mile for the past 7 years 2016-2022. Data is saved to a csv file in the ./data directory for each year/event/gender.

In [5]:
import os
import time
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [6]:
# These dictionaries are used to convert the event 
# and year into the codes used in the url
event_dict = {
    "mile": "57",
    "3000m": "60",
    "5000m": "62",
}

year_dict = {
    "2022_2023": "3901",
    "2021_2022": "3492",
    "2020_2021": "3157",
    "2019_2020": "2770",
    "2018_2019": "2324",
    "2017_2018": "2124",
    "2016_2017": "1797",
}

In [7]:
# helper function to convert html row into dictionaries
# for easy transformation into dataframe
def row_to_dictionary(row, column_names):
    """
    Description:
        helper function to convert html row into dictionaries
        for easy transformation into dataframe

    args:
        row: html row from beautiful soup eg. ds.select("tbody tr")[0]
        column_names: list of column names eg. ["Rank", "Name", "Year", "School", "Performance"]
    """
    row_items = [item.text.strip() for item in row.select("td")]
    return dict(zip(column_names, row_items))


def fetch_table(event, year_range, gender):
    """
    Description:
        fetches the html table from tfrrs.org and converts it into a dataframe
        for the given event, year, and gender. Only gets top 50 performances.
        Note: only scrapes indoor performances, not outdoor
    args:
        event: string, event name eg. "mile" or "3000m"
        year_range: string, year range eg. "2022_2023"
        gender: string, "m" or "f"
    """
    # modify url and table tag for event, gender, and year
    url = f"https://tf.tfrrs.org/lists/{year_dict[year_range]}/{year_range}_NCAA_Div_I_Indoor_Qualifying_FINAL?gender={gender}"
    table_tag = f"div.row.gender_{gender}.standard_event_hnd_{event_dict[event]}"

    # performance list urls
    snapshot_url = url
    headers = {"User-Agent": "Mozilla/5.0"}
    snapshot_response = requests.get(snapshot_url, headers=headers)
    status_code = snapshot_response.status_code
    
    # check that the page is valid
    assert status_code == 200, f"Error: status code {status_code} for {snapshot_url}"

    # fetch the html and make it soupy
    performance_list = BeautifulSoup(snapshot_response.text, "html.parser")

    # select the event tag
    ds = performance_list.select_one(table_tag)

    # access column names
    columns = [
        item.text.replace(" ", "_") for item in ds.select_one("thead").select("th")
    ]

    # put in new column name for rank
    columns[0] = "Rank"

    # create dataframe of performances
    return pd.DataFrame(
        [row_to_dictionary(row, columns) for row in ds.select("tbody tr")]
    )
    
def time_to_sec(t):
    """
    Description: convert time string to seconds
        remove any weird symbols eg. @, #, \n
    """    
    t = t.strip("\n@\n#")
    m, s = t.split(':')
    return int(m) * 60 + float(s)

def preprocess(df):
    """
    Description: preprocess dataframe
        - convert time to seconds
    """
    df["Time_seconds"] = df['Time'].apply(time_to_sec)
    return df

In [9]:
# data directory
save_dir = "data/"

# Create a new directory if it doesnt exist
if not os.path.exists(save_dir):
   os.makedirs(path)
   
# loop through events, genders, and years
for gender in ["m", "f"]:
    for event in event_dict.keys():
        for year_range in year_dict.keys():
            file = f"{save_dir}{year_range}_{event}_{gender}.csv"
            # skip if file already exists
            if os.path.exists(file):
                continue
            df = preprocess(fetch_table(event, year_range, gender))
            df.to_csv(file, index=False)
            print(f"Saved {file}")
            time.sleep(1)