In [10]:
# config.py
import os
import requests
import pandas as pd
import concurrent.futures
import json

# Use an environment variable for security, or replace with your key directly.
API_KEY = os.getenv("TRI_API_KEY", "your_api_key_here")

HEADERS = {"apikey": API_KEY}
NUMBER_OF_ATHLETES = 10

BASE_URL = "https://api.triathlon.org/v1"
ATHLETE_SEARCH_URL = f"{BASE_URL}/search/athletes"
ATHLETE_RESULTS_URL = f"{BASE_URL}/athletes/{{athlete_id}}/results"
ATHLETE_DATA_URL = f"{BASE_URL}/athletes/{{athlete_id}}?ouput=basic"
RANKING_URL = f"{BASE_URL}/rankings/{{ranking_id}}?limit={NUMBER_OF_ATHLETES}"

In [27]:
def get_athlete_id(athlete_name):
    """
    Fetch athlete ID based on the user-provided name.

    Parameters:
        athlete_name (str): The name of the athlete to search for.
    Returns:
        str/int: The athlete ID if found.
    """
    params = {"query": athlete_name}

    response = requests.get(ATHLETE_SEARCH_URL, params=params, headers=HEADERS)
    
    if response.status_code == 200:
        data = response.json().get("data", [])
        if data:
            return data[0]["athlete_id"]
        else:
            raise ValueError(f"Athlete '{athlete_name}' not found.")
    else:
        raise Exception(f"Search API failed with status code {response.status_code}")

In [11]:
def get_athlete_data(athlete_id):
    """
    Retrieve full athlete details and return them as a structured DataFrame.

    Parameters:
        athlete_id (int): The athlete's ID.

    Returns:
        pd.DataFrame: DataFrame containing athlete details.
    """
    url = ATHLETE_DATA_URL.format(athlete_id=athlete_id)
    response = requests.get(url, headers=HEADERS)

    if response.status_code == 200:
        data = response.json().get("data", {})

        categories_raw = data.get("categories", "{}")
        categories = json.loads(categories_raw)  # Parse JSON string

        # Extract relevant athlete details
        athlete_info = {
            "athlete_id": data.get("athlete_id"),
            "full_name": data.get("athlete_full_name"),
            "gender": data.get("athlete_gender"),
            "country_name": data.get("athlete_country_name"),
            "age": data.get("athlete_age"),
            "category_to": categories.get("to", False),  # Extract specific categories
            "category_coach": categories.get("coach", False),
            "category_athlete": categories.get("athlete", False),
            "category_medical": categories.get("medical", False),
            "category_paratriathlete": categories.get("paratriathlete", False)
        }

    return pd.DataFrame([athlete_info]) 

In [12]:
 # Example athlete IDs
get_athlete_data(5595)

Unnamed: 0,athlete_id,full_name,gender,country_name,age,category_to,category_coach,category_athlete,category_medical,category_paratriathlete
0,5595,Trent Chapman,male,Australia,48,False,False,True,False,False


In [35]:
def process_race_data(page_data):
    """Convert a list of race events (from one page) into a DataFrame."""
    records = []
    for event in page_data:
        splits = event.get("splits", [])
        records.append({
            "EventID": event.get("event_id"),
            "EventName": event.get("event_title"),
            "Venue": event.get("event_venue"),
            "EventDate": event.get("event_date"),
            "Country": event.get("event_country"),
            "Position": event.get("position"),
            "CategoryName": ", ".join([cat.get("cat_name") for cat in event.get("event_categories", [])]) 
                            if event.get("event_categories") else "Unknown",
            "TotalTime": event.get("total_time"),
            "SwimTime": splits[0] if len(splits) > 0 else None,
            "T1": splits[1] if len(splits) > 1 else None,
            "BikeTime": splits[2] if len(splits) > 2 else None,
            "T2": splits[3] if len(splits) > 3 else None,
            "RunTime": splits[4] if len(splits) > 4 else None
        })
    return pd.DataFrame(records)

def get_athlete_results(athlete_id):
    """Retrieve and process all race results for an athlete using pagination."""
    url = ATHLETE_RESULTS_URL.format(athlete_id=athlete_id)
    dataframes = []  # List to store DataFrames from each page

    while url:
        response = requests.get(url, headers=HEADERS)
        if response.status_code == 200:
            data = response.json()
            page_data = data.get("data", [])
            # Process the current page’s event data immediately
            df_page = process_race_data(page_data)
            dataframes.append(df_page)
            
            # Update URL for the next page (if available)
            url = data.get("next_page_url", None)  # Use next_page_url for pagination
        else:
            raise Exception(f"Athlete Results API failed: {response.status_code}")

    # Concatenate all the DataFrames into a single DataFrame
    if dataframes:
        final_df = pd.concat(dataframes, ignore_index=True)
    else:
        final_df = pd.DataFrame()

    return final_df

# Example usage:
athlete_id = 51201  # Replace with your actual athlete ID
df_final = get_athlete_results(athlete_id)

In [None]:
def get_ranking_data(rank):
    url = RANKING_URL.format(ranking_id=rank)
    # Although our HEADERS already include the API key, you could update headers here if needed.
    response = requests.get(url, headers=HEADERS)
    if response.status_code == 200:
        return response.json()
    else:
        raise Exception(f"Ranking API failed with status: {response.status_code}")

def get_top_athlete_ids(rank):
    
    ranking_data = get_ranking_data(rank)
    # In the JSON returned by the rankings endpoint, we expect the rankings to be under:
    # "data" -> "rankings". Adjust this if your API differs.
    rankings = ranking_data.get("data", {}).get("rankings", [])
    athlete_ids = [entry["athlete_id"] for entry in rankings]
    return athlete_ids


#male_ids = get_top_athlete_ids(13)
#female_ids = get_top_athlete_ids(14)

Top Male Athlete IDs: [104034, 96742, 30249, 63162, 94446, 80795, 86042, 55495, 105480, 74419, 11143, 98121, 132889, 54985, 125064, 164894, 49411, 49390, 92397, 128063, 103632, 96040, 167968, 63428, 98098, 63564, 92430, 39264, 63429, 110784, 111852, 104128, 74423, 76434, 67535, 98547, 83434, 106885, 104983, 96667, 162549, 33705, 135049, 79706, 56153, 56550, 106595, 116279, 110356, 123392, 123538, 125294, 69143, 103634, 80601, 165479, 116270, 155563, 90031, 103124, 98130, 158362, 127901, 93364, 50700, 94517, 63402, 86080, 61357, 83242, 41322, 83081, 127536, 111851, 94563, 144208, 51605, 144053, 115699, 103258, 54025, 56027, 113561, 123682, 115570, 139770, 47611, 40311, 21622, 41870, 125302, 63651, 103530, 104247, 156380, 171947, 173756, 163852, 93402, 58566, 109797, 106994, 71738, 163850, 176644, 122645, 39974, 16392, 102726, 124975, 127186, 55960, 49084, 73078, 144247, 80663, 125287, 132948, 165278, 11001, 95497, 123620, 165596, 107730, 42996, 106946, 106582, 85863, 23656, 164818, 4183

In [None]:
# master_data_import.py
import pandas as pd
import concurrent.futures
from ranking import get_top_athlete_ids
from race_results import get_athlete_results

def fetch_all_athlete_results(athlete_ids, max_workers=10):
    """
    Retrieve race results concurrently for multiple athlete IDs.
    
    Parameters:
      athlete_ids (list): List of athlete IDs to fetch results for.
      max_workers (int): Maximum threads for concurrent execution.
      
    Returns:
      list: A list of DataFrames for each athlete's race results.
    """
    dataframes = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit all tasks to executor
        future_to_id = {executor.submit(get_athlete_results, athlete_id): athlete_id 
                        for athlete_id in athlete_ids}
        
        for future in concurrent.futures.as_completed(future_to_id):
            athlete_id = future_to_id[future]
            try:
                df = future.result()
                if not df.empty:
                    dataframes.append(df)
            except Exception as exc:
                print(f"Error fetching data for athlete {athlete_id}: {exc}")
    return dataframes

def get_all_top_race_data():
    """
    Retrieve race data for the top male and female athletes concurrently.
    
    Returns:
      pd.DataFrame: Consolidated DataFrame containing race results from top athletes.
    """
    print("Fetching top male athlete IDs...")
    male_ids = get_top_athlete_ids(13)
    print(f"Found {len(male_ids)} male athletes.")

    print("Fetching top female athlete IDs...")
    female_ids = get_top_athlete_ids(14)
    print(f"Found {len(female_ids)} female athletes.")

    all_ids = male_ids + female_ids
    print(f"Fetching race results for {len(all_ids)} athletes concurrently...")
    df_list = fetch_all_athlete_results(all_ids, max_workers=10)  # adjust max_workers as needed
    if df_list:
        return pd.concat(df_list, ignore_index=True)
    else:
        return pd.DataFrame()

def save_to_database(df, db_filename="triathlon_results.db"):
    """
    Save the final DataFrame to a SQLite database using SQLAlchemy.
    
    Parameters:
      df (pd.DataFrame): The DataFrame with race results.
      db_filename (str): The SQLite database file name.
    """
    from sqlalchemy import create_engine
    engine = create_engine(f"sqlite:///{db_filename}")
    df.to_sql("race_results", engine, if_exists="append", index=False)
    print(f"Data saved to database '{db_filename}' in table 'race_results'.")

In [None]:
def main():
    print("Choose your option:")
    print("1. Search by athlete name (user input lookup)")
    print("2. Import top athlete data (top male and female athletes)")
    choice = input("Enter 1 or 2: ").strip()

    if choice == "1":
        athlete_name = input("Enter athlete name: ")
        try:
            athlete_id = get_athlete_id(athlete_name)
            print(f"Found athlete ID: {athlete_id} for '{athlete_name}'.")
            df = get_athlete_results(athlete_id)
            filename = f"{athlete_name}_past_races.csv"
            df.to_csv(filename, index=False)
            print(f"Data saved as '{filename}'.")
        except Exception as e:
            print("Error:", e)
    elif choice == "2":
        final_df = get_all_top_race_data()
        if not final_df.empty:
            print("Import complete. Saving data to database...")
            save_to_database(final_df)
        else:
            print("No data collected.")
    else:
        print("Invalid choice. Exiting.")

if __name__ == "__main__":
    main()

Choose your option:
1. Search by athlete name (user input lookup)
2. Import top athlete data (top male and female athletes)
Fetching top male athlete IDs...
Fetching top male athlete IDs...
Found 1000 male athletes.
Fetching top female athlete IDs...
Found 1000 male athletes.
Fetching top female athlete IDs...
Found 917 female athletes.
Fetching race data for athlete 104034 (1/1917)...
Found 917 female athletes.
Fetching race data for athlete 104034 (1/1917)...
Fetching race data for athlete 96742 (2/1917)...
Fetching race data for athlete 96742 (2/1917)...
Fetching race data for athlete 30249 (3/1917)...
Fetching race data for athlete 30249 (3/1917)...
Fetching race data for athlete 63162 (4/1917)...
Fetching race data for athlete 63162 (4/1917)...
Fetching race data for athlete 94446 (5/1917)...
Fetching race data for athlete 94446 (5/1917)...
Fetching race data for athlete 80795 (6/1917)...
Fetching race data for athlete 80795 (6/1917)...
Fetching race data for athlete 86042 (7/1917

KeyboardInterrupt: 