In [1]:
import requests
import json
import html
import pandas as pd
import time
import re
import unicodedata



In [2]:
years = ['2021','2022', '2023',  '2024']
races = ['TOR330' , 'TOR450' ]
delay_seconds = 15

## Scraping TORX Data from the website

In [3]:
# for race in races:
#     for year in years:
#         # URL of the JSON file
#         url = f'https://100x100trail.com/json/{race}{year}.json'

#         # Send a GET request to fetch the JSON data
#         response = requests.get(url)

#         # Check if the request was successful (status code 200)
#         if response.status_code == 200:
#             # Parse the JSON data
#             data = response.json()

#             # Optionally, save the data to a JSON file
#             with open(f'{race} Data/100x100trail/JSON/{race}_{year}.json', 'w') as f:
#                 json.dump(data, f, indent=4)

#             print(f"Data saved to '{race}_{year}.json'")
#         else:
#             print(f"Failed to retrieve data. Status code: {response.status_code}")

#         time.sleep(delay_seconds)

## Extracting Data from JSON data

In [4]:
# Prepare a list to hold the rows for the DataFrame
rows = []
aid_station_list = []

# Function to parse each entry
def parse_data(entries, race, year):
    n = 0
    for entry in entries:
        attributes = entry["data"]["attributes"]
        name = html.unescape(attributes["name"]).title()
        team = attributes["team"]
        bib_number = attributes["pettorale"]
        sex = attributes["sesso"]
        nationality = attributes["nazionalita"]
        category = attributes["categoria"]
        finisher_status = attributes["finisher"]
        
        race = f'{race}'
        year = f'{year}'
        
        #"Bib", "Name", "Team", "Sex",  'Nationality','Race', 'Status' 
        all_info = [ bib_number, name, team, sex, nationality, race,year, finisher_status ]
        # Append the row to the list
        rows.append(all_info)

        # Extracting times and places
        events = entry["data"]["relationships"]["events"]["data"]

        for event in events:
            place = event['title']
            time = event['start_date']

            aid_station_info = [name, bib_number,  sex, nationality,race,year,
                                category, place, time, finisher_status]
        
#             print(aid_station_info)
            aid_station_list.append(aid_station_info)



In [5]:
# Function to remove special characters
def normalize_name(name):
    # Normalize to remove accents and special characters
    name = unicodedata.normalize('NFD', name)
    # Encode to ASCII and ignore errors, then decode back to string
    name = name.encode('ascii', 'ignore').decode('utf-8')
    return name

## Looping through scraped data

In [8]:
TORX_df = {}
aid_station_TORX_df = {}


for race in races:
    for year in years:
        rows=[]
        aid_station_list = []

        # Load the JSON data from a file
        with open(f'{race} Data/100x100trail/JSON/{race}_{year}.json', 'r') as file:
            data = json.load(file)

            # parse data
            parse_data(data, race, year)

            # Create a DataFrame
            df = pd.DataFrame(rows, columns=[
        #        bib_number, name, team, gender, nationality, race, finisher_status
               "Bib", "Name", "Team", "Sex",  'Nationality','Race','Year',  'Status' ])
            
#             print(df.info())

            # Create a DataFrame
            aid_station_df = pd.DataFrame(aid_station_list, columns=[
        #    name, bib_number,  sex, nationality,race,year,
#           category, place, time, finisher_status
               "Name", "Bib", 'Sex', 'Nationality','Race', 'Year',
                "Category",  'Place', 'Time', 'Status' ])    

            df['Name'] = df['Name'].str.strip()
            df['Name'] = df['Name'].str.replace(',', '')
            df['Name'] = df['Name'].str.replace('\'', ' ')

            df['Name'] = df['Name'].apply(normalize_name)
            aid_station_df['Name'] = aid_station_df['Name'].apply(normalize_name)
            
            df.to_excel(f'{race} Data/100x100trail/{race}_{year}.xlsx' , index = False)
            aid_station_df.to_excel(f'{race} Data/100x100trail/aid_station_{race}_{year}.xlsx' , index = False)
            
            TORX_df[f'{race}_{year}'] = df
            aid_station_TORX_df[f'{race}_{year}'] = aid_station_df
                
            print(f"Data saved to f'{race}_{year}.xlsx'")

            print('*'*30)



Data saved to f'TOR330_2021.xlsx'
******************************
Data saved to f'TOR330_2022.xlsx'
******************************
Data saved to f'TOR330_2023.xlsx'
******************************
Data saved to f'TOR330_2024.xlsx'
******************************
Data saved to f'TOR450_2021.xlsx'
******************************
Data saved to f'TOR450_2022.xlsx'
******************************
Data saved to f'TOR450_2023.xlsx'
******************************
Data saved to f'TOR450_2024.xlsx'
******************************


## Making sense of abbreviated nationalities

In [None]:
TORX_df[f'{race}_{year}']

In [None]:
# Initialize a set to store unique nationalities
unique_nationalities = set()

for race in races:
    for year in years:
        # Get unique nationalities for the year and update the set
        df_nationalities = TORX_df[f'{race}_{year}']['Nationality'].unique()
        unique_nationalities.update(df_nationalities)

    # Convert the set back to a sorted list
unique_nationalities_list = sorted(unique_nationalities)

print(len(unique_nationalities_list))
print(unique_nationalities_list)


In [None]:
# Dictionary for updated nationality abbreviations
country_mapping = {
    'AD': 'Andorra', 'AE': 'United Arab Emirates', 'AI': 'Anguilla', 'AQ': 'Antarctica',
    'AR': 'Argentina', 'AT': 'Austria', 'AU': 'Australia', 'BE': 'Belgium', 'BG': 'Bulgaria',
    'BN': 'Brunei', 'BO': 'Bolivia', 'BR': 'Brazil', 'CA': 'Canada', 'CH': 'Switzerland',
    'CL': 'Chile', 'CN': 'China', 'CO': 'Colombia', 'CR': 'Costa Rica', 'CY': 'Cyprus',
    'CZ': 'Czech Republic', 'DE': 'Germany', 'DK': 'Denmark', 'EC': 'Ecuador', 'EE': 'Estonia',
    'ES': 'Spain', 'FI': 'Finland', 'FR': 'France', 'GB': 'United Kingdom', 'GF': 'French Guiana',
    'GP': 'Guadeloupe', 'GR': 'Greece', 'GT': 'Guatemala', 'HK': 'Hong Kong', 'HR': 'Croatia',
    'HU': 'Hungary', 'ID': 'Indonesia', 'IE': 'Ireland', 'IL': 'Israel', 'IM': 'Isle of Man',
    'IR': 'Iran', 'IS': 'Iceland', 'IT': 'Italy', 'JO': 'Jordan', 'JP': 'Japan', 'KR': 'South Korea',
    'KZ': 'Kazakhstan', 'LT': 'Lithuania', 'LV': 'Latvia', 'MA': 'Morocco', 'MC': 'Monaco',
    'ME': 'Montenegro', 'MO': 'Macau', 'MQ': 'Martinique', 'MT': 'Malta', 'MU': 'Mauritius',
    'MX': 'Mexico', 'MY': 'Malaysia', 'NC': 'New Caledonia', 'NL': 'Netherlands', 'NO': 'Norway',
    'NZ': 'New Zealand', 'PE': 'Peru', 'PF': 'French Polynesia', 'PH': 'Philippines', 'PL': 'Poland',
    'PT': 'Portugal', 'QA': 'Qatar', 'RE': 'Réunion', 'RO': 'Romania', 'RS': 'Serbia', 'RU': 'Russia',
    'SA': 'Saudi Arabia', 'SE': 'Sweden', 'SG': 'Singapore', 'SI': 'Slovenia', 'SK': 'Slovakia',
    'SM': 'San Marino', 'TH': 'Thailand', 'TR': 'Turkey', 'TW': 'Taiwan', 'UA': 'Ukraine',
    'US': 'United States', 'UY': 'Uruguay', 'VE': 'Venezuela', 'VN': 'Vietnam', 'ZA': 'South Africa'
}

In [None]:
# Convert dictionary to DataFrame
country_df = pd.DataFrame(list(country_mapping.items()), columns=['Nationality', 'Nationality Name'])

print(country_df)

country_df .to_excel(f'Database Data/TORX_100x100trail_nationality_table.xlsx', sheet_name = 'Nationality Code', index = False)