# TORX Data

In this code, the 100x100 database is scraped. The raw data is then saved to a folder with race and year. No cleaning is done.


In [1]:
import requests
import json
import html
import pandas as pd
import time
import re
import unicodedata



In [2]:
years = ['2021','2022', '2023',  '2024']
races = ['TOR330' ,
#          'TOR450',
#          'TOR130'
        ]
delay_seconds = 15

## Scraping TORX Data from the website

In [3]:
for race in races:
    for year in years:
        # URL of the JSON file
        url = f'https://100x100trail.com/json/{race}{year}.json'
        print(url)

        # Send a GET request to fetch the JSON data
        response = requests.get(url)

        # Check if the request was successful (status code 200)
        if response.status_code == 200:
            # Parse the JSON data
            data = response.json()

            # Optionally, save the data to a JSON file
            with open(f'{race} Data/1. 100x100trail/JSON/{race}_{year}.json', 'w') as f:
                json.dump(data, f, indent=4)

            print(f"Data saved to '{race}_{year}.json'")
        else:
            print(f"Failed to retrieve data. Status code: {response.status_code}")

        time.sleep(delay_seconds)

https://100x100trail.com/json/TOR3302021.json
Data saved to 'TOR330_2021.json'
https://100x100trail.com/json/TOR3302022.json
Data saved to 'TOR330_2022.json'
https://100x100trail.com/json/TOR3302023.json
Data saved to 'TOR330_2023.json'
https://100x100trail.com/json/TOR3302024.json
Data saved to 'TOR330_2024.json'


## Extracting Data from JSON data

In [4]:
# Prepare a list to hold the rows for the DataFrame
rows = []
aid_station_list = []

# Function to parse each entry
def parse_data(entries, race, year):
    n = 0
    for entry in entries:
        
        attributes = entry["data"]["attributes"]
        name = html.unescape(attributes["name"]).title()
        team = attributes["team"]
        bib_number = attributes["pettorale"]
        sex = attributes["sesso"]
        nationality = attributes["nazionalita"]
        category = attributes["categoria"]
        finisher_status = attributes["finisher"]
        
            # Extracting times and places
        events = entry["data"]["relationships"]["events"]["data"]

        for event in events:
            place = event['title']
            time = event['start_date']
        
            race = f'{race}'
            year = f'{year}'

            #"Bib", "Name", "Team", "Sex",  'Nationality','Race', 'Status' 
            all_info = [name, bib_number,  sex, nationality,race,year,
                                    category, place, time, finisher_status]
            # Append the row to the list
            rows.append(all_info)


In [5]:
# Function to remove special characters
def normalize_name(name):
    # Normalize to remove accents and special characters
    name = unicodedata.normalize('NFD', name)
    # Encode to ASCII and ignore errors, then decode back to string
    name = name.encode('ascii', 'ignore').decode('utf-8')
    return name

## Looping through scraped data

In [None]:
TORX_df = {}


for race in races:
    for year in years:
        rows=[]

        # Load the JSON data from a file
        with open(f'{race} Data/1. 100x100trail/JSON/{race}_{year}.json', 'r') as file:
            data = json.load(file)

            # parse data
            parse_data(data, race, year)

            # Create a DataFrame
            df =  pd.DataFrame(rows, columns=[
               "Name", "Bib", 'Sex', 'Nationality','Race', 'Year',
                "Category",  'Place', 'Time', 'Status' ])    

            df['Name'] = df['Name'].str.strip().str.title()
            df['Name'] = df['Name'].str.replace(',', '')
            df['Name'] = df['Name'].str.replace('\'', ' ')
            df['Name'] = df['Name'].str.replace('-', ' ')
            df['Name'] = df['Name'].str.replace(r"\s+", " ")
            df['Name'] = df['Name'].apply(normalize_name)
                
            
            df.to_excel(f'{race} Data/1. 100x100trail/{race}_{year}.xlsx' , index = False)
#             aid_station_df.to_excel(f'{race} Data/1. 100x100trail/aid_station_{race}_{year}.xlsx' , index = False)
            
            TORX_df[f'{race}_{year}'] = df
#             aid_station_TORX_df[f'{race}_{year}'] = aid_station_df
                
            print(f"Data saved to f'{race}_{year}.xlsx'")

            print('*'*30)




  df['Name'] = df['Name'].str.replace(r"\s+", " ")


Data saved to f'TOR330_2021.xlsx'
******************************


  df['Name'] = df['Name'].str.replace(r"\s+", " ")


Data saved to f'TOR330_2022.xlsx'
******************************


  df['Name'] = df['Name'].str.replace(r"\s+", " ")


Data saved to f'TOR330_2023.xlsx'
******************************


  df['Name'] = df['Name'].str.replace(r"\s+", " ")


In [None]:
TORX_df['TOR330_2022'].head(5)

### Exploring Data

In [None]:
# print(tor_des_glacier_df[f'tor_des_glacier_2024'])
df = TORX_df[f'TOR330_2024'][TORX_df[f'TOR330_2024']['Name'] == 'D Haene Francois']

df