# TORX Data

In this code, the 100x100 database is scraped. The raw data is then saved to a folder with race and year. No cleaning is done.


In [5]:
import requests
import json
import html
import pandas as pd
import time
import re
import unicodedata



In [3]:
years = ['2021','2022', '2023',  '2024']
races = [
#     'TOR330' ,
         'TOR450',
#          'TOR130'
        ]
delay_seconds = 15

## Scraping TORX Data from the website

In [3]:
for race in races:
    for year in years:
        # URL of the JSON file
        url = f'https://100x100trail.com/json/{race}{year}.json'
        print(url)

        # Send a GET request to fetch the JSON data
        response = requests.get(url)

        # Check if the request was successful (status code 200)
        if response.status_code == 200:
            # Parse the JSON data
            data = response.json()

            # Optionally, save the data to a JSON file
            with open(f'{race} Data/1. 100x100trail/JSON/{race}_{year}.json', 'w') as f:
                json.dump(data, f, indent=4)

            print(f"Data saved to '{race}_{year}.json'")
        else:
            print(f"Failed to retrieve data. Status code: {response.status_code}")

        time.sleep(delay_seconds)

https://100x100trail.com/json/TOR3302021.json
Data saved to 'TOR330_2021.json'
https://100x100trail.com/json/TOR3302022.json
Data saved to 'TOR330_2022.json'
https://100x100trail.com/json/TOR3302023.json
Data saved to 'TOR330_2023.json'
https://100x100trail.com/json/TOR3302024.json
Data saved to 'TOR330_2024.json'


## Extracting Data from JSON data

In [6]:
# Prepare a list to hold the rows for the DataFrame
rows = []
aid_station_list = []

# Function to parse each entry
def parse_data(entries, race, year):
    n = 0
    for entry in entries:
        
        attributes = entry["data"]["attributes"]
        name = html.unescape(attributes["name"]).title()
        team = attributes["team"]
        bib_number = attributes["pettorale"]
        sex = attributes["sesso"]
        nationality = attributes["nazionalita"]
        category = attributes["categoria"]
        finisher_status = attributes["finisher"]
        
            # Extracting times and places
        events = entry["data"]["relationships"]["events"]["data"]

        for event in events:
            place = event['title']
            time = event['start_date']
        
            race = f'{race}'
            year = f'{year}'

            #"Bib", "Name", "Team", "Sex",  'Nationality','Race', 'Status' 
            all_info = [name, bib_number,  sex, nationality,race,year,
                                    category, place, time, finisher_status]
            # Append the row to the list
            rows.append(all_info)


In [2]:
# Function to remove special characters
def normalize_name(name):
    # Normalize to remove accents and special characters
    name = unicodedata.normalize('NFD', name)
    # Encode to ASCII and ignore errors, then decode back to string
    name = name.encode('ascii', 'ignore').decode('utf-8')
    return name

## Looping through scraped data

In [7]:
TORX_df = {}


for race in races:
    for year in years:
        rows=[]

        # Load the JSON data from a file
        with open(f'{race} Data/1. 100x100trail/JSON/{race}_{year}.json', 'r') as file:
            data = json.load(file)

            # parse data
            parse_data(data, race, year)

            # Create a DataFrame
            df =  pd.DataFrame(rows, columns=[
               "Name", "Bib", 'Sex', 'Nationality','Race', 'Year',
                "Category",  'Place', 'Time', 'Status' ])    

            df['Name'] = df['Name'].str.strip().str.title()
            df['Name'] = df['Name'].str.replace(',', '')
            df['Name'] = df['Name'].str.replace('\'', ' ')
            df['Name'] = df['Name'].str.replace('-', ' ')
            df['Name'] = df['Name'].str.replace(r"\s+", " ")
            df['Name'] = df['Name'].apply(normalize_name)
                
            
            df.to_excel(f'{race} Data/1. 100x100trail/{race}_{year}.xlsx' , index = False)
#             aid_station_df.to_excel(f'{race} Data/1. 100x100trail/aid_station_{race}_{year}.xlsx' , index = False)
            
            TORX_df[f'{race}_{year}'] = df
#             aid_station_TORX_df[f'{race}_{year}'] = aid_station_df
                
            print(f"Data saved to f'{race}_{year}.xlsx'")

            print('*'*30)




  df['Name'] = df['Name'].str.replace(r"\s+", " ")


Data saved to f'TOR450_2021.xlsx'
******************************


  df['Name'] = df['Name'].str.replace(r"\s+", " ")


Data saved to f'TOR450_2022.xlsx'
******************************


  df['Name'] = df['Name'].str.replace(r"\s+", " ")


Data saved to f'TOR450_2023.xlsx'
******************************


  df['Name'] = df['Name'].str.replace(r"\s+", " ")


Data saved to f'TOR450_2024.xlsx'
******************************


In [9]:
TORX_df[f'{race}_2022'].head(5)

Unnamed: 0,Name,Bib,Sex,Nationality,Race,Year,Category,Place,Time,Status
0,Raichon Sebastien,4067,M,FR,TOR450,2022,V2,660 - FINISH|UHF,2022-09-14T23:57:18+00:00,True
1,Raichon Sebastien,4067,M,FR,TOR450,2022,V2,650 - Mont de la Saxe 330|344.5|30828,2022-09-14T23:10:42+00:00,True
2,Raichon Sebastien,4067,M,FR,TOR450,2022,V2,640 - Pas Entre Deux Sauts 330|335.6|30486,2022-09-14T21:34:35+00:00,True
3,Raichon Sebastien,4067,M,FR,TOR450,2022,V2,630 - Rif. Frassati 330|329.7|29862,2022-09-14T19:55:45+00:00,True
4,Raichon Sebastien,4067,M,FR,TOR450,2022,V2,600 - Hotel Italia - GSB|417|35131,2022-09-14T17:06:48+00:00,True


### Exploring Data

In [12]:
# print(tor_des_glacier_df[f'tor_des_glacier_2024'])
df = TORX_df[f'{race}_2024'][TORX_df[f'{race}_2024']['Name'] == 'Tierney Paul']

df

Unnamed: 0,Name,Bib,Sex,Nationality,Race,Year,Category,Place,Time,Status
4510,Tierney Paul,4104,M,IE,TOR450,2024,V1,060 - Rif. Deffeyes 450,2024-09-07T03:31:19+02:00,False
4511,Tierney Paul,4104,M,IE,TOR450,2024,V1,050 - Rif. Elisabetta,2024-09-06T22:32:53+02:00,False
4512,Tierney Paul,4104,M,IE,TOR450,2024,V1,040 - Rif. Maison Vieille,2024-09-06T21:00:21+02:00,False
4513,Tierney Paul,4104,M,IE,TOR450,2024,V1,010 - START,2024-09-06T19:48:30+02:00,False
4514,Tierney Paul,4104,M,IE,TOR450,2024,V1,000 - PETTORALI,2024-08-17T18:33:00+02:00,False
