# TORX Data

In this code, the 100x100 database is scraped. The raw data is then saved to a folder with race and year. No cleaning is done.


In [1]:
import requests
import json
import html
import pandas as pd
import time
import re
import unicodedata



In [2]:
years = ['2021','2022', '2023',  '2024']
races = ['TOR330' ,
#          'TOR450',
#          'TOR130'
        ]
delay_seconds = 15

## Scraping TORX Data from the website

In [3]:
for race in races:
    for year in years:
        # URL of the JSON file
        url = f'https://100x100trail.com/json/{race}{year}.json'
        print(url)

        # Send a GET request to fetch the JSON data
        response = requests.get(url)

        # Check if the request was successful (status code 200)
        if response.status_code == 200:
            # Parse the JSON data
            data = response.json()

            # Optionally, save the data to a JSON file
            with open(f'{race} Data/1. 100x100trail/JSON/{race}_{year}.json', 'w') as f:
                json.dump(data, f, indent=4)

            print(f"Data saved to '{race}_{year}.json'")
        else:
            print(f"Failed to retrieve data. Status code: {response.status_code}")

        time.sleep(delay_seconds)

https://100x100trail.com/json/TOR3302021.json
Data saved to 'TOR330_2021.json'
https://100x100trail.com/json/TOR3302022.json
Data saved to 'TOR330_2022.json'
https://100x100trail.com/json/TOR3302023.json
Data saved to 'TOR330_2023.json'
https://100x100trail.com/json/TOR3302024.json
Data saved to 'TOR330_2024.json'


## Extracting Data from JSON data

In [4]:
# Prepare a list to hold the rows for the DataFrame
rows = []
aid_station_list = []

# Function to parse each entry
def parse_data(entries, race, year):
    n = 0
    for entry in entries:
        
        attributes = entry["data"]["attributes"]
        name = html.unescape(attributes["name"]).title()
        team = attributes["team"]
        bib_number = attributes["pettorale"]
        sex = attributes["sesso"]
        nationality = attributes["nazionalita"]
        category = attributes["categoria"]
        finisher_status = attributes["finisher"]
        
            # Extracting times and places
        events = entry["data"]["relationships"]["events"]["data"]

        for event in events:
            place = event['title']
            time = event['start_date']
        
            race = f'{race}'
            year = f'{year}'

            #"Bib", "Name", "Team", "Sex",  'Nationality','Race', 'Status' 
            all_info = [name, bib_number,  sex, nationality,race,year,
                                    category, place, time, finisher_status]
            # Append the row to the list
            rows.append(all_info)


In [5]:
# Function to remove special characters
def normalize_name(name):
    # Normalize to remove accents and special characters
    name = unicodedata.normalize('NFD', name)
    # Encode to ASCII and ignore errors, then decode back to string
    name = name.encode('ascii', 'ignore').decode('utf-8')
    return name

## Looping through scraped data

In [6]:
TORX_df = {}


for race in races:
    for year in years:
        rows=[]

        # Load the JSON data from a file
        with open(f'{race} Data/1. 100x100trail/JSON/{race}_{year}.json', 'r') as file:
            data = json.load(file)

            # parse data
            parse_data(data, race, year)

            # Create a DataFrame
            df =  pd.DataFrame(rows, columns=[
               "Name", "Bib", 'Sex', 'Nationality','Race', 'Year',
                "Category",  'Place', 'Time', 'Status' ])    

            df['Name'] = df['Name'].str.strip().str.title()
            df['Name'] = df['Name'].str.replace(',', '')
            df['Name'] = df['Name'].str.replace('\'', ' ')
            df['Name'] = df['Name'].str.replace('-', ' ')
            df['Name'] = df['Name'].str.replace(r"\s+", " ")
            df['Name'] = df['Name'].apply(normalize_name)
                
            
            df.to_excel(f'{race} Data/1. 100x100trail/{race}_{year}.xlsx' , index = False)
#             aid_station_df.to_excel(f'{race} Data/1. 100x100trail/aid_station_{race}_{year}.xlsx' , index = False)
            
            TORX_df[f'{race}_{year}'] = df
#             aid_station_TORX_df[f'{race}_{year}'] = aid_station_df
                
            print(f"Data saved to f'{race}_{year}.xlsx'")

            print('*'*30)




  df['Name'] = df['Name'].str.replace(r"\s+", " ")


Data saved to f'TOR330_2021.xlsx'
******************************


  df['Name'] = df['Name'].str.replace(r"\s+", " ")


Data saved to f'TOR330_2022.xlsx'
******************************


  df['Name'] = df['Name'].str.replace(r"\s+", " ")


Data saved to f'TOR330_2023.xlsx'
******************************


  df['Name'] = df['Name'].str.replace(r"\s+", " ")


Data saved to f'TOR330_2024.xlsx'
******************************


In [7]:
TORX_df['TOR330_2022'].head(5)

Unnamed: 0,Name,Bib,Sex,Nationality,Race,Year,Category,Place,Time,Status
0,Russi Jonas,3,M,CH,TOR330,2022,SEN,660 - FINISH 330|349.3|30879,2022-09-14T08:31:36+00:00,True
1,Russi Jonas,3,M,CH,TOR330,2022,SEN,650 - Mont de la Saxe 330|344.5|30828,2022-09-14T07:33:56+00:00,True
2,Russi Jonas,3,M,CH,TOR330,2022,SEN,640 - Pas Entre Deux Sauts 330|335.6|30486,2022-09-14T05:30:00+00:00,True
3,Russi Jonas,3,M,CH,TOR330,2022,SEN,630 - Rif. Frassati 330|329.7|29862,2022-09-14T03:58:37+00:00,True
4,Russi Jonas,3,M,CH,TOR330,2022,SEN,620 - Bosses 330|320.6|28650,2022-09-14T00:54:36+00:00,True


### Exploring Data

In [8]:
# print(tor_des_glacier_df[f'tor_des_glacier_2024'])
df = TORX_df[f'TOR330_2024'][TORX_df[f'TOR330_2024']['Name'] == 'D Haene Francois']

df

Unnamed: 0,Name,Bib,Sex,Nationality,Race,Year,Category,Place,Time,Status
0,D Haene Francois,10,M,FR,TOR330,2024,SEN,660 - FINISH 330,2024-09-11T07:08:32+02:00,True
1,D Haene Francois,10,M,FR,TOR330,2024,SEN,650 - Monte de la Saxe 450,2024-09-11T06:25:42+02:00,True
2,D Haene Francois,10,M,FR,TOR330,2024,SEN,640 - Pas Entre Deux Sauts 450,2024-09-11T05:01:52+02:00,True
3,D Haene Francois,10,M,FR,TOR330,2024,SEN,630 - Rif. Frassati 450,2024-09-11T03:24:39+02:00,True
4,D Haene Francois,10,M,FR,TOR330,2024,SEN,620 - Bosses 330,2024-09-11T00:52:13+02:00,True
5,D Haene Francois,10,M,FR,TOR330,2024,SEN,610 - Ponteille Desot 450,2024-09-10T23:27:47+02:00,True
6,D Haene Francois,10,M,FR,TOR330,2024,SEN,590 - Rif. Champillon 450,2024-09-10T21:51:36+02:00,True
7,D Haene Francois,10,M,FR,TOR330,2024,SEN,581 - Ollomont OUT 330,2024-09-10T20:33:07+02:00,True
8,D Haene Francois,10,M,FR,TOR330,2024,SEN,580 - Ollomont IN 330,2024-09-10T20:14:49+02:00,True
9,D Haene Francois,10,M,FR,TOR330,2024,SEN,570 - Berio Damon 330,2024-09-10T19:52:34+02:00,True
