In [1]:
import requests
import json
import html
import pandas as pd
import time
import numpy as np
import matplotlib.pyplot as plt
import unicodedata
import re



In [2]:
# Function to remove special characters
def normalize_name(name):
    # Normalize to remove accents and special characters
    name = unicodedata.normalize('NFD', name)
    # Encode to ASCII and ignore errors, then decode back to string
    name = name.encode('ascii', 'ignore').decode('utf-8')
    return name

In [3]:

# # URL of the JSON file
# url = "https://itra.run/Races/RaceResults/TOR330.Tor.des.G%C3%A9ants%C2%AE/2023/80226"

# # Send a GET request to fetch the JSON data
# response = requests.get(url)
# response

In [4]:
years = [ '2024', '2023', 
         '2022', 
         '2022_HOTEL_ITALIA',  # bad weather at the end 
         '2022_RIFUGIO_FRASSATI',
         '2022_BOSSES',
         '2021', 
         '2019',
#          '2020', # cancelled due to covid
         '2018', '2017', '2016', 
         #'2015', # having issues with 2015
         '2014', '2013', '2012', '2011', '2010']

races = [#'TOR130', 'TOR330', 
         'TOR450'
        ]

#### Loading ITRA 

In [5]:

TORX_itra_df = {}
# races = ['TOR450']

for race in races:
    for year in years:
        try:

            file = f'{race} Data/3. ITRA/TXT/{race}_{year}.txt'
            df = pd.read_csv(file, delimiter='\t', 
                         dtype={'Time': 'string'})    

            # renaming columns so it is easier later to compare
            df = df.rename(columns={"Runner": "Name",
                                    "Gender": "Sex",
                                    "Nationality": "ITRA_Nationality",
                                   })

            # change the name to a title case
            df['Name'] = df['Name'].str.title()
            df['Name'] = df['Name'].apply(normalize_name)

            # Convert 'Time' to timedelta format
            df['Performance'] = pd.to_timedelta(df['Time'], errors='coerce')
            # Calculate total seconds and create 'Performance_Seconds' column
            df['Performance_Seconds'] = df['Performance'].dt.total_seconds()

            if year == '2022_HOTEL_ITALIA':
            # Create the 'Status' column
                df['Status'] =  'Finished at Hotel Italia'
                df['Year'] = '2022'    
            elif year == '2022_RIFUGIO_FRASSATI':
                df['Status'] = 'Finished at Rifugio Frassati'
                df['Year'] = '2022'
            elif year == '2022_BOSSES':
                df['Status'] =  'Finished at Bosses'
                df['Year'] = '2022'
            else:
                df['Status'] = np.where(df['Performance'].isna(), 'DNF', 'Finished')
                df['Year'] = f'{year}'

            df['Race'] = f'{race}'

    #             # Drop the intermediate columns if they're no longer needed
            df = df.drop(columns=['Time', '#'])

            df['Name'] = df['Name'].str.strip()
            df['Name'] = df['Name'].str.replace(',', '')
            df['Name'] = df['Name'].str.replace('\'', '')
            df['Name'] = df['Name'].str.replace('-', '')
            df['Name'] = df['Name'].str.replace(r"\s+", " ")

    #             print(race, year, '\n', df.head())

            # Store the DataFrame in the dictionary with a key like 'TOR330_2021'
            TORX_itra_df[f'{race}_{year}'] = df
            print('* ',f'{race}_{year} {df.shape}', '\n')
        except:
            
            if file == 'TOR330 Data/3. ITRA/TXT/TOR330_2022_HOTEL_ITALIA.txt':
                pass
            elif file == 'TOR450 Data/3. ITRA/TXT/TOR450_2022_RIFUGIO_FRASSATI.txt':
                pass
            elif file == 'TOR450 Data/3. ITRA/TXT/TOR330_2022_BOSSES.txt':
                pass
            
            else:
                print(f'issues with {race}_{year}')
                print(f'{file}')
                print('\n')
            


*  TOR450_2024 (168, 9) 

*  TOR450_2023 (173, 9) 

*  TOR450_2022 (46, 9) 

*  TOR450_2022_HOTEL_ITALIA (11, 9) 

issues with TOR450_2022_BOSSES
TOR450 Data/3. ITRA/TXT/TOR450_2022_BOSSES.txt


*  TOR450_2021 (56, 9) 

*  TOR450_2019 (100, 9) 

issues with TOR450_2018
TOR450 Data/3. ITRA/TXT/TOR450_2018.txt


issues with TOR450_2017
TOR450 Data/3. ITRA/TXT/TOR450_2017.txt


issues with TOR450_2016
TOR450 Data/3. ITRA/TXT/TOR450_2016.txt


issues with TOR450_2014
TOR450 Data/3. ITRA/TXT/TOR450_2014.txt


issues with TOR450_2013
TOR450 Data/3. ITRA/TXT/TOR450_2013.txt


issues with TOR450_2012
TOR450 Data/3. ITRA/TXT/TOR450_2012.txt


issues with TOR450_2011
TOR450 Data/3. ITRA/TXT/TOR450_2011.txt


issues with TOR450_2010
TOR450 Data/3. ITRA/TXT/TOR450_2010.txt




  df['Name'] = df['Name'].str.replace(r"\s+", " ")
  df['Name'] = df['Name'].str.replace(r"\s+", " ")
  df['Name'] = df['Name'].str.replace(r"\s+", " ")
  df['Name'] = df['Name'].str.replace(r"\s+", " ")
  df['Name'] = df['Name'].str.replace(r"\s+", " ")
  df['Name'] = df['Name'].str.replace(r"\s+", " ")


In [6]:
for race in races:
    for year in years:
        try:
            tor_year = TORX_itra_df[f'{race}_{year}']
            # Count the occurrences of each status
            status_counts = tor_year['Status'].value_counts()
            print(race, year, '\n', status_counts, '\n', '*'*40)
        except:
            pass

TOR450 2024 
 DNF         100
Finished     68
Name: Status, dtype: int64 
 ****************************************
TOR450 2023 
 DNF         88
Finished    85
Name: Status, dtype: int64 
 ****************************************
TOR450 2022 
 Finished    46
Name: Status, dtype: int64 
 ****************************************
TOR450 2022_HOTEL_ITALIA 
 Finished at Hotel Italia    11
Name: Status, dtype: int64 
 ****************************************
TOR450 2021 
 Finished    31
DNF         25
Name: Status, dtype: int64 
 ****************************************
TOR450 2019 
 DNF         60
Finished    40
Name: Status, dtype: int64 
 ****************************************


In [7]:
TORX_itra_df

{'TOR450_2024':                     Name  Age Sex ITRA_Nationality     Performance  \
 0            Erwee Tiaan   37   M              RSA 5 days 02:09:41   
 1     Fohrmeister Volker   39   M              GER 5 days 18:32:15   
 2     Roncato Alessandro   44   M              ITA 5 days 23:01:51   
 3           Bero Stephan   53   M              BEL 6 days 06:11:04   
 4           Galve Javier   42   M              ESP 6 days 08:17:34   
 ..                   ...  ...  ..              ...             ...   
 163     Trabucchi Pietro   61   M              ITA             NaT   
 164     Trygub Ievgeniia   41   F              UKR             NaT   
 165  Van Gheluwe Yannick   52   M              FRA             NaT   
 166        Vuillen Loris   63   M              ITA             NaT   
 167           Yvin Erwan   49   M              FRA             NaT   
 
      Performance_Seconds    Status  Year    Race  
 0               439781.0  Finished  2024  TOR450  
 1               498735.0  

In [8]:
# Append using pd.concat
TORX_itra_df = pd.concat(TORX_itra_df)
TORX_itra_df = TORX_itra_df.reset_index(drop = True)
TORX_itra_df.isna().sum()

Name                     0
Age                      0
Sex                      0
ITRA_Nationality         0
Performance            273
Performance_Seconds    273
Status                   0
Year                     0
Race                     0
dtype: int64

In [9]:
years_list = TORX_itra_df['Year'].unique()
race_list = TORX_itra_df['Race'].unique()

for race in races:
    for year in years_list:
        year_df = TORX_itra_df[(TORX_itra_df['Year'] == year) &
                              (TORX_itra_df['Race'] == race)]
        status_df = year_df['Status'].value_counts()
        
        if race == 'TOR450' and year in [ 
            #'2020', # cancelled due to covid
         '2018', '2017', '2016', 
         '2015', 
         '2014', '2013', '2012', '2011', '2010']:
            pass
        else:
            print(race, year,'\n',status_df, '\n', '*'*40)
    

TOR450 2024 
 DNF         100
Finished     68
Name: Status, dtype: int64 
 ****************************************
TOR450 2023 
 DNF         88
Finished    85
Name: Status, dtype: int64 
 ****************************************
TOR450 2022 
 Finished                    46
Finished at Hotel Italia    11
Name: Status, dtype: int64 
 ****************************************
TOR450 2021 
 Finished    31
DNF         25
Name: Status, dtype: int64 
 ****************************************
TOR450 2019 
 DNF         60
Finished    40
Name: Status, dtype: int64 
 ****************************************


## EDA of ITRA 

In [10]:
TORX_itra_df[TORX_itra_df['Sex'].isna()]

Unnamed: 0,Name,Age,Sex,ITRA_Nationality,Performance,Performance_Seconds,Status,Year,Race


In [11]:
TORX_itra_df[TORX_itra_df['Name'] == 'D Haene Francois']

Unnamed: 0,Name,Age,Sex,ITRA_Nationality,Performance,Performance_Seconds,Status,Year,Race


In [12]:
TORX_itra_df[TORX_itra_df['Name'] == 'Tierney Paul']

Unnamed: 0,Name,Age,Sex,ITRA_Nationality,Performance,Performance_Seconds,Status,Year,Race
162,Tierney Paul,42,M,IRL,NaT,,DNF,2024,TOR450
174,Tierney Paul,41,M,IRL,5 days 15:22:00,487320.0,Finished,2023,TOR450
344,Tierney Paul,40,M,IRL,5 days 16:23:20,491000.0,Finished,2022,TOR450


In [13]:
TORX_itra_df[TORX_itra_df['Name'].str.contains('Girolami Paolo')]

Unnamed: 0,Name,Age,Sex,ITRA_Nationality,Performance,Performance_Seconds,Status,Year,Race


In [14]:
# grabbing all finishers 
TORX_itra_df['Performance'] = TORX_itra_df['Performance'].astype('str')



for race in ['TOR130', 'TOR330', 'TOR450']:
    df = TORX_itra_df[TORX_itra_df['Race'] == race]
    print(df.head)
    # # Save the result
    df.to_excel(f'{race} Data/5. Clean Data for Data Visualisation/{race}_itra_including_DNF.xlsx' , index = False)



<bound method NDFrame.head of Empty DataFrame
Columns: [Name, Age, Sex, ITRA_Nationality, Performance, Performance_Seconds, Status, Year, Race]
Index: []>
<bound method NDFrame.head of Empty DataFrame
Columns: [Name, Age, Sex, ITRA_Nationality, Performance, Performance_Seconds, Status, Year, Race]
Index: []>
<bound method NDFrame.head of                    Name Age Sex ITRA_Nationality      Performance  \
0           Erwee Tiaan  37   M              RSA  5 days 02:09:41   
1    Fohrmeister Volker  39   M              GER  5 days 18:32:15   
2    Roncato Alessandro  44   M              ITA  5 days 23:01:51   
3          Bero Stephan  53   M              BEL  6 days 06:11:04   
4          Galve Javier  42   M              ESP  6 days 08:17:34   
..                  ...  ..  ..              ...              ...   
549     Thiebat Orlando  65   M              ITA              NaT   
550        Thierry Ador  48   M              FRA              NaT   
551    Tribolo Philippe  44   M        

### Extracting only Finishers

In [30]:
# TORX_itra_df
TORX_itra_no_DNF = TORX_itra_df[(TORX_itra_df['Status'] != 'DNF') &
                                (TORX_itra_df['Race'] == race) 
                                 # & (TORX_itra_df['Year'] != year)
                               ].reset_index(drop = True)
TORX_itra_no_DNF = TORX_itra_no_DNF[[
    'Name', 'ITRA_Nationality', 'Sex',  'Age', 
    'Performance','Performance_Seconds','Status', 
    'Race', 'Year'
]]

In [31]:
TORX_itra_no_DNF['Status'].unique()

array(['Finished', 'Finished at Hotel Italia'], dtype=object)

In [32]:
TORX_itra_no_DNF.isna().sum()

Name                   0
ITRA_Nationality       0
Sex                    0
Age                    0
Performance            0
Performance_Seconds    0
Status                 0
Race                   0
Year                   0
dtype: int64

In [33]:
TORX_itra_no_DNF.head()

Unnamed: 0,Name,ITRA_Nationality,Sex,Age,Performance,Performance_Seconds,Status,Race,Year
0,Erwee Tiaan,RSA,M,37,5 days 02:09:41,439781.0,Finished,TOR450,2024
1,Fohrmeister Volker,GER,M,39,5 days 18:32:15,498735.0,Finished,TOR450,2024
2,Roncato Alessandro,ITA,M,44,5 days 23:01:51,514911.0,Finished,TOR450,2024
3,Bero Stephan,BEL,M,53,6 days 06:11:04,540664.0,Finished,TOR450,2024
4,Galve Javier,ESP,M,42,6 days 08:17:34,548254.0,Finished,TOR450,2024


### This data might be used later data Visualisations

In [34]:
# grabbing all finishers 
TORX_itra_no_DNF['Performance'] = TORX_itra_no_DNF['Performance'].astype('str')

for race in ['TOR130', 'TOR330', 'TOR450']:
    df = TORX_itra_no_DNF[TORX_itra_no_DNF['Race'] == race]
    print(df.head)
    # # Save the result
    df.to_excel(f'{race} Data/5. Clean Data for Data Visualisation/{race}_itra_no_DNF.xlsx' , index = False)



<bound method NDFrame.head of Empty DataFrame
Columns: [Name, ITRA_Nationality, Sex, Age, Performance, Performance_Seconds, Status, Race, Year]
Index: []>
<bound method NDFrame.head of Empty DataFrame
Columns: [Name, ITRA_Nationality, Sex, Age, Performance, Performance_Seconds, Status, Race, Year]
Index: []>
<bound method NDFrame.head of                    Name ITRA_Nationality Sex Age      Performance  \
0           Erwee Tiaan              RSA   M  37  5 days 02:09:41   
1    Fohrmeister Volker              GER   M  39  5 days 18:32:15   
2    Roncato Alessandro              ITA   M  44  5 days 23:01:51   
3          Bero Stephan              BEL   M  53  6 days 06:11:04   
4          Galve Javier              ESP   M  42  6 days 08:17:34   
..                  ...              ...  ..  ..              ...   
276       Plavan Marina              ITA   F  58  7 days 19:09:19   
277    Isernia Ruggiero              ITA   M  50  7 days 19:43:50   
278    Gallizia Michele              IT