# SwissPeaks Data

In this code, Swiss Peaks website is scraped. The raw data is then saved to a folder with race and year. No cleaning is done.


In [32]:
import requests
import json
import html
import pandas as pd
import time
import re
import unicodedata
import datetime
import random
import numpy as np
from datetime import timedelta

# Start the timer
start_time = time.time()

year = '2020'
male_url = 'https://my1.raceresult.com/156560/RRPublish/data/list?key=2049a5fb34f5e180b9d6cf54d6150d67&listname=Online%7CFinal&page=results&contest=0&r=group&name=%231_Swiss%20Peaks%20360%0C%232_Male&f=Swiss%20Peaks%20360%0C%0C%3CIgnore%3E'
female_url = 'https://my1.raceresult.com/156560/RRPublish/data/list?key=2049a5fb34f5e180b9d6cf54d6150d67&listname=Online%7CFinal&page=results&contest=0&r=group&name=%231_Swiss%20Peaks%20360%0C%231_Female&f=Swiss%20Peaks%20360%0C%0C%3CIgnore%3E'
delay_seconds = 3

## Scraping Swiss Peaks 2020 Data from the website

In [33]:
def reformat_name(name_str):
    parts = [p.strip() for p in name_str.split(',')]
    return f"{parts[1]} {parts[0]}" if len(parts) == 2 else name_str


runner_list = []

for url in [male_url, female_url]:
    print(f"Fetching from: {url}")
    
    # Send GET request
    response = requests.get(url)

    if response.status_code == 200:
        json_data = response.json()
        
        # Extract the specific race data
        race_participants = json_data['data']
    #     print(race_participants)
        extracted_list = []

        for participant in race_participants:
            
            try:
                bib =  participant[0]
                bib_2 =  participant[1]
                rank =  participant[2]                
            
                # Format name
                formatted_name = reformat_name(participant[3])
                formatted_name = formatted_name.title()

                
                match = re.search(r'\[img:flags/([A-Z]{2})\.gif\]', participant[4])
                country_code = match.group(1) if match else ""

                birth_year =  participant[5] 
                
                if '_Male&' in url:
                    gender = 'M'
                elif 'Female' in url:
                    gender = 'F'

                club =  participant[6]  

                # Convert time
                chip_time = participant[7]
                formatted_chip_time = pd.to_timedelta(chip_time)
                
                runner_info = [bib, bib_2, rank,
                               formatted_name, country_code,
                               birth_year, gender,
                               club, formatted_chip_time]
                print(runner_info, '\n', '*'*20)
                runner_list.append(runner_info)
            except:
                print('Issue',participant)


    else:
        print(f"Failed to retrieve data. Status code: {response.status_code}")

    time.sleep(delay_seconds)
    print('******* delay! ******* ')

Fetching from: https://my1.raceresult.com/156560/RRPublish/data/list?key=2049a5fb34f5e180b9d6cf54d6150d67&listname=Online%7CFinal&page=results&contest=0&r=group&name=%231_Swiss%20Peaks%20360%0C%232_Male&f=Swiss%20Peaks%20360%0C%0C%3CIgnore%3E
['2', '2', '1.', 'Franco Collé', 'IT', '1978', 'M', 'HOKA ONE ONE/ Karpos', Timedelta('2 days 14:45:34')] 
 ********************
['8', '8', '2.', 'Jonas Russi', 'CH', '1985', 'M', '', Timedelta('2 days 14:45:35')] 
 ********************
['1', '1', '3.', 'Andrea Mattiato', 'IT', '1978', 'M', '', Timedelta('2 days 22:32:18')] 
 ********************
['3', '3', '4.', 'Michael Nançoz', 'CH', '1988', 'M', '', Timedelta('3 days 05:08:16')] 
 ********************
['250', '250', '5.', 'Javier Galve Sainz De Varanda', 'ES', '1982', 'M', '', Timedelta('3 days 08:19:35')] 
 ********************
['13', '13', '6.', 'Rainer Hilz', 'DE', '1985', 'M', '', Timedelta('3 days 08:53:25')] 
 ********************
['93', '93', '7.', 'Matthieu Moreau', 'FR', '1980', 'M', 

******* delay! ******* 
Fetching from: https://my1.raceresult.com/156560/RRPublish/data/list?key=2049a5fb34f5e180b9d6cf54d6150d67&listname=Online%7CFinal&page=results&contest=0&r=group&name=%231_Swiss%20Peaks%20360%0C%231_Female&f=Swiss%20Peaks%20360%0C%0C%3CIgnore%3E
['10', '10', '1.', 'Anita Lehmann', 'CH', '1980', 'F', '', Timedelta('3 days 13:10:45')] 
 ********************
['18', '18', '2.', 'Claire Bannwarth', 'FR', '1989', 'F', '', Timedelta('3 days 14:15:28')] 
 ********************
['48', '48', '3.', 'Emily Vaudan', 'CH', '1983', 'F', '', Timedelta('3 days 17:05:29')] 
 ********************
['24', '24', '4.', 'Denise Zimmermann', 'CH', '1975', 'F', '', Timedelta('3 days 20:57:41')] 
 ********************
['163', '163', '5.', 'Katja Fink', 'CH', '1975', 'F', '', Timedelta('3 days 22:05:22')] 
 ********************
['61', '61', '6.', 'Daphné Derouch', 'FR', '1989', 'F', '', Timedelta('4 days 04:41:34')] 
 ********************
['77', '77', '7.', 'Catherine Rion', 'CH', '1978', 'F

In [34]:

SP_dem_df =  pd.DataFrame(runner_list, columns = [
    'Bib', 'Bib_2', 'Rank',
    'Name', 'Nationality',
    'Birth Year', 'Gender',
    'Club', 'Duration'])   

SP_dem_df['Status'] = SP_dem_df['Rank'].copy()
SP_dem_df['Status'] = np.where((SP_dem_df['Status'] == 'DNF') |
                               (SP_dem_df['Status'] == 'DNS')
                               , SP_dem_df['Status'], 'Finished')

SP_dem_df['Year'] = year
SP_dem_df['Race'] = 'SP360'

SP_dem_df['PK'] = SP_dem_df['Race'] +'_' + SP_dem_df['Year'] +'_' + SP_dem_df['Bib']    


SP_dem_df.head()

Unnamed: 0,Bib,Bib_2,Rank,Name,Nationality,Birth Year,Gender,Club,Duration,Status,Year,Race,PK
0,2,2,1.0,Franco Collé,IT,1978,M,HOKA ONE ONE/ Karpos,2 days 14:45:34,Finished,2020,SP360,SP360_2020_2
1,8,8,2.0,Jonas Russi,CH,1985,M,,2 days 14:45:35,Finished,2020,SP360,SP360_2020_8
2,1,1,3.0,Andrea Mattiato,IT,1978,M,,2 days 22:32:18,Finished,2020,SP360,SP360_2020_1
3,3,3,4.0,Michael Nançoz,CH,1988,M,,3 days 05:08:16,Finished,2020,SP360,SP360_2020_3
4,250,250,5.0,Javier Galve Sainz De Varanda,ES,1982,M,,3 days 08:19:35,Finished,2020,SP360,SP360_2020_250


In [35]:
SP_dem_df.tail()

Unnamed: 0,Bib,Bib_2,Rank,Name,Nationality,Birth Year,Gender,Club,Duration,Status,Year,Race,PK
270,51,51,DNS,Dariia Bodnar,UA,1984,F,,NaT,DNS,2020,SP360,SP360_2020_51
271,174,174,DNS,Keri Devine,NZ,1973,F,,NaT,DNS,2020,SP360,SP360_2020_174
272,214,214,DNS,Yan Gu,CN,1971,F,,NaT,DNS,2020,SP360,SP360_2020_214
273,226,226,DNS,Satu Iho,FI,1984,F,,NaT,DNS,2020,SP360,SP360_2020_226
274,266,266,DNS,Magali Juvenal,FR,1971,F,,NaT,DNS,2020,SP360,SP360_2020_266


In [36]:
SP_dem_df['Duration'] = SP_dem_df['Duration'].astype('str') 
SP_dem_df.to_excel(f'Data/SwissPeaks360_{year}_DEM.xlsx', index=False)

In [37]:
SP_dem_df[SP_dem_df['Status'] == 'DNF']

Unnamed: 0,Bib,Bib_2,Rank,Name,Nationality,Birth Year,Gender,Club,Duration,Status,Year,Race,PK
129,11,11,DNF,Krystian Pietrzak,PL,1983,M,,NaT,DNF,2020,SP360,SP360_2020_11
130,15,15,DNF,Silvan Burkhalter,CH,1986,M,,NaT,DNF,2020,SP360,SP360_2020_15
131,16,16,DNF,Vincent Pelisson,FR,1993,M,HOKA ONE ONE Italy,NaT,DNF,2020,SP360,SP360_2020_16
132,22,22,DNF,Sebastien Aubineau,FR,1976,M,,NaT,DNF,2020,SP360,SP360_2020_22
133,23,23,DNF,Nicolas Lehmann,CH,1981,M,,NaT,DNF,2020,SP360,SP360_2020_23
...,...,...,...,...,...,...,...,...,...,...,...,...,...
265,173,173,DNF,Cindy Rosset,CH,1977,F,,NaT,DNF,2020,SP360,SP360_2020_173
266,189,189,DNF,Xiabingqing Wu,CH,1990,F,,NaT,DNF,2020,SP360,SP360_2020_189
267,197,197,DNF,Julie Rapenne,FR,1980,F,,NaT,DNF,2020,SP360,SP360_2020_197
268,206,206,DNF,Virginie Humblet,BE,1973,F,,NaT,DNF,2020,SP360,SP360_2020_206


### Using bib column to extract station 

In [None]:
list(SP_dem_df['ID'].unique())[:1]

In [None]:
station_list = []
failed_to_fetch= []
n=0
# Loop through the unique bibs in the DataFrame
for unique_id in list(SP_dem_df['ID'].unique()):  # Limit to just one bib for testing
    n = n+1
    print(n, ' - ', unique_id)
    
    # URL of the JSON file
    url = f'https://my4.raceresult.com/156560/RRPublish/data/splits?key=374250657b42ce1a7a8e1a4a3311786d&pid={unique_id}'
    print(url)

    # Send a GET request to fetch the JSON data
    response = requests.get(url)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the JSON data
        data = response.json()
        stations = data['Splits']
        
        # Loop through the stations
        for station in stations:
            station_name = station['Name']
            exists = station['Exists']
            
            # Helper function to format durations safely
            def get_formatted_duration(key):
                duration = station.get(key)
                if duration:
                    return f"{duration.split(':')[0]} days {':'.join(duration.split(':')[1:])}"
                return np.nan

            formatted_gun = get_formatted_duration('Gun')
            formatted_chip = get_formatted_duration('Chip')
            formatted_sector = get_formatted_duration('Sector')

            # Store the extracted info in the station list
            extracted_info = [unique_id, station_name, exists,
                              formatted_gun, 
                              formatted_chip,
                              formatted_sector]
            station_list.append(extracted_info)
            
    else:
        print(f"Failed to fetch data for {unique_id}. Status code: {response.status_code}")
        failed_to_fetch.append(unique_id)
        
    
    #### So we dont get blocked!!!!
    delay_seconds = random.randint(5,20)
    print('Delay before calling the server again: ', delay_seconds)
    print('*'*20)
    time.sleep(delay_seconds)



In [None]:
# Convert the station list to a DataFrame
aid_station_df = pd.DataFrame(station_list, columns=['Bib', 'Aid Station', 'Aid Station Duration', 'Rolling Duration'])
aid_station_df['Year'] = year
aid_station_df['Race'] = 'SP360'

aid_station_df['PK'] = aid_station_df['Race'] +'_' + aid_station_df['Year'] +'_' + aid_station_df['Bib']    

# Preview the DataFrame
print(aid_station_df.head())
# aid_station_df.to_excel(f'Data/SwissPeaks360_{year}_aid_stations.xlsx', index=False)

In [None]:
aid_station_df['Aid Station'].unique()

In [None]:
aid_station_df.to_excel(f'Data/SwissPeaks360_{year}_aid_stations.xlsx', index=False)

In [None]:
# End the timer
end_time = time.time()

# Calculate and print execution time
execution_time = end_time - start_time
print(f"Execution time: {execution_time:.6f} seconds")