# Scraping ISFC data

In [14]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
from urllib.parse import urlparse, parse_qs
import pandas as pd
import time

In [22]:
def link_generator(original_link):
    try:
        # Parse the URL
        parsed_url = urlparse(original_link)

        # Extract parameters
        query_params = parse_qs(parsed_url.query)
        event_number = query_params.get('event', [None])[0]
        result_number = query_params.get('result', [None])[0]

        # Check if the required parameters are present
        if event_number is not None and result_number is not None:
            # Convert to integers if needed
            event_number = int(event_number)
            result_number = int(result_number)

            # Create the second URL
            return f'https://components.ifsc-climbing.org/result-complete/?event={event_number}&result={result_number}'

        else:
            return 'Error: Missing required parameters (event or result) in the URL.'

    except Exception as e:
        return f'Error: {e}'

In [32]:
def get_competition_data(year):
    menu = Select(driver.find_element(By.ID, "yearSelect"))
    menu.select_by_visible_text(year)
    driver.implicitly_wait(10)

    print("YEAR", year)
    
    # Store information of each competition
    world_cups_list = []

    # Find competitions by class name
    competitions = driver.find_elements(By.CLASS_NAME, 'competition')

    for c in competitions:
        # print(c.find_element(By.CLASS_NAME, 'title').text)
        # print(c.find_element(By.CLASS_NAME, 'date').text)
        title = c.find_element(By.CLASS_NAME, 'title').text
        date = c.find_element(By.CLASS_NAME, 'date').text
        
        # Get categories (BOULDER, LEAD, SPEED)
        categories = c.find_elements(By.CLASS_NAME, 'tag')

        competition_entry = {}

        # TODO: MAKE ERROR HANDLER WHEN THERE IS TAG BUT THERE IS NO HREF
        if categories:
            for cat in categories:
                category_name = cat.text

                try:
                    # Attempt to find the 'a' tag
                    a_tag = cat.find_element(By.TAG_NAME, 'a')
                    href_link = a_tag.get_attribute("href")
                    new_href = link_generator(href_link)
                except NoSuchElementException:
                    # Handle the case when 'a' tag is not found
                    break

                # Store competition information as dictionary
                competition_entry = {
                    'title': title,
                    'date': date,
                    'category': category_name,
                    'url': new_href
                }

                # Append entry to world cup list
                world_cups_list.append(competition_entry)
    
    return world_cups_list

In [24]:
def save_csv(comp_list, year):
    # Create a DataFrame from the competition data
    df = pd.DataFrame(comp_list)
    csv_file_path = f'world_cups_{year}.csv'

    df.to_csv(csv_file_path, index=False, encoding='utf-8')

### First Step: Get competition information

In this step, our job is to scrape the calendar page to get all the worldcups information available such as `event name`, `date`, the `categories` played on that event year, and the `url` for each category competition.

Each competition information will be save on its own csv file by year. 

In [33]:
options = Options()
options.add_argument('--headless')
# options.add_argument('--incognito')
driver = webdriver.Chrome(options=options)


driver.get("https://components.ifsc-climbing.org/calendar/")

In [34]:
# Find dropdown menu using its ID
dropdown = driver.find_element(By.ID, "yearSelect")

# Get all years in the dropdown menu using CSS selector
values = dropdown.find_elements(By.CSS_SELECTOR, 'option')
years = []

for v in values:
    # There is not information for worldcups on the ifsc calendar website after 2007
    if v.text == '2006':
        break
    
    years.append(v.text)

# Loop through years
for year in years:
    try:
        # Find the dropdown menu each time before selecting a year
        dropdown = driver.find_element(By.ID, "yearSelect")
        
        # Select the year from the dropdown
        menu = Select(dropdown)
        menu.select_by_visible_text(year)

        # Wait for some time to let the page load
        time.sleep(2)

        # Get competition data for the selected year
        competition_list = get_competition_data(year)
        save_csv(competition_list, year)

    except StaleElementReferenceException:
        # Handle StaleElementReferenceException by refreshing the dropdown
        print("StaleElementReferenceException: Refreshing dropdown.")
        continue

driver.quit()

YEAR 2024
YEAR 2023
YEAR 2022
YEAR 2021
YEAR 2020
YEAR 2019
YEAR 2018
YEAR 2017
YEAR 2016
YEAR 2015
YEAR 2014
YEAR 2013
YEAR 2012
YEAR 2011
YEAR 2010
YEAR 2009
YEAR 2008
YEAR 2007


### Second Step: Scrape Competition Results for each Year

The second step involves scrape the competition results from each url obtain on the first step.

In [2]:
options = Options()
# options.add_argument('--headless')
options.add_argument('--incognito')
driver = webdriver.Chrome(options=options)


driver.get("https://components.ifsc-climbing.org/result-complete/?event=1291&result=3")
driver.implicitly_wait(0.5)

In [3]:
# Find the table element (replace with the appropriate selector for your table)
table = driver.find_element(By.CSS_SELECTOR, 'table')

# Initialize empty lists to store table data
table_header = []
table_data = []

# Find the header row (within the thead section)
thead = table.find_element(By.CSS_SELECTOR, 'thead')
header_row = thead.find_element(By.CSS_SELECTOR, 'tr')

# Extract header cell text
for cell in header_row.find_elements(By.CSS_SELECTOR, 'th'):
    header_text = cell.text
    table_header.append(header_text)

# Find and iterate through data rows (within the tbody section)
tbody = table.find_element(By.CSS_SELECTOR, 'tbody')
for row in tbody.find_elements(By.CSS_SELECTOR, 'tr'):
    row_data = []

    # Iterate through columns in the current row
    for cell in row.find_elements(By.CSS_SELECTOR, 'td'):
        # Extract text from each cell
        cell_text = cell.text
        row_data.append(cell_text)

    # Add the row data to the table data list
    table_data.append(row_data)

# Close the browser
driver.quit()

# Create a Pandas DataFrame from the table data
df = pd.DataFrame(table_data, columns=table_header)

# Export the DataFrame to a CSV file
csv_file_path = 'table_data_pandas.csv'
df.to_csv(csv_file_path, index=False, encoding='utf-8')



Table Header:
Index(['Rank', 'Name', '', 'Country', 'Qualification', 'Semi-final', 'Final'], dtype='object')
Table Data:
   Rank         Name                      Country Qualification  Semi-final  \
0     1        MEJDI              SCHALCK     FRA      4t4z 6 6    1t4z 2 9   
1     2       HANNES           VAN DUYSEN     BEL     3t5z 9 15   1t4z 8 14   
2     3         PAUL                JENFT     FRA     4t5z 10 9  2t4z 11 13   
3     4       KOKORO                FUJII     JPN     4t5z 8 17    1t2z 3 4   
4     5       SORATO               ANRAKU     JPN     3t5z 4 17   1t4z 1 12   
..  ...          ...                  ...     ...           ...         ...   
86   85        DIEGO  LEQUERICA BUSCAGLIA     PER      0t2z 0 7               
87   88         MARK              SCANLON     IRL      0t1z 0 3               
88   89         IMAN                 MORA     PHI      0t1z 0 8               
89   89   YING-CHIEH                  WEN     TPE      0t0z 0 0               
90   91  J

In [5]:
df.tail()

Unnamed: 0,Rank,Name,Unnamed: 3,Country,Qualification,Semi-final,Final
86,85,DIEGO,LEQUERICA BUSCAGLIA,PER,0t2z 0 7,,
87,88,MARK,SCANLON,IRL,0t1z 0 3,,
88,89,IMAN,MORA,PHI,0t1z 0 8,,
89,89,YING-CHIEH,WEN,TPE,0t0z 0 0,,
90,91,JOHN JOSEPH,VELORIA,PHI,0t0z 0 0,,


In [34]:
driver.quit()