In [1]:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

## kaggle

- Open the main website

In [98]:
website = 'https://www.kaggle.com/competitions?sortOption=reward'
# Create a WebDriver instance for Chrome
driver = webdriver.Chrome()
driver.maximize_window()
# Visit the website
driver.get(website)

- Gather links for each competition

In [99]:
button = WebDriverWait(driver, 20).until(
    EC.element_to_be_clickable((By.XPATH, '//*[@id="site-content"]/div[2]/div/div[4]/div/div[2]/div/div[1]/button[1]'))
)
button.click()

competition_links = WebDriverWait(driver, 20).until(
    EC.presence_of_all_elements_located((By.XPATH, '//*[@id="site-content"]/div[2]/div/div[5]/div/div/div/ul/li/div/a'))
)

# Extract href attributes from the top two competitions
top_competitions = [link.get_attribute('href') for link in competition_links[:2]]

- Scraping data from each competition

In [102]:
competition_data = []

for url in top_competitions:
    driver.get(url)
    time.sleep(3)  # Ensure the page loads completely
    
    # Extract competition names
    try:
        competition_name = WebDriverWait(driver, 2).until(
            EC.visibility_of_element_located((By.XPATH, '//*[@id="site-content"]/div[2]/div/div/div[2]/div[2]/div[1]/h1'))
        ).text
    except:
        competition_name = "Competition name not found"

    # Extract the overview text
    try:
        overview_text = WebDriverWait(driver, 2).until(
            EC.visibility_of_element_located((By.XPATH, '//*[@id="abstract"]/div[1]/div[2]/div/p'))
        ).text
    except:
        overview_text = "Overview text not found"

    # Extract all paragraphs in the description section
    try:
        description_paragraphs = WebDriverWait(driver, 2).until(
            EC.presence_of_all_elements_located((By.XPATH, '//*[@id="description"]/div/div[2]/div/div/p'))
        )
        description_text = ' '.join([para.text for para in description_paragraphs])
    except:
        description_text = "Description text not found"

    try:
        driver.get(url + '/data')
        time.sleep(1)
        dataset_paragraphs = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.XPATH, '//*[@id="site-content"]/div[2]/div/div/div[6]/div[1]/div[1]/div/div[2]/div/div[1]/div/div/div/p'))
        )
        dataset_description = ' '.join([para.text for para in dataset_paragraphs])
    except:
        dataset_description = "Dataset description not found"


    # Store the competition url, overview text, and description text
    competition_data.append({
        'name' : competition_name,
        'url': url,
        'overview_text': overview_text,
        'description_text': description_text,
        'dataset_text' : dataset_description
    })

# Output the data
for data in competition_data:
    print(data)

{'name': 'AI Mathematical Olympiad - Progress Prize 2', 'url': 'https://www.kaggle.com/competitions/ai-mathematical-olympiad-progress-prize-2', 'overview_text': 'The goal of this competition is to create algorithms and models that can solve tricky math problems written in LaTeX format. Your participation will help to advance AI models’ mathematical reasoning skills and drive frontier knowledge.', 'description_text': "Note: This is the second AIMO Progress Prize competition. It builds upon the first AIMO Progress Prize competition, which was won in July 2024 by Project Numina. This second competition has an increased prize pool, a new dataset of problems, increased compute for participants and updated rules for using open-source LLMs. The ability to reason mathematically is a critical milestone for AI. Mathematical reasoning is the foundation for solving many complex problems, from engineering marvels to intricate financial models. However, current AI capabilities are limited in this ar

In [108]:
driver.quit()


## Section 2: Eval.Ai

- Open the major website

In [104]:
website = 'https://eval.ai/web/challenges/list'
# Create a WebDriver instance for Chrome
driver = webdriver.Chrome()
driver.maximize_window()
# Visit the website
driver.get(website)

- Gather the competition webpage links

In [105]:
# Wait for the page to load (this might require adjusting depending on page load time)
driver.implicitly_wait(10)  # Adjust the wait time as necessary

base_xpath = '//*[@id="page-wrap"]/div/div/div/ui-view/ui-view/section/div[2]/div'

# List to hold links
competition_links = []

# Loop through the first two competition divs
for i in range(1, 3):  # Since XPath index starts at 1 and we need first two competitions
    competition_xpath = f'{base_xpath}[{i}]/a'
    # Find the <a> element and get the href attribute
    competition_link = driver.find_element(By.XPATH, competition_xpath).get_attribute('href')
    competition_links.append(competition_link)

# Print the links
for data in competition_links:
    print(data)

https://eval.ai/web/challenges/challenge-page/2429
https://eval.ai/web/challenges/challenge-page/2418


- Scraping from each competition

In [106]:
competition_data_eval = []
for url in competition_links:
    driver.get(url)
    time.sleep(1)  # Ensure the page loads completely
    try:
        paragraphs_xpath = '//*[@id="page-wrap"]/div/div/div/ui-view/ui-view/ui-view/section/div/div[2]/div/div/p'

        # Wait until the presence of all paragraph elements is located
        description_paragraphs = WebDriverWait(driver, 2).until(
            EC.presence_of_all_elements_located((By.XPATH, paragraphs_xpath))
        )

        # Extract text from each paragraph
        competition_overview = ' '.join([paragraph.text for paragraph in description_paragraphs])
    except:
        competition_overview = "Overview text not found"

    try: 
        name_xpath = '//*[@id="page-wrap"]/div/div/div/ui-view/ui-view/section/div/div[1]/div[2]/div/h4'
        competition_name = WebDriverWait(driver, 2).until(
            EC.visibility_of_element_located((By.XPATH, name_xpath))
        ).text
    except:
        name = "Name of competition not found"
    competition_data_eval.append({'url' : url, 
                                'overview' : competition_overview,
                                'name' : competition_name})        

competition_data_eval
    

[{'url': 'https://eval.ai/web/challenges/challenge-page/2429',
  'overview': 'Surgical action triplet detection To detect surgical activities as triplets of {`instruments, verb, target`} where :',
  'name': 'CholecTriplet Challenge Detection Evaluation'},
 {'url': 'https://eval.ai/web/challenges/challenge-page/2418',
  'overview': "OpenAD is the first open-world 3D object detection benchmark for autonomous driving. We meticulously selected 2,000 scenes from 5 public datasets and annotated 6,597 3D corner cases for these scenes. Together with the original annotations of these scenes, there are 19,761 objects belonging to 206 different categories. You can utilize OpenAD to evaluate your model's open-world capabilities, encompassing scene generalization, cross-vehicle-type adaptability, open-vocabulary proficiency, and corner case detection aptitude. We provide a toolkit to organize data, load data, and evaluate your model with simple commands. Access the data and code here.",
  'name': '

In [107]:
driver.quit()

## drivendata

In [19]:
website = 'https://www.drivendata.org/competitions/search/?sort=total_prize_purse'
# Create a WebDriver instance for Chrome
driver = webdriver.Chrome()
driver.maximize_window()
# Visit the website
driver.get(website)

In [21]:
try:
    # Wait for the competition list div to load and locate it using its ID
    competition_list_div = WebDriverWait(driver, 20).until(
        EC.presence_of_element_located((By.ID, "competition-list"))
    )

    # Find all <a> tags within nested layers of the competition list div that have the specific class
    competition_links = competition_list_div.find_elements(By.XPATH, ".//a[@class='text-decoration-none'][@href]")

    # Extract href attributes from the first five links only (corrected limit comment)
    hrefs = [link.get_attribute('href') for link in competition_links[:2]]  # Limit to first five links

    # Output the collected links
    for href in hrefs:
        print(href)
except:
    # Clean up: close the browser window
    hrefs = []

https://www.drivendata.org/competitions/group/nist-federated-learning/
https://www.drivendata.org/competitions/group/nih-nia-alzheimers-adrd-competition/


In [22]:
main_competition_links = hrefs

# Dictionary to hold all sub-competition links for each main competition
all_sub_competition_links = {}

# Iterate over each main competition link
for main_link in main_competition_links:
    driver.get(main_link)
    try:
        # Wait for the sub-competition divs to load
        sub_competition_divs = WebDriverWait(driver, 20).until(
            EC.presence_of_all_elements_located((By.CLASS_NAME, "competition-subgroup"))
        )
        # Collect all hrefs from <a> tags within each subgroup
        sub_competition_hrefs = []
        for div in sub_competition_divs:
            sub_competition_links = div.find_elements(By.XPATH, ".//a[@href]")
            for link in sub_competition_links:
                href = link.get_attribute('href')
                if href not in sub_competition_hrefs:
                    sub_competition_hrefs.append(href)

        # Store the collected sub-competition links
        all_sub_competition_links[main_link] = sub_competition_hrefs
    except Exception as e:
        print(f"Error processing {main_link}: {str(e)}")
        all_sub_competition_links[main_link] = []

# Output the collected links for each competition
for main_link, sub_links in all_sub_competition_links.items():
    print(f"Main Competition: {main_link}")
    for link in sub_links:
        print(f"  Sub-Competition: {link}")

Main Competition: https://www.drivendata.org/competitions/group/nist-federated-learning/
  Sub-Competition: https://www.drivendata.org/competitions/98/nist-federated-learning-1/
  Sub-Competition: https://www.drivendata.org/competitions/search/?category=privacy
  Sub-Competition: https://www.drivendata.org/competitions/search/?type=privacy
  Sub-Competition: https://www.drivendata.org/competitions/105/nist-federated-learning-2-financial-crime-federated/
  Sub-Competition: https://www.drivendata.org/competitions/144/nist-federated-learning-2-financial-crime-centralized/
  Sub-Competition: https://www.drivendata.org/competitions/103/nist-federated-learning-2-pandemic-forecasting-federated/
  Sub-Competition: https://www.drivendata.org/competitions/145/nist-federated-learning-2-pandemic-forecasting-centralized/
  Sub-Competition: https://www.drivendata.org/competitions/139/nist-federated-learning-3-red-teams/
Main Competition: https://www.drivendata.org/competitions/group/nih-nia-alzheime

In [18]:
driver.quit()