In [1]:
from webdriver_manager.chrome import ChromeDriverManager
chrome_path = ChromeDriverManager().install()

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import json
import re
import time
import os
import numpy as np

def create_if_not_exists(path: str) -> None:
    """
    Creates the specified folder if it does not exist.
    :param path: the complete path of the folder to be created
    """
    if not os.path.exists(path):
        os.makedirs(path)

def extract_from_paper_elements(paper_elements, sleep_time=0.05):
    results = []
    for paper in paper_elements:
        href = paper.find_element_by_tag_name('h4 > a').get_attribute('href')
        paper.find_element_by_css_selector('div > div.collapse-widget > a').click()
        #accept-oral > div > div > ul > li:nth-child(1) > div > div.collapse-widget > a
        time.sleep(sleep_time)

        contents = paper.text.split('\n')
        title, keywords, abstract = contents[0], None, None
        for i, content in enumerate(contents):
            if content.startswith('Keywords'):
                keywords = [x.strip().lower() for x in content.split(':')[1].split(',')]
            elif content.startswith('Abstract'):
                start = i+1
            elif content.startswith('Primary Area') or content.startswith('Code Of Ethics') or content.startswith('Supplementary'):
                end = i
                break
        
        abstract = ' '.join(contents[start:end])

        results.append({
            'title': title,
            'href': href,
            'keywords': keywords,
            'abstract': abstract,
        })
    return results

def collect_all_papers(chrome_path, paper_type, num_pages, max_retries=10, sleep_time=0.05):
    assert paper_type in ['oral', 'poster', 'spotlight']
    session = webdriver.Chrome(chrome_path)
    session.implicitly_wait(10)

    session.get(f'https://openreview.net/group?id=ICLR.cc/2024/Conference&referrer=%5BHomepage%5D(%2F)#tab-accept-{paper_type}')
    time.sleep(0.2)

    papers = dict()

    start = time.time()
    for page in range(num_pages):
        paper_elements = session.find_element_by_id(f'accept-{paper_type}').find_element_by_class_name('submissions-list').find_elements_by_css_selector('li > div')

        for i in range(max_retries):
            try:
                results = extract_from_paper_elements(paper_elements, sleep_time=sleep_time)
                break
            except:
                print(f'retrying ({i}/{max_retries})...')
                time.sleep(sleep_time)
        
        # appending to the final result.
        for res in results:
            papers[res['href']] = res
        
        print(f'page {page+1}/{num_pages} done. ({time.time()-start:.2f} secs)')
        start = time.time()
        
        if page == num_pages - 1:
            return papers
        
        # Get the button and nav to the next page
        button = session.find_element_by_css_selector(f'#accept-{paper_type} > div > div > nav > ul > li.right-arrow > a') # get the first right-arrow button.
        button.click()
        time.sleep(2*sleep_time)
        session.execute_script("window.scrollTo(0, 0);")
        time.sleep(sleep_time)

In [None]:
data_dir = os.path.abspath('../data/ICLR/2024/')
create_if_not_exists(data_dir)

orals = collect_all_papers(chrome_path, 'oral', num_pages=4, sleep_time=0.2)
with open(os.path.join(data_dir, 'orals.json'), 'w') as f:
    json.dump(orals, f, indent=4)

spotlights = collect_all_papers(chrome_path, 'spotlight', num_pages=15, sleep_time=0.2)
with open(os.path.join(data_dir, 'spotlights.json'), 'w') as f:
    json.dump(spotlights, f, indent=4)

posters = collect_all_papers(chrome_path, 'poster', num_pages=72, sleep_time=0.25)
with open(os.path.join(data_dir, 'posters.json'), 'w') as f:
    json.dump(posters, f, indent=4)