In [1]:
VENUE_ID = 'NeurIPS.cc/2024/Conference'
VENUE_LS = ['NeurIPS 2024 oral', 'NeurIPS 2024 spotlight', 'NeurIPS 2024 poster']
CONFERENCE_NAME = 'NeurIPS'
OUTPUT_PATH = 'output/NeurIPS 2024 Abstracts.pdf'

## Init API client

In [2]:
import os
from dotenv import load_dotenv
import openreview

load_dotenv('.env')
client = openreview.api.OpenReviewClient(
    baseurl='https://api2.openreview.net',
    username=os.getenv("username"),
    password=os.getenv("password"),
)

## Scraping prep

In [3]:
# print matched venue ids
top3 = ['ICML', 'ICLR', 'NeurIPS']
venues = client.get_group(id='venues').members
venue_ids = [ven for ven in venues if any(con in ven for con in top3) and 'Conference' in ven]
venue_ids.sort()
venue_ids

['ICLR.cc/2018/Conference',
 'ICLR.cc/2019/Conference',
 'ICLR.cc/2020/Conference',
 'ICLR.cc/2021/Conference',
 'ICLR.cc/2022/Conference',
 'ICLR.cc/2023/Conference',
 'ICLR.cc/2024/Conference',
 'ICLR.cc/2025/Conference',
 'ICML.cc/2020/Conference',
 'ICML.cc/2023/Conference',
 'ICML.cc/2024/Conference',
 'ICML.cc/2025/Conference',
 'NeurIPS.cc/2020/Conference',
 'NeurIPS.cc/2021/Conference',
 'NeurIPS.cc/2022/Conference',
 'NeurIPS.cc/2023/Conference',
 'NeurIPS.cc/2024/Conference',
 'NeurIPS.cc/2025/Conference']

In [4]:
# print example paper note to examine the structure of content
# p.s. the pattern will be used for client.get_notes() to match all papers
sample_paper = client.get_notes(id='aVh9KRZdRk')[0]
print(sample_paper)

{'cdate': 1715802154333,
 'content': {'_bibtex': {'value': '@inproceedings{\n'
                                  'he2024learning,\n'
                                  'title={Learning to grok: Emergence of '
                                  'in-context learning and skill composition '
                                  'in modular arithmetic tasks},\n'
                                  'author={Tianyu He and Darshil Doshi and '
                                  'Aritra Das and Andrey Gromov},\n'
                                  'booktitle={The Thirty-eighth Annual '
                                  'Conference on Neural Information Processing '
                                  'Systems},\n'
                                  'year={2024},\n'
                                  'url={https://openreview.net/forum?id=aVh9KRZdRk}\n'
                                  '}'},
             'abstract': {'value': 'Large language models can solve tasks that '
                                   'we

## Paper scraping

In [5]:
submissions = client.get_all_notes(content={'venueid': VENUE_ID})
submissions

Getting V2 Notes: 100%|█████████▉| 4030/4035 [00:03<00:00, 1298.16it/s]


[Note(id = 'zzOOqD6R1b',number = 6188,cdate = 1715586496661,pdate = 1727287807547,odate = 1730873891234,mdate = 1730873891250,tcdate = 1715586496661,tmdate = 1730873891250,ddate = None,content = {'title': {'value': 'Stress-Testing Capability Elicitation With Password-Locked Models'}, 'authors': {'value': ['Ryan Greenblatt', 'Fabien Roger', 'Dmitrii Krasheninnikov', 'David Krueger']}, 'authorids': {'value': ['~Ryan_Greenblatt1', '~Fabien_Roger1', '~Dmitrii_Krasheninnikov1', '~David_Krueger1']}, 'keywords': {'value': ['LLMs', 'Elicitation', 'Fine-tuning', 'Sandbagging', 'Red-teaming', 'Safety']}, 'TLDR': {'value': 'We train models to behave poorly except when the prompt contains a password, and study when supervised fine-tuning and RL can recover high performance.'}, 'abstract': {'value': 'To determine the safety of large language models (LLMs), AI developers must be able to assess their dangerous capabilities. But simple prompting strategies often fail to elicit an LLM’s full capabiliti

In [6]:
# primary areas
areas = set()

for submission in submissions:
    areas.add(submission.content['primary_area']['value'])

areas

{'active_learning',
 'algorithmic_game_theory',
 'bandits',
 'causal_inference',
 'deep_learning_architectures',
 'diffusion_based_models',
 'evaluation',
 'fairness',
 'generative_models',
 'graph_neural_networks',
 'human-AI_interaction',
 'infrastructure',
 'interpretability_and_explainability',
 'learning_theory',
 'machine_learning_for_healthcare',
 'machine_learning_for_other_sciences_and_fields',
 'machine_learning_for_physical_sciences',
 'machine_learning_for_social_sciences',
 'machine_vision',
 'natural_language_processing',
 'neuroscience_and_cognitive_science',
 'online_learning',
 'optimization',
 'optimization_for_deep_networks',
 'other',
 'privacy',
 'probabilistic_methods',
 'reinforcement_learning',
 'robotics',
 'safety_in_machine_learning',
 'speech_and_audio'}

In [7]:
from collections import defaultdict
allpaper = {}

for submission in submissions:
    venue = submission.content['venue']['value']
    area = submission.content['primary_area']['value']

    if venue not in allpaper:
        allpaper[venue] = defaultdict(dict)

    if area not in allpaper[venue]:
        allpaper[venue][area] = []

    allpaper[venue][area].append(submission)

allpaper.keys()

dict_keys(['NeurIPS 2024 poster', 'NeurIPS 2024 spotlight', 'NeurIPS 2024 oral'])

## Export .bib & .md

In [8]:
sample_paper.content
sample_paper.content['venue']['value']
sample_paper.content['primary_area']['value']
sample_paper.content['_bibtex']['value']

'@inproceedings{\nhe2024learning,\ntitle={Learning to grok: Emergence of in-context learning and skill composition in modular arithmetic tasks},\nauthor={Tianyu He and Darshil Doshi and Aritra Das and Andrey Gromov},\nbooktitle={The Thirty-eighth Annual Conference on Neural Information Processing Systems},\nyear={2024},\nurl={https://openreview.net/forum?id=aVh9KRZdRk}\n}'

In [9]:
import re

def parse_bitex(bibtex_data, gen_id=False, id_surfix='', lower_case_type=True):
    entry_regex = re.compile(r'@([a-zA-Z]+){([^,]+),(.*)}', re.DOTALL)
    match = entry_regex.search(bibtex_data)

    if not match:
        raise ValueError("Invalid BibTeX data")

    fields = {
        'type': match.group(1),
        'id': match.group(2)
    }
    fields_str = match.group(3)

    mode = 'key'
    store = ''
    max_layer = 0
    stack = []
    keys = []
    values = []

    for idx, char in enumerate(fields_str):
        if mode == 'key':
            if char == '=':
                keys.append(store.strip())
                store = ''
                mode = 'value'
            else:
                store += char
        elif mode == 'value':
            store += char

            if char == '{':
                stack.append(char)
                max_layer += 1
                if max_layer == 1:
                    store = ''
            elif char == '}':
                stack.pop()
                if not stack:
                    store = store[:-1]

            if (max_layer > 0 and not stack) or (max_layer == 0 and (char in ',}' or idx == len(fields_str) - 1)):
                value = store.strip().replace(": ", "{:} ")
                if value.startswith('{') or value.endswith('}'):
                    value = f'"{value}"'
                values.append(value)
                store = ''
                max_layer = 0
                mode = 'key'

    for key, value in zip(keys, values):
        fields[key.lower().replace('\n', '').replace(',', '')] = value

    if gen_id and 'author' in fields:
        authors = fields['author'].split(' and ')
        first_author = authors[0]
        if ',' in first_author:
            last_name, first_name = map(str.strip, first_author.split(','))
        else:
            name_parts = list(map(str.strip, first_author.split()))
            last_name = name_parts.pop()
            first_name = ' '.join(name_parts)

        fields['id'] = re.sub(r'[^a-zA-Z0-9]', '', f"{first_name}{last_name}{fields.get('year', '')}") + id_surfix

    if lower_case_type:
        fields['type'] = fields['type'].lower()

    return fields

bib = sample_paper.content['_bibtex']['value']
fields = parse_bitex(bib, gen_id=True, id_surfix='NeurIPS')
fields

{'type': 'inproceedings',
 'id': 'TianyuHe2024NeurIPS',
 'title': 'Learning to grok{:} Emergence of in-context learning and skill composition in modular arithmetic tasks',
 'author': 'Tianyu He and Darshil Doshi and Aritra Das and Andrey Gromov',
 'booktitle': 'The Thirty-eighth Annual Conference on Neural Information Processing Systems',
 'year': '2024',
 'url': 'https://openreview.net/forum?id=aVh9KRZdRk'}

In [10]:
def make_bibtex(fields):
    # generate bibtex from a dict
    bibtex = '@' + fields['type'] + '{' + fields['id'] + ',\n'

    for key, value in fields.items():
        if key not in ['type', 'id']:
            bibtex += '  ' + key + ' = ' + '{' + value + '},\n'
    bibtex += "}\n"
    return bibtex

print(make_bibtex(fields))

@inproceedings{TianyuHe2024NeurIPS,
  title = {Learning to grok{:} Emergence of in-context learning and skill composition in modular arithmetic tasks},
  author = {Tianyu He and Darshil Doshi and Aritra Das and Andrey Gromov},
  booktitle = {The Thirty-eighth Annual Conference on Neural Information Processing Systems},
  year = {2024},
  url = {https://openreview.net/forum?id=aVh9KRZdRk},
}



In [11]:
for name, value in allpaper.items():
    print(name)
    for name, value in value.items():
        print(name, len(value))

NeurIPS 2024 poster
safety_in_machine_learning 169
machine_vision 530
generative_models 184
learning_theory 219
natural_language_processing 270
infrastructure 25
machine_learning_for_healthcare 69
graph_neural_networks 110
deep_learning_architectures 176
causal_inference 72
neuroscience_and_cognitive_science 89
reinforcement_learning 251
diffusion_based_models 200
interpretability_and_explainability 111
other 132
probabilistic_methods 107
privacy 77
machine_learning_for_physical_sciences 77
optimization_for_deep_networks 113
optimization 173
speech_and_audio 27
evaluation 40
fairness 46
algorithmic_game_theory 43
online_learning 56
active_learning 23
machine_learning_for_other_sciences_and_fields 127
robotics 38
human-AI_interaction 18
bandits 57
machine_learning_for_social_sciences 19
NeurIPS 2024 spotlight
machine_learning_for_healthcare 5
graph_neural_networks 7
natural_language_processing 21
machine_vision 43
reinforcement_learning 17
generative_models 15
machine_learning_for_other

In [12]:
bib = []
mdbib = []
bibkey = set()

for name, venue in allpaper.items():
    bib.append('% ---------------------------')
    bib.append(f'% {name}')
    bib.append('% ---------------------------\n')

    mdbib.append(f'## {name}\n')

    for name, area in venue.items():
        bib.append(f'% {name}\n')
        mdbib.append(f'### {name}\n')
        mdbib.append(f'```bibtex')

        for paper in area:
            bibtex = paper.content['_bibtex']['value']
            fields = parse_bitex(bibtex, gen_id=True, id_surfix='NeurIPS')
            
            # resolve duplicated bibkey
            while(fields['id'] in bibkey):
                fields['id'] += '+'
                
            bibkey.add(fields['id'])

            # add abstract
            abstract = paper.content['abstract']['value'].replace('\n', '')
            fields['abstract'] = abstract

            # add tags
            tags = [paper.content['venue']['value'], paper.content['primary_area']['value']]
            fields['tags'] = ', '.join(tags)

            # add to bib list (w/o abstract)
            bib.append(make_bibtex({
                key: fields[key]
                for key in fields.keys() if key != 'abstract'
            }))

            # add to markdown list
            mdbib.append(make_bibtex(fields))

        mdbib.append(f'```\n')

with open('output/neurips2024.bib', 'w') as f:
    f.write('\n'.join(bib))

with open('output/neurips2024.md', 'w') as f:
    f.write('\n'.join(mdbib))