# Crawl dataset with all submissions info
OpenReview Venue Crawling

In [137]:
%load_ext autoreload
%autoreload 2

import time
import pandas as pd
import multiprocessing as mp
from multiprocessing import Pool
from tqdm import tqdm
from tqdm.notebook import tqdm
import requests
import openreview
import json
import numpy as np

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Crawl list of all submissions
Here we scrape the _notes_ , (list of all submissions) using OpenReview's API, way faster than Selenium-based scraping.


In [37]:
client = openreview.api.OpenReviewClient(
    baseurl='https://api2.openreview.net',
    username="",
    password=""
)

In [38]:
venue_id = 'ICLR.cc/2024/Conference'
venue_group = client.get_group(venue_id)
submission_name = venue_group.content['submission_name']['value']
submissions = client.get_all_notes(invitation=f'{venue_id}/-/{submission_name}', details='replies')

Getting V2 Notes: 100%|███████████████████████████████████████████▉| 7296/7304 [01:06<00:00, 109.13it/s]


In [119]:
submission.id

'zzqn5G9fjn'

In [129]:
def submission2note(submission):
    review_keys=['summary', 'strengths', 'weaknesses', 'questions']
    total_replies = submission.details["replies"]
    rating_replies = [reply for reply in submission.details["replies"] if "rating" in reply["content"]]
    note = {
        "id": submission.id,
        "title": submission.content["title"]["value"],
        "keywords": submission.content["keywords"]["value"],
        "ratings": [
            int(reply["content"]["rating"]["value"][0])
            for reply in rating_replies
        ],
        "confidences": [
            int(reply["content"]["confidence"]["value"][0])
            for reply in rating_replies
        ],
        "withdraw": 1 if "Withdrawn" in submission.content["venue"]["value"] else 0,
        "review_lengths": [
            sum([len(reply["content"][key]["value"].split()) for key in review_keys])
            for reply in rating_replies
        ],
        "abstract": submission.content["abstract"]["value"],
        "comments": len(total_replies),
        "url": f"https://openreview.net/forum?id={submission.id}"
    }
    return note

In [158]:
notes = [submission2note(submission) for submission in submissions]
notes = pd.DataFrame(notes)
notes['ratings_avg'] = notes['ratings'].apply(lambda x: np.mean(x))
notes['ratings_std'] = notes['ratings'].apply(lambda x: np.std(x))
notes['confidence_avg'] = notes['confidences'].apply(lambda x: np.mean(x))
notes['confidence_std'] = notes['confidences'].apply(lambda x: np.std(x))
notes

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,


Unnamed: 0,id,title,keywords,ratings,confidences,withdraw,review_lengths,abstract,comments,url,ratings_avg,ratings_std,confidence_avg,confidence_std
0,zzv4Bf50RW,Learning SO(3)-Invariant Correspondence via Po...,"[Point cloud understanding, 3D dense correspon...","[6, 5, 3, 5, 5]","[2, 4, 4, 4, 4]",1,"[191, 215, 570, 412, 290]",Establishing accurate dense 3D correspondences...,5,https://openreview.net/forum?id=zzv4Bf50RW,4.800000,0.979796,3.600000,0.800000
1,zzqn5G9fjn,Breaking Physical and Linguistic Borders: Mult...,"[Multilingual Federated Learning, Natural Lang...","[8, 1, 3, 5]","[5, 5, 4, 4]",0,"[262, 646, 293, 234]",Pretrained large language models (LLMs) have e...,28,https://openreview.net/forum?id=zzqn5G9fjn,4.250000,2.586020,4.500000,0.500000
2,zz61V8bIab,Stochastic Adversarial Networks for Multi-Doma...,"[Multi-domain text classification, Adversarial...","[5, 1, 5]","[3, 4, 2]",1,"[534, 484, 213]",Adversarial training has played a pivotal role...,3,https://openreview.net/forum?id=zz61V8bIab,3.666667,1.885618,3.000000,0.816497
3,zyBJodMrn5,On the generalization capacity of neural netwo...,"[compositional generalization, compositionalit...","[6, 3, 8]","[4, 3, 4]",0,"[318, 860, 591]",The advent of the Transformer has led to the d...,14,https://openreview.net/forum?id=zyBJodMrn5,5.666667,2.054805,3.666667,0.471405
4,zxPDdw8koz,CLIP meets Model Zoo Experts: Pseudo-Supervisi...,"[Contrastive Learning, CLIP, Distillation, Den...","[3, 3, 3, 8]","[4, 5, 5, 3]",1,"[226, 261, 264, 228]",Contrastive language image pretraining (CLIP) ...,4,https://openreview.net/forum?id=zxPDdw8koz,4.250000,2.165064,4.250000,0.829156
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7299,02Ug9N8DCI,GateLoop: Fully Data-Controlled Linear Recurre...,"[Data-controlled, Linear Recurrence, Sequence ...","[3, 3, 3, 5]","[4, 4, 4, 5]",0,"[459, 372, 278, 635]",Linear Recurrence has proven to be a powerful ...,4,https://openreview.net/forum?id=02Ug9N8DCI,3.500000,0.866025,4.250000,0.433013
7300,01ep65umEr,TeLLMe what you see: Using LLMs to Explain Neu...,"[Explainable AI, Explaining Neurons in Vision ...","[6, 5, 5, 5]","[5, 3, 4, 3]",0,"[397, 397, 238, 289]",As the role of machine learning models continu...,10,https://openreview.net/forum?id=01ep65umEr,5.250000,0.433013,3.750000,0.829156
7301,01Yi8rzoNs,Visual Chain of Thought: Bridging Logical Gaps...,"[chain of thought, vision and language, large ...","[3, 6, 3, 5]","[4, 4, 4, 4]",1,"[452, 416, 368, 713]",Recent advances in large language models elici...,4,https://openreview.net/forum?id=01Yi8rzoNs,4.250000,1.299038,4.000000,0.000000
7302,014CgNPAGy,On the Role of Momentum in the Implicit Bias o...,"[GD, momentum, implicit bias, linear networks]","[5, 6, 5, 3]","[5, 3, 4, 4]",0,"[1154, 270, 356, 303]",Momentum is a widely adopted and crucial modif...,18,https://openreview.net/forum?id=014CgNPAGy,4.750000,1.089725,4.000000,0.707107


In [159]:
notes.to_csv(f"../data/iclr2024_{time.strftime('%Y%m%d')}.csv", index=False)

## Filter topic

In [161]:
notes_pr = notes[notes.abstract.str.contains("protein")]
notes_pr.to_csv(f"../data/iclr2024_protein_{time.strftime('%Y%m%d')}.csv", index=False)
print(len(notes_pr))
print(len(notes_pr[notes_pr.withdraw==0]))
notes_pr

101
88


Unnamed: 0,id,title,keywords,ratings,confidences,withdraw,review_lengths,abstract,comments,url,ratings_avg,ratings_std,confidence_avg,confidence_std
33,zgQ0PHeGnL,Rigid Protein-Protein Docking via Equivariant ...,"[Equivariant Graph Neural Network, rigid body ...","[5, 8, 8, 3]","[4, 3, 3, 4]",0,"[525, 714, 290, 338]",The study of rigid protein-protein docking pla...,17,https://openreview.net/forum?id=zgQ0PHeGnL,6.000000,2.121320,3.500000,0.500000
49,zUHgYRRAWl,Objective-Agnostic Enhancement of Molecule Pro...,"[VAE, molecule generation]","[1, 3, 1]","[5, 4, 4]",1,"[80, 568, 34]",Variational autoencoder (VAE) is a popular met...,3,https://openreview.net/forum?id=zUHgYRRAWl,1.666667,0.942809,4.333333,0.471405
64,zMPHKOmQNb,Protein Discovery with Discrete Walk-Jump Samp...,"[generative modeling, langevin mcmc, energy-ba...","[8, 8, 8]","[4, 3, 4]",0,"[591, 581, 523]",We resolve difficulties in training and sampli...,7,https://openreview.net/forum?id=zMPHKOmQNb,8.000000,0.000000,3.666667,0.471405
95,z3mPLBLfGY,Generalist Equivariant Transformer Towards 3D ...,[unified representation; molecular interaction...,"[6, 5, 8, 5]","[5, 4, 3, 3]",0,"[175, 389, 157, 335]",Many processes in biology and drug discovery i...,25,https://openreview.net/forum?id=z3mPLBLfGY,6.000000,1.224745,3.750000,0.829156
166,yRrPfKyJQ2,Conversational Drug Editing Using Retrieval an...,"[Large Language Models, prompt, retrieval, dom...","[6, 6, 6]","[2, 3, 4]",0,"[202, 195, 394]",Recent advancements in conversational large la...,11,https://openreview.net/forum?id=yRrPfKyJQ2,6.000000,0.000000,3.000000,0.816497
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6985,2xYO9oxh0y,DiffSDS: A geometric sequence diffusion model ...,[Conditional sequence diffusion],"[3, 5, 3]","[5, 4, 3]",1,"[352, 492, 466]",Can a pure transformer learn protein structure...,3,https://openreview.net/forum?id=2xYO9oxh0y,3.666667,0.942809,4.000000,0.816497
7176,1IaoWBqB6K,DiffDock-Pocket: Diffusion for Pocket-Level Do...,"[diffusion, diffusion models, docking, generat...","[6, 5, 6, 3]","[3, 4, 3, 4]",0,"[522, 454, 380, 360]","When a small molecule binds to a protein, the ...",15,https://openreview.net/forum?id=1IaoWBqB6K,5.000000,1.224745,3.500000,0.500000
7212,0xT87opqKV,ProteinAdapter: Adapting Pre-trained Large Pro...,"[Pretrained Large Models, Parameter-Efficient ...","[3, 3, 5, 5]","[3, 3, 4, 5]",0,"[794, 187, 211, 404]",The study of proteins is crucial in various sc...,4,https://openreview.net/forum?id=0xT87opqKV,4.000000,1.000000,3.750000,0.829156
7291,070DFUdNh7,GraphGPT: Graph Learning with Generative Pre-t...,"[Graph, GPT, Generative, Pre-train, Fine-tune,...","[5, 5, 3, 5]","[4, 3, 3, 3]",0,"[371, 376, 218, 184]","We introduce GraphGPT, a novel model for Graph...",11,https://openreview.net/forum?id=070DFUdNh7,4.500000,0.866025,3.250000,0.433013


In [162]:
notes_mol = notes[notes.abstract.str.contains("molecule") | notes.abstract.str.contains("molecular")]
notes_mol.to_csv(f"../data/iclr2024_molecule_{time.strftime('%Y%m%d')}.csv", index=False)
print(len(notes_mol))
print(len(notes_mol[notes_mol.withdraw==0]))
notes_mol

160
142


Unnamed: 0,id,title,keywords,ratings,confidences,withdraw,review_lengths,abstract,comments,url,ratings_avg,ratings_std,confidence_avg,confidence_std
49,zUHgYRRAWl,Objective-Agnostic Enhancement of Molecule Pro...,"[VAE, molecule generation]","[1, 3, 1]","[5, 4, 4]",1,"[80, 568, 34]",Variational autoencoder (VAE) is a popular met...,3,https://openreview.net/forum?id=zUHgYRRAWl,1.666667,0.942809,4.333333,0.471405
50,zUDbPgskDS,"Crystals with Transformers on Graphs, for pred...","[AI for science, Graph networks, transformers,...","[6, 3, 3, 1]","[3, 2, 3, 5]",0,"[607, 206, 350, 409]",Graph neural networks (GNN) has found extensiv...,11,https://openreview.net/forum?id=zUDbPgskDS,3.250000,1.785357,3.250000,1.089725
95,z3mPLBLfGY,Generalist Equivariant Transformer Towards 3D ...,[unified representation; molecular interaction...,"[6, 5, 8, 5]","[5, 4, 3, 3]",0,"[175, 389, 157, 335]",Many processes in biology and drug discovery i...,25,https://openreview.net/forum?id=z3mPLBLfGY,6.000000,1.224745,3.750000,0.829156
116,yrgQdA5NkI,Equivariant Matrix Function Neural Networks,"[equivariance, graph neural networks, long range]","[6, 8, 5]","[3, 4, 2]",0,"[1095, 782, 260]","Graph Neural Networks (GNNs), especially messa...",13,https://openreview.net/forum?id=yrgQdA5NkI,6.333333,1.247219,3.000000,0.816497
166,yRrPfKyJQ2,Conversational Drug Editing Using Retrieval an...,"[Large Language Models, prompt, retrieval, dom...","[6, 6, 6]","[2, 3, 4]",0,"[202, 195, 394]",Recent advancements in conversational large la...,11,https://openreview.net/forum?id=yRrPfKyJQ2,6.000000,0.000000,3.000000,0.816497
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7176,1IaoWBqB6K,DiffDock-Pocket: Diffusion for Pocket-Level Do...,"[diffusion, diffusion models, docking, generat...","[6, 5, 6, 3]","[3, 4, 3, 4]",0,"[522, 454, 380, 360]","When a small molecule binds to a protein, the ...",15,https://openreview.net/forum?id=1IaoWBqB6K,5.000000,1.224745,3.500000,0.500000
7227,0oIkKERYhH,DOG: Discriminator-only Generation Beats GANs ...,"[generative modeling, graph generation]","[5, 3, 6, 3]","[4, 4, 3, 4]",0,"[587, 476, 289, 249]",We propose discriminator-only generation (DOG)...,14,https://openreview.net/forum?id=0oIkKERYhH,4.250000,1.299038,3.750000,0.433013
7239,0fSNU64FV7,Sorting Out Quantum Monte Carlo,"[quantum chemistry, scientific machine learnin...","[6, 3, 5, 3]","[4, 4, 3, 5]",0,"[273, 694, 1039, 256]",Molecular modeling at the quantum level requir...,16,https://openreview.net/forum?id=0fSNU64FV7,4.250000,1.299038,4.000000,0.707107
7257,0VBsoluxR2,MOFDiff: Coarse-grained Diffusion for Metal-Or...,"[Materials design, diffusion model, metal-orga...","[8, 8, 8, 8]","[4, 3, 3, 4]",0,"[354, 409, 562, 573]",Metal-organic frameworks (MOFs) are of immense...,18,https://openreview.net/forum?id=0VBsoluxR2,8.000000,0.000000,3.500000,0.500000


## (optional) older crawled data

In [44]:
# Read data from old version 
df_old = pd.read_csv(DATA_PATH + 'iclr2023_20221120.csv')
df_old.head()

Unnamed: 0,id,title,keywords,ratings,confidences,withdraw,review_lengths
0,kRvZ2PcsxjJj,Quantum reinforcement learning,"['quantum reinforcement learning', 'multi-agen...","[1, 1, 1, 1]","[5, 5, 5, 5]",1,"[45, 49, 25, 283]"
1,RUzSobdYy0V,Quantifying and Mitigating the Impact of Label...,[],"[5, 6, 8]","[4, 3, 3]",0,"[443, 274, 401]"
2,N3kGYG3ZcTi,Suppression helps: Lateral Inhibition-inspired...,"['Lateral Inhibition', 'Convolutional Neural N...","[3, 5, 3, 1]","[5, 5, 5, 5]",0,"[333, 360, 362, 304]"
3,tmIiMPl4IPa,Factorized Fourier Neural Operators,"['fourier transform', 'fourier operators', 'pd...","[8, 6, 3, 8, 3]","[5, 4, 4, 2, 2]",0,"[203, 142, 323, 520, 635]"
4,mhnHqRqcjYU,DFPC: Data flow driven pruning of coupled chan...,"['Pruning', 'Data Free', 'Model Compression']","[8, 6, 6]","[3, 2, 3]",0,"[302, 90, 257]"


In [45]:
papers_ids = df_old['id'].values
print("Number of papers (including old):", len(papers_ids))

Number of papers (including old): 4874


## Crawl forums of each submission
Here we scrape the forums of each submissions, it can be pretty fast thanks to:
- OpenReview's API (we use requests)
- Multiprocessing to parallelize the scraping of each paper

In [46]:
# Create multiprocessing pool of requests over index of dataframe

extra = "trash=true&details=replyCount%2Cwritable%2Crevisions%2Coriginal%2Coverwriting%2Cinvitation%2Ctags"

def get_paper_data(paper_id, extra='', timeout=5):
    try:
        url = f"https://api.openreview.net/notes?forum={paper_id}&{extra}"
        response = requests.get(url, timeout=timeout)
        data = response.json()
        return data
    except requests.exceptions.Timeout:
        print(f"Error for paper {paper_id}: Request timed out")
        return None
    except:
        print(f"Error for paper {paper_id}: General error")
        return None

def retry_get_paper_data(paper_id, extra='', timeout=5, retries=10):
    for i in range(retries):
        data = get_paper_data(paper_id, extra, timeout)
        if data is not None:
            return data
    print(f"Error for paper {paper_id}: All {retries} attempts failed")
    return None

def get_paper_data_multi(paper_ids, ratio=0.8):
    num_processes = int(ratio*mp.cpu_count())
    with Pool(num_processes) as p:
        data = list(tqdm(p.imap(retry_get_paper_data, paper_ids), total=len(paper_ids)))
    return data

In [47]:
# filter df with only id, title, url and keywords
df_raw_filtered = df_raw[['id', 'content.title', 'content.keywords']]
df_raw_filtered.head()

Unnamed: 0,id,content.title,content.keywords
0,RUzSobdYy0V,Quantifying and Mitigating the Impact of Label...,[]
1,N3kGYG3ZcTi,Suppression helps: Lateral Inhibition-inspired...,"[Lateral Inhibition, Convolutional Neural Netw..."
2,tmIiMPl4IPa,Factorized Fourier Neural Operators,"[fourier transform, fourier operators, pde, na..."
3,mhnHqRqcjYU,DFPC: Data flow driven pruning of coupled chan...,"[Pruning, Data Free, Model Compression]"
4,sZI1Oj9KBKy,TVSPrune - Pruning Non-discriminative filters ...,"[Structured pruning, model compression]"


In [48]:
# ids = list(df_raw_filtered['id'])
ids = df_old['id'].values # use old ids to get data from old papers
data = get_paper_data_multi(ids, ratio=1)

  0%|          | 0/4874 [00:00<?, ?it/s]

In [50]:
# get only notes
notes = [d['notes'] for d in data]

In [58]:
def filter_data(item, 
                review_keys=['summary_of_the_paper', 'strength_and_weaknesses', 'clarity,_quality,_novelty_and_reproducibility', 'summary_of_the_review'],
                decision=True):
    """Filter only ratings, confidence, withdraw status and decisions"""
    # parse each note
    withdraw = 0
    # filter meta note
    meta_note = [d for d in item if 'Paper' not in d['invitation']]
    # check withdrawn
    withdraw = 1 if 'Withdrawn_Submission' in meta_note[0]['invitation'] else 0
    # decision
    if decision:
        try:
            if withdraw == 0:
                decision_note = [d for d in item if 'Decision' in d['invitation']]
                decision = decision_note[0]['content']['decision']
            else:
                decision = ''
        except:
            decision = ''
    # filter reviewer comments
    comment_notes = [d for d in item \
                     if 'Official_Review' in d['invitation'] and 'recommendation' in d['content'].keys()]
    comment_notes = sorted(comment_notes, key=lambda d: d['number'])[::-1]
    ratings = [int(note['content']['recommendation'].split(':')[0]) for note in comment_notes]
    confidences = [int(note['content']['confidence'].split(':')[0]) for note in comment_notes]
    review_lengths = [sum(len(note['content'][key].split()) for key in review_keys) for note in comment_notes] # review lengths

    data = {'ratings': ratings, 'confidences': confidences, 'withdraw': withdraw, 'review_lengths': review_lengths}
    if decision: data['decision'] = decision
    return data

In [59]:
# filter data in a pool of processes
with Pool(8) as p:
    filtered_notes = list(tqdm(p.imap(filter_data, notes), total=len(notes)))

  0%|          | 0/4874 [00:00<?, ?it/s]

In [60]:
# create dataframe
ratings = pd.DataFrame(filtered_notes)
ratings.head()

Unnamed: 0,ratings,confidences,withdraw,review_lengths,decision
0,"[1, 1, 1, 1]","[5, 5, 5, 5]",1,"[45, 49, 25, 283]",
1,"[5, 6, 8]","[4, 3, 3]",0,"[443, 274, 401]",Accept: poster
2,"[3, 6, 3, 1]","[5, 5, 5, 5]",0,"[333, 360, 362, 304]",Reject
3,"[8, 6, 5, 8, 6]","[5, 4, 4, 2, 3]",0,"[203, 142, 323, 520, 752]",Accept: poster
4,"[8, 6, 6]","[3, 2, 3]",0,"[302, 90, 257]",Accept: poster


In [61]:
# Merge with df_raw_filtered
df_final = pd.concat([df_raw_filtered, ratings], axis=1)
df_final.head()

Unnamed: 0,id,content.title,content.keywords,ratings,confidences,withdraw,review_lengths,decision
0,RUzSobdYy0V,Quantifying and Mitigating the Impact of Label...,[],"[1, 1, 1, 1]","[5, 5, 5, 5]",1,"[45, 49, 25, 283]",
1,N3kGYG3ZcTi,Suppression helps: Lateral Inhibition-inspired...,"[Lateral Inhibition, Convolutional Neural Netw...","[5, 6, 8]","[4, 3, 3]",0,"[443, 274, 401]",Accept: poster
2,tmIiMPl4IPa,Factorized Fourier Neural Operators,"[fourier transform, fourier operators, pde, na...","[3, 6, 3, 1]","[5, 5, 5, 5]",0,"[333, 360, 362, 304]",Reject
3,mhnHqRqcjYU,DFPC: Data flow driven pruning of coupled chan...,"[Pruning, Data Free, Model Compression]","[8, 6, 5, 8, 6]","[5, 4, 4, 2, 3]",0,"[203, 142, 323, 520, 752]",Accept: poster
4,sZI1Oj9KBKy,TVSPrune - Pruning Non-discriminative filters ...,"[Structured pruning, model compression]","[8, 6, 6]","[3, 2, 3]",0,"[302, 90, 257]",Accept: poster


## Save filtered dataset 
We will be saving a smaller version of the dataset in csv format with the data we need for our analysis - this can also be saved directly in Github

In [62]:
# Save dataframe as csv
# rename title
df_final.rename(columns={'content.title': 'title'}, inplace=True)
#rename keywords
df_final.rename(columns={'content.keywords': 'keywords'}, inplace=True)
df_final.to_csv(f'{DATA_PATH}{venue_short}_{time.strftime("%Y%m%d")}.csv', index=False)

## Saving full crawled dataset

Note that this dataset is raw and contains everyting; so it will be pretty large (>100 MBs)!

In [63]:
# Save dataframe as hdf5
notes_df = pd.DataFrame([n['notes'] for n in data])
count_df = pd.DataFrame({'notes_count': [n['count'] for n in data]})
df = pd.concat([df_raw, notes_df, count_df], axis=1)
df.to_hdf(f'{DATA_PATH}{venue_short}_data_full_{time.strftime("%Y%m%d")}.h5', key='df', mode='w')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed-integer,key->axis0] [items->None]

  df.to_hdf(f'{DATA_PATH}{venue_short}_data_full_{time.strftime("%Y%m%d")}.h5', key='df', mode='w')
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block2_values] [items->Index([                                                                    'id',
                                                                     'original',
                                                                        'mdate',
                                                                        'ddate',
                                                                       'tddate',
                                                                        'forum',
                                                                      'replyto',
                  