In [None]:
# Step 1, figure out how to download the list of papers for ICLR 2018.
# Step 2, get the keywords for those papers
# Step 3, get the authors for those papers
# Step 4, make a dict mapping authors to keywords (and likewise for papers)

# Largely following the notebook shared here: https://www.reddit.com/r/MachineLearning/comments/eyfhtc/d_openreviewnet_scraper_for_reviews/

In [1]:
!pip install openreview-py pandas matplotlib seaborn



In [2]:
from collections import defaultdict, deque, Counter

import openreview
import io
import os
import json
import statistics
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [2]:
guest_client = openreview.Client(baseurl='https://api.openreview.net')
submissions = openreview.tools.iterget_notes(
        guest_client, invitation='ICLR.cc/2021/Conference/-/Blind_Submission')
submissions_by_forum = {n.forum: n for n in submissions}
# print(submissions_by_forum)

In [None]:
print(len(submissions_by_forum))
print(list(submissions_by_forum.keys())[:5])
print(submissions_by_forum['73WTGs96kho'].content['authorids'])
print(submissions_by_forum['73WTGs96kho'].content['keywords'])

In [None]:
guest_client = openreview.Client(baseurl='https://api.openreview.net')
submissions = openreview.tools.iterget_notes(
        guest_client, invitation='ICLR.cc/2020/Conference/-/Blind_Submission')
submissions_by_forum = {n.forum: n for n in submissions}
print(len(submissions_by_forum))

In [None]:
guest_client = openreview.Client(baseurl='https://api.openreview.net')
submissions = openreview.tools.iterget_notes(
        guest_client, invitation='ICLR.cc/2019/Conference/-/Blind_Submission')
submissions_by_forum = {n.forum: n for n in submissions}
print(len(submissions_by_forum))

In [None]:
guest_client = openreview.Client(baseurl='https://api.openreview.net')
submissions = openreview.tools.iterget_notes(
        guest_client, invitation='ICLR.cc/2018/Conference/-/Blind_Submission')
submissions_by_forum = {n.forum: n for n in submissions}
print(len(submissions_by_forum))

In [None]:
guest_client = openreview.Client(baseurl='https://api.openreview.net')
submissions = openreview.tools.iterget_notes(
        guest_client, invitation='ICLR.cc/2018/Conference/-/Blind_Submission')
submissions_by_forum = {n.forum: n for n in submissions}
print(len(submissions_by_forum))

In [None]:
print(submissions_by_forum['ryBnUWb0b'])

In [3]:
reviewer_keywords = {}
paper_keywords = {}

for year in range(2018, 2023):
    submissions = openreview.tools.iterget_notes(
        guest_client, invitation='ICLR.cc/%d/Conference/-/Blind_Submission' % year)
    submissions_by_forum = {n.forum: n for n in submissions}
    
    # Collect keywords for each paper
    paper_keywords[year] = {}
    for paper_id in submissions_by_forum:
        paper_keywords[year][paper_id] = [kw.lower() for kw in submissions_by_forum[paper_id].content['keywords']]
        
    # Collect keywords for each reviewer
    reviewer_keywords[year] = {}
    for paper_id in submissions_by_forum:
        kws = Counter([kw.lower() for kw in submissions_by_forum[paper_id].content['keywords']])
        for reviewer_id in submissions_by_forum[paper_id].content['authors']:
            if reviewer_id in reviewer_keywords[year]:
                reviewer_keywords[year][reviewer_id] += kws
            else:
                reviewer_keywords[year][reviewer_id] = kws

In [4]:
# Represent each reviewer as the collection of keywords from papers they wrote in this and previous ICLR conferences.
combined_reviewer_keywords = {}
for year in range(2018, 2023):
    combined_reviewer_keywords[year] = {reviewer_id: Counter() for reviewer_id in reviewer_keywords[year]}
    for prev_year in range(2018, year+1):
        for reviewer_id in combined_reviewer_keywords[year]:
            if reviewer_id in reviewer_keywords[prev_year]:
                combined_reviewer_keywords[year][reviewer_id] += reviewer_keywords[prev_year][reviewer_id]

In [None]:
((combined_reviewer_keywords[2018].keys() & combined_reviewer_keywords[2019].keys()) & combined_reviewer_keywords[2020].keys()) & combined_reviewer_keywords[2021].keys()

In [None]:
len(combined_reviewer_keywords[2021].keys() & combined_reviewer_keywords[2022].keys())

In [None]:
combined_reviewer_keywords[2022]['Aaron Courville']

In [5]:
combined_reviewer_keywords[2019]['Aaron Courville']

Counter({'generative models': 3,
         'hierarchical models': 1,
         'latent variable models': 1,
         'variational inference': 1,
         'bayesian inference': 1,
         'deep networks': 1,
         'language model': 1,
         'unsupervised parsing': 1,
         'generative': 1,
         'hierarchical': 1,
         'unsupervised': 2,
         'semisupervised': 1,
         'latent': 1,
         'ali': 1,
         'gan': 1,
         'representation learning': 2,
         'auto-encoders': 1,
         '3d point clouds': 1,
         'gans': 2,
         'gaussian mixture models': 1,
         'adaptive optimizer': 1,
         'momentum': 1,
         'hyperparameter tuning': 1,
         'probability distillation': 1,
         'autoregressive models': 1,
         'normalizing flows': 1,
         'wavenet': 1,
         'pixelcnn': 1,
         'image-to-image': 1,
         'translation': 1,
         'generation': 1,
         'adversarial': 1,
         'learning': 1,
         'mu

In [None]:
combined_reviewer_keywords[2020]['David Blei']

In [None]:
sorted(combined_reviewer_keywords[2022].keys())[1500:7000]

In [6]:
for year in range(2018, 2023):
    print(year)
    revs_this_year = sorted(combined_reviewer_keywords[year])
    papers_this_year = sorted(paper_keywords[year])
    
    m = len(combined_reviewer_keywords[year])
    n = len(paper_keywords[year])
    scores_mu = np.zeros((m, n))
    scores_sigma = np.zeros((m, n))
    for r_idx, rev in enumerate(revs_this_year):
        if r_idx % 500 == 0:
            print(r_idx/m)
        for p_idx, pap in enumerate(papers_this_year):
            # Compute the score and variance for this reviewer-paper pair
            rev_kws = combined_reviewer_keywords[year][rev]
            pap_kws = paper_keywords[year][pap]
            
            if not len(rev_kws) or not len(pap_kws):
                scores_mu[r_idx, p_idx] = 0.5
                scores_sigma[r_idx, p_idx] = 0.25
            else:
                # Convert the max reviewer keyword count to 1 and the min count to 0.2.
                scaled_rev_kws = {}
                max_reviewer_kw_ct = sorted(rev_kws.values())[-1]
                min_reviewer_kw_ct = sorted(rev_kws.values())[0]
                new_scores = np.linspace(.2, 1, max_reviewer_kw_ct - min_reviewer_kw_ct + 1)
                for kw, old_ct in rev_kws.items():
                    scaled_rev_kws[kw] = new_scores[old_ct-min_reviewer_kw_ct]
                # Score the match (mean)
                scores_achieved = []
                for kw in pap_kws:
                    if kw in scaled_rev_kws:
                        scores_achieved.append(scaled_rev_kws[kw])
                score_mean = 0
                for idx, s in enumerate(sorted(scores_achieved, key=lambda x: -x)):
                    score_mean += s*(0.5**idx)
                max_score = 0
                for idx, s in enumerate([1]*len(pap_kws)):
                    max_score += s*(0.5**idx)
                score_mean /= max_score
                scores_mu[r_idx, p_idx] = score_mean
                # Get the variance of the match
                scores_sigma[r_idx, p_idx] = (1/len(pap_kws)) * (1/len(rev_kws))
    np.save('scores_mu_iclr_%d' % year, scores_mu)
    np.save('scores_sigma_iclr_%d' % year, scores_sigma)

2018
0.0
0.18102824040550325
0.3620564808110065
0.5430847212165097
0.724112961622013
0.9051412020275162
2019
0.0
0.1144950767117014
0.2289901534234028
0.3434852301351042
0.4579803068468056
0.572475383558507
0.6869704602702084
0.8014655369819098
0.9159606136936111
2020
0.0
0.07275902211874273
0.14551804423748546
0.21827706635622818
0.2910360884749709
0.3637951105937136
0.43655413271245636
0.5093131548311991
0.5820721769499418
0.6548311990686845
0.7275902211874272
0.80034924330617
0.8731082654249127
0.9458672875436555
2021
0.0
0.06435006435006435
0.1287001287001287
0.19305019305019305
0.2574002574002574
0.32175032175032175
0.3861003861003861
0.45045045045045046
0.5148005148005148
0.5791505791505791
0.6435006435006435
0.7078507078507078
0.7722007722007722
0.8365508365508365
0.9009009009009009
0.9652509652509652
2022
0.0
0.05972288580984233
0.11944577161968466
0.179168657429527
0.23889154323936931
0.29861442904921165
0.358337314859054
0.4180602006688963
0.47778308647873863
0.53750597228858

In [3]:
x = np.load("scores_mu_iclr_2018.npy")
y = np.load("scores_sigma_iclr_2018.npy")

FileNotFoundError: [Errno 2] No such file or directory: 'scores_mu_iclr_2018.npy'