In [None]:
# Step 1, figure out how to download the list of papers for ICLR 2018.
# Step 2, get the keywords for those papers
# Step 3, get the authors for those papers
# Step 4, make a dict mapping authors to keywords (and likewise for papers)

# Largely following the notebook shared here: https://www.reddit.com/r/MachineLearning/comments/eyfhtc/d_openreviewnet_scraper_for_reviews/

In [1]:
!pip install openreview-py pandas matplotlib seaborn



In [6]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
     ---------------------------------------- 86.0/86.0 kB 2.4 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: sentence_transformers
  Building wheel for sentence_transformers (setup.py): started
  Building wheel for sentence_transformers (setup.py): finished with status 'done'
  Created wheel for sentence_transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125925 sha256=6ce055b5d12723d6c71455a34349a9b06236d38ea93cb55983bec1d7451a29ee
  Stored in directory: c:\users\justin payan\appdata\local\pip\cache\wheels\71\67\06\162a3760c40d74dd40bc855d527008d26341c2b0ecf3e8e11f
Successfully built sentence_transformers
Installing collected packages: sentence_transformers
Successfully installed sentence_transformers-2.2.2




In [8]:
from collections import defaultdict, deque, Counter

import openreview
import io
import os
import json
import statistics
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from copy import deepcopy
import pickle

In [8]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')

In [2]:
guest_client = openreview.Client(baseurl='https://api.openreview.net')
submissions = openreview.tools.iterget_notes(
        guest_client, invitation='ICLR.cc/2021/Conference/-/Blind_Submission')
submissions_by_forum = {n.forum: n for n in submissions}
# print(submissions_by_forum)

In [3]:
print(len(submissions_by_forum))
print(list(submissions_by_forum.keys())[:5])
print(submissions_by_forum['73WTGs96kho'].content['authorids'])
print(submissions_by_forum['73WTGs96kho'].content['keywords'])

2594
['trPMYEn1FCX', '73WTGs96kho', 'mNtmhaDkAr', 'oev4KdikGjy', '-aThAo4b1zn']
['~Liran_Katzir1', '~Gal_Elidan1', '~Ran_El-Yaniv1']
['Neural Networks', 'Architectures', 'Tabular Data', 'Predictive Modeling']


In [9]:
# sentence_embeddings = model.encode(sentences)
# cos_sim = util.cos_sim(emb1, emb2)
submissions_by_forum['73WTGs96kho'].content['title']

'Net-DNF: Effective Deep Modeling of Tabular Data'

In [None]:
guest_client = openreview.Client(baseurl='https://api.openreview.net')
submissions = openreview.tools.iterget_notes(
        guest_client, invitation='ICLR.cc/2020/Conference/-/Blind_Submission')
submissions_by_forum = {n.forum: n for n in submissions}
print(len(submissions_by_forum))

In [None]:
guest_client = openreview.Client(baseurl='https://api.openreview.net')
submissions = openreview.tools.iterget_notes(
        guest_client, invitation='ICLR.cc/2019/Conference/-/Blind_Submission')
submissions_by_forum = {n.forum: n for n in submissions}
print(len(submissions_by_forum))

In [None]:
guest_client = openreview.Client(baseurl='https://api.openreview.net')
submissions = openreview.tools.iterget_notes(
        guest_client, invitation='ICLR.cc/2018/Conference/-/Blind_Submission')
submissions_by_forum = {n.forum: n for n in submissions}
print(len(submissions_by_forum))

In [4]:
guest_client = openreview.Client(baseurl='https://api.openreview.net')
submissions = openreview.tools.iterget_notes(
        guest_client, invitation='ICLR.cc/2018/Conference/-/Blind_Submission')
submissions_by_forum = {n.forum: n for n in submissions}
print(len(submissions_by_forum))

930


In [5]:
print(submissions_by_forum['ryBnUWb0b'])

{'cdate': 1518730168684,
 'content': {'TL;DR': 'We used an LSTM to detect when a smartphone walks into '
                      "a building. Then we predict the device's floor level "
                      'using data from sensors aboard the smartphone.',
             '_bibtex': '@inproceedings{\n'
                        'falcon2018predicting,\n'
                        'title={Predicting Floor-Level for 911 Calls with '
                        'Neural Networks and Smartphone Sensor Data},\n'
                        'author={William Falcon and Henning Schulzrinne},\n'
                        'booktitle={International Conference on Learning '
                        'Representations},\n'
                        'year={2018},\n'
                        'url={https://openreview.net/forum?id=ryBnUWb0b},\n'
                        '}',
             'abstract': 'In cities with tall buildings, emergency responders '
                         'need an accurate floor level location to find 911 '

In [6]:
reviewer_keywords = {}
paper_keywords = {}
paper_titles = {}
reviewer_num_papers = defaultdict(Counter)

for year in range(2018, 2023):
    submissions = openreview.tools.iterget_notes(
        guest_client, invitation='ICLR.cc/%d/Conference/-/Blind_Submission' % year)
    submissions_by_forum = {n.forum: n for n in submissions}
    
    # Collect keywords for each paper
    paper_keywords[year] = {}
    paper_titles[year] = {}
    for paper_id in submissions_by_forum:
        paper_keywords[year][paper_id] = [kw.lower() for kw in submissions_by_forum[paper_id].content['keywords']]
        paper_titles[year][paper_id] = submissions_by_forum[paper_id].content['title']
        
    # Collect keywords for each reviewer
    reviewer_keywords[year] = {}
    for paper_id in submissions_by_forum:
        kws = Counter([kw.lower() for kw in submissions_by_forum[paper_id].content['keywords']])
        for reviewer_id in submissions_by_forum[paper_id].content['authors']:
            if reviewer_id in reviewer_keywords[year]:
                reviewer_keywords[year][reviewer_id] += kws
            else:
                reviewer_keywords[year][reviewer_id] = kws
                
            reviewer_num_papers[year][reviewer_id] += 1

In [9]:
pickle.dump(reviewer_num_papers, open("reviewer_num_papers.pkl", 'wb'))

In [23]:
# Represent each reviewer as the collection of keywords from papers they wrote in this and previous ICLR conferences.
combined_reviewer_keywords = {}
for year in range(2018, 2023):
    combined_reviewer_keywords[year] = {reviewer_id: Counter() for reviewer_id in reviewer_keywords[year]}
    for prev_year in range(2018, year+1):
        for reviewer_id in combined_reviewer_keywords[year]:
            if reviewer_id in reviewer_keywords[prev_year]:
                combined_reviewer_keywords[year][reviewer_id] += reviewer_keywords[prev_year][reviewer_id]

In [24]:
((combined_reviewer_keywords[2018].keys() & combined_reviewer_keywords[2019].keys()) & combined_reviewer_keywords[2020].keys()) & combined_reviewer_keywords[2021].keys()

{'Aaron Courville',
 'Abbas Abdolmaleki',
 'Adam Trischler',
 'Afshin Rostamizadeh',
 'Alan Yuille',
 'Aleksander Madry',
 'Alex Lamb',
 'Alexander Novikov',
 'Alexandre Lacoste',
 'Ali Jadbabaie',
 'Ameet Talwalkar',
 'Amir Globerson',
 'Amit Deshpande',
 'Amos Storkey',
 'Amr Sharaf',
 'Andreas Krause',
 'Andrej Risteski',
 'Andrew Gordon Wilson',
 'Andrew Ilyas',
 'Anima Anandkumar',
 'Anirudh Goyal',
 'Anshumali Shrivastava',
 'Antonio Torralba',
 'Ari S. Morcos',
 'Armand Joulin',
 'Arthur Guez',
 'Arthur Szlam',
 'Aurelien Lucchi',
 'Aviv Tamar',
 'Balaji Lakshminarayanan',
 'Behnam Neyshabur',
 'Ben Poole',
 'Benjamin Eysenbach',
 'Benjamin Rosman',
 'Bernhard Schölkopf',
 'Bernt Schiele',
 'Bin Dong',
 'Bing Liu',
 'Bo Chang',
 'Bo Chen',
 'Bo Dai',
 'Bo Li',
 'Bo Zhang',
 'Boqing Gong',
 'Brooks Paige',
 'Bruno Ribeiro',
 'Byron Boots',
 'Caiming Xiong',
 'Cao Xiao',
 'Carl Vondrick',
 'Changyou Chen',
 'Charles Blundell',
 'Chelsea Finn',
 'Chenguang Zhu',
 'Cho-Jui Hsieh',
 

In [None]:
len(combined_reviewer_keywords[2021].keys() & combined_reviewer_keywords[2022].keys())

In [25]:
combined_reviewer_keywords[2022]['Aaron Courville']

Counter({'generative models': 7,
         'hierarchical models': 1,
         'latent variable models': 1,
         'variational inference': 3,
         'bayesian inference': 3,
         'deep networks': 1,
         'language model': 1,
         'unsupervised parsing': 1,
         'generative': 1,
         'hierarchical': 2,
         'unsupervised': 2,
         'semisupervised': 1,
         'latent': 1,
         'ali': 1,
         'gan': 3,
         'representation learning': 8,
         'auto-encoders': 1,
         '3d point clouds': 1,
         'gans': 3,
         'gaussian mixture models': 1,
         'deep learning': 6,
         'neural networks': 3,
         'information theory': 2,
         'adversarial': 3,
         'adaptive optimizer': 1,
         'momentum': 1,
         'hyperparameter tuning': 1,
         'probability distillation': 1,
         'autoregressive models': 1,
         'normalizing flows': 3,
         'wavenet': 1,
         'pixelcnn': 1,
         'image-to-image'

In [26]:
combined_reviewer_keywords[2019]['Aaron Courville']

Counter({'generative models': 5,
         'hierarchical models': 1,
         'latent variable models': 1,
         'variational inference': 1,
         'bayesian inference': 1,
         'deep networks': 1,
         'language model': 1,
         'unsupervised parsing': 1,
         'generative': 1,
         'hierarchical': 1,
         'unsupervised': 2,
         'semisupervised': 1,
         'latent': 1,
         'ali': 1,
         'gan': 3,
         'representation learning': 2,
         'auto-encoders': 1,
         '3d point clouds': 1,
         'gans': 2,
         'gaussian mixture models': 1,
         'deep learning': 4,
         'neural networks': 2,
         'information theory': 2,
         'adversarial': 3,
         'adaptive optimizer': 1,
         'momentum': 1,
         'hyperparameter tuning': 1,
         'probability distillation': 1,
         'autoregressive models': 1,
         'normalizing flows': 1,
         'wavenet': 1,
         'pixelcnn': 1,
         'image-to-image'

In [29]:
reviewer_num_papers[2019]['Aaron Courville']

10

In [None]:
sorted(combined_reviewer_keywords[2022].keys())[1500:7000]

In [31]:
for year in range(2018, 2023):
    print(year)
    revs_this_year = sorted(combined_reviewer_keywords[year])
    papers_this_year = sorted(paper_keywords[year])
    
    m = len(combined_reviewer_keywords[year])
    n = len(paper_keywords[year])
    scores_mu = np.zeros((m, n))
    scores_sigma = np.zeros((m, n))
    for r_idx, rev in enumerate(revs_this_year):
        if r_idx % 500 == 0:
            print(r_idx/m)
        for p_idx, pap in enumerate(papers_this_year):
            # Compute the score and variance for this reviewer-paper pair
            rev_kws = combined_reviewer_keywords[year][rev]
            pap_kws = paper_keywords[year][pap]
            
            if not len(rev_kws) or not len(pap_kws):
                scores_mu[r_idx, p_idx] = 0.5
                scores_sigma[r_idx, p_idx] = 0.25
            else:
                # Convert the max reviewer keyword count to 1 and the min count to 0.2.
                scaled_rev_kws = {}
                max_reviewer_kw_ct = sorted(rev_kws.values())[-1]
                min_reviewer_kw_ct = sorted(rev_kws.values())[0]
                new_scores = np.linspace(.2, 1, max_reviewer_kw_ct - min_reviewer_kw_ct + 1)
                for kw, old_ct in rev_kws.items():
                    scaled_rev_kws[kw] = new_scores[old_ct-min_reviewer_kw_ct]
                # Score the match (mean)
                scores_achieved = []
                for kw in pap_kws:
                    if kw in scaled_rev_kws:
                        scores_achieved.append(scaled_rev_kws[kw])
                score_mean = 0
                for idx, s in enumerate(sorted(scores_achieved, key=lambda x: -x)):
                    score_mean += s*(0.5**idx)
                max_score = 0
                for idx, s in enumerate([1]*len(pap_kws)):
                    max_score += s*(0.5**idx)
                score_mean /= max_score
                scores_mu[r_idx, p_idx] = score_mean
                # Get the variance of the match
                scores_sigma[r_idx, p_idx] = (1/len(pap_kws)) * (1/len(rev_kws))
    np.save('data/scores_mu_iclr_%d' % year, scores_mu)
    np.save('data/scores_sigma_iclr_%d' % year, scores_sigma)

2018
0.0
0.17889087656529518
0.35778175313059035
0.5366726296958855
0.7155635062611807
0.8944543828264758
2019
0.0
0.1144950767117014
0.2289901534234028
0.3434852301351042
0.4579803068468056
0.572475383558507
0.6869704602702084
0.8014655369819098
0.9159606136936111
2020
0.0
0.07275902211874273
0.14551804423748546
0.21827706635622818
0.2910360884749709
0.3637951105937136
0.43655413271245636
0.5093131548311991
0.5820721769499418
0.6548311990686845
0.7275902211874272
0.80034924330617
0.8731082654249127
0.9458672875436555
2021
0.0
0.0643915003219575
0.128783000643915
0.1931745009658725
0.25756600128783
0.32195750160978753
0.386349001931745
0.4507405022537025
0.51513200257566
0.5795235028976176
0.6439150032195751
0.7083065035415326
0.77269800386349
0.8370895041854475
0.901481004507405
0.9658725048293625
2022
0.0
0.05994485073732166
0.11988970147464333
0.17983455221196498
0.23977940294928665
0.2997242536866083
0.35966910442392996
0.41961395516125166
0.4795588058985733
0.539503656635895
0.599

In [3]:
x = np.load("scores_mu_iclr_2018.npy")
y = np.load("scores_sigma_iclr_2018.npy")

FileNotFoundError: [Errno 2] No such file or directory: 'scores_mu_iclr_2018.npy'

# Create groups of papers so we can do max group egalitarian

In [16]:
cos_sims = {}
for year in range(2018, 2023):
    print(year)
    papers_this_year = sorted(paper_keywords[year])
    n = len(paper_keywords[year])
    
    titles = [paper_titles[year][p] for p in papers_this_year]
    print(titles)
    embs = model.encode(titles)
    cos_sims[year] = util.cos_sim(embs, embs)
    
pickle.dump(cos_sims, "cos_sims.pkl")

2018
['Learning to Count Objects in Natural Images for Visual Question Answering', 'THINK VISUALLY: QUESTION ANSWERING THROUGH VIRTUAL IMAGERY', 'Thinking like a machine — generating visual rationales through latent space optimization', 'Progressive Reinforcement Learning with Distillation for Multi-Skilled Motion Control', 'QANet: Combining Local Convolution with Global Self-Attention for Reading Comprehension', 'No Spurious Local Minima in a Two Hidden Unit ReLU Network', 'Deep Boosting of Diverse Experts', 'Avoiding Catastrophic States with Intrinsic Fear', 'Emergence of grid-like representations by training recurrent neural networks to perform spatial localization', 'Learning General Purpose Distributed Sentence Representations via Large Scale Multi-task Learning', 'Clustering with Deep Learning: Taxonomy and New Methods', 'Learning to Compute Word Embeddings On the Fly', 'Learning to select examples for program synthesis', 'Training Autoencoders by Alternating Minimization', 'A Si

2019
['Neural Causal Discovery with Learnable Input Noise', 'RETHINKING SELF-DRIVING : MULTI -TASK KNOWLEDGE FOR BETTER GENERALIZATION AND ACCIDENT EXPLANATION ABILITY', 'Hierarchical Attention: What Really Counts in Various NLP Tasks', 'Convolutional Neural Networks on Non-uniform Geometrical Signals Using Euclidean Spectral Transformation', 'Augmented Cyclic Adversarial Learning for Low Resource Domain Adaptation', 'Variance Networks: When Expectation Does Not Meet Your Expectations', 'HyperGAN:  Exploring the Manifold of Neural Networks', 'Explicit Recall for Efficient Exploration', 'Unsupervised  one-to-many image translation', 'Initialized Equilibrium Propagation for Backprop-Free Training', 'Stop memorizing: A data-dependent regularization framework for intrinsic pattern learning', 'Gradient Acceleration in Activation Functions', 'Empirical Bounds on Linear Regions of Deep Rectifier Networks', 'On-Policy Trust Region Policy Optimisation with Replay Buffers', 'On the Ineffectivene

2020
['Structured Object-Aware Physics Prediction for Video Modeling and Planning', 'DeepSphere: a graph-based spherical CNN', 'Pareto Optimality in No-Harm Fairness', 'On Robustness of Neural Ordinary Differential Equations', 'DiffTaichi: Differentiable Programming for Physical Simulation', 'Semi-supervised semantic segmentation needs strong, high-dimensional perturbations', 'The Generalization-Stability Tradeoff in Neural Network Pruning', 'Independence-aware Advantage Estimation', 'Calibration, Entropy Rates, and Memory in Language Models', 'Kernel of CycleGAN as a principal homogeneous space', 'Directional Message Passing for Molecular Graphs', 'Superseding Model Scaling by Penalizing Dead Units and Points with Separation Constraints', 'THE EFFECT OF ADVERSARIAL TRAINING: A THEORETICAL CHARACTERIZATION', 'Attacking Graph Convolutional Networks via Rewiring', 'Efficient and Information-Preserving Future Frame Prediction and Beyond', 'A Deep Recurrent Neural Network via Unfolding Rew

2021
['Enhancing Visual Representations for Efficient Object Recognition during Online Distillation', 'Meta-Learning of Structured Task Distributions in Humans and Machines', 'CROSS-SUPERVISED OBJECT DETECTION', 'Learning Invariant Representations for Reinforcement Learning without Reconstruction', 'Log representation as an interface for log processing applications', 'Regret Bounds and Reinforcement Learning Exploration of EXP-based Algorithms', 'Optimizing Memory Placement using Evolutionary Graph Reinforcement Learning', 'Generative Adversarial Neural Architecture Search with Importance Sampling', 'Can Kernel Transfer Operators Help Flow based Generative Models?', 'Generalized Universal Approximation for Certified Networks', 'Categorical Normalizing Flows via Continuous Transformations', "Improving VAEs' Robustness to Adversarial Attack", 'Universal approximation power of deep residual neural networks via nonlinear control theory', 'Chameleon: Learning Model Initializations Across Ta

2022
['Towards Uncertainties in Deep Learning that Are Accurate and Calibrated', 'Mind Your Bits and Errors: Prioritizing the Bits that Matter in Variational Autoencoders', 'Word Sense Induction with Knowledge Distillation from BERT', 'The magnitude vector of images', 'How to Improve Sample Complexity of SGD over Highly Dependent Data?', 'Model Validation Using Mutated Training Labels: An Exploratory Study', 'Hyperparameter Tuning with Renyi Differential Privacy', 'State-Action Joint Regularized Implicit Policy for Offline Reinforcement Learning', 'NETWORK INSENSITIVITY TO PARAMETER NOISE VIA PARAMETER ATTACK DURING TRAINING', 'Learning Controllable Elements Oriented Representations for Reinforcement Learning ', 'Fast Differentiable Matrix Square Root', 'Dissecting Local Properties of Adversarial Examples', 'SHINE: SHaring the INverse Estimate from the forward pass for bi-level optimization and implicit models', 'Revisiting Out-of-Distribution Detection: A Simple Baseline is Surprising

NameError: name 'pickle' is not defined

In [58]:
np.min(cos_sims[2018].numpy())

-0.17541924

In [188]:
from sklearn.cluster import AgglomerativeClustering
yr = 2022
clustering = AgglomerativeClustering(distance_threshold=.95, n_clusters=None, affinity='precomputed', linkage='complete').fit(1-cos_sims[yr])
print(clustering.labels_)
print(clustering.n_clusters_)

[50 63 26 ...  0 11 44]
80


In [189]:
# Take the papers that are in small clusters and put them in larger clusters. Probably get the maximum linkage distance
# for all the clusters, sort by that, and pick the one with the smallest max linkage distance.
clusters = defaultdict(list)
for idx, cid in enumerate(clustering.labels_):
    clusters[cid].append(idx)

In [190]:
bad_clusters = set()
for cid, paps in clusters.items():
    if len(paps) < 10:
        bad_clusters.add(cid)

In [191]:
bad_clusters

{6, 12, 34, 60, 62, 74, 76, 78, 79}

In [192]:
def reassigned_cluster(pid, clusters, bad_clusters, cos_sims):
    # Get the max distance to each cluster
    cluster_dists = {}
    for cid in clusters:
        paps_other_clust = clusters[cid]
#         print(paps_other_clust)
#         print(cos_sims[pid, :])
#         print(cos_sims[pid, :][paps_other_clust])
        max_dist = np.max(1-cos_sims[pid, :][paps_other_clust])
        cluster_dists[cid] = max_dist
    # Filter out the bad_clusters
    for cid, dist in sorted(cluster_dists.items(), key=lambda x: x[1]):
        if cid not in bad_clusters:
            return cid

In [193]:
clust_labels = deepcopy(clustering.labels_)

for bc in bad_clusters:
    # Pull the papers out and add them to other clusters
    for pid in clusters[bc]:
        cid = reassigned_cluster(pid,  clusters, bad_clusters, cos_sims[yr].numpy())
        clust_labels[pid] = cid


In [194]:
clustering.labels_[-50:-30]

array([16,  5, 28, 44, 39, 56, 51, 19, 33, 50, 29, 11, 31, 42, 39, 11, 11,
        2, 73, 77], dtype=int64)

In [195]:
clust_labels[-50:]

array([16,  5, 28, 44, 39, 56, 51, 19, 33, 50, 29, 11, 31, 42, 39, 11, 11,
        2, 73, 77,  0, 19, 29, 10, 42, 28, 43, 36, 14,  7, 11, 10, 14, 22,
       20, 28, 59,  3,  4, 11, 32, 26, 77, 37, 55, 16, 63,  0, 11, 44],
      dtype=int64)

In [196]:
label_map = {}
sorted_bad = sorted(bad_clusters | {clustering.n_clusters_})
ctr = 0
for idx, i in enumerate(sorted_bad):
    while idx + ctr < i:
        label_map[idx + ctr] = ctr
        ctr += 1

In [197]:
label_map

{0: 0,
 1: 1,
 2: 2,
 3: 3,
 4: 4,
 5: 5,
 7: 6,
 8: 7,
 9: 8,
 10: 9,
 11: 10,
 13: 11,
 14: 12,
 15: 13,
 16: 14,
 17: 15,
 18: 16,
 19: 17,
 20: 18,
 21: 19,
 22: 20,
 23: 21,
 24: 22,
 25: 23,
 26: 24,
 27: 25,
 28: 26,
 29: 27,
 30: 28,
 31: 29,
 32: 30,
 33: 31,
 35: 32,
 36: 33,
 37: 34,
 38: 35,
 39: 36,
 40: 37,
 41: 38,
 42: 39,
 43: 40,
 44: 41,
 45: 42,
 46: 43,
 47: 44,
 48: 45,
 49: 46,
 50: 47,
 51: 48,
 52: 49,
 53: 50,
 54: 51,
 55: 52,
 56: 53,
 57: 54,
 58: 55,
 59: 56,
 61: 57,
 63: 58,
 64: 59,
 65: 60,
 66: 61,
 67: 62,
 68: 63,
 69: 64,
 70: 65,
 71: 66,
 72: 67,
 73: 68,
 75: 69,
 77: 70}

In [198]:
final_clusters = deepcopy(clust_labels)

In [199]:
for idx in range(final_clusters.shape[0]):
    final_clusters[idx] = label_map[final_clusters[idx]]

In [200]:
final_clusters

array([47, 58, 24, ...,  0, 10, 41], dtype=int64)

In [201]:
for i in range(clustering.n_clusters_):
    print("Cluster ", i)
    papers_this_year = sorted(paper_titles[yr])
    titles = [paper_titles[yr][p] for p in papers_this_year]
    for idx, cid in enumerate(final_clusters):

#     for idx, cid in enumerate(clust_labels):
#     for idx, cid in enumerate(clustering.labels_):
        if cid == i:
            print(titles[idx])
    print()

Cluster  0
Self-supervised Learning for Sequential Recommendation with Model Augmentation
Successive POI Recommendation via Brain-inspired Spatiotemporal Aware Representation
IA-MARL: Imputation Assisted Multi-Agent Reinforcement Learning for Missing Training Data
Iterative Memory Network for Long Sequential User Behavior Modeling in Recommender Systems
How to deal with missing data in supervised deep learning?
Bridging Recommendation and Marketing via Recurrent Intensity Modeling
Equal Experience in Recommender Systems
EMFlow: Data Imputation in Latent Space via EM and Deep Flow Models
On the regularization landscape for the linear recommendation models
Gradient Importance Learning for Incomplete Observations
From Intervention to Domain Transportation: A Novel Perspective to Optimize Recommendation
Estimating and Penalizing Induced Preference Shifts in Recommender Systems
Neuron-Enhanced Autoencoder based Collaborative filtering: Theory and Practice
CareGraph: A Graph-based Recommende

In [202]:
np.save("group_ids_%d.npy" % yr, final_clusters)

### Now we are gonna sample the matrices indicating whether the reviewer did a good job reviewing (the outcomes)

The model is that we sample the actual affinity from the normal distribution, but that reviewers with less than a threshold number of papers are overestimated and so we sample only from the bottom half of the normal. Then we truncate between 0 and 1. Then this affinity is a Bernoulli which determines the probability of success.

In [53]:
# Now sample the true affinities and the outcomes 100x
for yr in range(2018, 2022):
    mu = np.load("data/scores_mu_iclr_%d.npy" % yr)
    sigma = np.load("data/scores_sigma_iclr_%d.npy" % yr)
    overest = np.array([x[1] <= 1 for x in sorted(reviewer_num_papers[yr].items())])

    rng = np.random.default_rng(12345)

    n_trials = 100
    for i in range(n_trials):
        true_affinity = np.zeros(mu.shape)
        true_affinity[np.logical_not(overest), :] = rng.normal(mu[np.logical_not(overest), :], sigma[np.logical_not(overest), :])
        true_affinity[overest, :] = mu[overest, :] - np.abs(mu[overest, :] - rng.normal(mu[overest, :], sigma[overest, :]))
        true_affinity = np.clip(true_affinity, 0, 1)
        outcomes = rng.uniform(0, 1, mu.shape) < true_affinity
        np.save("data/outcomes_%d_%d.npy" % (yr, i), outcomes)

In [49]:
np.sum(outcomes)

1604253

In [51]:
outcomes.shape[0]*outcomes.shape[1]

21845079

In [21]:
len(overest)

8341

In [22]:
len(reviewer_keywords[year])

8341