# Crawl dataset with all submissions info
OpenReview Venue Crawling

In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
from multiprocessing import Pool

## Crawl list of all submissions
Here we scrape the _notes_ , (list of all submissions) using OpenReview's API, way faster than Selenium-based scraping.


In [None]:
DATA_PATH = './data/'
venue_id = 'ICLR.cc/2024/Conference'
venue_short = 'iclr2024'

In [None]:
import openreview

# API V2
client = openreview.api.OpenReviewClient(
    baseurl='https://api2.openreview.net',
    username='<username>',
    password='<password>'
)

In [None]:
venue_group = client.get_group(venue_id)
submission_name = venue_group.content['submission_name']['value']
submissions = client.get_all_notes(invitation=f'{venue_id}/-/{submission_name}', details='directReplies')
submissions = [s.to_json() for s in submissions]
print(f'Number of submissions: {len(submissions)}')

In [None]:
df = pd.json_normalize(submissions)
df.head()

## Save filtered dataset 
We will be saving a smaller version of the dataset in csv format with the data we need for our analysis - this can also be saved directly in Github

In [None]:
# Save dataframe as csv
# rename title
df.rename(columns={'content.title.value': 'title'}, inplace=True)
#rename keywords
df.rename(columns={'content.keywords.value': 'keywords'}, inplace=True)
df.to_csv(f'{DATA_PATH}{venue_short}.csv', index=False)

In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import gridspec
from imageio import imread
from wordcloud import WordCloud
import time

gridc = (1., 1., 1)
plt.rcParams['grid.color'] = gridc
plt.rcParams["axes.edgecolor"] = (0.898, 0.925, 0.965, 1)
plt.rc('xtick', labelsize=12)
plt.rc('ytick', labelsize=12)
sns.set(style='darkgrid', context='talk', palette='colorblind')

from IPython.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))
display(HTML("<style>div#site { height: 100% !important; }</style>"))

In [None]:
# Get list
df = pd.read_csv(f'{DATA_PATH}{venue_short}.csv')
df['keywords'] = df['keywords'].apply(eval)

data = df['keywords']
keywords = {}
for kw in data:
    kw = [_k.lower().strip() for _k in kw]
    for _k in kw:
        if _k in keywords.keys():
            keywords[_k] += 1
        else:
            keywords[_k] = 1
# sort values
keywords = {k: v for k, v in sorted(keywords.items(), key=lambda item: item[1])[::-1]}
keywords = pd.Series(keywords).sort_values(ascending=True)
# sort keyword
keywords.iloc[-50:].plot.barh(figsize=(8, 12), title='ICLR 2024 Submission Top 50 Keywords')
plt.savefig(f'./sources/top50_keywords.png', bbox_inches='tight', dpi=300)

In [None]:
wc = WordCloud(background_color="black", max_words=300, max_font_size=64, 
               width=1280, height=640, random_state=0)
wc.generate_from_frequencies(keywords.to_dict())
fig = plt.figure(figsize=(16, 8))
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.savefig(f'./sources/wordcloud.png', bbox_inches='tight', dpi=200)

In [None]:
logo = imread('./sources/ICLR-mask.png')
wc = WordCloud(background_color="white", max_words=300, max_font_size=64, 
               width=1280, height=640, random_state=0, mask=logo, contour_color='black')
wc.generate_from_frequencies(keywords.to_dict())
fig = plt.figure(figsize=(16, 8))
plt.imshow(logo)
plt.imshow(wc, interpolation="bilinear", alpha=.75)
plt.axis("off")
plt.savefig(f'./sources/wordcloud_nice.png', bbox_inches='tight', dpi=200)