In [1]:
import numpy as np
import pandas as pd

Based on a [notebook](https://github.com/dkobak/iclr-tsne/blob/main/iclr-tsne.ipynb) by [Dmitry Kobak](https://github.com/dkobak) (I originally found it via [a tweet](https://twitter.com/hippopedoid/status/1575879260216373249)). This uses TF-IDF of ICLR submissions.

In [21]:
import time

import httpx


def download_iclr():
    titles = []
    abstracts = []
    years = []

    for year in [2018, 2019, 2020, 2021, 2022, 2023]:
        for query in [
            "Blind_Submission",
            "Withdrawn_Submission",
            "Desk_Rejected_Submission",
        ]:
            url = f"https://api.openreview.net/notes?invitation=ICLR.cc%2F{year}%2FConference%2F-%2F{query}"
            for offset in [0, 1000, 2000, 3000, 4000]:
                resp = httpx.get(url + f"&offset={offset}").json()
                if "notes" not in resp:
                    print(f"Skipping {year} {query} {offset}")
                    print(resp)
                    continue
                df = pd.DataFrame(resp["notes"])
                if len(df) > 0:
                    titles += [d["title"].strip() for d in df["content"].values]
                    abstracts += [d["abstract"].strip() for d in df["content"].values]
                    years += [year] * len(df)
                    # sleep for one second to avoid rate limiting
                    time.sleep(1)

    return np.array(abstracts), np.array(titles), np.array(years)

In [20]:
json = download_iclr()

Skipping 2022 Blind_Submission 0
dict_keys(['name', 'message', 'status', 'details'])
Skipping 2022 Blind_Submission 1000
dict_keys(['name', 'message', 'status', 'details'])
Skipping 2022 Blind_Submission 2000
dict_keys(['name', 'message', 'status', 'details'])
Skipping 2022 Blind_Submission 3000
dict_keys(['name', 'message', 'status', 'details'])
Skipping 2022 Blind_Submission 4000
dict_keys(['name', 'message', 'status', 'details'])
Skipping 2022 Withdrawn_Submission 0
dict_keys(['name', 'message', 'status', 'details'])
Skipping 2022 Withdrawn_Submission 1000
dict_keys(['name', 'message', 'status', 'details'])
Skipping 2022 Withdrawn_Submission 2000
dict_keys(['name', 'message', 'status', 'details'])
Skipping 2022 Withdrawn_Submission 3000
dict_keys(['name', 'message', 'status', 'details'])
Skipping 2022 Withdrawn_Submission 4000
dict_keys(['name', 'message', 'status', 'details'])
Skipping 2022 Desk_Rejected_Submission 0
dict_keys(['name', 'message', 'status', 'details'])
Skipping 2022

In [16]:
year = "2022"
query = "Blind_Submission"
offset = 0
url = f"https://api.openreview.net/notes?invitation=ICLR.cc%2F{year}%2FConference%2F-%2F{query}"
resp = httpx.get(url + f"&offset={offset}").json()

In [18]:
"notes" in resp

True

In [14]:
abstracts, titles, years = download_iclr()

Skipping 2022 Blind_Submission
Skipping 2022 Blind_Submission
Skipping 2022 Blind_Submission
Skipping 2022 Blind_Submission
Skipping 2022 Blind_Submission
Skipping 2022 Withdrawn_Submission
Skipping 2022 Withdrawn_Submission
Skipping 2022 Withdrawn_Submission
Skipping 2022 Withdrawn_Submission
Skipping 2022 Withdrawn_Submission
Skipping 2022 Desk_Rejected_Submission
Skipping 2022 Desk_Rejected_Submission
Skipping 2022 Desk_Rejected_Submission
Skipping 2022 Desk_Rejected_Submission
Skipping 2022 Desk_Rejected_Submission
Skipping 2023 Blind_Submission
Skipping 2023 Blind_Submission
Skipping 2023 Blind_Submission
Skipping 2023 Blind_Submission
Skipping 2023 Blind_Submission
Skipping 2023 Withdrawn_Submission
Skipping 2023 Withdrawn_Submission
Skipping 2023 Withdrawn_Submission
Skipping 2023 Withdrawn_Submission
Skipping 2023 Withdrawn_Submission
Skipping 2023 Desk_Rejected_Submission
Skipping 2023 Desk_Rejected_Submission
Skipping 2023 Desk_Rejected_Submission
Skipping 2023 Desk_Rejected_

In [15]:
len(titles), len(abstracts)

(8205, 8205)

In [4]:
len(titles), len(abstracts)

(16576, 16576)

time: 5.31 ms (started: 2023-06-18 12:53:02 -07:00)


In [5]:
mask = np.array([len(a) >= 200 for a in abstracts])
docs_to_keep = np.where(mask)

time: 142 ms (started: 2023-06-18 12:53:02 -07:00)


In [6]:
abstracts = abstracts[docs_to_keep]
titles = titles[docs_to_keep]
years = years[docs_to_keep]
len(titles)

16553

time: 90.8 ms (started: 2023-06-18 12:53:02 -07:00)


In [7]:
text = np.empty_like(titles, dtype=object)
for i in range(len(titles)):
    text[i] = titles[i] + " " + abstracts[i]
text[:3]

array(["Predicting Floor-Level for 911 Calls with Neural Networks and Smartphone Sensor Data In cities with tall buildings, emergency responders need an accurate floor level location to find 911 callers quickly. We introduce a system to estimate a victim's floor level via their mobile device's sensor data in a two-step process. First, we train a neural network to determine when a smartphone enters or exits a building via GPS signal changes. Second, we use a barometer equipped smartphone to measure the change in barometric pressure from the entrance of the building to the victim's indoor location. Unlike impractical previous approaches, our system is the first that does not require the use of beacons, prior knowledge of the building infrastructure, or knowledge of user behavior. We demonstrate real-world feasibility through 63 experiments across five different tall buildings throughout New York City where our system predicted the correct floor level with 100% accuracy.",
       "Some Co

time: 155 ms (started: 2023-06-18 12:53:03 -07:00)


In [8]:
import sklearn.feature_extraction.text

iclr_l2s = sklearn.feature_extraction.text.TfidfVectorizer(
    norm="l2", sublinear_tf=True
).fit_transform(text)

time: 2.32 s (started: 2023-06-18 12:53:03 -07:00)


We'll use 100 components here like Dmitry does in his notebook. My own permutation-based tests suggest maybe up to 150 components is also a possible choice, but not a lot more than that.

In [9]:
import sklearn.decomposition

tsvd = sklearn.decomposition.TruncatedSVD(n_components=100, algorithm="arpack").fit(
    iclr_l2s
)

time: 4.86 s (started: 2023-06-18 12:53:05 -07:00)


In [10]:
np.sum(tsvd.explained_variance_ratio_)

0.13176887212075664

time: 4.34 ms (started: 2023-06-18 12:53:10 -07:00)


13% of variance explained with 100 dimensions.

In [11]:
data = tsvd.transform(iclr_l2s)

time: 181 ms (started: 2023-06-18 12:53:10 -07:00)


In [12]:
keywords = [
    "network",
    "graph",
    "reinforcement",
    "language",
    "adversarial",
    "federated",
    "contrastive",
    "domain",
    "diffusion",
    "out-of-dis",
    "continual",
    "distillation",
    "architecture",
    "privacy",
    "protein",
    "fair",
    "attention",
    "video",
    "meta-learning",
    "generative adv",
    "autoencoder",
    "game",
    "semi-sup",
    "pruning",
    "physics",
    "3d",
    "translation",
    "optimization",
    "recurrent",
    "word",
    "bayesian",
]

time: 3.69 ms (started: 2023-06-18 12:53:10 -07:00)


In [13]:
# Most frequent words in the titles (at least 5 letters)

words, counts = np.unique(" ".join(titles).split(), return_counts=True)
ind = np.argsort(counts)[::-1][:50]
for i in ind:
    if len(words[i]) >= 5:
        print(f"{words[i]:20} {counts[i]:4}")

Learning             4545
Neural               2067
Networks             1668
Reinforcement         868
Adversarial           803
Graph                 797
Models                784
Training              635
Network               561
Representation        534
Model                 481
Optimization          465
Efficient             460
Generative            436
Language              407
Representations       399
Robust                370
Image                 348
Generalization        347
using                 343
Gradient              338
Towards               338
Unsupervised          334
learning              328
Federated             321
Detection             312
Generation            309
Classification        302
Robustness            288
Policy                274
Adaptive              270
Contrastive           269
time: 91.3 ms (started: 2023-06-18 12:53:10 -07:00)


Some processing of the text is necessary. To do this properly you may want to consider looking into the likes of [nltk](https://www.nltk.org/) for more robust handling of text (see for example this [article](https://towardsdatascience.com/tf-idf-for-document-ranking-from-scratch-in-python-on-real-world-dataset-796d339a4089)), but I am just going to make everything lower-case and then remove some symbols. This seems good enough for this dataset.

In [14]:
def remove_symbols(text):
    symbols = '!"#$%&()*+-,./:;<=>?@[\]^_`{|}~\n'
    for i in symbols:
        text = np.char.replace(text, i, " ")
    return text.tolist()


processed_titles = [remove_symbols(title.lower()) for title in titles]

time: 2.1 s (started: 2023-06-18 12:53:10 -07:00)


In [15]:
titles[:5], processed_titles[:5]

(array(['Predicting Floor-Level for 911 Calls with Neural Networks and Smartphone Sensor Data',
        'Some Considerations on Learning to Explore via Meta-Reinforcement Learning',
        'MACH: Embarrassingly parallel $K$-class classification in $O(d\\log{K})$ memory and $O(K\\log{K} + d\\log{K})$ time, instead of $O(Kd)$',
        'Deterministic Policy Imitation Gradient Algorithm',
        'Searching for Activation Functions'], dtype='<U176'),
 ['predicting floor level for 911 calls with neural networks and smartphone sensor data',
  'some considerations on learning to explore via meta reinforcement learning',
  'mach  embarrassingly parallel  k  class classification in  o d log k    memory and  o k log k    d log k    time  instead of  o kd  ',
  'deterministic policy imitation gradient algorithm',
  'searching for activation functions'])

time: 4.13 ms (started: 2023-06-18 12:53:12 -07:00)


In [16]:
words, counts = np.unique(" ".join(processed_titles).split(), return_counts=True)
ind = np.argsort(counts)[::-1]
for i in ind:
    if len(words[i]) >= 5:
        if words[i] in keywords:
            print(f"{words[i]:20} {counts[i]:4}")

reinforcement         945
graph                 934
adversarial           902
network               728
optimization          538
language              525
domain                384
attention             334
federated             334
contrastive           305
architecture          210
continual             201
bayesian              175
distillation          174
recurrent             173
video                 172
translation           167
pruning               157
diffusion             152
privacy               107
physics                78
autoencoder            77
protein                66
time: 92.2 ms (started: 2023-06-18 12:53:12 -07:00)


As a rough way to see what titles we are missing due to bad processing, here the same procedure but allowing through any words that begin with the keywords, so we pickup any plurals or when it's part of a compound adjective:

In [17]:
words, counts = np.unique(" ".join(processed_titles).split(), return_counts=True)
ind = np.argsort(counts)[::-1]
for i in ind:
    if len(words[i]) >= 5:
        for k in keywords:
            if words[i].startswith(k):
                print(f"{words[i]:20} {counts[i]:4}")
                break

networks             2059
reinforcement         945
graph                 934
adversarial           902
network               728
optimization          538
language              525
domain                384
attention             334
federated             334
contrastive           305
architecture          210
continual             201
bayesian              175
distillation          174
recurrent             173
video                 172
translation           167
pruning               157
diffusion             152
graphs                144
autoencoders          138
privacy               107
games                  89
physics                78
autoencoder            77
fairness               76
protein                66
architectures          60
adversarially          46
domains                40
videos                 29
languages              16
words                   9
attentional             7
graphical               7
graphics                6
networked               5
translations

In [18]:
ordered_keywords = []
words, counts = np.unique(" ".join(processed_titles).split(), return_counts=True)
ind = np.argsort(counts)[::-1]
for i in ind:
    if len(words[i]) >= 5:
        if words[i] in keywords:
            ordered_keywords.append(words[i])
ordered_keywords

['reinforcement',
 'graph',
 'adversarial',
 'network',
 'optimization',
 'language',
 'domain',
 'attention',
 'federated',
 'contrastive',
 'architecture',
 'continual',
 'bayesian',
 'distillation',
 'recurrent',
 'video',
 'translation',
 'pruning',
 'diffusion',
 'privacy',
 'physics',
 'autoencoder',
 'protein']

time: 86.5 ms (started: 2023-06-18 12:53:13 -07:00)


In [19]:
ordered_keywords = []
words, counts = np.unique(" ".join(processed_titles).split(), return_counts=True)
ind = np.argsort(counts)[::-1]
for i in ind:
    if len(words[i]) >= 5:
        if words[i] in keywords:
            ordered_keywords.append(words[i])
ordered_keywords

['reinforcement',
 'graph',
 'adversarial',
 'network',
 'optimization',
 'language',
 'domain',
 'attention',
 'federated',
 'contrastive',
 'architecture',
 'continual',
 'bayesian',
 'distillation',
 'recurrent',
 'video',
 'translation',
 'pruning',
 'diffusion',
 'privacy',
 'physics',
 'autoencoder',
 'protein']

time: 85 ms (started: 2023-06-18 12:53:13 -07:00)


In [20]:
levels = np.repeat(len(keywords), len(processed_titles))
for level_int, keyword in enumerate(keywords):
    ind = [i for i, t in enumerate(titles) if keyword.lower() in t.lower()]
    levels[ind] = level_int
keywords.append("unknown")

time: 368 ms (started: 2023-06-18 12:53:13 -07:00)


In [21]:
levels

array([ 0,  2, 31, ...,  1, 16,  0])

time: 3.6 ms (started: 2023-06-18 12:53:13 -07:00)


In [22]:
from drnb.util import codes_to_categories

description = codes_to_categories(levels, keywords, "description")
description

0              network
1        reinforcement
2              unknown
3              unknown
4              unknown
             ...      
16548           domain
16549          unknown
16550            graph
16551        attention
16552          network
Name: description, Length: 16553, dtype: category
Categories (32, object): ['3d', 'adversarial', 'architecture', 'attention', ..., 'translation', 'unknown', 'video', 'word']

time: 56.2 ms (started: 2023-06-18 12:53:13 -07:00)


In [23]:
description.value_counts()

unknown           7581
network           1602
graph              823
reinforcement      823
adversarial        651
optimization       506
language           414
domain             337
attention          320
federated          254
contrastive        247
architecture       232
autoencoder        196
3d                 191
continual          179
recurrent          171
bayesian           170
video              169
translation        160
semi-sup           157
distillation       150
pruning            150
diffusion          134
meta-learning      130
fair               126
generative adv     123
game               122
out-of-dis         116
word                99
privacy             95
physics             74
protein             51
Name: description, dtype: int64

time: 5.57 ms (started: 2023-06-18 12:53:13 -07:00)


## Pipeline

In [24]:
target = pd.concat(
    [
        pd.Series(years, name="year", dtype="category"),
        pd.Series(levels, name="class"),
        pd.Series(titles, name="title"),
        description,
    ],
    axis=1,
)
target

Unnamed: 0,year,class,title,description
0,2018,0,Predicting Floor-Level for 911 Calls with Neur...,network
1,2018,2,Some Considerations on Learning to Explore via...,reinforcement
2,2018,31,MACH: Embarrassingly parallel $K$-class classi...,unknown
3,2018,31,Deterministic Policy Imitation Gradient Algorithm,unknown
4,2018,31,Searching for Activation Functions,unknown
...,...,...,...,...
16548,2023,7,Text-Driven Generative Domain Adaptation with ...,domain
16549,2023,31,"Laziness, Barren Plateau, and Noises in Machin...",unknown
16550,2023,1,Discovering the Representation Bottleneck of G...,graph
16551,2023,16,Results for Perfect Classification for Graph A...,attention


time: 22.4 ms (started: 2023-06-18 12:53:13 -07:00)


`glasbey` generates the colors for the descriptions, except for the final `unknown` description, which will be grey. We'll use the `extend_palette` function, initializing with the grey color, to try and avoid creating a palette which contains another similar grey color by accident (to the extent possible).

In [34]:
my_list = [1, 2, 3, 4, 5]  # Example list

my_list.append(my_list.pop(0))

print(my_list)

[2, 3, 4, 5, 1]
time: 1.9 ms (started: 2023-06-18 12:56:04 -07:00)


In [37]:
import glasbey

colors = glasbey.extend_palette(["#aaaaaa"], len(keywords))
colors.append(colors.pop(0))
colors

['#00008e',
 '#690000',
 '#005900',
 '#a600c6',
 '#b6ff00',
 '#ff5108',
 '#007dff',
 '#00ffff',
 '#ff8aff',
 '#555561',
 '#00b24d',
 '#967100',
 '#fbba4d',
 '#410041',
 '#c6005d',
 '#282008',
 '#009296',
 '#610ce7',
 '#cef3c6',
 '#a6aaff',
 '#ffdffb',
 '#ff969e',
 '#a26d8e',
 '#003139',
 '#00c2f7',
 '#a64114',
 '#597955',
 '#793555',
 '#a6ae00',
 '#9661fb',
 '#ef18c6',
 '#aaaaaa']

time: 590 ms (started: 2023-06-18 12:57:21 -07:00)


The `pop`ing and `append`ing is to then move the grey color for `unknown` to the end of the palette so it matches the ordering of the `keywords`.

In [38]:
palette = dict(
    description=dict(
        zip(
            keywords,
            colors,
        )
    )
)
palette

{'description': {'network': '#00008e',
  'graph': '#690000',
  'reinforcement': '#005900',
  'language': '#a600c6',
  'adversarial': '#b6ff00',
  'federated': '#ff5108',
  'contrastive': '#007dff',
  'domain': '#00ffff',
  'diffusion': '#ff8aff',
  'out-of-dis': '#555561',
  'continual': '#00b24d',
  'distillation': '#967100',
  'architecture': '#fbba4d',
  'privacy': '#410041',
  'protein': '#c6005d',
  'fair': '#282008',
  'attention': '#009296',
  'video': '#610ce7',
  'meta-learning': '#cef3c6',
  'generative adv': '#a6aaff',
  'autoencoder': '#ffdffb',
  'game': '#ff969e',
  'semi-sup': '#a26d8e',
  'pruning': '#003139',
  'physics': '#00c2f7',
  '3d': '#a64114',
  'translation': '#597955',
  'optimization': '#793555',
  'recurrent': '#a6ae00',
  'word': '#9661fb',
  'bayesian': '#ef18c6',
  'unknown': '#aaaaaa'}}

time: 7.38 ms (started: 2023-06-18 12:57:32 -07:00)


In [39]:
from drnb.io.pipeline import create_default_pipeline

_ = create_default_pipeline(check_for_duplicates=True, metric=["euclidean"]).run(
    "iclr",
    data=data,
    target=target,
    target_palette=palette,
    url="https://github.com/dkobak/iclr-tsne",
    verbose=True,
)

time: 21.3 s (started: 2023-06-18 12:57:39 -07:00)


## Renormalize

I also recommend renormalizing to L2 after the SVD procedure, so let's save that as a separate dataset.

In [40]:
from drnb.preprocess import normalize_l2

time: 866 µs (started: 2023-06-18 12:58:00 -07:00)


In [41]:
_ = create_default_pipeline(check_for_duplicates=True, metric=["euclidean"]).run(
    "iclr-l2r",
    data=normalize_l2(data),
    target=target,
    target_palette=palette,
    url="https://github.com/dkobak/iclr-tsne",
    verbose=True,
)

time: 21 s (started: 2023-06-18 12:58:00 -07:00)


For use in the `renorm-prep.ipynb` to experiment with different ways of processing TF-IDF data, let's save the text that was used for the TF-IDF analysis here.

In [42]:
from drnb.io import write_pickle

_ = write_pickle(
    text,
    "iclr",
    suffix="text",
    verbose=False,
    compression="gzip",
    overwrite=True,
)

time: 1.34 s (started: 2023-06-18 12:58:21 -07:00)


In [None]:
import drnb.embed.pipeline as pl

pl.standard_eval(
    method="leopold",
    dataset="iclr-l2r",
    extra_plot=dict(
        plot="plotly",
        figsize=(10, 7),
        hover=["title", "description"],
        alpha_scale=0.6,
        cex=30,
    ),
)