In [1]:
import numpy as np
import pandas as pd

Based on a [notebook](https://github.com/dkobak/iclr-tsne/blob/main/iclr-tsne.ipynb) by [Dmitry Kobak](https://github.com/dkobak) (I originally found it via [a tweet](https://twitter.com/hippopedoid/status/1575879260216373249)). This uses TF-IDF of ICLR submissions.


In [2]:
import logging
import time

import httpx
import numpy as np
import pandas as pd

# httpx can be a bit noisy with info logging
logging.getLogger("httpx").setLevel(logging.WARNING)
logging.getLogger("httpcore").setLevel(logging.WARNING)


def download_iclr() -> tuple[np.ndarray, np.ndarray, np.ndarray]:
    base_url = "https://api.openreview.net/notes"
    all_records = []

    # Use a Client for connection pooling (Keep-Alive)
    # We increase the timeout in case the server hangs slightly
    with httpx.Client(timeout=30.0) as client:
        for year in [2018, 2019, 2020, 2021, 2022, 2023]:
            for query in [
                "Blind_Submission",
                "Withdrawn_Submission",
                "Desk_Rejected_Submission",
            ]:
                invitation = f"ICLR.cc/{year}/Conference/-/{query}"
                offset = 0
                limit = 1000

                while True:
                    params = {
                        "invitation": invitation,
                        "offset": offset,
                        "limit": limit,
                    }

                    try:
                        resp = client.get(base_url, params=params)

                        # Handle Rate Limiting (429)
                        if resp.status_code == 429:
                            # Use Retry-After header if available, otherwise default to 2s
                            wait_time = int(resp.headers.get("retry-after", 2))
                            print(f"Rate limited. Sleeping for {wait_time}s...")
                            time.sleep(wait_time)
                            continue  # Retry

                        # other non-200 codes (404, 500, etc.)
                        resp.raise_for_status()

                        data = resp.json()
                        notes = data.get("notes", [])

                        # no notes returned, we are done with this query
                        if not notes:
                            break

                        # Process batch
                        for note in notes:
                            content = note.get("content", {})
                            all_records.append(
                                {
                                    "title": content.get("title", "").strip(),
                                    "abstract": content.get("abstract", "").strip(),
                                    "year": year,
                                }
                            )

                        print(f"Fetched {year} {query} - Offset {offset}")
                        offset += limit

                        # Small sleep just in case
                        time.sleep(0.1)

                    except httpx.RequestError as e:
                        print(f"Network error occurred: {e}")
                        break

    # Create DataFrame once at the end (much faster)
    if not all_records:
        return np.array([]), np.array([]), np.array([])

    df = pd.DataFrame(all_records)

    return (df["abstract"].values, df["title"].values, df["year"].values)

In [3]:
abstracts, titles, years = download_iclr()

Fetched 2018 Blind_Submission - Offset 0
Fetched 2018 Withdrawn_Submission - Offset 0
Fetched 2019 Blind_Submission - Offset 0
Fetched 2019 Blind_Submission - Offset 1000
Fetched 2019 Withdrawn_Submission - Offset 0
Fetched 2020 Blind_Submission - Offset 0
Fetched 2020 Blind_Submission - Offset 1000
Fetched 2020 Blind_Submission - Offset 2000
Fetched 2020 Withdrawn_Submission - Offset 0
Fetched 2020 Desk_Rejected_Submission - Offset 0
Fetched 2021 Blind_Submission - Offset 0
Fetched 2021 Blind_Submission - Offset 1000
Fetched 2021 Blind_Submission - Offset 2000
Fetched 2021 Withdrawn_Submission - Offset 0
Fetched 2021 Desk_Rejected_Submission - Offset 0
Fetched 2022 Blind_Submission - Offset 0
Fetched 2022 Blind_Submission - Offset 1000
Fetched 2022 Blind_Submission - Offset 2000
Fetched 2022 Withdrawn_Submission - Offset 0
Fetched 2022 Desk_Rejected_Submission - Offset 0
Fetched 2023 Blind_Submission - Offset 0
Fetched 2023 Blind_Submission - Offset 1000
Fetched 2023 Blind_Submission 

In [4]:
len(titles), len(abstracts)

(16582, 16582)

In [5]:
mask = np.array([len(a) >= 200 for a in abstracts])
docs_to_keep = np.where(mask)

In [6]:
abstracts = abstracts[docs_to_keep]
titles = titles[docs_to_keep]
years = years[docs_to_keep]
len(titles)

16559

In [7]:
text = np.empty_like(titles, dtype=object)
for i in range(len(titles)):
    text[i] = titles[i] + " " + abstracts[i]
text[:3]

array(["Predicting Floor-Level for 911 Calls with Neural Networks and Smartphone Sensor Data In cities with tall buildings, emergency responders need an accurate floor level location to find 911 callers quickly. We introduce a system to estimate a victim's floor level via their mobile device's sensor data in a two-step process. First, we train a neural network to determine when a smartphone enters or exits a building via GPS signal changes. Second, we use a barometer equipped smartphone to measure the change in barometric pressure from the entrance of the building to the victim's indoor location. Unlike impractical previous approaches, our system is the first that does not require the use of beacons, prior knowledge of the building infrastructure, or knowledge of user behavior. We demonstrate real-world feasibility through 63 experiments across five different tall buildings throughout New York City where our system predicted the correct floor level with 100% accuracy.",
       "Some Co

In [8]:
import sklearn.feature_extraction.text

iclr_l2s = sklearn.feature_extraction.text.TfidfVectorizer(
    norm="l2", sublinear_tf=True
).fit_transform(text)

We'll use 100 components here like Dmitry does in his notebook. My own permutation-based tests suggest maybe up to 150 components is also a possible choice, but not a lot more than that.


In [9]:
import sklearn.decomposition

tsvd = sklearn.decomposition.TruncatedSVD(n_components=100, algorithm="arpack").fit(
    iclr_l2s
)

In [10]:
np.sum(tsvd.explained_variance_ratio_)

np.float64(0.1317686028984716)

13% of variance explained with 100 dimensions.


In [11]:
data = tsvd.transform(iclr_l2s)

In [12]:
keywords = [
    "network",
    "graph",
    "reinforcement",
    "language",
    "adversarial",
    "federated",
    "contrastive",
    "domain",
    "diffusion",
    "out-of-dis",
    "continual",
    "distillation",
    "architecture",
    "privacy",
    "protein",
    "fair",
    "attention",
    "video",
    "meta-learning",
    "generative adv",
    "autoencoder",
    "game",
    "semi-sup",
    "pruning",
    "physics",
    "3d",
    "translation",
    "optimization",
    "recurrent",
    "word",
    "bayesian",
]

In [13]:
# Most frequent words in the titles (at least 5 letters)

words, counts = np.unique(" ".join(titles).split(), return_counts=True)
ind = np.argsort(counts)[::-1][:50]
for i in ind:
    if len(words[i]) >= 5:
        print(f"{words[i]:20} {counts[i]:4}")

Learning             4545
Neural               2069
Networks             1670
Reinforcement         868
Adversarial           803
Graph                 797
Models                784
Training              635
Network               561
Representation        534
Model                 481
Optimization          465
Efficient             460
Generative            436
Language              407
Representations       399
Robust                370
Image                 348
Generalization        347
using                 344
Gradient              338
Towards               338
Unsupervised          335
learning              329
Federated             321
Detection             313
Generation            309
Classification        302
Robustness            288
Policy                274
Adaptive              270
Contrastive           269


Some processing of the text is necessary. To do this properly you may want to consider looking into the likes of [nltk](https://www.nltk.org/) for more robust handling of text (see for example this [article](https://towardsdatascience.com/tf-idf-for-document-ranking-from-scratch-in-python-on-real-world-dataset-796d339a4089)), but I am just going to make everything lower-case and then remove some symbols. This seems good enough for this dataset.


In [14]:
def remove_symbols(text):
    symbols = '!"#$%&()*+-,./:;<=>?@[\]^_`{|}~\n'
    for i in symbols:
        text = np.char.replace(text, i, " ")
    return text.tolist()


processed_titles = [remove_symbols(title.lower()) for title in titles]

  symbols = '!"#$%&()*+-,./:;<=>?@[\]^_`{|}~\n'


In [15]:
titles[:5], processed_titles[:5]

(array(['Predicting Floor-Level for 911 Calls with Neural Networks and Smartphone Sensor Data',
        'Some Considerations on Learning to Explore via Meta-Reinforcement Learning',
        'MACH: Embarrassingly parallel $K$-class classification in $O(d\\log{K})$ memory and $O(K\\log{K} + d\\log{K})$ time, instead of $O(Kd)$',
        'Deterministic Policy Imitation Gradient Algorithm',
        'Searching for Activation Functions'], dtype=object),
 ['predicting floor level for 911 calls with neural networks and smartphone sensor data',
  'some considerations on learning to explore via meta reinforcement learning',
  'mach  embarrassingly parallel  k  class classification in  o d log k    memory and  o k log k    d log k    time  instead of  o kd  ',
  'deterministic policy imitation gradient algorithm',
  'searching for activation functions'])

In [16]:
words, counts = np.unique(" ".join(processed_titles).split(), return_counts=True)
ind = np.argsort(counts)[::-1]
for i in ind:
    if len(words[i]) >= 5:
        if words[i] in keywords:
            print(f"{words[i]:20} {counts[i]:4}")

reinforcement         945
graph                 934
adversarial           902
network               728
optimization          538
language              525
domain                384
federated             334
attention             334
contrastive           305
architecture          210
continual             201
bayesian              175
distillation          174
recurrent             173
video                 172
translation           167
pruning               157
diffusion             152
privacy               107
physics                78
autoencoder            77
protein                66


As a rough way to see what titles we are missing due to bad processing, here the same procedure but allowing through any words that begin with the keywords, so we pickup any plurals or when it's part of a compound adjective:


In [17]:
words, counts = np.unique(" ".join(processed_titles).split(), return_counts=True)
ind = np.argsort(counts)[::-1]
for i in ind:
    if len(words[i]) >= 5:
        for k in keywords:
            if words[i].startswith(k):
                print(f"{words[i]:20} {counts[i]:4}")
                break

networks             2061
reinforcement         945
graph                 934
adversarial           902
network               728
optimization          538
language              525
domain                384
federated             334
attention             334
contrastive           305
architecture          210
continual             201
bayesian              175
distillation          174
recurrent             173
video                 172
translation           167
pruning               157
diffusion             152
graphs                144
autoencoders          138
privacy               107
games                  89
physics                78
autoencoder            77
fairness               76
protein                66
architectures          61
adversarially          46
domains                40
videos                 29
languages              16
words                   9
attentional             7
graphical               7
graphics                6
networked               5
attentions  

In [18]:
ordered_keywords = []
words, counts = np.unique(" ".join(processed_titles).split(), return_counts=True)
ind = np.argsort(counts)[::-1]
for i in ind:
    if len(words[i]) >= 5:
        if words[i] in keywords:
            ordered_keywords.append(words[i])
ordered_keywords

[np.str_('reinforcement'),
 np.str_('graph'),
 np.str_('adversarial'),
 np.str_('network'),
 np.str_('optimization'),
 np.str_('language'),
 np.str_('domain'),
 np.str_('federated'),
 np.str_('attention'),
 np.str_('contrastive'),
 np.str_('architecture'),
 np.str_('continual'),
 np.str_('bayesian'),
 np.str_('distillation'),
 np.str_('recurrent'),
 np.str_('video'),
 np.str_('translation'),
 np.str_('pruning'),
 np.str_('diffusion'),
 np.str_('privacy'),
 np.str_('physics'),
 np.str_('autoencoder'),
 np.str_('protein')]

In [19]:
ordered_keywords = []
words, counts = np.unique(" ".join(processed_titles).split(), return_counts=True)
ind = np.argsort(counts)[::-1]
for i in ind:
    if len(words[i]) >= 5:
        if words[i] in keywords:
            ordered_keywords.append(words[i])
ordered_keywords

[np.str_('reinforcement'),
 np.str_('graph'),
 np.str_('adversarial'),
 np.str_('network'),
 np.str_('optimization'),
 np.str_('language'),
 np.str_('domain'),
 np.str_('federated'),
 np.str_('attention'),
 np.str_('contrastive'),
 np.str_('architecture'),
 np.str_('continual'),
 np.str_('bayesian'),
 np.str_('distillation'),
 np.str_('recurrent'),
 np.str_('video'),
 np.str_('translation'),
 np.str_('pruning'),
 np.str_('diffusion'),
 np.str_('privacy'),
 np.str_('physics'),
 np.str_('autoencoder'),
 np.str_('protein')]

In [20]:
levels = np.repeat(len(keywords), len(processed_titles))
for level_int, keyword in enumerate(keywords):
    ind = [i for i, t in enumerate(titles) if keyword.lower() in t.lower()]
    levels[ind] = level_int
keywords.append("unknown")

In [21]:
levels

array([ 0,  2, 31, ...,  1, 16,  0])

In [22]:
from drnb.util import codes_to_categories

description = codes_to_categories(levels, keywords, "description")
description

0              network
1        reinforcement
2              unknown
3              unknown
4              unknown
             ...      
16554           domain
16555          unknown
16556            graph
16557        attention
16558          network
Name: description, Length: 16559, dtype: category
Categories (32, object): ['network', 'graph', 'reinforcement', 'language', ..., 'recurrent', 'word', 'bayesian', 'unknown']

In [23]:
description.value_counts()

description
unknown           7583
network           1604
reinforcement      823
graph              823
adversarial        651
optimization       506
language           414
domain             337
attention          320
federated          254
contrastive        247
architecture       233
autoencoder        196
3d                 191
continual          179
recurrent          171
bayesian           170
video              169
translation        160
semi-sup           157
pruning            150
distillation       150
diffusion          134
meta-learning      130
fair               126
generative adv     123
game               122
out-of-dis         116
word               100
privacy             95
physics             74
protein             51
Name: count, dtype: int64

## Pipeline


In [24]:
target = pd.concat(
    [
        pd.Series(years, name="year", dtype="category"),
        pd.Series(levels, name="class"),
        pd.Series(titles, name="title"),
        description,
    ],
    axis=1,
)
target

Unnamed: 0,year,class,title,description
0,2018,0,Predicting Floor-Level for 911 Calls with Neur...,network
1,2018,2,Some Considerations on Learning to Explore via...,reinforcement
2,2018,31,MACH: Embarrassingly parallel $K$-class classi...,unknown
3,2018,31,Deterministic Policy Imitation Gradient Algorithm,unknown
4,2018,31,Searching for Activation Functions,unknown
...,...,...,...,...
16554,2023,7,Text-Driven Generative Domain Adaptation with ...,domain
16555,2023,31,"Laziness, Barren Plateau, and Noises in Machin...",unknown
16556,2023,1,Discovering the Representation Bottleneck of G...,graph
16557,2023,16,Results for Perfect Classification for Graph A...,attention


`glasbey` generates the colors for the descriptions, except for the final `unknown` description, which will be grey. We'll use the `extend_palette` function, initializing with the grey color, to try and avoid creating a palette which contains another similar grey color by accident (to the extent possible).


In [25]:
my_list = [1, 2, 3, 4, 5]  # Example list

my_list.append(my_list.pop(0))

print(my_list)

[2, 3, 4, 5, 1]


In [26]:
import glasbey

colors = glasbey.extend_palette(["#aaaaaa"], len(keywords))
colors.append(colors.pop(0))
colors

['#00008e',
 '#690000',
 '#005900',
 '#a600c6',
 '#b6ff00',
 '#ff5108',
 '#007dff',
 '#00ffff',
 '#ff8aff',
 '#555561',
 '#00b24d',
 '#967100',
 '#fbba4d',
 '#410041',
 '#c6005d',
 '#282008',
 '#009296',
 '#610ce7',
 '#cef3c6',
 '#a6aaff',
 '#ffdffb',
 '#ff969e',
 '#a26d8e',
 '#003139',
 '#00c2f7',
 '#a64114',
 '#597955',
 '#793555',
 '#a6ae00',
 '#9661fb',
 '#ef18c6',
 '#aaaaaa']

The `pop`ing and `append`ing is to then move the grey color for `unknown` to the end of the palette so it matches the ordering of the `keywords`.


In [27]:
palette = dict(
    description=dict(
        zip(
            keywords,
            colors,
        )
    )
)
palette

{'description': {'network': '#00008e',
  'graph': '#690000',
  'reinforcement': '#005900',
  'language': '#a600c6',
  'adversarial': '#b6ff00',
  'federated': '#ff5108',
  'contrastive': '#007dff',
  'domain': '#00ffff',
  'diffusion': '#ff8aff',
  'out-of-dis': '#555561',
  'continual': '#00b24d',
  'distillation': '#967100',
  'architecture': '#fbba4d',
  'privacy': '#410041',
  'protein': '#c6005d',
  'fair': '#282008',
  'attention': '#009296',
  'video': '#610ce7',
  'meta-learning': '#cef3c6',
  'generative adv': '#a6aaff',
  'autoencoder': '#ffdffb',
  'game': '#ff969e',
  'semi-sup': '#a26d8e',
  'pruning': '#003139',
  'physics': '#00c2f7',
  '3d': '#a64114',
  'translation': '#597955',
  'optimization': '#793555',
  'recurrent': '#a6ae00',
  'word': '#9661fb',
  'bayesian': '#ef18c6',
  'unknown': '#aaaaaa'}}

In [28]:
from drnb.io.pipeline import create_default_pipeline

_ = create_default_pipeline(check_for_duplicates=True, metric=["euclidean"]).run(
    "iclr",
    data=data,
    target=target,
    target_palette=palette,
    url="https://github.com/dkobak/iclr-tsne",
    verbose=True,
)

## Renormalize

I also recommend renormalizing to L2 after the SVD procedure, so let's save that as a separate dataset.


In [29]:
from drnb.preprocess import normalize_l2

In [30]:
_ = create_default_pipeline(check_for_duplicates=True, metric=["euclidean"]).run(
    "iclr-l2r",
    data=normalize_l2(data),
    target=target,
    target_palette=palette,
    url="https://github.com/dkobak/iclr-tsne",
    verbose=True,
)

For use in the `renorm-prep.ipynb` to experiment with different ways of processing TF-IDF data, let's save the text that was used for the TF-IDF analysis here.


In [31]:
from drnb.io import write_pickle

_ = write_pickle(
    text,
    "iclr",
    suffix="text",
    verbose=False,
    compression="gzip",
    overwrite=True,
)

In [33]:
import drnb.embed.pipeline as pl

pl.standard_eval(
    method="leopold",
    dataset="iclr-l2r",
    extra_plot=dict(
        backend="plotly",
        figsize=(10, 7),
        hover=["title", "description"],
        alpha_scale=0.6,
        cex=30,
    ),
)

[EvalResult(eval_type='RTE', label='rte-5-euclidean', value=np.float64(0.595579443203092), info={'metric': 'euclidean', 'ntpp': 5}),
 EvalResult(eval_type='RPC', label='rpc-5-euclidean', value=0.33057841658592224, info={'metric': 'euclidean', 'ntpp': 5}),
 EvalResult(eval_type='NNP', label='nnp-15-noself-euclidean', value=np.float64(0.19286188779515673), info={'metric': 'euclidean', 'n_neighbors': 15}),
 EvalResult(eval_type='NNP', label='nnp-50-noself-euclidean', value=np.float64(0.30865148861646236), info={'metric': 'euclidean', 'n_neighbors': 50}),
 EvalResult(eval_type='NNP', label='nnp-150-noself-euclidean', value=np.float64(0.4173549127362764), info={'metric': 'euclidean', 'n_neighbors': 150})]