In [1]:
%load_ext lab_black
%load_ext autotime
import pandas as pd
import numpy as np

time: 273 ms (started: 2022-10-02 00:43:11 -07:00)


Based on a [notebook](https://github.com/dkobak/iclr-tsne/blob/main/iclr-tsne.ipynb) by [Dmitry Kobak](https://github.com/dkobak) (I originally found it via [a tweet](https://twitter.com/hippopedoid/status/1575879260216373249)). This uses TF-IDF of ICLR submissions.

In [2]:
import requests


def download_iclr():
    titles = []
    abstracts = []
    years = []

    for year in [2018, 2019, 2020, 2021, 2022, 2023]:
        url = f"https://api.openreview.net/notes?invitation=ICLR.cc%2F{year}%2FConference%2F-%2FBlind_Submission"
        for offset in [0, 1000, 2000, 3000, 4000]:
            df = pd.DataFrame(requests.get(url + f"&offset={offset}").json()["notes"])
            if len(df) > 0:
                titles += [d["title"].strip() for d in df["content"].values]
                abstracts += [d["abstract"].strip() for d in df["content"].values]
                years += [year] * len(df)

    return abstracts, titles, np.array(years)

time: 40.1 ms (started: 2022-10-02 00:43:12 -07:00)


In [3]:
abstracts, titles, years = download_iclr()

time: 20.4 s (started: 2022-10-02 00:43:12 -07:00)


In [4]:
len(titles), len(abstracts)

(14732, 14732)

time: 7.1 ms (started: 2022-10-02 00:43:32 -07:00)


In [5]:
import sklearn.feature_extraction.text

iclr_tfidf = sklearn.feature_extraction.text.TfidfVectorizer(
    sublinear_tf=True
).fit_transform(abstracts)

time: 1.75 s (started: 2022-10-02 00:43:32 -07:00)


In [6]:
iclr_tfidf

<14732x32146 sparse matrix of type '<class 'numpy.float64'>'
	with 1618991 stored elements in Compressed Sparse Row format>

time: 2.57 ms (started: 2022-10-02 00:43:34 -07:00)


PCA to a dense representation should take less than 3GB and a few minutes to run:

In [7]:
import sklearn.decomposition

tsvd = sklearn.decomposition.TruncatedSVD(n_components=3000).fit(iclr_tfidf)

time: 2min 41s (started: 2022-10-02 00:43:34 -07:00)


In [8]:
np.sum(tsvd.explained_variance_ratio_)

0.7653356697941727

time: 4.16 ms (started: 2022-10-02 00:46:16 -07:00)


77% of variance explained with 3000 dimensions.

In [9]:
data = tsvd.transform(iclr_tfidf)

time: 2.89 s (started: 2022-10-02 00:46:16 -07:00)


In [10]:
keywords = [
    "network",
    "graph",
    "reinforcement",
    "language",
    "adversarial",
    "federated",
    "contrastive",
    "domain",
    "generalization",
    "detection",
    "diffusion",
    "recurrent",
]

time: 1.7 ms (started: 2022-10-02 00:46:18 -07:00)


In [11]:
# Most frequent words in the titles (at least 5 letters)

words, counts = np.unique(" ".join(titles).split(), return_counts=True)
ind = np.argsort(counts)[::-1][:50]
for i in ind:
    if len(words[i]) >= 5:
        print(f"{words[i]:20} {counts[i]:4}")

Learning             4078
Neural               1824
Networks             1459
Reinforcement         807
Models                719
Graph                 711
Adversarial           687
Training              557
Network               474
Representation        473
Model                 437
Efficient             418
Optimization          417
Generative            394
Language              363
Representations       359
Robust                320
Generalization        311
Gradient              310
using                 305
Image                 297
learning              297
Towards               294
Federated             291
Unsupervised          288
Generation            273
Detection             266
Robustness            261
Classification        255
Policy                252
Adaptive              240
Contrastive           237
time: 71.5 ms (started: 2022-10-02 00:46:18 -07:00)


Some processing of the text is necessary. To do this properly you may want to consider looking into the likes of [nltk](https://www.nltk.org/) for more robust handling of text (see for example this [article](https://towardsdatascience.com/tf-idf-for-document-ranking-from-scratch-in-python-on-real-world-dataset-796d339a4089)), but I am just going to make everything lower-case and then remove some symbols. This seems good enough for this dataset.

In [12]:
def remove_symbols(text):
    symbols = '!"#$%&()*+-,./:;<=>?@[\]^_`{|}~\n'
    for i in symbols:
        text = np.char.replace(text, i, " ")
    return text.tolist()


processed_titles = [remove_symbols(title.lower()) for title in titles]

time: 1.57 s (started: 2022-10-02 00:46:18 -07:00)


In [13]:
titles[:5], processed_titles[:5]

(['Predicting Floor-Level for 911 Calls with Neural Networks and Smartphone Sensor Data',
  'Some Considerations on Learning to Explore via Meta-Reinforcement Learning',
  'MACH: Embarrassingly parallel $K$-class classification in $O(d\\log{K})$ memory and $O(K\\log{K} + d\\log{K})$ time, instead of $O(Kd)$',
  'Deterministic Policy Imitation Gradient Algorithm',
  'Searching for Activation Functions'],
 ['predicting floor level for 911 calls with neural networks and smartphone sensor data',
  'some considerations on learning to explore via meta reinforcement learning',
  'mach  embarrassingly parallel  k  class classification in  o d log k    memory and  o k log k    d log k    time  instead of  o kd  ',
  'deterministic policy imitation gradient algorithm',
  'searching for activation functions'])

time: 2.56 ms (started: 2022-10-02 00:46:20 -07:00)


In [14]:
words, counts = np.unique(" ".join(processed_titles).split(), return_counts=True)
ind = np.argsort(counts)[::-1]
for i in ind:
    if len(words[i]) >= 5:
        if words[i] in keywords:
            print(f"{words[i]:20} {counts[i]:4}")

reinforcement         879
graph                 833
adversarial           769
network               615
language              471
generalization        376
detection             324
domain                314
federated             303
contrastive           268
recurrent             151
diffusion             149
time: 81.6 ms (started: 2022-10-02 00:46:20 -07:00)


As a rough way to see what titles we are missing due to bad processing, here the same procedure but allowing through any words that begin with the keywords, so we pickup any plurals or when it's part of a compound adjective:

In [15]:
words, counts = np.unique(" ".join(processed_titles).split(), return_counts=True)
ind = np.argsort(counts)[::-1]
for i in ind:
    if len(words[i]) >= 5:
        for k in keywords:
            if words[i].startswith(k):
                print(f"{words[i]:20} {counts[i]:4}")
                break

networks             1814
reinforcement         879
graph                 833
adversarial           769
network               615
language              471
generalization        376
detection             324
domain                314
federated             303
contrastive           268
recurrent             151
diffusion             149
graphs                135
adversarially          42
domains                34
languages              16
graphics                6
networked               4
graphical               4
graph2seq               3
diffusions              3
graphon                 2
graphic                 2
contrastively           1
graphzoom               1
graphvf                 1
graphvae                1
grapheditor             1
graphaf                 1
graphcg                 1
graphcgan               1
graphcodebert           1
graphcore               1
graphebm                1
graphens                1
graphseq2seq            1
graphgan                1
graphix     

In [16]:
ordered_keywords = []
words, counts = np.unique(" ".join(processed_titles).split(), return_counts=True)
ind = np.argsort(counts)[::-1]
for i in ind:
    if len(words[i]) >= 5:
        if words[i] in keywords:
            ordered_keywords.append(words[i])
ordered_keywords

['reinforcement',
 'graph',
 'adversarial',
 'network',
 'language',
 'generalization',
 'detection',
 'domain',
 'federated',
 'contrastive',
 'recurrent',
 'diffusion']

time: 71.6 ms (started: 2022-10-02 00:46:20 -07:00)


In [17]:
levels = np.repeat(len(ordered_keywords), len(processed_titles))
for level_int, keyword in enumerate(ordered_keywords):
    ind = [i for i, t in enumerate(titles) if keyword.lower() in t.lower()]
    levels[ind] = level_int
ordered_keywords.append("unknown")

time: 52.3 ms (started: 2022-10-02 00:46:20 -07:00)


In [18]:
levels

array([ 3,  0, 12, ...,  0, 12,  0])

time: 3.9 ms (started: 2022-10-02 00:46:20 -07:00)


In [19]:
from drnb.util import codes_to_categories

description = codes_to_categories(levels, ordered_keywords, "description")
description

0               network
1         reinforcement
2               unknown
3               unknown
4               unknown
              ...      
14727           unknown
14728    generalization
14729     reinforcement
14730           unknown
14731     reinforcement
Name: description, Length: 14732, dtype: category
Categories (13, object): ['adversarial', 'contrastive', 'detection', 'diffusion', ..., 'network', 'recurrent', 'reinforcement', 'unknown']

time: 12.9 ms (started: 2022-10-02 00:46:20 -07:00)


## Pipeline

In [27]:
target = pd.concat(
    [
        pd.Series(years, name="year", dtype="category"),
        pd.Series(levels, name="class"),
        description,
    ],
    axis=1,
)
target

Unnamed: 0,year,class,description
0,2018,3,network
1,2018,0,reinforcement
2,2018,12,unknown
3,2018,12,unknown
4,2018,12,unknown
...,...,...,...
14727,2023,12,unknown
14728,2023,5,generalization
14729,2023,0,reinforcement
14730,2023,12,unknown


time: 15.4 ms (started: 2022-10-02 01:15:57 -07:00)


In [24]:
palette = dict(
    description=dict(
        zip(
            ordered_keywords,
            [
                "#8dd3c7",
                "#ffffb3",
                "#bebada",
                "#fb8072",
                "#80b1d3",
                "#fdb462",
                "#b3de69",
                "#fccde5",
                "#d9d9d9",
                "#bc80bd",
                "#ccebc5",
                "#ffed6f",
                "#f6f6f6",
            ],
        )
    )
)
palette

{'description': {'reinforcement': '#8dd3c7',
  'graph': '#ffffb3',
  'adversarial': '#bebada',
  'network': '#fb8072',
  'language': '#80b1d3',
  'generalization': '#fdb462',
  'detection': '#b3de69',
  'domain': '#fccde5',
  'federated': '#d9d9d9',
  'contrastive': '#bc80bd',
  'recurrent': '#ccebc5',
  'diffusion': '#ffed6f',
  'unknown': '#f6f6f6'}}

time: 6.73 ms (started: 2022-10-02 01:05:18 -07:00)


In [28]:
from drnb.io.pipeline import create_default_pipeline

data_result = create_default_pipeline(
    check_for_duplicates=True, metric=["euclidean", "cosine"]
).run(
    "iclr",
    data=data,
    target=target,
    target_palette=palette,
    tags=["highdim"],
    url="https://github.com/dkobak/iclr-tsne",
    verbose=True,
)

time: 56.2 s (started: 2022-10-02 01:16:45 -07:00)
