In [1]:
import os
from glob import glob
import json

source_dir = os.path.join("..", "arxiv_papers_infos")
papers_paths = glob(os.path.join(source_dir, "*.json"))

In [2]:
paper_data = []
for path in papers_paths:
    with open(path, "r") as f:
        data = json.load(f)
    paper_data.append(data)
print(len(paper_data))

320


In [3]:
from sentence_transformers import SentenceTransformer

# model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
model = SentenceTransformer("paraphrase-MiniLM-L6-v2")

In [4]:
docs = []
titles = []
for data in paper_data:
    if "result" not in data:
        continue
    data = data["result"]
    authors = "" if "authors" not in data else " ".join(data["authors"])
    title = "" if "title" not in data else data["title"]
    abstract = "" if "abstract" not in data else data["abstract"]
    # sent = authors + " " + title + " " + abstract
    sent = title + " ----- " + abstract
    docs.append(sent)
    titles.append(title)
print(len(docs))

320


In [5]:
embeddings = model.encode(docs[:1000])

In [17]:
from sklearn.manifold import TSNE

X_tsne = TSNE(
    n_components=2,
    learning_rate='auto',
    init='random',
    perplexity=10
).fit_transform(embeddings)

In [36]:
import pandas as pd
from matplotlib import pyplot as plt
import plotly.express as px
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score


runs_df = {
    "TSNE perplexity": [],
    "DBSCAN eps": [],
    "DBSCAN min_samples": [],
    "Num. clusters": [],
    "Num. noise": [],
    "Silhouette": [],
}

for perp in [10]:
    X_tsne = TSNE(
        n_components=2,
        learning_rate='auto',
        init='random',
        perplexity=perp
    ).fit_transform(embeddings)
    plt_data = pd.DataFrame(
        {
            "x_tsne": X_tsne[:, 0],
            "y_tsne": X_tsne[:, 1],
            "content": titles[:len(X_tsne)],
        }
    )
    for _eps in [int(4 * 2.75)]:
        eps = _eps / 4.0
        for min_samples in [2]:
            db = DBSCAN(eps=eps, min_samples=min_samples).fit(X_tsne)
            labels = db.labels_
            if len(set(labels)) == 1:
                sil_score = -1.0
            else:
                sil_score = silhouette_score(X_tsne, labels)
            n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
            n_noise_ = list(labels).count(-1)
            runs_df["TSNE perplexity"].append(perp)
            runs_df["DBSCAN eps"].append(eps)
            runs_df["DBSCAN min_samples"].append(min_samples)
            runs_df["Num. clusters"].append(n_clusters_)
            runs_df["Num. noise"].append(n_noise_)
            runs_df["Silhouette"].append(sil_score)

runs_df = pd.DataFrame(runs_df)

# n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
# n_noise_ = list(labels).count(-1)

# print(f"Estimated number of clusters: {n_clusters_}")
# print(f"Estimated number of noise points: {n_noise_}")

In [39]:
pd.set_option("display.max_rows", 180)
display(runs_df.sort_values("Silhouette", ascending=False).iloc[:180])

plt_data["label"] = labels
fig = px.scatter(
    plt_data,
    x="x_tsne",
    y="y_tsne",
    hover_name="content",
    hover_data=["label"],
    color=labels,
)
fig.show()

Unnamed: 0,TSNE perplexity,DBSCAN eps,DBSCAN min_samples,Num. clusters,Num. noise,Silhouette
0,10,2.75,2,54,24,0.239184


In [40]:
from collections import defaultdict as dd

cluster_counts = dd(int)
cluster_titles = dd(list)

for idx, label in enumerate(labels):
    cluster_counts[label] += 1
    cluster_titles[label].append(titles[idx])

In [41]:
sorted_clusters = sorted(cluster_titles.items(), key=lambda x: -len(x[1]))

In [43]:
sorted_clusters #[1:]

[(12,
  ['dynamicdet: a unified dynamic architecture for object detection',
   'dynamic neural network for multi-task learning searching across diverse network topologies',
   'fixing overconfidence in dynamic neural networks',
   'adaensemble: learning adaptively sparse structured ensemble network for click-through rate prediction',
   'monadic deep learning',
   'physics-informed koopman network',
   'a survey on dynamic neural networks for natural language processing',
   "physics-guided problem decomposition for scaling deep learning of high-dimensional eigen-solvers: the case of schrödinger's equation",
   'forecasting the outcome of spintronic experiments with neural ordinary differential equations',
   'kdexplainer: a task-oriented attention model for explaining knowledge distillation',
   'ecnns: ensemble learning methods for improving planar grasp quality estimation',
   'contextual hypernetworks for novel feature adaptation',
   'deep learning with a classifier system: initia

In [None]:
cluster_lenghts = [len(x[1]) for x in sorted_clusters[1:]]
print(sum(cluster_lenghts)) # [1:125]
plt.hist(cluster_lenghts, bins=220)
plt.grid()
plt.title("Distribuição do No de artigos. Perp = 30")
plt.show()