In [None]:
import os
from glob import glob
import json

source_dir = os.path.join("..", "arxiv_papers_infos")
papers_paths = glob(os.path.join(source_dir, "*.json"))

In [None]:
paper_data = []
for path in papers_paths:
    with open(path, "r") as f:
        data = json.load(f)
    paper_data.append(data)
print(len(paper_data))

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
# model = SentenceTransformer("paraphrase-MiniLM-L6-v2")

In [None]:
docs = []
titles = []
for data in paper_data:
    if "result" not in data:
        continue
    data = data["result"]
    authors = "" if "authors" not in data else " ".join(data["authors"])
    title = "" if "title" not in data else data["title"]
    abstract = "" if "abstract" not in data else data["abstract"]
    sent = authors + " " + title + " " + abstract
    docs.append(sent)
    titles.append(title)
print(len(docs))

In [None]:
embeddings = model.encode(docs[:1000])

In [None]:
from sklearn.manifold import TSNE

X_tsne = TSNE(
    n_components=2,
    # learning_rate='auto',
    init='random',
    perplexity=30
).fit_transform(embeddings)

In [None]:
import pandas as pd
from matplotlib import pyplot as plt
import plotly.express as px
from sklearn.cluster import DBSCAN

plt_data = pd.DataFrame(
    {
        "x_tsne": X_tsne[:, 0],
        "y_tsne": X_tsne[:, 1],
        "content": titles[:len(X_tsne)],
    }
)

db = DBSCAN(eps=1.5, min_samples=5).fit(X_tsne)
labels = db.labels_

n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)
plt_data["label"] = labels

print(f"Estimated number of clusters: {n_clusters_}")
print(f"Estimated number of noise points: {n_noise_}")

fig = px.scatter(
    plt_data,
    x="x_tsne",
    y="y_tsne",
    hover_name="content",
    hover_data=["label"],
    color=labels,
)
fig.show()