In [None]:
from pathlib import Path

import torch
from tqdm.auto import tqdm
from transformers import AutoModel, AutoTokenizer
from umap import UMAP

from src.document import Document

In [None]:
data_dir = Path("../data/processed")

n_docs = 5
files = list(data_dir.glob("*.json"))[:n_docs]
documents = [Document.load(file) for file in files]

In [None]:
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-mpnet-base-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-mpnet-base-v2")

In [None]:
# Embed every sentence in every document
embeddings = []
all_sentences = [sentence for document in documents for sentence in document.sentences]
for sentence in tqdm(all_sentences):
    # Tokenize the sentence
    tokens = tokenizer.encode(sentence, add_special_tokens=True)

    # Convert tokens to tensor
    input_ids = torch.tensor([tokens])

    # Get the sentence embeddings
    with torch.no_grad():
        sentence_embedding = model(input_ids)[0].squeeze(0).mean(dim=0)

    # Append the sentence embedding to the list
    embeddings.append(sentence_embedding)

In [None]:
len(embeddings)

In [None]:
# Convert the embeddings to a tensor
embeddings = torch.stack(embeddings)

# Reduce the dimensionality of the embeddings
umap = UMAP(n_components=2)
umap_embeddings = umap.fit_transform(embeddings)

x, y = umap_embeddings[:, 0], umap_embeddings[:, 1]

In [None]:
import pandas as pd
import plotly.express as px

df = pd.DataFrame({"x": x, "y": y, "sentence": all_sentences[: len(x)]})
fig = px.scatter(
    df, x="x", y="y", hover_data={"x": False, "y": False, "sentence": True}
)
fig.show()