In [None]:
import random
from pathlib import Path

import torch
from tqdm.auto import tqdm
from transformers import AutoModel, AutoTokenizer

from src.document import Document

In [None]:
data_dir = Path("../data/raw/text")
files = list(data_dir.glob("*.json"))
documents = [Document.load_raw(file) for file in tqdm(files)]

Load the tokenizer and model


In [None]:
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-mpnet-base-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-mpnet-base-v2")

model.eval()

In [None]:
sentences = [sentence for document in documents for sentence in document.sentences]

# filter for sentences with at least 5 tokens and less than 512 tokens
sentences = [
    sentence
    for sentence in tqdm(sentences)
    if 512 > len(tokenizer.tokenize(sentence)) >= 5
]

# sample 10,000 random sentences
sentences = random.sample(sentences, 10_000)

In [None]:
embeddings = []
for sentence in tqdm(sentences):
    tokens = tokenizer.encode(sentence, add_special_tokens=True)
    input_ids = torch.tensor([tokens])
    sentence_embedding = model(input_ids)[0].squeeze(0).mean(dim=0)

    embeddings.append(sentence_embedding)

reduce embedding dimensionality using UMAP


In [None]:
from umap import UMAP

In [None]:
embeddings = torch.stack(embeddings)

umap = UMAP(n_components=2)
embeddings_2d = umap.fit_transform(embeddings)

use plotly to visualize the embeddings interactively


In [None]:
import pandas as pd
import plotly.express as px

In [None]:
df = pd.DataFrame(
    {
        "x": embeddings_2d[:, 0],
        "y": embeddings_2d[:, 1],
        "sentence": sentences,
    }
)
fig = px.scatter(
    df, x="x", y="y", hover_data={"x": False, "y": False, "sentence": True}
)
fig.show()