# Sentence transformer embedding & embedding utils showcase

In [19]:
import sentence_topology as st
import numpy as np
from tqdm import tqdm
import os

Loading the corpus

In [2]:
corpus = st.utils.load_corpus("../data/COSTRA1.1.tsv")
corpus = list(corpus)

Each sentence is a named tuple

In [3]:
for sentence in corpus[:10]:
    print(sentence, end="\n\n")

Generating embeddings for given sentence transformer

See available models at [sentence_transformers' list of pretrained models](https://www.sbert.net/docs/pretrained_models.html).
We are interested in the multilingual ones.

In [5]:
model = "paraphrase-multilingual-MiniLM-L12-v2"
embeddings = st.sentence_transformers.get_embeddings(corpus, model, verbose=True)

In [6]:
print(len(embeddings))
for embed in embeddings[:10]:
    print(embed)

6968
CostraEmbedding(id=0, seed_id=1, trans='ban', embedding=array([ 2.13315725e-01, -1.76641881e-01, -4.27546382e-01,  2.22908124e-01,
        1.91439152e-01,  5.26094884e-02, -2.55386233e-01, -2.51422971e-01,
        2.60699630e-01,  8.56404826e-02, -1.47073925e-01,  3.54264647e-01,
        1.73849970e-01,  5.25524132e-02,  1.05171554e-01, -1.81667611e-01,
        1.02410041e-01,  7.85607286e-03, -1.41937509e-01, -3.90075259e-02,
        2.32116863e-01, -3.91427204e-02,  5.12557745e-01, -1.00622535e-01,
       -6.39140010e-02,  3.47778611e-02, -2.41077527e-01, -2.89054722e-01,
       -1.14380941e-01,  2.40409113e-02, -1.22253962e-01, -1.67624637e-01,
        2.62787819e-01, -2.21327338e-02,  3.15345407e-01, -8.30564648e-02,
       -1.85983047e-01,  2.98437566e-01,  1.31037891e-01, -1.18623823e-01,
        2.53063103e-04,  2.83281714e-01, -2.07946718e-01,  1.58157527e-01,
       -1.97815314e-01,  4.52092141e-02, -2.33051240e-01, -5.44373617e-02,
       -1.23606645e-01, -3.30503315e-01

Saving the embeddings

In [15]:
st.utils.save_embeddings(embeddings, "./embeddings.tsv")

Loading the just saved embeddings

In [16]:
loaded_embeddings = st.utils.load_embedding("./embeddings.tsv")

Testing for equality

In [17]:
for generated, loaded in tqdm(
    zip(embeddings, loaded_embeddings), desc="Checking equality", total=len(corpus)
):
    assert generated.id == loaded.id
    assert generated.seed_id == loaded.seed_id
    assert generated.trans == loaded.trans
    assert np.all(generated.embedding == loaded.embedding)

Checking equality: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6968/6968 [00:00<00:00, 7514.59it/s]


Remove the embeddings

In [20]:
os.remove("./embeddings.tsv")