# Embeddings

Let's import a dataset from HuggingFace

In [6]:
import datasets

ds = datasets.load_dataset("loresiensis/corpus-en-es", split="test")
ds = ds.rename_column("EN", "text")
ds = ds.remove_columns(["ES"])

Clean the dataset with NLP common preprocessing tasks

In [8]:
from utils import clean_text

ds = ds.map(clean_text, batched=True)

Map:   0%|          | 0/1049 [00:00<?, ? examples/s]

Map: 100%|██████████| 1049/1049 [00:00<00:00, 4017.54 examples/s]


Load Sentence Transformer model embedding

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

Transform to pandas DataFrame

In [11]:
df = ds.to_pandas()
df = df.explode("text", ignore_index = True)
df.dropna(inplace = True)
df.drop_duplicates(inplace=True)
df.reset_index(inplace = True, drop = True)

Perform embeddings

In [None]:
embeddings = model.encode(df["text"], batch_size=64, show_progress_bar=True)

Save embeddings and labels

In [None]:
import pandas as pd

embeddings_df = pd.DataFrame(embeddings)
embeddings_df.head(5000).to_csv('data/processed/embeddings.tsv', sep='\t', index=False)
df["text"].head(5000).to_csv('data/processed/labels.tsv', sep='\t', index=False)

In [None]:
a = 5