In [12]:
import torch
from transformers import BertTokenizer, BertModel
import logging
import matplotlib.pyplot as plt
import json
import tqdm


tokenizer = BertTokenizer.from_pretrained("dkleczek/bert-base-polish-cased-v1")
model = BertModel.from_pretrained("dkleczek/bert-base-polish-cased-v1", output_hidden_states=True)


In [10]:
def get_embedding(text):
    marked_text = "[CLS] " + text + " [SEP]"
    tokenized_text = tokenizer.tokenize(marked_text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    segments_ids = [1] * len(tokenized_text)

    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
    
    token_vecs = outputs[2][-2][0]
    sentence_embedding = torch.mean(token_vecs, dim=0)
    return sentence_embedding

In [16]:
embeddings = []
with open('polish_annotations.jsonl', 'r', encoding='utf-8') as file:
    for line in file:
        data = json.loads(line)
        text = data['text']
        embedding = get_embedding(text)
        embeddings.append(embedding.tolist())

In [17]:
with open('text_embeddings.json', 'w', encoding='utf-8') as outfile:
    json.dump(embeddings, outfile)

print("Embeddings saved to text_embeddings.json")

Embeddings saved to text_embeddings.json
