In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel
import numpy as np

def tokenize(word):
    ids = tokenizer(word, return_tensors='pt')['input_ids'][0]
    return [tokenizer.decode(n) for n in ids]

def cos(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

model_name = 'flax-community/papuGaPT2'
device = 'cpu'

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

clusters = {}
with open('categories.txt', 'r', encoding='utf-8') as f:
    for line in f:
        category, words = line.strip().split(":")
        word_list = words.split()
        clusters[category] = word_list

embeddings = model.transformer.wte.weight.detach().cpu().numpy()
N = embeddings.shape[0]

word_embeddings = {}

for category, words in clusters.items():
    for w in words:
        tokens = tokenize(' ' + w)
        token_ids = tokenizer.encode(tokens[0])

        word_embedding = np.zeros(embeddings.shape[1])

        for token_id in token_ids:
            word_embedding += embeddings[token_id]

        word_embedding /= len(token_ids)

        word_embeddings[w] = word_embedding

with open('papuga.txt', 'w') as f:
    for word, embedding in word_embeddings.items():
        embedding_str = ' '.join(map(str, embedding))
        f.write(f"{word} {embedding_str}\n")


In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import random
import numpy as np

device = 'cpu'
model_name = "allegro/herbert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)

def tokenize(word):
    ids = tokenizer(word, return_tensors='pt')['input_ids'][0]
    return ids.detach().numpy()

def embedding_raw(words):
    txt = ' '.join(words)
    input_ids = tokenizer(txt, return_tensors='pt')['input_ids'].to(device)
    output = model(input_ids=input_ids)
    return output.last_hidden_state.detach().cpu().numpy()[0, 0, :]

def embedding(word, ctx):
    return embedding_raw([word] + ctx)

def random_typo(word):
    word = list(word)
    idx = random.randint(0, len(word)-1)
    word[idx] = random.choice('abcdefghijklmnopqrstuvwxyz')
    return ''.join(word)

def transposition_typo(word):
    word = list(word)
    if len(word) < 2:
        return ''.join(word)
    idx1 = random.randint(0, len(word)-1)
    idx2 = random.randint(0, len(word)-1)
    while idx1 == idx2:
        idx2 = random.randint(0, len(word)-1)

    word[idx1], word[idx2] = word[idx2], word[idx1]
    return ''.join(word)

def process_clusters():
    clusters = {}
    with open('categories.txt', 'r', encoding='utf-8') as f:
        for line in f:
            category, words = line.strip().split(":")
            word_list = words.split()
            clusters[category] = word_list


    with open('bert.txt', 'w') as file:
        for category, words in clusters.items():
            for word in words:
                emb = embedding(word, words)
                file.write(f"{word} ")
                for e in emb:
                    file.write(f"{e} ")
                file.write("\n")

    with open('bert_random.txt', 'w') as f:
        for category, words in clusters.items():
            for word in words:
                word_with_random_typo = random_typo(word)
                emb = embedding(word_with_random_typo, words)
                embedding_with_random_typo_str = ' '.join(map(str, emb))
                f.write(f"{word} {embedding_with_random_typo_str}\n")

    with open('bert_transposition.txt', 'w') as f:
        for category, words in clusters.items():
            for word in words:
                word_with_transposition_typo = transposition_typo(word)
                emb = embedding(word_with_transposition_typo, words)
                embedding_with_transposition_typo_str = ' '.join(map(str, emb))
                f.write(f"{word} {embedding_with_transposition_typo_str}\n")

process_clusters()


Some weights of the model checkpoint at allegro/herbert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.sso.sso_relationship.bias', 'cls.sso.sso_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
from transformers import AutoTokenizer, AutoModel
import torch

device = 'cpu'
model_name = "allegro/herbert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)

def tokenize(word):
    ids = tokenizer(word, return_tensors='pt')['input_ids'][0]
    return ids.detach().numpy()

def embedding_raw(words):
    txt = ' '.join(words)
    input_ids = tokenizer(txt, return_tensors='pt')['input_ids'].to(device)
    output = model(input_ids=input_ids)
    return output.last_hidden_state.detach().cpu().numpy()[0, 0, :]

def embedding(word, ctx):
    return embedding_raw([word] + ctx)

def process_clusters(result_file_name):
    clusters = {}
    with open('categories.txt', 'r', encoding='utf-8') as f:
        for line in f:
            category, words = line.strip().split(":")
            word_list = words.split()
            clusters[category] = word_list

    with open(result_file_name, 'w') as file:
        for category, words in clusters.items():
            print(category)
            for word in words:
                emb = embedding(word, words)  # Use entire category as context
                file.write(f"{word} ")
                for e in emb:
                    file.write(f"{e} ")
                file.write("\n")

process_clusters('bert.txt')


Some weights of the model checkpoint at allegro/herbert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.sso.sso_relationship.bias', 'cls.sso.sso_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


piśmiennicze
małe_ssaki
okręty
lekarze
zupy
uczucia
działy_matematyki
budynki_sakralne
stopień_wojskowy
grzyby_jadalne
prądy_filozoficzne
religie
dzieła_muzyczne
cyfry
owady
broń_biała
broń_palna
komputery
kolory
duchowny
ryby
napoje_mleczne
czynności_sportowe
ubranie
mebel
przestępca
mięso_wędliny
drzewo
źródło_światła
organ
oddziały
napój_alkoholowy
kot_drapieżny
metal
samolot
owoc
pościel
agd


In [None]:
pip install sacremoses

Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/897.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sacremoses
Successfully installed sacremoses-0.1.1


In [None]:
# Compute the cumulative distribution function (CDF) for the standard normal distribution at 0.9
F_09 = 1-norm.cdf(-0.65)
round(F_09, 3)


0.742