In [1]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning, module="huggingface_hub.file_download")
warnings.filterwarnings("ignore", category=FutureWarning, module="transformers.utils.generic")
warnings.filterwarnings("ignore", category=FutureWarning, module="transformers.modeling_utils")
import numpy as np
from scipy.spatial.distance import cosine
import torch
import transformers
from transformers import LlamaTokenizer, LlamaModel
from gensim.models import KeyedVectors, Word2Vec

from nlp import (
    get_llama_embedding,
    get_word2vec_embedding,
    get_avg_embedding,
    is_normalized,
    normalize_rows,
)
from config import load_config

In [2]:
config, secrets = load_config()

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
tokenizer = transformers.LlamaTokenizer.from_pretrained(config.transformer.model_name, token=secrets.huggingface_token, legacy=False)
tokenizer.pad_token = tokenizer.eos_token

In [5]:
config.transformer.model_name

'openlm-research/open_llama_3b_v2'

In [6]:
model = transformers.LlamaModel.from_pretrained(
    config.transformer.model_name,
    torch_dtype=torch.float16,
    device_map='auto',
    token=secrets.huggingface_token
)
#model.config.pad_token_id = model.config.eos_token_id

In [8]:
model

LlamaModel(
  (embed_tokens): Embedding(32000, 3200, padding_idx=0)
  (layers): ModuleList(
    (0-25): 26 x LlamaDecoderLayer(
      (self_attn): LlamaSdpaAttention(
        (q_proj): Linear(in_features=3200, out_features=3200, bias=False)
        (k_proj): Linear(in_features=3200, out_features=3200, bias=False)
        (v_proj): Linear(in_features=3200, out_features=3200, bias=False)
        (o_proj): Linear(in_features=3200, out_features=3200, bias=False)
        (rotary_emb): LlamaRotaryEmbedding()
      )
      (mlp): LlamaMLP(
        (gate_proj): Linear(in_features=3200, out_features=8640, bias=False)
        (up_proj): Linear(in_features=3200, out_features=8640, bias=False)
        (down_proj): Linear(in_features=8640, out_features=3200, bias=False)
        (act_fn): SiLU()
      )
      (input_layernorm): LlamaRMSNorm()
      (post_attention_layernorm): LlamaRMSNorm()
    )
  )
  (norm): LlamaRMSNorm()
)

In [7]:
word2vec_model = KeyedVectors.load_word2vec_format('/home/igor/.cache/GoogleNews-vectors-negative300.bin.gz', binary=True)

In [8]:
def build_vocabulary(words, topn = 1000):
    # Build a vocabulary with 1000 neighbor words based on Word2Vec model.
    unique_words = set()
    for word in words:
        unique_words.add(word)
        similar_words = word2vec_model.most_similar(word, topn=topn)
        unique_words.update(token for token, _ in similar_words)
    unique_words = list(unique_words)
    return unique_words

In [9]:
def word_similarity(composition, vocabulary, add_special_tokens, normalized, aggregation):
    # Compute new embeddings based on tokenizer/model params
    llama_embeddings = {}
    for vocab in vocabulary:
        emb, tkn = get_llama_embedding(
            vocab, model, tokenizer,
            add_special_tokens=add_special_tokens,
            normalized=normalized,
            aggregation=aggregation
        )
        llama_embeddings[vocab] = emb

    # Compute the compositionality of King - man + worman = queen on llama model
    llama_composition = llama_embeddings[composition[0]] - llama_embeddings[composition[1]] + llama_embeddings[composition[2]]
    #print(llama_composition.shape, llama_composition)
    
    # Compute the similarity with llama's  composition and all vocabulary.
    similarity = {}
    for vocab, embedding in llama_embeddings.items():
        similarity[vocab] = 1 - cosine(llama_composition, embedding)

    # Create two lists, one with words and other with similarity to llama's compositionality.
    itens = list(similarity.items())
    words, similarities = zip(*itens)
    similarities = np.array(similarities)

    # Order the words from the most similar to the less simillar
    sorted_pairs = sorted(zip(words, similarities), key=lambda pair: pair[1], reverse=True)
    sorted_text, sorted_values = zip(*sorted_pairs)
    
    return sorted_text, sorted_values

In [56]:
myword = "queen"
mywords = ["king", "man", "woman", "queen", "men", "women", "ceo", myword]
myvocab = build_vocabulary(mywords, 100)
mycompo = ["king", "man", "woman"]

In [57]:
results = []
for st in [True, False]:
    for nor in [True, False]:
        for agg in ["mean", "sum"]:
            text, vals = word_similarity(mycompo, myvocab, st, nor, agg)
            emb, tkns = get_llama_embedding(myword, model, tokenizer, st, nor, agg)
            pos = text.index(myword)+1
            res = [st, nor, agg, pos, emb[0]]
            print(res)
            results.append(res)

[True, True, 'mean', 5, 0.00512]
[True, True, 'sum', 5, 0.01024]
[True, False, 'mean', 5, 0.001129]
[True, False, 'sum', 5, 0.002258]
[False, True, 'mean', 6, -0.0092]
[False, True, 'sum', 6, -0.0092]
[False, False, 'mean', 6, -0.00806]
[False, False, 'sum', 6, -0.00806]


In [58]:
# Check how norm behaves in words with many tokens.
# Seems, when averaging, norm get lower.
for word in mywords + ["prettydupperfuckinlicious"]:
    emb, token = get_llama_embedding(word, model, tokenizer, add_special_tokens=False, normalized=True, aggregation="mean")
    print(word, token, np.linalg.norm(emb, axis=-1))

king 7367 0.9995
man 599 1.0
woman 3318 1.0
queen 16502 0.9995
men 1853 0.9995
women 2022 0.9995
ceo [ 6288 29504] 0.712
queen 16502 0.9995
prettydupperfuckinlicious [ 2615   590  2607 29517  1430   261   580   775] 0.4001


In [None]:
list(zip(text[:10], vals[:10]))