# **Сравнение эмбеддингов GloVe и BERT**

## **Импорт библиотек**

Для работы с эмбеддингами мы будем использовать библиотеку `flair`, разработанную Берлинским университетом имени Гумбольдта. `flair` имеет простой интерфейс, который позволяет использовать и комбинировать различные эмбеддинги слов и документов.

In [1]:
# !pip install flair -q

In [2]:
import numpy as np
from flair.embeddings import WordEmbeddings
from flair.embeddings import TransformerWordEmbeddings
from flair.data import Sentence
from scipy.spatial import distance

## **Пример эмбеддинга "GloVe"**

Классические модели эмбеддингов, такие как *Word2Vec* слов предполагают, что каждое отдельное слово получает ровно один предварительно вычисленный вектор. Большинство эмбеддингов подпадают под этот класс, включая популярные эмбеддинги **GloVe** или **Komninos**.

Проилюстрируем работу *Word2Vec* с помощью эмбеддингов **GloVe**.

**GloVe** — это алгоритм обучения без учителя для получения векторных представлений слов. В то время, как *Word2Vec* фиксирует, появляются ли слова в похожих контекстах, **GloVe** фокусируется на совпадении слов во всем корпусе. Его эмбеддинги связаны с вероятностью того, что два слова появятся вместе.

In [3]:
# Скачаем заранее изученные эмбеддинги
glove_embedding = WordEmbeddings('glove')

2024-08-08 17:42:32,584 https://flair.informatik.hu-berlin.de/resources/embeddings/token/glove.gensim.vectors.npy not found in cache, downloading to /tmp/tmp8_lmo8z2


100%|██████████| 153M/153M [00:06<00:00, 24.3MB/s]

2024-08-08 17:42:39,498 copying /tmp/tmp8_lmo8z2 to cache at /root/.flair/embeddings/glove.gensim.vectors.npy





2024-08-08 17:42:40,076 removing temp file /tmp/tmp8_lmo8z2
2024-08-08 17:42:40,470 https://flair.informatik.hu-berlin.de/resources/embeddings/token/glove.gensim not found in cache, downloading to /tmp/tmpp5pf7fd3


100%|██████████| 20.5M/20.5M [00:01<00:00, 13.1MB/s]

2024-08-08 17:42:42,445 copying /tmp/tmpp5pf7fd3 to cache at /root/.flair/embeddings/glove.gensim
2024-08-08 17:42:42,468 removing temp file /tmp/tmpp5pf7fd3





### **1е предложение**

In [43]:
sentence_1 = Sentence("The light in the room was too bright, and I couldn't read")

 Элементы результирующего вектора называются ***токенами***.



In [44]:
glove_embedding.embed(sentence_1)

[Sentence[14]: "The light in the room was too bright, and I couldn't read"]

In [45]:
for token in sentence_1:
    print(token)
    print(token.embedding.shape, '\n')

Token[0]: "The"
torch.Size([100]) 

Token[1]: "light"
torch.Size([100]) 

Token[2]: "in"
torch.Size([100]) 

Token[3]: "the"
torch.Size([100]) 

Token[4]: "room"
torch.Size([100]) 

Token[5]: "was"
torch.Size([100]) 

Token[6]: "too"
torch.Size([100]) 

Token[7]: "bright"
torch.Size([100]) 

Token[8]: ","
torch.Size([100]) 

Token[9]: "and"
torch.Size([100]) 

Token[10]: "I"
torch.Size([100]) 

Token[11]: "could"
torch.Size([100]) 

Token[12]: "n't"
torch.Size([100]) 

Token[13]: "read"
torch.Size([100]) 



Выберем первое слово нашего предложения и посмотрим на его векторное представление

In [46]:
print(sentence_1[1], sentence_1[1].embedding, sep="\n")

Token[1]: "light"
tensor([-0.0057,  0.4809, -0.1462, -0.1068,  0.1280, -0.2933,  0.1540,  0.4191,
        -0.5845, -0.0882,  0.2701, -0.6978,  0.2960,  0.1466,  0.7739, -0.7541,
        -0.1105, -0.1999,  0.8344, -0.5336,  0.3413, -0.9155, -0.2100, -0.4984,
         0.3633,  0.7514,  0.2243,  0.0499,  0.0130,  0.0514, -0.1493, -0.2122,
        -0.6939,  0.0937, -0.1015,  0.2140,  0.0152, -0.1494,  0.6456,  0.3025,
        -0.2018, -0.9830, -0.3830, -0.5775,  0.0785,  0.0308,  0.0819,  0.2499,
         0.4438, -0.9815,  0.5671,  0.1875,  0.2845,  1.2965,  0.1928, -2.2519,
         0.2619,  0.6577,  1.7942, -0.4435,  0.1692,  1.1139, -0.6808,  0.7017,
         0.4602, -0.1253,  0.3981, -0.5240, -0.2695, -0.2514,  0.4031, -0.7773,
         0.7515,  0.1175,  0.7159,  0.0934,  0.4862, -0.3857, -0.5227,  0.0262,
         0.3681, -0.2359, -0.4491,  0.7412, -0.8613, -0.2482, -0.0351, -0.2383,
         0.9937,  0.0170,  0.0811, -0.4658, -0.6330,  0.2032, -0.7355,  0.3726,
        -0.8211, -0.41

In [47]:
sentence_1[1].embedding.shape

torch.Size([100])

### **2е предложение**

In [48]:
sentence_2 = Sentence('She always radiates light and joy to everyone around her')

In [49]:
glove_embedding.embed(sentence_2)

[Sentence[10]: "She always radiates light and joy to everyone around her"]

In [50]:
for token in sentence_2:
    print(token)
    print(token.embedding.shape, '\n')

Token[0]: "She"
torch.Size([100]) 

Token[1]: "always"
torch.Size([100]) 

Token[2]: "radiates"
torch.Size([100]) 

Token[3]: "light"
torch.Size([100]) 

Token[4]: "and"
torch.Size([100]) 

Token[5]: "joy"
torch.Size([100]) 

Token[6]: "to"
torch.Size([100]) 

Token[7]: "everyone"
torch.Size([100]) 

Token[8]: "around"
torch.Size([100]) 

Token[9]: "her"
torch.Size([100]) 



In [51]:
print(sentence_2[3], sentence_2[3].embedding, sep="\n")

Token[3]: "light"
tensor([-0.0057,  0.4809, -0.1462, -0.1068,  0.1280, -0.2933,  0.1540,  0.4191,
        -0.5845, -0.0882,  0.2701, -0.6978,  0.2960,  0.1466,  0.7739, -0.7541,
        -0.1105, -0.1999,  0.8344, -0.5336,  0.3413, -0.9155, -0.2100, -0.4984,
         0.3633,  0.7514,  0.2243,  0.0499,  0.0130,  0.0514, -0.1493, -0.2122,
        -0.6939,  0.0937, -0.1015,  0.2140,  0.0152, -0.1494,  0.6456,  0.3025,
        -0.2018, -0.9830, -0.3830, -0.5775,  0.0785,  0.0308,  0.0819,  0.2499,
         0.4438, -0.9815,  0.5671,  0.1875,  0.2845,  1.2965,  0.1928, -2.2519,
         0.2619,  0.6577,  1.7942, -0.4435,  0.1692,  1.1139, -0.6808,  0.7017,
         0.4602, -0.1253,  0.3981, -0.5240, -0.2695, -0.2514,  0.4031, -0.7773,
         0.7515,  0.1175,  0.7159,  0.0934,  0.4862, -0.3857, -0.5227,  0.0262,
         0.3681, -0.2359, -0.4491,  0.7412, -0.8613, -0.2482, -0.0351, -0.2383,
         0.9937,  0.0170,  0.0811, -0.4658, -0.6330,  0.2032, -0.7355,  0.3726,
        -0.8211, -0.41

In [52]:
sentence_2[3].embedding.shape

torch.Size([100])

### **Расстояние между словом light в двух предложениях**

In [54]:
glove_dst = distance.euclidean(
    np.array(sentence_1[1].embedding),
    np.array(sentence_2[3].embedding)
    )

print("Distance between 'light' embeddings for Glove = {}".format(glove_dst))

Distance between 'light' embeddings for Glove = 0.0


## **Эмбеддинг BERT**

In [28]:
bert_embedding = TransformerWordEmbeddings('bert-base-multilingual-cased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

### **1е предложение**

In [55]:
bert_embedding.embed(sentence_1)
for token in sentence_1:
    print(token)
    print(token.embedding.shape, '\n')

Token[0]: "The"
torch.Size([868]) 

Token[1]: "light"
torch.Size([868]) 

Token[2]: "in"
torch.Size([868]) 

Token[3]: "the"
torch.Size([868]) 

Token[4]: "room"
torch.Size([868]) 

Token[5]: "was"
torch.Size([868]) 

Token[6]: "too"
torch.Size([868]) 

Token[7]: "bright"
torch.Size([868]) 

Token[8]: ","
torch.Size([868]) 

Token[9]: "and"
torch.Size([868]) 

Token[10]: "I"
torch.Size([868]) 

Token[11]: "could"
torch.Size([868]) 

Token[12]: "n't"
torch.Size([868]) 

Token[13]: "read"
torch.Size([868]) 



In [57]:
print(sentence_1[1], sentence_1[1].embedding[:100], sep='\n')

Token[1]: "light"
tensor([-0.0057,  0.4809, -0.1462, -0.1068,  0.1280, -0.2933,  0.1540,  0.4191,
        -0.5845, -0.0882,  0.2701, -0.6978,  0.2960,  0.1466,  0.7739, -0.7541,
        -0.1105, -0.1999,  0.8344, -0.5336,  0.3413, -0.9155, -0.2100, -0.4984,
         0.3633,  0.7514,  0.2243,  0.0499,  0.0130,  0.0514, -0.1493, -0.2122,
        -0.6939,  0.0937, -0.1015,  0.2140,  0.0152, -0.1494,  0.6456,  0.3025,
        -0.2018, -0.9830, -0.3830, -0.5775,  0.0785,  0.0308,  0.0819,  0.2499,
         0.4438, -0.9815,  0.5671,  0.1875,  0.2845,  1.2965,  0.1928, -2.2519,
         0.2619,  0.6577,  1.7942, -0.4435,  0.1692,  1.1139, -0.6808,  0.7017,
         0.4602, -0.1253,  0.3981, -0.5240, -0.2695, -0.2514,  0.4031, -0.7773,
         0.7515,  0.1175,  0.7159,  0.0934,  0.4862, -0.3857, -0.5227,  0.0262,
         0.3681, -0.2359, -0.4491,  0.7412, -0.8613, -0.2482, -0.0351, -0.2383,
         0.9937,  0.0170,  0.0811, -0.4658, -0.6330,  0.2032, -0.7355,  0.3726,
        -0.8211, -0.41

In [58]:
sentence_1[1].embedding.shape

torch.Size([868])

### **2е предложение**

In [59]:
bert_embedding.embed(sentence_2)

for token in sentence_2:
    print(token)
    print(token.embedding.shape, '\n')

Token[0]: "She"
torch.Size([868]) 

Token[1]: "always"
torch.Size([868]) 

Token[2]: "radiates"
torch.Size([868]) 

Token[3]: "light"
torch.Size([868]) 

Token[4]: "and"
torch.Size([868]) 

Token[5]: "joy"
torch.Size([868]) 

Token[6]: "to"
torch.Size([868]) 

Token[7]: "everyone"
torch.Size([868]) 

Token[8]: "around"
torch.Size([868]) 

Token[9]: "her"
torch.Size([868]) 



In [60]:
print(sentence_2[3], sentence_2[3].embedding[:100], sep='\n')

Token[3]: "light"
tensor([-0.0057,  0.4809, -0.1462, -0.1068,  0.1280, -0.2933,  0.1540,  0.4191,
        -0.5845, -0.0882,  0.2701, -0.6978,  0.2960,  0.1466,  0.7739, -0.7541,
        -0.1105, -0.1999,  0.8344, -0.5336,  0.3413, -0.9155, -0.2100, -0.4984,
         0.3633,  0.7514,  0.2243,  0.0499,  0.0130,  0.0514, -0.1493, -0.2122,
        -0.6939,  0.0937, -0.1015,  0.2140,  0.0152, -0.1494,  0.6456,  0.3025,
        -0.2018, -0.9830, -0.3830, -0.5775,  0.0785,  0.0308,  0.0819,  0.2499,
         0.4438, -0.9815,  0.5671,  0.1875,  0.2845,  1.2965,  0.1928, -2.2519,
         0.2619,  0.6577,  1.7942, -0.4435,  0.1692,  1.1139, -0.6808,  0.7017,
         0.4602, -0.1253,  0.3981, -0.5240, -0.2695, -0.2514,  0.4031, -0.7773,
         0.7515,  0.1175,  0.7159,  0.0934,  0.4862, -0.3857, -0.5227,  0.0262,
         0.3681, -0.2359, -0.4491,  0.7412, -0.8613, -0.2482, -0.0351, -0.2383,
         0.9937,  0.0170,  0.0811, -0.4658, -0.6330,  0.2032, -0.7355,  0.3726,
        -0.8211, -0.41

In [61]:
sentence_2[3].embedding.shape

torch.Size([868])

### **Расстояние между словом light в двух предложениях**

In [62]:
bert_dst = distance.euclidean(
    np.array(sentence_1[1].embedding),
    np.array(sentence_2[3].embedding)
    )

print("Distance between 'light' embeddings for Glove = {}".format(bert_dst))

Distance between 'light' embeddings for Glove = 12.834538459777832


Как видим, в отличие от классических моделей, BERT учитывает контекст.