In [8]:
import torch
from torch import nn
from torch.nn import functional as F
from transformers import BertTokenizer, BertModel

In [2]:
model = BertModel.from_pretrained("bert-base-multilingual-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-uncased")

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/672M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/872k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

In [77]:
en_tokens = tokenizer.encode('Today i will go to university', return_tensors='pt')
ru_tokens = tokenizer.encode('Сегодня я пойду в университет', return_tensors='pt')
en_tokens.shape, ru_tokens.shape

(torch.Size([1, 8]), torch.Size([1, 8]))

In [78]:
print(tokenizer.batch_decode(en_tokens))
print(tokenizer.batch_decode(ru_tokens))

['[CLS] today i will go to university [SEP]']
['[CLS] сегодня я поиду в университет [SEP]']


In [79]:
with torch.inference_mode():
    en_output = model(en_tokens)
    ru_output = model(ru_tokens)
print(f'Overall cos sim: {F.cosine_similarity(ru_output.pooler_output, en_output.pooler_output).item()}')
for i in range(en_tokens.shape[1]):
    sim = F.cosine_similarity(ru_output.last_hidden_state[0, i], en_output.last_hidden_state[0, i], dim=0)
    print(f'Token {i} | {tokenizer.ids_to_tokens[en_tokens[0, i].item()]} -> {tokenizer.ids_to_tokens[ru_tokens[0, i].item()]}; last_hidden_state cos sim: {round(sim.item(), 3)}')

Overall cos sim: 0.9929603338241577
Token 0 | [CLS] -> [CLS]; last_hidden_state cos sim: 0.993
Token 1 | today -> сегодня; last_hidden_state cos sim: 0.671
Token 2 | i -> я; last_hidden_state cos sim: 0.78
Token 3 | will -> по; last_hidden_state cos sim: 0.517
Token 4 | go -> ##иду; last_hidden_state cos sim: 0.513
Token 5 | to -> в; last_hidden_state cos sim: 0.666
Token 6 | university -> университет; last_hidden_state cos sim: 0.78
Token 7 | [SEP] -> [SEP]; last_hidden_state cos sim: 0.69


In [80]:
with torch.inference_mode():
    ru_embeddings = model.embeddings(ru_tokens)
    en_embeddings = model.embeddings(en_tokens)
for i in range(en_tokens.shape[1]):
    sim = F.cosine_similarity(ru_embeddings[0, i], en_embeddings[0, i], dim=0)
    print(f'Token {i} | {tokenizer.ids_to_tokens[en_tokens[0, i].item()]} -> {tokenizer.ids_to_tokens[ru_tokens[0, i].item()]}; embedding cos sim: {round(sim.item(), 3)}')

Token 0 | [CLS] -> [CLS]; embedding cos sim: 1.0
Token 1 | today -> сегодня; embedding cos sim: 0.518
Token 2 | i -> я; embedding cos sim: 0.237
Token 3 | will -> по; embedding cos sim: 0.092
Token 4 | go -> ##иду; embedding cos sim: 0.048
Token 5 | to -> в; embedding cos sim: 0.301
Token 6 | university -> университет; embedding cos sim: 0.514
Token 7 | [SEP] -> [SEP]; embedding cos sim: 1.0
