# Static vs Contextual Embeddings

Static embeddings represent each word as fixed vector.

Contextual embeddings leverage the context to represent each word.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

In [2]:
!python -m spacy download en_core_web_lg

2024-06-24 10:37:06.453437: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-06-24 10:37:07.452127: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-24 10:37:14.883819: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-12.2/lib64/stubs:/usr/local/cuda-12.2/lib64:/usr/local/cuda-12.2/lib64/

In [None]:
import spacy

In [None]:
nlp = spacy.load("en_core_web_lg")

In [None]:
s1 = "I went to the bank yesterday to withdraw cash."

s2 = "We went fishing by the river bank."

In [None]:
doc1 = nlp(s1)
doc2 = nlp(s2)

In [None]:
for token in doc1:
    print(dir(token))
    break

In [None]:
for token in doc1:
    print(token.text, token.pos_)
    if token.text == 'bank':
        v1 = token.vector

In [None]:
type(v1), v1.shape

In [None]:
v1

In [None]:
for token in doc2:
    print(token.text, token.pos_)
    if token.text == 'bank':
        v2 = token.vector

In [None]:
type(v2), v2.shape

In [None]:
(v1 == v2).all()

The word2vec representation for the word bank in both the sentences is the same even though the sense of usage is different.

## Contextual Embeddings using BERT

In [None]:
!pip install transformers

In [4]:
import torch
from transformers import AutoTokenizer, AutoModel

2024-06-24 10:41:53.959254: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-06-24 10:41:54.100390: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-24 10:41:54.798918: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-12.2/lib64/stubs:/usr/local/cuda-12.2/lib64:/usr/local/cuda-12.2/lib64/

In [5]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

Downloading:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
def embed_text_using_bert(text):
    input_ids = torch.tensor(tokenizer.encode(text)).unsqueeze(0)  # Batch size 1
    outputs = model(input_ids)
    last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
    return last_hidden_states

How to get vectors using BERT?

In [None]:
to_embed = "We would like to embed this text!"

print("Vocab ids:")
print(tokenizer.encode(to_embed))

In [None]:
print("Token ids:")
print(tokenizer.convert_ids_to_tokens([101, 2057, 2052, 2066, 2000, 7861, 8270, 2023, 3793, 999, 102]))

In [None]:
vec = embed_text_using_bert(to_embed)
print(vec.shape)

## Visualizing contextual vectors in 2d embedding space

In [None]:
s1 = "I went to the bank yesterday to withdraw cash."
s2 = "We went fishing by the river bank."
s3 = "He has borrowed a loan from the bank."

s4 = "The bank issued a new 10 rupee note yesterday."
s5 = "Please note down the steps directed by the bank."

In [None]:
for s in [s1, s2, s3, s4, s5]:
    print(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(s)))

In [None]:
tokenizer.convert_ids_to_tokens([2924])

In [None]:
s1vec = embed_text_using_bert(s1)[0:4:]
s1vec.shape

In [None]:
def get_bank_contextual_vector(text):
    bank_idx = tokenizer.encode(text).index(2924)
    v1 = embed_text_using_bert(text).detach().numpy()[0, bank_idx]
    print(v1.shape)
    return v1

In [None]:
s1vec = get_bank_contextual_vector(s1)
s2vec = get_bank_contextual_vector(s2)
s3vec = get_bank_contextual_vector(s3)
s4vec = get_bank_contextual_vector(s4)
s5vec = get_bank_contextual_vector(s5)

In [None]:
(s3vec == s2vec).all()

In [None]:
from sklearn.decomposition import PCA
import numpy as np

In [None]:
pca = PCA(n_components=2)
X_2d = pca.fit_transform(np.array([s1vec, s2vec, s3vec, s4vec, s5vec]))

In [None]:
X_2d.shape

In [None]:
import matplotlib.pyplot as plt

In [None]:
fig, ax = plt.subplots(dpi=150)
ax.scatter(X_2d[:,0], X_2d[:, 1])
ax.margins(0.17)

for _, txt in enumerate(['bank cash', 'river bank', 'bank loan', 'bank note_curr', 'note_steps bank']):
    ax.annotate(txt, (X_2d[_, 0]+0.2, X_2d[_, 1]), fontsize='x-small')