<a href="https://colab.research.google.com/github/jvns/colab-fun/blob/main/embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# embedding fun

- This notebook runs on Google Colab.
- Codes from [A Comprehensive Guide to Build Your Own Language Model in Python](https://medium.com/analytics-vidhya/a-comprehensive-guide-to-build-your-own-language-model-in-python-5141b3917d6d)
- Use the OpenAI GPT-2 language model (based on Transformers) to:
  - Generate text sequences based on seed texts
  - Convert text sequences into numerical representations

In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m46.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m106.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 KB[0m [31m25.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.27.4


In [3]:
# Import required libraries
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load pre-trained model tokenizer (vocabulary)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Encode a text inputs
text = "What is the fastest car in the"
indexed_tokens = tokenizer.encode(text)

# Convert indexed tokens in a PyTorch tensor
tokens_tensor = torch.tensor([indexed_tokens])

# Load pre-trained model (weights)
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Set the model in evaluation mode to deactivate the DropOut modules
model.eval()

# If you have a GPU, put everything on cuda
tokens_tensor = tokens_tensor.to('cuda')
model.to('cuda')

# Predict all tokens
with torch.no_grad():
    outputs = model(tokens_tensor)
    predictions = outputs[0]

# Get the predicted next sub-word
predicted_index = torch.argmax(predictions[0, -1, :]).item()
predicted_text = tokenizer.decode(indexed_tokens + [predicted_index])

# Print the predicted word
print(predicted_text)

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

What is the fastest car in the world


In [9]:
def angle(word1, word2):
  tokens1 = tokenizer.encode(word1)
  w1 = model.transformer.wte.weight[tokens1[0]]
  tokens2 = tokenizer.encode(word2)
  w2 = model.transformer.wte.weight[tokens2[0]]
  return torch.arccos(torch.dot(w1, w2) / (torch.norm(w1) * torch.norm(w2))).item()/6.28



In [21]:
pairs = [('begin', 'commence'),         ('buy', 'purchase'),         ('cheerful', 'happy'),         ('confident', 'assured'),         ('famous', 'renowned'),         ('glance', 'glimpse'),         ('helpful', 'beneficial'),         ('hurry', 'rush'),         ('ill', 'sick'),         ('important', 'significant'),         ('inquire', 'ask'),         ('known', 'familiar'),         ('laugh', 'chuckle'),         ('leap', 'jump'),         ('nearly', 'almost'),         ('peaceful', 'serene'),         ('plentiful', 'abundant'),         ('powerful', 'strong'),         ('quiet', 'silent'),         ('sincere', 'genuine')]

In [29]:
for (w1, w2) in pairs:
    print(f"{w1:15s} {w2:12s}", angle(w1, w2))

begin           commence     0.20311440631842156
buy             purchase     0.21039069078530476
cheerful        happy        0.20373152319792728
confident       assured      0.19554305987752926
famous          renowned     0.1972141159567863
glance          glimpse      0.18530544961334033
helpful         beneficial   0.1929789212099306
hurry           rush         0.22075711541874393
ill             sick         0.21339951047472133
important       significant  0.1729835370543656
inquire         ask          0.19534359312361213
known           familiar     0.2096469235268368
laugh           chuckle      0.21974438694631976
leap            jump         0.21679955682936747
nearly          almost       0.20785035601087437
peaceful        serene       0.21219318080100283
plentiful       abundant     0.20927851367148623
powerful        strong       0.1691336085082619
quiet           silent       0.18110928262115283
sincere         genuine      0.21086802148515252


In [35]:
pairs = [('agistor', 'receive'), ('agelong', 'bonding'), ('unblued', 'balky'), ('baa', 'colon'), ('enapt', 'sliding'), ('thoria', 'okrug'), ('prepink', 'drugman'), ('inwards', 'unpile'), ('fruiter', 'kevel'), ('eulogic', 'printed')]
for (w1, w2) in pairs:
    print(f"{w1:10s} {w2:10s}", angle(w1, w2))

agistor    receive    0.20811736204062298
agelong    bonding    0.2009169690927882
unblued    balky      0.19615881002632674
baa        colon      0.2145562771778957
enapt      sliding    0.21339823865586785
thoria     okrug      0.21218668883013878
prepink    drugman    0.20012779600301364
inwards    unpile     0.1782680962495743
fruiter    kevel      0.20818023165320135
eulogic    printed    0.224857117719711
