<a href="https://colab.research.google.com/github/jvns/colab-fun/blob/main/similar-words.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# embedding fun

- This notebook runs on Google Colab.
- Codes from [A Comprehensive Guide to Build Your Own Language Model in Python](https://medium.com/analytics-vidhya/a-comprehensive-guide-to-build-your-own-language-model-in-python-5141b3917d6d)
- Use the OpenAI GPT-2 language model (based on Transformers) to:
  - Generate text sequences based on seed texts
  - Convert text sequences into numerical representations

In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m46.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m106.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 KB[0m [31m25.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.27.4


In [3]:
# Import required libraries
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from torch.nn.functional import cosine_similarity


# Load pre-trained model tokenizer (vocabulary)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Encode a text inputs
text = "What is the fastest car in the"
indexed_tokens = tokenizer.encode(text)

# Convert indexed tokens in a PyTorch tensor
tokens_tensor = torch.tensor([indexed_tokens])

# Load pre-trained model (weights)
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Set the model in evaluation mode to deactivate the DropOut modules
model.eval()

# If you have a GPU, put everything on cuda
tokens_tensor = tokens_tensor.to('cuda')
model.to('cuda')

# Predict all tokens
with torch.no_grad():
    outputs = model(tokens_tensor)
    predictions = outputs[0]

# Get the predicted next sub-word
predicted_index = torch.argmax(predictions[0, -1, :]).item()
predicted_text = tokenizer.decode(indexed_tokens + [predicted_index])

# Print the predicted word
print(predicted_text)

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

What is the fastest car in the world


In [104]:
def most_similar_words(word):
  token = tokenizer.encode(word)[0]
  print(token)
  weight = model.transformer.wte.weight[token]
  sims = cosine_similarity(weight, model.transformer.wte.weight, dim=1)
  top_idx = torch.topk(sims, k=20)[1]
  return [tokenizer.decode(x) for x in top_idx]

In [105]:
most_similar_words("forest")

29623


['forest',
 'Forest',
 ' forest',
 ' forests',
 ' Forest',
 ' deforestation',
 'forestation',
 ' forestry',
 ' Forestry',
 ' biodiversity',
 ' plantations',
 ' jungle',
 ' woodland',
 ' wetlands',
 ' externalToEVA',
 '\x19',
 '\x1e',
 '�',
 'reportprint',
 '\x11']

In [106]:
most_similar_words("banana")

3820


['ban',
 'Ban',
 ' ban',
 ' Ban',
 ' banning',
 ' bans',
 ' Bans',
 ' banned',
 ' prohibitions',
 ' prohibition',
 ' prohibit',
 ' moratorium',
 ' banished',
 ' prohibiting',
 'kan',
 'ba',
 'aban',
 ' prohibits',
 ' Banana',
 ' restrictions']

In [107]:
most_similar_words("fantastic")

69


['f',
 'F',
 ' f',
 ' F',
 'b',
 'fs',
 'c',
 'd',
 'ff',
 'fd',
 'v',
 'fe',
 'l',
 'p',
 'fl',
 'fb',
 'm',
 't',
 'fi',
 'g']

In [109]:
most_similar_words("cake")

30560


['cake',
 'cakes',
 ' Cake',
 ' cake',
 ' cakes',
 ' pastry',
 '\x01',
 'StreamerBot',
 '\x1b',
 '�',
 '\x0b',
 ' RandomRedditor',
 ' サーティ',
 '\x07',
 '\x17',
 '\x15',
 '\x1e',
 '�',
 '\x05',
 '�']