<a href="https://colab.research.google.com/github/jvns/colab-fun/blob/main/similar-words.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# embedding fun

- This notebook runs on Google Colab.
- Codes from [A Comprehensive Guide to Build Your Own Language Model in Python](https://medium.com/analytics-vidhya/a-comprehensive-guide-to-build-your-own-language-model-in-python-5141b3917d6d)
- Use the OpenAI GPT-2 language model (based on Transformers) to:
  - Generate text sequences based on seed texts
  - Convert text sequences into numerical representations

In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
# Import required libraries
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from torch.nn.functional import cosine_similarity


# Load pre-trained model tokenizer (vocabulary)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Encode a text inputs
text = "What is the fastest car in the"
indexed_tokens = tokenizer.encode(text)

# Convert indexed tokens in a PyTorch tensor
tokens_tensor = torch.tensor([indexed_tokens])

# Load pre-trained model (weights)
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Set the model in evaluation mode to deactivate the DropOut modules
model.eval()

# If you have a GPU, put everything on cuda
tokens_tensor = tokens_tensor.to('cuda')
model.to('cuda')

# Predict all tokens
with torch.no_grad():
    outputs = model(tokens_tensor)
    predictions = outputs[0]

# Get the predicted next sub-word
predicted_index = torch.argmax(predictions[0, -1, :]).item()
predicted_text = tokenizer.decode(indexed_tokens + [predicted_index])

# Print the predicted word
print(predicted_text)

What is the fastest car in the world


In [22]:
def most_similar_words(word):
  tokens = tokenizer.encode(word)
  assert len(tokens) == 1
  token = tokens[0]
  weight = model.transformer.wte.weight[token]
  sims = cosine_similarity(weight, model.transformer.wte.weight, dim=1)
  top_idx = torch.topk(sims, k=30)[1]
  return [tokenizer.decode(x) for x in top_idx]

In [23]:
most_similar_words("forest")

['forest',
 'Forest',
 ' forest',
 ' forests',
 ' Forest',
 ' deforestation',
 'forestation',
 ' forestry',
 ' Forestry',
 ' biodiversity',
 ' plantations',
 ' jungle',
 ' woodland',
 ' wetlands',
 ' externalToEVA',
 '\x19',
 '\x1e',
 '�',
 'reportprint',
 '\x11',
 '\x14',
 '\x13',
 '\x15',
 ' サーティ',
 '�',
 '\x03',
 '\x7f',
 ' TheNitrome',
 '�',
 '�']

In [24]:
most_similar_words("ban")

['ban',
 'Ban',
 ' ban',
 ' Ban',
 ' banning',
 ' bans',
 ' Bans',
 ' banned',
 ' prohibitions',
 ' prohibition',
 ' prohibit',
 ' moratorium',
 ' banished',
 ' prohibiting',
 'kan',
 'ba',
 'aban',
 ' prohibits',
 ' Banana',
 ' restrictions',
 ' barred',
 'bon',
 ' barring',
 '246',
 '\x1a',
 'oreAndOnline',
 '\x15',
 '\x0c',
 '\x11',
 '\r']

In [25]:
most_similar_words("pan")

['pan',
 ' Pan',
 ' pan',
 'Pan',
 'panel',
 'panic',
 'pa',
 'Panel',
 ' skillet',
 ' PAN',
 ' pans',
 ' pane',
 'pect',
 ' Panel',
 'p',
 'ban',
 ' Span',
 'pe',
 'glass',
 'par',
 'pit',
 ' Panama',
 ' panel',
 'scale',
 'pas',
 'py',
 'span',
 ' perspect',
 'pots',
 ' span']

In [26]:
most_similar_words("cake")

['cake',
 'cakes',
 ' Cake',
 ' cake',
 ' cakes',
 ' pastry',
 '\x01',
 'StreamerBot',
 '\x1b',
 '�',
 '\x0b',
 ' RandomRedditor',
 ' サーティ',
 '\x07',
 '\x17',
 '\x15',
 '\x1e',
 '�',
 '\x05',
 '�',
 'rawdownload',
 '�',
 '\x0c',
 '�',
 '\x0e',
 '\x03',
 '\x10',
 '\x13',
 '\x12',
 'embedreportprint']

In [27]:
most_similar_words("the")

['the',
 'The',
 'that',
 'there',
 ' the',
 'this',
 ' THE',
 'these',
 'which',
 'what',
 'they',
 'their',
 'those',
 ' The',
 'your',
 'to',
 'from',
 'THE',
 'when',
 'you',
 'and',
 'a',
 ' an',
 'with',
 'for',
 'another',
 'It',
 'In',
 'This',
 'There']

In [28]:
most_similar_words("think")

['think',
 'Think',
 ' Think',
 'thinking',
 ' think',
 'thought',
 ' thinkers',
 ' thinker',
 ' Thinking',
 ' thinks',
 ' thinking',
 'remember',
 'Imagine',
 ' imagine',
 'sorry',
 'consider',
 ' Thought',
 ' rethink',
 'doing',
 'Look',
 'Consider',
 'yeah',
 ' externalToEVA',
 '\x07',
 'perhaps',
 '�',
 '\x04',
 '\x17',
 '\x19',
 '�']

In [30]:
most_similar_words("boat")

['boat',
 'boats',
 ' boat',
 ' boats',
 ' Boat',
 ' yacht',
 ' vessel',
 ' submarine',
 ' sail',
 ' canoe',
 ' sailed',
 ' raft',
 ' vessels',
 ' submarines',
 ' afloat',
 ' ship',
 ' sailing',
 ' torpedo',
 'wreck',
 ' ships',
 ' submar',
 ' ashore',
 ' kay',
 ' warships',
 'fish',
 '\x12',
 '\x1b',
 '\x1e',
 'InstoreAndOnline',
 '�']