In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 5.1 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 11.4 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 53.6 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 44.3 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstal

In [2]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import math
import random

In [3]:
#@title Model Name
model_name = "gpt2-xl" #@param ["gpt2", "distilgpt2", "gpt2-large", "gpt2-xl", "sshleifer/tiny-gpt2"] {allow-input: true}

In [4]:
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
# Load pre-trained model (weights)
model = GPT2LMHeadModel.from_pretrained(model_name)

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/689 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.99G [00:00<?, ?B/s]

## Define the function

In [5]:
def return_token_and_probability(sentence, logprobs_truncate = 100):
    # Encode a text inputs
    text = "<|endoftext|>" + sentence
    indexed_tokens = tokenizer.encode(text)

    # Convert indexed tokens in a PyTorch tensor
    tokens_tensor = torch.tensor([indexed_tokens])

    # Set the model in evaluation mode to deactivate the DropOut modules
    model.eval()

    # If you have a GPU, put everything on cuda
    tokens_tensor = tokens_tensor.to('cuda')
    model.to('cuda')

    # Predict all tokens
    with torch.no_grad():
        outputs = model(tokens_tensor)
        predictions = outputs[0]

    # Get the predicted next sub-word

    predictions = torch.sort(predictions[0, -1, :], descending=True)
    # You can change the range but typically only the first 20ish values matter

    predicted_ids = predictions.indices[:logprobs_truncate]
    predicted_probabilities = predictions.values[:logprobs_truncate]
    #predicted_index = torch.argmax(predictions[0, -1, :]).item()
    predicted_texts = []

    # This needs to be optimized
    for token, probability in zip(predicted_ids, predicted_probabilities):
      predicted_texts.append( (tokenizer.decode([token]), math.e ** probability.item()))

    total_prob = 0
    for log_prob in predicted_texts:
      total_prob += log_prob[1]
    for index, log_prob in enumerate(predicted_texts):
      predicted_texts[index] = (predicted_texts[index][0], (log_prob[1] / total_prob) * 100)

    # Print the predicted word
    return predicted_texts

## Get the logprobs

In [15]:
prompt = "The" #@param {type:"string"}
logprobs = return_token_and_probability(prompt, 1000000)

print("The 100 most probable tokens are:", logprobs[0:100], "\n")
print("The 100 least probable token are:", logprobs[-100:-1])

The 100 most probable tokens are: [(' first', 1.2793870524787925), (' following', 1.1152788334737267), (' U', 1.086563049138522), (' New', 0.7466756017739529), (' new', 0.6396572275233682), (' United', 0.5933558829351285), (' latest', 0.5056504058643506), (' US', 0.505380913479379), (' most', 0.494997094507575), (' world', 0.4611155173910941), (' "', 0.45551328001378055), (' National', 0.4161691907685238), (' last', 0.3722220308931209), (' video', 0.3394766516852445), (' number', 0.33786771508125335), (' White', 0.3153295985679882), (' government', 0.3063399404305146), (' best', 0.2997026850009314), (' University', 0.29965524284327455), (' man', 0.2982599259924276), (' American', 0.2883919997750622), (' next', 0.28150098907686993), (' former', 0.27084067910111553), (' European', 0.2678273931907919), (' story', 0.2621340354346303), (' second', 0.25416413922807646), (' federal', 0.24911128126432056), (' Canadian', 0.24855752565591652), (' official', 0.24101925553463), (' city', 0.2380325