# Hello Hugging Face Tokenizer

In [5]:
# Prerequisites
from transformers import AutoModelForCausalLM, AutoTokenizer
import pandas as pd
import torch

Load a pre-trained model and tokenizer

In [9]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2")

# Make up some text
my_text = "Generative AI will in the future provide promising applications for the field of"
tokens = tokenizer(my_text, return_tensors="pt")

# Display tokens
print(tokens["input_ids"])

tensor([[ 8645,   876,  9552,   481,   287,   262,  2003,  2148, 11781,  5479,
           329,   262,  2214,   286]])


Look into the Tokens

In [10]:

def display_tokens(tokens, tokenizer):
    return pd.DataFrame([(id, tokenizer.decode(id)) for id in tokens["input_ids"][0]], columns=["id","token"])

display_tokens(tokens, tokenizer)

Unnamed: 0,id,token
0,tensor(8645),Gener
1,tensor(876),ative
2,tensor(9552),AI
3,tensor(481),will
4,tensor(287),in
5,tensor(262),the
6,tensor(2003),future
7,tensor(2148),provide
8,tensor(11781),promising
9,tensor(5479),applications


Get the next token probabilities

In [12]:
# Calculate the probabilities for likely next token and display top 10
with torch.no_grad():
    logits = model(**tokens).logits[:, -1, :]
    probabilities = torch.nn.functional.softmax(logits[0], dim=-1)


def display_likely_next_tokens(probabilities, tokenizer, top_n=5):
    return pd.DataFrame(
        [(id, tokenizer.decode(id), p.item()) for id, p in enumerate(probabilities) if p.item() ],
        columns=["id", "token", "probability"],
    ).sort_values("probability", ascending=False)[:top_n]


display_likely_next_tokens(probabilities, tokenizer, top_n=10)


Unnamed: 0,id,token,probability
11666,11666,artificial,0.237438
9552,9552,AI,0.081239
35941,35941,Artificial,0.066997
36359,36359,robotics,0.038756
4572,4572,machine,0.026404
3644,3644,computer,0.019377
31350,31350,computational,0.015083
10870,10870,cognitive,0.013064
12661,12661,intelligent,0.012315
17019,17019,neural,0.012304


Get the next token

In [13]:
# Get the next token
next_token_id = torch.argmax(probabilities).item()

print(f"Most likely next token_id: {next_token_id}")
print(f"Most likely next token: {tokenizer.decode(next_token_id)}")

Most likely next token_id: 11666
Most likely next token:  artificial


In [14]:
# Append to the original sentence
my_text = my_text + tokenizer.decode(11666)
print(my_text)

Generative AI will in the future provide promising applications for the field of artificial


Generate more text with generate() method

In [16]:
from IPython.display import Markdown, display

# Tokenize some random text
my_text_2 = "In the beginning, machine learning was"
tokens = tokenizer(my_text_2, return_tensors="pt")

# generate more text 
output = model.generate(**tokens, max_length=64, pad_token_id=tokenizer.eos_token_id)

# Display markdown
display(Markdown(tokenizer.decode(output[0])))

In the beginning, machine learning was a very simple concept. It was a way to learn about the world. It was a way to learn about the world. It was a way to learn about the world. It was a way to learn about the world. It was a way to learn about the world. It was