All Notebooks in this folder are based on the "Hands-on Large Language Models" book by Jay Alammar and Maarten Grootendorst, and the corresponding public repo.

See https://www.llm-book.com/
    
The code has been tweaked to run on a Mac M4 with 10 GPU cores.

# Tokenization

In [None]:
import warnings
warnings.filterwarnings('ignore')

from transformers import AutoTokenizer

colors_list = [
    '102;194;165', '252;141;98', '141;160;203',
    '231;138;195', '166;216;84', '255;217;47'
]

def show_tokens(sentence, tokenizer_name):
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    token_ids = tokenizer(sentence).input_ids
    print(f"{tokenizer_name}:", end=' ')
    for idx, t in enumerate(token_ids):
        print(
            f'\x1b[0;30;48;2;{colors_list[idx % len(colors_list)]}m' +
            tokenizer.decode(t) +
            '\x1b[0m',
            end=' '
        )
    print("\n")

In [None]:
text = """
English and CAPITALIZATION
🎵 鸟
show_tokens False None elif == >= else: two tabs:"    " Three tabs: "       "
12.0*50=600
"""

In [None]:
# no newlines
# all lower-case
# UNK for unknown characters
show_tokens(text, "bert-base-uncased")

In [None]:
# no newlines
# casing is kept intact
# UNK for unknown characters
show_tokens(text, "bert-base-cased")

In [None]:
# uses Byte Pair Encoding (BPE)
# newlines and special characters are also encoded
show_tokens(text, "gpt2")

In [None]:
# uses BPE
# has special tokens for coding, like for 4 spaces and "elif"
show_tokens(text, "Xenova/gpt-4")

In [None]:
# like gpt-4, but keeps digits separate to improve math skills
show_tokens(text, "bigcode/starcoder2-15b")

# Embeddings

In [None]:
from transformers import AutoModel, AutoTokenizer

# Load a tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-base")

# Load a language model
model = AutoModel.from_pretrained("microsoft/deberta-v3-xsmall")

def tokenize_sentence(sentence):
    # Tokenize the sentence
    tokens = tokenizer(sentence, return_tensors='pt')

    for token in tokens['input_ids'][0]:
        print(tokenizer.decode(token))

    # Process the tokens
    output = model(**tokens)[0]

    print(output.shape)
    print(output)

In [None]:
tokenize_sentence('Hello world')

In [None]:
tokenize_sentence('Hello world') # embeddings should be the same

In [None]:
tokenize_sentence('Hello Anna, how are you doing?') # embedding for Hello is now different because of context