# Tokenizers (PyTorch)

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [None]:
!pip install datasets evaluate transformers[sentencepiece]

In [1]:
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer

from huggingface_hub import HfApi, login


llama2_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
llama2_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [None]:
tokenized_text = "Jim Henson was a puppeteer".split()
print(tokenized_text)

['Jim', 'Henson', 'was', 'a', 'puppeteer']

In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [None]:
tokenizer("Using a Transformer network is simple")

{'input_ids': [101, 7993, 170, 11303, 1200, 2443, 1110, 3014, 102],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
tokenizer.save_pretrained("directory_on_my_computer")

In [2]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

sequence = "Using a Transformer network is simple"
tokens = tokenizer.tokenize(sequence)

print(tokens)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

['Using', 'a', 'Trans', '##former', 'network', 'is', 'simple']


In [4]:
sequence = "我爱你 abc"
tokens = llama2_tokenizer.tokenize(sequence)

print(tokens)

['▁', '我', '<0xE7>', '<0x88>', '<0xB1>', '你', '▁abc']


In [6]:
ids = llama2_tokenizer.convert_tokens_to_ids(tokens)



print(ids)

[29871, 30672, 234, 139, 180, 30919, 25638]


In [None]:
decoded_string = tokenizer.decode([7993, 170, 11303, 1200, 2443, 1110, 3014])
print(decoded_string)

'Using a Transformer network is simple'

In [16]:
import torch
input_ids = torch.tensor([ids])

print(input_ids)

logits = llama2_model(input_ids)

print(tokenizer.pad_token_id)



tensor([[29871, 30672,   234,   139,   180, 30919, 25638]])
0


In [32]:
sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]

llama2_tokenizer.pad_token = "1"

pt_inputs = llama2_tokenizer(sequences,padding=True,  return_tensors="pt")

# Returns TensorFlow tensors
tf_inputs = llama2_tokenizer(sequences,padding=True,  return_tensors="tf")

# Returns NumPy arrays
model_inputs = tokenizer(sequences, padding=True, return_tensors="np")

print(pt_inputs)
print(tf_inputs)



{'input_ids': tensor([[    1,   306, 29915,   345,  1063, 10534,   363,   263,   379,   688,
          3460, 23360,  3236,   590,  3353,  2834, 29889],
        [    1,  1105,   505,   306, 29991, 29896, 29896, 29896, 29896, 29896,
         29896, 29896, 29896, 29896, 29896, 29896, 29896]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}
{'input_ids': <tf.Tensor: shape=(2, 17), dtype=int32, numpy=
array([[    1,   306, 29915,   345,  1063, 10534,   363,   263,   379,
          688,  3460, 23360,  3236,   590,  3353,  2834, 29889],
       [    1,  1105,   505,   306, 29991, 29896, 29896, 29896, 29896,
        29896, 29896, 29896, 29896, 29896, 29896, 29896, 29896]],
      dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(2, 17), dtype=int32, numpy=
array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>}
