# Install Libraries

In [None]:
!pip install huggingface

Collecting huggingface
  Downloading huggingface-0.0.1-py3-none-any.whl (2.5 kB)
Installing collected packages: huggingface
Successfully installed huggingface-0.0.1


In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m38.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m31.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m95.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m26.3 MB/s[0m eta [36m0:00:00[0m
Col

# Tokenization

**There are 3 tokenizers:**
1. Word tokenizer

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
sentence = "dog park is filled with dogs"
tokens = tokenizer(sentence)

print(tokens)

{'input_ids': [101, 3676, 2493, 1110, 2709, 1114, 6363, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}


In [None]:
token_ids = tokens.input_ids

In [None]:
token_ids

[101, 3676, 2493, 1110, 2709, 1114, 6363, 102]

In [None]:
decoded_string = tokenizer.decode(token_ids)

In [None]:
decoded_string

'[CLS] dog park is filled with dogs [SEP]'

Every word is given a token,The words "dog" and "dogs" are given 2 seperate tokens regardless of them sharing a common root word.

We could go in a little more depth to tokenize characters.

2. Character Tokenizer

In [None]:
i=0
chars=[]
while i<len(sentence):
    chars.append(sentence[i])
    i+=1
print(chars)
char_tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokens = char_tokenizer(chars)
print(tokens)

['d', 'o', 'g', ' ', 'p', 'a', 'r', 'k', ' ', 'i', 's', ' ', 'f', 'i', 'l', 'l', 'e', 'd', ' ', 'w', 'i', 't', 'h', ' ', 'd', 'o', 'g', 's']
{'input_ids': [[67], [78], [70], [220], [79], [64], [81], [74], [220], [72], [82], [220], [69], [72], [75], [75], [68], [67], [220], [86], [72], [83], [71], [220], [67], [78], [70], [82]], 'attention_mask': [[1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1]]}


In [None]:
chartoken_ids = tokens.input_ids
print(chartoken_ids,end="")

[[67], [78], [70], [220], [79], [64], [81], [74], [220], [72], [82], [220], [69], [72], [75], [75], [68], [67], [220], [86], [72], [83], [71], [220], [67], [78], [70], [82]]

In [None]:
char_ids=[]
for values in chartoken_ids:
  if values:
    char_ids.append(int(values[0]))


decode_string = char_tokenizer.decode(char_ids)

In [None]:
decode_string

'dog park is filled with dogs'

* As you can see above d is give [67] and is
being re-utilised. So we can save memory.

* However, the charcaters themselves have no individual meaning so the context might be sacrificed when the model is learning.

* Each word is given a single token, but in here we are increasing the number of tokens since we are assigning tokens to individual characters

3. Subword Tokenizer

In [None]:
from transformers import AutoTokenizer

sw_tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
sentence = "Dog park is filled with dogs and dog's owners"

tokens = sw_tokenizer.tokenize(sentence)

In [None]:
print(tokens)

['Dog', 'park', 'is', 'filled', 'with', 'dogs', 'and', 'dog', "'", 's', 'owners']


In [None]:
sw_ids = sw_tokenizer.convert_tokens_to_ids(tokens)

In [None]:
sw_ids

[8166, 2493, 1110, 2709, 1114, 6363, 1105, 3676, 112, 188, 5032]

In [None]:
decoded_string = sw_tokenizer.decode(sw_ids)

In [None]:
decoded_string

"Dog park is filled with dogs and dog's owners"

# Batch Tokenization

In [20]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")
sentences = ["this is a batch","Multiple sentences are written here"]

print(tokens)

{'input_ids': [[5661, 318, 257, 15458], [31217, 13439, 389, 3194, 994]], 'attention_mask': [[1, 1, 1, 1], [1, 1, 1, 1, 1]]}


In [21]:
model_inputs = tokens.input_ids

In [22]:
model_inputs

[[5661, 318, 257, 15458], [31217, 13439, 389, 3194, 994]]

With padding

In [23]:
tokenizer.pad_token = tokenizer.eos_token

# Will pad the sentences up to the maximum sequence length
model_inputs = tokenizer(sentences, padding="longest")
print("max sentence length: ",model_inputs)
# Will pad the sentences up to the model max length
# (512 for BERT or DistilBERT)
model_inputs = tokenizer(sentences, padding="max_length")
print("max model length: ",model_inputs)
# Will pad the sentences up to the specified max length
model_inputs = tokenizer(sentences, padding="max_length", max_length=8)
print("max setence length specified by user: ",model_inputs)

max sentence length:  {'input_ids': [[5661, 318, 257, 15458, 50256], [31217, 13439, 389, 3194, 994]], 'attention_mask': [[1, 1, 1, 1, 0], [1, 1, 1, 1, 1]]}
max model length:  {'input_ids': [[5661, 318, 257, 15458, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50

truncation

In [24]:
# Will truncate the sentences that are longer than the model max length
# (512 for BERT or DistilBERT)
model_inputs = tokenizer(sentences, truncation=True)
print("truncate if longer than length of the model: ",model_inputs)
# Will truncate the sentences that are longer than the specified max length
model_inputs = tokenizer(sentences, max_length=3, truncation=True)
print("truncate longer than specified length: ",model_inputs)

truncate if longer than length of the model:  {'input_ids': [[5661, 318, 257, 15458], [31217, 13439, 389, 3194, 994]], 'attention_mask': [[1, 1, 1, 1], [1, 1, 1, 1, 1]]}
truncate longer than specified length:  {'input_ids': [[5661, 318, 257], [31217, 13439, 389]], 'attention_mask': [[1, 1, 1], [1, 1, 1]]}


Framework specific token

In [26]:
# Returns PyTorch tensors
model_inputs = tokenizer(sentences, padding=True, return_tensors="pt")
print("pt: ",model_inputs)
# Returns TensorFlow tensors
model_inputs = tokenizer(sentences, padding=True, return_tensors="tf")
print("tf: ",model_inputs)
# Returns NumPy arrays
model_inputs = tokenizer(sentences, padding=True, return_tensors="np")
print("np: ",model_inputs)

pt:  {'input_ids': tensor([[ 5661,   318,   257, 15458, 50256],
        [31217, 13439,   389,  3194,   994]]), 'attention_mask': tensor([[1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1]])}
tf:  {'input_ids': <tf.Tensor: shape=(2, 5), dtype=int32, numpy=
array([[ 5661,   318,   257, 15458, 50256],
       [31217, 13439,   389,  3194,   994]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(2, 5), dtype=int32, numpy=
array([[1, 1, 1, 1, 0],
       [1, 1, 1, 1, 1]], dtype=int32)>}
np:  {'input_ids': array([[ 5661,   318,   257, 15458, 50256],
       [31217, 13439,   389,  3194,   994]]), 'attention_mask': array([[1, 1, 1, 1, 0],
       [1, 1, 1, 1, 1]])}


Model training

In [30]:
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]

tokens = tokenizer(sequences, padding=True, truncation=True, return_tensors="tf")
output = model(tokens)

All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


In [31]:
print(output)

TFSequenceClassifierOutput(loss=None, logits=<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
array([[-1.5606961,  1.6122813],
       [-3.618318 ,  3.9137495]], dtype=float32)>, hidden_states=None, attentions=None)


In [32]:
import tensorflow as tf

results = tf.math.softmax(output.logits,axis=1)
print(results)

tf.Tensor(
[[4.0195391e-02 9.5980465e-01]
 [5.3534308e-04 9.9946469e-01]], shape=(2, 2), dtype=float32)
