### HuggingFace Transformers


In [1]:
from transformers import pipeline

In [2]:
sentiment_classifier = pipeline("sentiment-analysis")
result = sentiment_classifier("I love using Hugging Face Transformers!")
print(result)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use mps:0


[{'label': 'POSITIVE', 'score': 0.9971315860748291}]


In [3]:
ner = pipeline("ner", model="dslim/bert-base-NER")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


config.json:   0%|          | 0.00/829 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use mps:0


In [4]:
ner("Hugging Face is based in New York City.")

[{'entity': 'B-ORG',
  'score': 0.7377328,
  'index': 1,
  'word': 'Hu',
  'start': 0,
  'end': 2},
 {'entity': 'I-ORG',
  'score': 0.6290814,
  'index': 2,
  'word': '##gging',
  'start': 2,
  'end': 7},
 {'entity': 'I-ORG',
  'score': 0.93110704,
  'index': 3,
  'word': 'Face',
  'start': 8,
  'end': 12},
 {'entity': 'B-LOC',
  'score': 0.9994505,
  'index': 7,
  'word': 'New',
  'start': 25,
  'end': 28},
 {'entity': 'I-LOC',
  'score': 0.99945694,
  'index': 8,
  'word': 'York',
  'start': 29,
  'end': 33},
 {'entity': 'I-LOC',
  'score': 0.9995559,
  'index': 9,
  'word': 'City',
  'start': 34,
  'end': 38}]

In [5]:
zeroshot_classifier = pipeline(
    "zero-shot-classification", model="facebook/bart-large-mnli"
)

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use mps:0


In [6]:
sequence_to_classify = "Hugging Face is creating a tool that democratizes AI."
candidate_labels = ["technology", "education", "politics"]

In [7]:
result = zeroshot_classifier(sequence_to_classify, candidate_labels)
print(result)

{'sequence': 'Hugging Face is creating a tool that democratizes AI.', 'labels': ['technology', 'education', 'politics'], 'scores': [0.9906295537948608, 0.005319306626915932, 0.0040511772967875]}


### Pre-trained Tokenizers


In [8]:
from transformers import AutoTokenizer

In [9]:
model = "bert-base-uncased"

In [10]:
tokenizer = AutoTokenizer.from_pretrained(model)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [11]:
sentence = "Transformers are amazing for natural language processing tasks."

In [12]:
inputs_ids = tokenizer(sentence)
print(inputs_ids)

{'input_ids': [101, 19081, 2024, 6429, 2005, 3019, 2653, 6364, 8518, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [13]:
tokens = tokenizer.tokenize(sentence)

In [14]:
print(tokens)

['transformers', 'are', 'amazing', 'for', 'natural', 'language', 'processing', 'tasks', '.']


In [15]:
token_ids = tokenizer.convert_tokens_to_ids(tokens)

In [17]:
print(token_ids)

[19081, 2024, 6429, 2005, 3019, 2653, 6364, 8518, 1012]


In [18]:
decoded_ids = tokenizer.decode(token_ids)
print(decoded_ids)

transformers are amazing for natural language processing tasks.


In [19]:
tokenizer.decode(101)

'[CLS]'

In [20]:
tokenizer.decode(102)

'[SEP]'

In [26]:
model2 = "xlnet-base-cased"

In [37]:
tokenizer2 = AutoTokenizer.from_pretrained(model2)

In [42]:
input_ids = tokenizer2(sentence)

In [43]:
print(input_ids)

{'input_ids': [17, 21442, 270, 41, 3704, 28, 1136, 1243, 4218, 6243, 9, 4, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [44]:
tokens = tokenizer2.tokenize(sentence)
print(tokens)

['▁', 'Transform', 'ers', '▁are', '▁amazing', '▁for', '▁natural', '▁language', '▁processing', '▁tasks', '.']


In [45]:
token_ids = tokenizer2.convert_tokens_to_ids(tokens)
print(token_ids)

[17, 21442, 270, 41, 3704, 28, 1136, 1243, 4218, 6243, 9]


In [46]:
tokenizer2.decode(4)

'<sep>'

In [47]:
tokenizer2.decode(3)

'<cls>'

### Huggingface and Pytorch/Tensorflow


In [49]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

In [51]:
print(sentence)
print(input_ids)

Transformers are amazing for natural language processing tasks.
{'input_ids': [17, 21442, 270, 41, 3704, 28, 1136, 1243, 4218, 6243, 9, 4, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    "distilbert-base-uncased-finetuned-sst-2-english"
)

In [53]:
input_ids_pt = tokenizer(sentence, return_tensors="pt")
print(input_ids_pt)

{'input_ids': tensor([[  101, 19081,  2024,  6429,  2005,  3019,  2653,  6364,  8518,  1012,
           102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [54]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased-finetuned-sst-2-english"
)

In [55]:
with torch.no_grad():
    logits = model(**input_ids_pt).logits

predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]

'POSITIVE'

### Saving and loading models


In [56]:
model_directory = "my_saved_models"

In [57]:
tokenizer.save_pretrained(model_directory)

('my_saved_models/tokenizer_config.json',
 'my_saved_models/special_tokens_map.json',
 'my_saved_models/vocab.txt',
 'my_saved_models/added_tokens.json',
 'my_saved_models/tokenizer.json')

In [60]:
model.save_pretrained(model_directory)

In [61]:
my_tokenizer = AutoTokenizer.from_pretrained(model_directory)

In [62]:
my_model = AutoModelForSequenceClassification.from_pretrained(model_directory)