### HuggingFace Transformers

In [4]:
%pip install -U transformers torch accelerate

Note: you may need to restart the kernel to use updated packages.


In [7]:
import transformers
print(transformers.__version__)

4.57.3


In [9]:
from transformers import pipeline

In [11]:
sentiment_classifier = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


In [13]:
sentiment_classifier("I am excited to learn about large language models")

[{'label': 'POSITIVE', 'score': 0.999734103679657}]

In [15]:
ner = pipeline("ner", model = "dslim/bert-base-NER")

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


In [17]:
ner("His name is John and he works in Montreal for Morgan Stanley")

[{'entity': 'B-PER',
  'score': 0.99863416,
  'index': 4,
  'word': 'John',
  'start': 12,
  'end': 16},
 {'entity': 'B-LOC',
  'score': 0.9995042,
  'index': 9,
  'word': 'Montreal',
  'start': 33,
  'end': 41},
 {'entity': 'B-ORG',
  'score': 0.9975923,
  'index': 11,
  'word': 'Morgan',
  'start': 46,
  'end': 52},
 {'entity': 'I-ORG',
  'score': 0.9981583,
  'index': 12,
  'word': 'Stanley',
  'start': 53,
  'end': 60}]

In [19]:
zeroshot_classifier = pipeline("zero-shot-classification", model = "facebook/bart-large-mnli")

Device set to use cpu


In [21]:
seqeunce_to_classify = "I will graduate from Concordia University this summer."
candidate_labels = ['travel', 'academic', 'cooking']

In [23]:
zeroshot_classifier(seqeunce_to_classify, candidate_labels)

{'sequence': 'I will graduate from Concordia University this summer.',
 'labels': ['academic', 'travel', 'cooking'],
 'scores': [0.970462441444397, 0.025085192173719406, 0.00445235799998045]}

### Pre Trained Tokenizers

In [26]:
from transformers import AutoTokenizer

In [28]:
model = "bert-base-uncased"

In [30]:
tokenizer = AutoTokenizer.from_pretrained(model)

In [32]:
sentence = "I am excited to graduate from Concordia University this summer."

In [34]:
input_ids = tokenizer(sentence)
print (input_ids)

{'input_ids': [101, 1045, 2572, 7568, 2000, 4619, 2013, 24982, 2118, 2023, 2621, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [36]:
tokens = tokenizer.tokenize(sentence)
print (tokens)

['i', 'am', 'excited', 'to', 'graduate', 'from', 'concordia', 'university', 'this', 'summer', '.']


In [38]:
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print (token_ids)

[1045, 2572, 7568, 2000, 4619, 2013, 24982, 2118, 2023, 2621, 1012]


In [40]:
decoded_ids = tokenizer.decode(token_ids)
print (decoded_ids)

i am excited to graduate from concordia university this summer.


In [42]:
tokenizer.decode([101,102])

'[CLS] [SEP]'

In [44]:
model2 = "xlnet-base-cased"

In [46]:
tokenizer2 = AutoTokenizer.from_pretrained(model2)

In [47]:
sentence2 = "I am excited to graduate from Concordia University this summer."

In [50]:
input_ids = tokenizer2(sentence2)
print (input_ids)

{'input_ids': [35, 569, 5564, 22, 3868, 40, 16479, 780, 315, 52, 1148, 9, 4, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [52]:
tokens2 = tokenizer2.tokenize(sentence)
print (tokens2)

['▁I', '▁am', '▁excited', '▁to', '▁graduate', '▁from', '▁Concord', 'ia', '▁University', '▁this', '▁summer', '.']


In [54]:
token_ids2 = tokenizer2.convert_tokens_to_ids(tokens2)
print (token_ids2)

[35, 569, 5564, 22, 3868, 40, 16479, 780, 315, 52, 1148, 9]


In [56]:
decoded_ids2 = tokenizer2.decode(token_ids2)
print (decoded_ids2)

I am excited to graduate from Concordia University this summer.


In [58]:
tokenizer2.decode([4,3])

'<sep><cls>'

### HuggingFace and PyTorch/TensorFlow

In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

In [61]:
print (sentence)
print (input_ids)

I am excited to graduate from Concordia University this summer.
{'input_ids': [35, 569, 5564, 22, 3868, 40, 16479, 780, 315, 52, 1148, 9, 4, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [63]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [69]:
input_ids_pt = tokenizer(sentence, return_tensors = "pt")
print (input_ids_pt)

{'input_ids': tensor([[  101,  1045,  2572,  7568,  2000,  4619,  2013, 24982,  2118,  2023,
          2621,  1012,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [71]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [73]:
with torch.no_grad():
    logits = model(**input_ids_pt).logits

predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]

'POSITIVE'

### Saving and Loading Models

In [76]:
model_directory = "saved_models"

In [80]:
tokenizer.save_pretrained(model_directory)

('saved_models\\tokenizer_config.json',
 'saved_models\\special_tokens_map.json',
 'saved_models\\vocab.txt',
 'saved_models\\added_tokens.json',
 'saved_models\\tokenizer.json')

In [82]:
model.save_pretrained(model_directory)

In [84]:
my_tokenizer = AutoTokenizer.from_pretrained(model_directory)

In [86]:
my_model = AutoModelForSequenceClassification.from_pretrained(model_directory)