### Using Hugging-face using transformers

In [13]:
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification

In [2]:
sentiment_classifier = pipeline('sentiment-analysis')
results = sentiment_classifier(["mary had a little lamb which followed her to school everyday. It was a nuisance"])
print(results)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Device set to use mps:0


[{'label': 'NEGATIVE', 'score': 0.992002010345459}]


In [None]:
# Check each model and its implementation to get its available prameters and how to use it
tokenizer = AutoTokenizer.from_pretrained('dslim/bert-large-NER')
model = AutoModelForTokenClassification.from_pretrained('dslim/bert-large-NER')
custom_analyzer = pipeline('ner', model=model, tokenizer=tokenizer)
results = custom_analyzer("mary had a little lamb which followed her to school everyday. It was a nuisance")
print(results)

In [21]:
zeroshot_classifier = pipeline('zero-shot-classification', model='MoritzLaurer/deberta-v3-base-zeroshot-v1')
labels = ['politics', 'animals', 'education']
results = zeroshot_classifier("Mary had a little lamb which followed her to school everyday. It was a nuisance", labels, multi_label=False)
print(results)


Device set to use mps:0


{'sequence': 'Mary had a little lamb which followed her to school everyday. It was a nuisance', 'labels': ['animals', 'education', 'politics'], 'scores': [0.9984616041183472, 0.0014378840569406748, 0.00010049124830402434]}


## Pre-Trained Tokenizers

In [1]:
from transformers import AutoTokenizer
model = "bert-base-uncased"  # a type of tokenizer model. We wil be using another model later

In [2]:
tokenizer = AutoTokenizer.from_pretrained(model)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [31]:
sent = "I am very sad to hear about the children suffering from mumps."

In [32]:
input_ids = tokenizer(sent) # Tokenizes (turns words into numbered tokens
print(input_ids)

{'input_ids': [101, 1045, 2572, 2200, 6517, 2000, 2963, 2055, 1996, 2336, 6114, 2013, 12954, 4523, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [33]:
# Below we are going one step-by-step
# 1. converting sentence to a list of tokens
tokens = tokenizer.tokenize(sent)
print(tokens)

['i', 'am', 'very', 'sad', 'to', 'hear', 'about', 'the', 'children', 'suffering', 'from', 'mum', '##ps']


In [34]:
#2. Give ids to tokens . note the length is same
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print(token_ids)

[1045, 2572, 2200, 6517, 2000, 2963, 2055, 1996, 2336, 6114, 2013, 12954, 4523]


In [13]:
# Decoding for verification 
print(tokenizer.decode(2336))

children


In [21]:
# Decoding a special token. These are special tokens that the tokenizer understands
print(tokenizer.decode(101))

[CLS]


In [22]:
model2 = "xlnet-base-cased"
tokenizer2 = AutoTokenizer.from_pretrained(model2)

config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

In [24]:
input_ids = tokenizer2(sent)
print(input_ids)  

{'input_ids': [35, 569, 172, 5694, 22, 1388, 75, 18, 341, 3545, 40, 14695, 3716, 9, 4, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [26]:
# breaking down
tokenizer2 = tokenizer2.tokenize(sent)
print(tokens)

['▁I', '▁am', '▁very', '▁sad', '▁to', '▁hear', '▁about', '▁the', '▁children', '▁suffering', '▁from', '▁mum', 'ps', '.']


In [28]:
token_ids = tokenizer2.convert_tokens_to_ids(tokens)
print(token_ids) # notice the ids are different

[35, 569, 172, 5694, 22, 1388, 75, 18, 341, 3545, 40, 14695, 3716, 9]


In [30]:
# decoding
print(tokenizer2.decode(14695))

mum


In [37]:
# Special tokens
print(tokenizer2.decode(4))  # the ids for special tokens are different here

<sep>
