# Basic usage of high level APIs 

In [9]:
from transformers import pipeline
import datasets

### Basic sentiment analysis - using pretrained models 

- #### the pipeline will downlaod a default pretrained model (if not specified it is distilled bert base), fineturned on SST2

In [5]:
classifier = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)


In [8]:
results = classifier(["We are very happy to show you the 🤗 Transformers library.", 
                      "We hope you don't hate it."])
for result in results:
    print(f"label: {result['label']}, with score: {round(result['score'], 4)}")

label: POSITIVE, with score: 0.9998
label: NEGATIVE, with score: 0.5309


### Use specific model and model tokenizers in pipeline

In [11]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [17]:
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [13]:
classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
classifier("Nous sommes très heureux de vous présenter la bibliothèque 🤗 Transformers.")

[{'label': '5 stars', 'score': 0.7272651791572571}]

- now we can tokenize a sentence 

In [14]:
encoding = tokenizer("We are very happy to show you the 🤗 Transformers library.")
print(encoding)

{'input_ids': [101, 11312, 10320, 12495, 19308, 10114, 11391, 10855, 10103, 100, 58263, 13299, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


- we can also tokenize a batch with padding 

In [16]:
pt_batch = tokenizer(
    ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."],
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="pt",
)
print(pt_batch)

{'input_ids': tensor([[  101, 11312, 10320, 12495, 19308, 10114, 11391, 10855, 10103,   100,
         58263, 13299,   119,   102],
        [  101, 11312, 18763, 10855, 11530,   112,   162, 39487, 10197,   119,
           102,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]])}


- #### we can now use the mode and tokeinzed data for prediction; The model outputs the final activations in the logits attribute. Apply the softmax function to the logits to retrieve the probabilities

In [20]:
from torch import nn

In [21]:
pt_outputs = model(**pt_batch)
pt_predictions = nn.functional.softmax(pt_outputs.logits, dim=-1)
print(pt_predictions)

tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
        [0.2084, 0.1826, 0.1969, 0.1755, 0.2365]], grad_fn=<SoftmaxBackward0>)


- #### Once finished fineturning your model, you can save model weights locally

In [23]:
pt_save_directory = "/Volumes/T7/Data/Pretrained_models/example"
tokenizer.save_pretrained(pt_save_directory)
model.save_pretrained(pt_save_directory)

- #### You can now reload from saved path 

In [24]:
tokenizer = AutoTokenizer.from_pretrained(pt_save_directory)
pt_model = AutoModelForSequenceClassification.from_pretrained(pt_save_directory)

#### Models are donloaded in a local cach folder, you can config this following : https://huggingface.co/docs/transformers/v4.17.0/en/installation

#### You can also manually download pretrained modles weights locally : https://huggingface.co/models

In [29]:
bertweet_path = "/Volumes/T7/Data/Pretrained_models/bertweet-base-sentiment-analysis"
tokenizer = AutoTokenizer.from_pretrained(bertweet_path)
pt_model = AutoModelForSequenceClassification.from_pretrained(bertweet_path)

In [30]:
pt_batch = tokenizer(
    ["We are very happy to show you the 🤗 Transformers library.", 
     "We hope you don't hate it."],
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="pt",
)
print(pt_batch)

{'input_ids': tensor([[    0,   134,    41,   249,   225,     9,   258,    14,     6,     3,
         27615, 42593, 10028,     4,     2],
        [    0,   134,   240,    14, 19933,   253,   987,     4,     2,     1,
             1,     1,     1,     1,     1]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]])}
