# Quick tour

- https://huggingface.co/docs/transformers/en/quicktour

## Pipeline

In [1]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis")

  from .autonotebook import tqdm as notebook_tqdm
No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [2]:
classifier("We are very happy to show you the 🤗 Transformers library.")

[{'label': 'POSITIVE', 'score': 0.9997795224189758}]

In [3]:
results = classifier(["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."])
for result in results:
    print(f"label: {result['label']}, with score: {round(result['score'], 4)}")

label: POSITIVE, with score: 0.9998
label: NEGATIVE, with score: 0.5309


In [4]:
import torch
from transformers import pipeline

speech_recognizer = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h")

Some weights of the model checkpoint at facebook/wav2vec2-base-960h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You sho

In [6]:
from datasets import load_dataset, Audio

dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")

Generating train split: 563 examples [00:00, 1585.84 examples/s]


In [8]:
dataset = dataset.cast_column("audio", Audio(sampling_rate=speech_recognizer.feature_extractor.sampling_rate))

In [10]:
result = speech_recognizer(dataset[:4]["audio"])
print([d["text"] for d in result])

['I WOULD LIKE TO SET UP A JOINT ACCOUNT WITH MY PARTNER HOW DO I PROCEED WITH DOING THAT', "FONDERING HOW I'D SET UP A JOIN TO HELL T WITH MY WIFE AND WHERE THE AP MIGHT BE", "I I'D LIKE TOY SET UP A JOINT ACCOUNT WITH MY PARTNER I'M NOT SEEING THE OPTION TO DO IT ON THE APSO I CALLED IN TO GET SOME HELP CAN I JUST DO IT OVER THE PHONE WITH YOU AND GIVE YOU THE INFORMATION OR SHOULD I DO IT IN THE AP AN I'M MISSING SOMETHING UQUETTE HAD PREFERRED TO JUST DO IT OVER THE PHONE OF POSSIBLE THINGS", 'HOW DO I FURN A JOINA COUT']


### Use another model and tokenizer in the pipeline

In [11]:
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"

In [12]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [13]:
classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
classifier("Nous sommes très heureux de vous présenter la bibliothèque 🤗 Transformers.")

[{'label': '5 stars', 'score': 0.7272652387619019}]

## AutoClass

### AutoTokenizer

In [14]:
from transformers import AutoTokenizer

model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [15]:
encoding = tokenizer("We are very happy to show you the 🤗 Transformers library.")
print(encoding)

{'input_ids': [101, 11312, 10320, 12495, 19308, 10114, 11391, 10855, 10103, 100, 58263, 13299, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [16]:
pt_batch = tokenizer(
    ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."],
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="pt",
)

### AutoModel

In [17]:
from transformers import AutoModelForSequenceClassification

model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
pt_model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [18]:
pt_outputs = pt_model(**pt_batch)

In [19]:
from torch import nn

pt_predictions = nn.functional.softmax(pt_outputs.logits, dim=1)
print(pt_predictions)

tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
        [0.2084, 0.1826, 0.1969, 0.1755, 0.2365]], grad_fn=<SoftmaxBackward0>)


### Save a model

In [20]:
pt_save_directory = "./pt_save_pretrained"
tokenizer.save_pretrained(pt_save_directory)
pt_model.save_pretrained(pt_save_directory)

In [21]:
pt_model = AutoModelForSequenceClassification.from_pretrained("./pt_save_pretrained")

## Custom model builds

In [23]:
from transformers import AutoConfig

my_config = AutoConfig.from_pretrained("distilbert/distilbert-base-uncased", n_heads=12)

In [24]:
from transformers import AutoModel

my_model = AutoModel.from_config(my_config)

## Trainer - a PyTorch optimized training loop

In [26]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="output/",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
)

In [28]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

In [29]:
from datasets import load_dataset

dataset = load_dataset("rotten_tomatoes")

Downloading readme: 100%|██████████| 7.46k/7.46k [00:00<00:00, 8.61MB/s]
Downloading data: 100%|██████████| 699k/699k [00:02<00:00, 309kB/s]
Downloading data: 100%|██████████| 90.0k/90.0k [00:01<00:00, 67.9kB/s]
Downloading data: 100%|██████████| 92.2k/92.2k [00:01<00:00, 61.9kB/s]
Generating train split: 100%|██████████| 8530/8530 [00:00<00:00, 131145.51 examples/s]
Generating validation split: 100%|██████████| 1066/1066 [00:00<00:00, 531391.50 examples/s]
Generating test split: 100%|██████████| 1066/1066 [00:00<00:00, 492143.98 examples/s]


In [30]:
def tokenize_dataset(dataset):
    return tokenizer(dataset["text"])

In [31]:
dataset = dataset.map(tokenize_dataset, batched=True)

Map: 100%|██████████| 8530/8530 [00:00<00:00, 8880.02 examples/s]
Map: 100%|██████████| 1066/1066 [00:00<00:00, 9171.94 examples/s]
Map: 100%|██████████| 1066/1066 [00:00<00:00, 8712.78 examples/s]


In [32]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [33]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [34]:
trainer.train()

 23%|██▎       | 500/2134 [02:22<07:01,  3.88it/s]

{'loss': 0.435, 'grad_norm': 7.56728458404541, 'learning_rate': 1.5313964386129335e-05, 'epoch': 0.47}


 47%|████▋     | 1000/2134 [04:44<05:18,  3.56it/s]

{'loss': 0.3879, 'grad_norm': 1.24951171875, 'learning_rate': 1.0627928772258671e-05, 'epoch': 0.94}


 70%|███████   | 1500/2134 [07:05<02:55,  3.62it/s]

{'loss': 0.2707, 'grad_norm': 5.844650745391846, 'learning_rate': 5.941893158388004e-06, 'epoch': 1.41}


 94%|█████████▎| 2000/2134 [09:27<00:37,  3.58it/s]

{'loss': 0.2689, 'grad_norm': 54.52899169921875, 'learning_rate': 1.2558575445173386e-06, 'epoch': 1.87}


100%|██████████| 2134/2134 [10:07<00:00,  3.51it/s]

{'train_runtime': 607.4076, 'train_samples_per_second': 28.087, 'train_steps_per_second': 3.513, 'train_loss': 0.33379696682444665, 'epoch': 2.0}





TrainOutput(global_step=2134, training_loss=0.33379696682444665, metrics={'train_runtime': 607.4076, 'train_samples_per_second': 28.087, 'train_steps_per_second': 3.513, 'total_flos': 195974132394480.0, 'train_loss': 0.33379696682444665, 'epoch': 2.0})