# transformers

In [1]:
!pip freeze > requirements.txt

In [2]:
import transformers
import torch

In [3]:
transformers.__version__

'4.48.3'

## pipeline

In [4]:
from transformers import pipeline

# help(pipeline)

In [5]:
classifier = pipeline("sentiment-analysis")
classifier

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cpu


<transformers.pipelines.text_classification.TextClassificationPipeline at 0x7c8af557f210>

In [6]:
# help(classifier)

In [7]:
classifier(
    ["We are very happy to show you the Transformers library.", "I hate this so much!"]
)

[{'label': 'POSITIVE', 'score': 0.9997994303703308},
 {'label': 'NEGATIVE', 'score': 0.9994558691978455}]

In [8]:
!ls -al ~/.cache/huggingface/hub/models--distilbert--distilbert-base-uncased-finetuned-sst-2-english

total 24
drwxr-xr-x 6 root root 4096 Feb 24 05:52 .
drwxr-xr-x 4 root root 4096 Feb 24 05:52 ..
drwxr-xr-x 2 root root 4096 Feb 24 05:52 blobs
drwxr-xr-x 3 root root 4096 Feb 24 05:52 .no_exist
drwxr-xr-x 2 root root 4096 Feb 24 05:52 refs
drwxr-xr-x 3 root root 4096 Feb 24 05:52 snapshots


## AutoTokenizer

In [9]:
from transformers import AutoTokenizer

# help(AutoTokenizer)

In [10]:
tokenizer = AutoTokenizer.from_pretrained(
    "distilbert/distilbert-base-uncased-finetuned-sst-2-english"
)
tokenizer

DistilBertTokenizerFast(name_or_path='distilbert/distilbert-base-uncased-finetuned-sst-2-english', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [11]:
# help(tokenizer)

In [12]:
raw_inputs = [
    "We are very happy to show you the Transformers library.",
    "I hate this so much!",
]
inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")
inputs

{'input_ids': tensor([[  101,  2057,  2024,  2200,  3407,  2000,  2265,  2017,  1996, 19081,
          3075,  1012,   102],
        [  101,  1045,  5223,  2023,  2061,  2172,   999,   102,     0,     0,
             0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]])}

### tokenizer 参数

* padding 参数的含义是使多个输入文本长度一致, 在短文本后面追加 0
* truncation 参数的含义是截断, 截断按照指定的长度或者一定的规则进行截断
* return_tensors

### tokenizer

* input_ids
* attention_mask

### decode

In [13]:
tokenizer.decode(
    [101, 2057, 2024, 2200, 3407, 2000, 2265, 2017, 1996, 19081, 3075, 1012, 102]
)

'[CLS] we are very happy to show you the transformers library. [SEP]'

In [14]:
tokenizer.decode([101, 1045, 5223, 2023, 2061, 2172, 999, 102, 0, 0, 0, 0, 0])

'[CLS] i hate this so much! [SEP] [PAD] [PAD] [PAD] [PAD] [PAD]'

## model

In [15]:
from transformers import AutoModel

model = AutoModel.from_pretrained(
    "distilbert/distilbert-base-uncased-finetuned-sst-2-english"
)
model

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): DistilBertSdpaAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): L

In [16]:
model.config

DistilBertConfig {
  "_attn_implementation_autoset": true,
  "_name_or_path": "distilbert/distilbert-base-uncased-finetuned-sst-2-english",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "finetuning_task": "sst-2",
  "hidden_dim": 3072,
  "id2label": {
    "0": "NEGATIVE",
    "1": "POSITIVE"
  },
  "initializer_range": 0.02,
  "label2id": {
    "NEGATIVE": 0,
    "POSITIVE": 1
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.48.3",
  "vocab_size": 30522
}

In [17]:
raw_inputs = [
    "We are very happy to show you the Transformers library.",
    "I hate this so much!",
]
inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")

with torch.no_grad():
    output = model(**inputs)

print("output:", output)

output: BaseModelOutput(last_hidden_state=tensor([[[ 0.6602,  0.3654,  0.2995,  ...,  0.6371,  0.8724, -0.4947],
         [ 1.0887,  0.4034,  0.3167,  ...,  0.3810,  1.3005, -0.3504],
         [ 0.9669,  0.4830,  0.3624,  ...,  0.3719,  0.9534, -0.3543],
         ...,
         [ 0.7038,  0.5192,  0.8491,  ...,  0.7808,  0.3867,  0.0103],
         [ 1.1114,  0.2558,  0.5603,  ...,  0.7455,  0.5657, -0.7366],
         [ 1.2035,  0.3187,  0.5993,  ...,  0.7357,  0.4185, -0.6384]],

        [[-0.2937,  0.7283, -0.1497,  ..., -0.1187, -1.0227, -0.0422],
         [-0.2206,  0.9384, -0.0951,  ..., -0.3643, -0.6605,  0.2407],
         [-0.1536,  0.8988, -0.0728,  ..., -0.2189, -0.8528,  0.0710],
         ...,
         [-0.1761,  0.9389,  0.0433,  ..., -0.0927, -0.8482, -0.1296],
         [-0.2201,  0.8474, -0.0195,  ..., -0.0863, -0.8485, -0.1036],
         [-0.2319,  0.8268, -0.0312,  ..., -0.0764, -0.8509, -0.1043]]]), hidden_states=None, attentions=None)


In [18]:
output.last_hidden_state.shape

torch.Size([2, 13, 768])

### AutoModelForSequenceClassification

In [19]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased-finetuned-sst-2-english"
)
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [20]:
raw_inputs = [
    "We are very happy to show you the Transformers library.",
    "I hate this so much!",
]
inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")

with torch.no_grad():
    output = model(**inputs)

print("output:", output)
print("output.logits:", output.logits)

output: SequenceClassifierOutput(loss=None, logits=tensor([[-4.1329,  4.3811],
        [ 4.1692, -3.3464]]), hidden_states=None, attentions=None)
output.logits: tensor([[-4.1329,  4.3811],
        [ 4.1692, -3.3464]])


### padding 的作用


In [21]:
sequence1_ids = [[200, 200, 200]]
sequence2_ids = [[200, 200]]

batched_id = [[200, 200, 200], [200, 200, tokenizer.pad_token_type_id]]

print(model(torch.tensor(sequence1_ids)).logits)
print(model(torch.tensor(sequence2_ids)).logits)
print(model(torch.tensor(batched_id)).logits)

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


tensor([[ 1.5694, -1.3895]], grad_fn=<AddmmBackward0>)
tensor([[ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)
tensor([[ 1.5694, -1.3895],
        [ 1.3374, -1.2163]], grad_fn=<AddmmBackward0>)


### attention_mask 的作用

In [22]:
sequence1_ids = [[200, 200, 200]]
sequence2_ids = [[200, 200]]

batched_id = [[200, 200, 200], [200, 200, tokenizer.pad_token_type_id]]
attention_mask = [[1, 1, 1], [1, 1, 0]]
print(model(torch.tensor(sequence1_ids)).logits)
print(model(torch.tensor(sequence2_ids)).logits)
print(
    model(torch.tensor(batched_id), attention_mask=torch.tensor(attention_mask)).logits
)

tensor([[ 1.5694, -1.3895]], grad_fn=<AddmmBackward0>)
tensor([[ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)
tensor([[ 1.5694, -1.3895],
        [ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)


### 不同的 padding 方法

```python
 |          padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
 |              Activates and controls padding. Accepts the following values:
 |      
 |              - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
 |                sequence if provided).
 |              - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
 |                acceptable input length for the model if that argument is not provided.
 |              - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
 |                lengths).
```

In [23]:
from transformers.utils import PaddingStrategy

PaddingStrategy

In [24]:
raw_inputs = [
    "We are very happy to show you the Transformers library.",
    "I hate this so much!",
]
inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")
inputs

{'input_ids': tensor([[  101,  2057,  2024,  2200,  3407,  2000,  2265,  2017,  1996, 19081,
          3075,  1012,   102],
        [  101,  1045,  5223,  2023,  2061,  2172,   999,   102,     0,     0,
             0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]])}

In [25]:
raw_inputs = [
    "We are very happy to show you the Transformers library.",
    "I hate this so much!",
]
inputs = tokenizer(raw_inputs, padding="longest", truncation=True, return_tensors="pt")
inputs

{'input_ids': tensor([[  101,  2057,  2024,  2200,  3407,  2000,  2265,  2017,  1996, 19081,
          3075,  1012,   102],
        [  101,  1045,  5223,  2023,  2061,  2172,   999,   102,     0,     0,
             0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]])}

In [26]:
raw_inputs = [
    "We are very happy to show you the Transformers library.",
    "I hate this so much!",
]
inputs = tokenizer(
    raw_inputs, padding="max_length", truncation=True, return_tensors="pt"
)
inputs

{'input_ids': tensor([[ 101, 2057, 2024,  ...,    0,    0,    0],
        [ 101, 1045, 5223,  ...,    0,    0,    0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}