## 1. tokenizer, 构造输入
- tokenizer, model: 相匹配, tokenizer outputs => model input
- tokenizer
    - tokenizer.encode = tokenizer.tokenize + tokenizer.convert_tokens_to_ids + `[CLS]` and `[SEP]`
    - tokenizer.decode
    - tokenizer 工作原理其实是tokenizer.vovab
    - attention

In [1]:
test_sentences = ['today is not that bad', 'today is so bad', 'I am so happy']
model_name = 'distilbert-base-uncased-finetuned-sst-2-english'

In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [4]:
batch_input = tokenizer(test_sentences, truncation=True, max_length=256, padding=True, return_tensors='pt') # 一次性处理一个string list的话则需要设定后面的参数

In [5]:
batch_input

{'input_ids': tensor([[ 101, 2651, 2003, 2025, 2008, 2919,  102],
        [ 101, 2651, 2003, 2061, 2919,  102,    0],
        [ 101, 1045, 2572, 2061, 3407,  102,    0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 0]])}

In [6]:
# tokenizer, word to bert input. 
tokenizer(test_sentences[0])  # 一次性处理一个句子的话，则不需要添加参数

{'input_ids': [101, 2651, 2003, 2025, 2008, 2919, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [7]:
# encode, word to index
# Auto add [CLS] and [SEP]
ids = tokenizer.encode(test_sentences[0]) 
ids 

[101, 2651, 2003, 2025, 2008, 2919, 102]

In [8]:
tokens = tokenizer.tokenize(test_sentences[0]) 
tokens

['today', 'is', 'not', 'that', 'bad']

In [9]:
tokenizer.convert_tokens_to_ids(tokens) # without [CLS] and [SEP]

[2651, 2003, 2025, 2008, 2919]

In [10]:
tokenizer.decode([101, 2651, 2003, 2025, 2008, 2919, 102])

'[CLS] today is not that bad [SEP]'

In [11]:
tokenizer.convert_tokens_to_ids([special for special in tokenizer.special_tokens_map.values()])

[100, 102, 0, 101, 103]

In [12]:
tokenizer(test_sentences, max_length=32, truncation=True, padding='max_length', return_tensors='pt')
tokenizer(test_sentences, truncation=True, padding=True, return_tensors='pt')

{'input_ids': tensor([[ 101, 2651, 2003, 2025, 2008, 2919,  102],
        [ 101, 2651, 2003, 2061, 2919,  102,    0],
        [ 101, 1045, 2572, 2061, 3407,  102,    0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 0]])}

## 2、 模型构造

In [13]:
model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [14]:
model(**batch_input)

SequenceClassifierOutput(loss=None, logits=tensor([[-3.4620,  3.6118],
        [ 4.7508, -3.7899],
        [-4.3472,  4.6912]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

## 3、一个文本语料

- newsgroups_train.DESCR
- newsgroups_train.data
- newsgroups_train.target
- newsgroups_train.target_names

In [15]:
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train')

In [16]:
len(newsgroups_train.target)

11314

In [17]:
len(newsgroups_train.data)

11314

In [18]:
# 统计
from collections import Counter
Counter(newsgroups_train.target)

Counter({10: 600,
         15: 599,
         8: 598,
         9: 597,
         11: 595,
         7: 594,
         13: 594,
         14: 593,
         5: 593,
         2: 591,
         12: 591,
         3: 590,
         6: 585,
         1: 584,
         4: 578,
         17: 564,
         16: 546,
         0: 480,
         18: 465,
         19: 377})