In [40]:
import transformers
from datasets import load_dataset
import tokenizers

In [9]:

imdb_dataset = load_dataset("stanfordnlp/imdb")
split = imdb_dataset['train'].train_test_split(train_size=0.8, seed=42)
imdb_train_set, imdb_valid_set = split['train'], split['test']
imdb_test_set = imdb_dataset['test']

train_reviews = [review['text'] for review in imdb_train_set]


### GPT tokenizer

In [30]:
gpt2_tokenizer = transformers.AutoTokenizer.from_pretrained("gpt2")
gpt_encoding = gpt2_tokenizer(train_reviews[:3], truncation=True, max_length=500)
gpt_encoding #attributes: (input_ids, attention_mask)


{'input_ids': [[29391, 35030, 1690, 423, 257, 1688, 8046, 13, 1119, 1690, 1282, 503, 2045, 588, 257, 2646, 4676, 373, 2391, 4624, 319, 262, 3800, 357, 16678, 355, 366, 24732, 10584, 11074, 35727, 18348, 316, 338, 4571, 7622, 262, 2646, 6776, 11, 543, 318, 2592, 2408, 1201, 262, 4286, 4438, 683, 645, 1103, 4427, 13, 7831, 11, 340, 338, 3621, 284, 804, 379, 329, 644, 340, 318, 13, 383, 16585, 1022, 3899, 327, 5718, 290, 12803, 29030, 303, 318, 2407, 10457, 13, 383, 17262, 286, 511, 2776, 389, 6452, 13, 327, 5718, 318, 9623, 355, 1464, 11, 290, 29030, 303, 3011, 530, 286, 465, 1178, 8395, 284, 1107, 719, 29847, 1671, 1220, 6927, 1671, 11037, 40, 22127, 326, 314, 1053, 1239, 1775, 314, 430, 32325, 338, 711, 11, 475, 314, 3285, 326, 9180, 4332, 261, 9659, 338, 16711, 318, 17074, 13, 383, 4226, 318, 8131, 47370, 11, 290, 7622, 345, 25260, 13, 366, 20148, 46670, 1, 318, 281, 36005, 17774, 2646, 11, 290, 318, 7151, 329, 3016, 477, 3296, 286, 3800, 290, 3159, 29847, 1671, 1220, 6927, 1671, 1103

In [28]:
gpt_token_ids = gpt_encoding['input_ids'][0][:10]
print(gpt_token_ids)
gpt2_tokenizer.decode(gpt_token_ids)

[29391, 35030, 1690, 423, 257, 1688, 8046, 13, 1119, 1690]


'Stage adaptations often have a major fault. They often'

### BERT tokenizer

In [31]:
bert_tokenizer = transformers.AutoTokenizer.from_pretrained("bert-base-uncased")
bert_encoding = bert_tokenizer(train_reviews[:3], padding=True, truncation=True, max_lenth=500, return_tensors="pt")


In [39]:
print(bert_encoding["input_ids"][0][:10])
print(bert_encoding["attention_mask"][0][:10])

"""Notice that each token ID sequence starts with token 101 ([CLS]), and ends
with token 102 ([SEP]) (ignoring padding tokens)."""

"""drop special token [CLS] and [SEP]"""
bert_encoding_1 = bert_tokenizer(train_reviews[:3],
                                padding=True,
                                truncation=True,
                                max_lenth=500,
                                return_tensors="pt",
                                add_special_tokens=False)

print(bert_encoding_1["input_ids"][0][:10])

tensor([  101,  2754, 17241,  2411,  2031,  1037,  2350,  6346,  1012,  2027])
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
tensor([ 2754, 17241,  2411,  2031,  1037,  2350,  6346,  1012,  2027,  2411])


### Wrap own tokenizer to have same api

In [45]:
bpe_model = tokenizers.models.BPE(unk_token="<unk>")
bpe_tokenizer = tokenizers.Tokenizer(bpe_model)
"""分词前先用空格分词"""
# bpe_tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.Whitespace()
"""把所有空格替换成Ġ"""
bpe_tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.ByteLevel()
special_tokens = ['<pad>', "<unk>"]
bpe_trainer = tokenizers.trainers.BpeTrainer(vocab_size=1000, special_tokens=special_tokens)
train_reviews = [review["text"].lower() for review in imdb_train_set]
bpe_tokenizer.train_from_iterator(iterator=train_reviews, trainer=bpe_trainer)

bpe_tokenizer.enable_padding(pad_id=0, pad_token="<pad>")
bpe_tokenizer.enable_truncation(max_length=500)







In [51]:
hf_tokenizer = transformers.PreTrainedTokenizerFast(tokenizer_object=bpe_tokenizer)

hf_encodings = hf_tokenizer(train_reviews[:3], padding=True)

hf_encodings['input_ids'][0][:10]

[196, 499, 460, 40, 351, 862, 159, 59, 146, 264]