# 3. Autoencoding Language Models
自动编码语言模型

##### BERT: As one of Autoencoding Language Models

## Preparation for Google Collab

In [1]:
import os
from google.colab import drive

#  挂载 google 云盘
drive.mount("/content/drive")

print(os.getcwd())  # /content

# print(os.listdir("/content/drive/MyDrive/"))

# print(os.listdir("/content/drive/MyDrive/Colab Notebooks"))

# if os.getcwd() != "/content/drive/MyDrive":
#     os.chdir("/content/drive/MyDrive")

# print(os.getcwd())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content


In [2]:
# 提前将 requirements.txt 放在 google 云盘上
!pip install -r /content/drive/MyDrive/requirements.txt



In [3]:
subdir = "ch03a"
work_path = "/content/drive/MyDrive/" + subdir
if not os.path.exists(work_path):
    os.mkdir(work_path)
os.chdir(work_path)
print(os.getcwd())

/content/drive/MyDrive/ch03a


In [4]:
!apt-get install tree && tree -a "./"

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tree is already the newest version (2.0.2-1).
0 upgraded, 0 newly installed, 0 to remove and 23 not upgraded.
[01;34m./[0m
└── [00mIMDB.csv[0m

0 directories, 1 file


## 1 准备数据

1. 将原始数据文件 (IMDB.csv) 重新组织为 corpus.txt 文件
2. 利用 corpus.txt 数据进行分词, 生成 vocab.txt
3. 利用 vocab.txt 训练分词器
4. 利用 corpus.txt 生成数据集
5. 生成 PyTorch DataLoaders
6. 训练参数
7. 模型配置
8. 模型结构
9. 训练
10. 保存模型

In [5]:
!cat IMDB.csv

review,sentiment
"One of the other reviewers has mentioned that after watching just hooked.",positive
"A wonderful little production.",positive
"I thought this was a wonderful way to spend time on a too hot summer weekend",positive
"his parents are fighting all the time.",negative

In [6]:
import pandas as pd

imdb_df = pd.read_csv("IMDB.csv")
imdb_df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production.,positive
2,I thought this was a wonderful way to spend ti...,positive
3,his parents are fighting all the time.,negative


In [7]:
reviews = imdb_df.review.to_string(index=None)
reviews

' One of the other reviewers has mentioned that ...\n                    A wonderful little production.\n I thought this was a wonderful way to spend ti...\n            his parents are fighting all the time.'

In [8]:
with open("corpus.txt", "w") as f:
    f.writelines(reviews)

In [9]:
!tree -a "./"

[01;34m./[0m
├── [00mcorpus.txt[0m
└── [00mIMDB.csv[0m

0 directories, 2 files


In [10]:
!cat "./corpus.txt"

 One of the other reviewers has mentioned that ...
                    A wonderful little production.
 I thought this was a wonderful way to spend ti...
            his parents are fighting all the time.

## 2 tokenizers 分词

1. 将原始数据文件 (IMDB.csv) 重新组织为 corpus.txt 文件
2. 利用 corpus.txt 数据进行分词, 生成 vocab.txt
3. 利用 vocab.txt 训练分词器
4. 利用 corpus.txt 生成数据集
5. 生成 PyTorch DataLoaders
6. 训练参数
7. 模型配置
8. 模型结构
9. 训练
10. 保存模型

In [11]:
from tokenizers import BertWordPieceTokenizer

bert_wordpiece_tokenizer = BertWordPieceTokenizer()
bert_wordpiece_tokenizer

Tokenizer(vocabulary_size=0, model=BertWordPiece, unk_token=[UNK], sep_token=[SEP], cls_token=[CLS], pad_token=[PAD], mask_token=[MASK], clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=True, wordpieces_prefix=##)

In [12]:
bert_wordpiece_tokenizer.train("corpus.txt")

In [13]:
bert_wordpiece_tokenizer

Tokenizer(vocabulary_size=63, model=BertWordPiece, unk_token=[UNK], sep_token=[SEP], cls_token=[CLS], pad_token=[PAD], mask_token=[MASK], clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=True, wordpieces_prefix=##)

In [14]:
bert_wordpiece_tokenizer.get_vocab()

{'[CLS]': 2,
 '##y': 44,
 '##w': 33,
 'd': 8,
 '##r': 34,
 '[PAD]': 0,
 '##e': 26,
 't': 21,
 '[MASK]': 4,
 '##a': 41,
 '##v': 32,
 '##en': 49,
 'c': 7,
 '.': 5,
 'won': 53,
 '##on': 48,
 's': 20,
 '##c': 45,
 '##d': 31,
 '##er': 47,
 'h': 12,
 '##der': 55,
 '##i': 29,
 'g': 11,
 'n': 16,
 'v': 23,
 'wa': 52,
 'ti': 51,
 'w': 24,
 '##o': 30,
 '##gh': 57,
 'wonder': 60,
 'e': 9,
 '##m': 43,
 'o': 17,
 '[UNK]': 1,
 '##g': 40,
 '##p': 42,
 'y': 25,
 '##s': 35,
 'm': 15,
 '##ti': 50,
 'i': 13,
 '##f': 36,
 '##tion': 59,
 '##l': 38,
 '##n': 27,
 'u': 22,
 'th': 46,
 'p': 18,
 'a': 6,
 '##h': 39,
 '##fu': 56,
 '##ful': 61,
 '##u': 37,
 'wonderful': 62,
 'r': 19,
 '[SEP]': 3,
 '##t': 28,
 'l': 14,
 'the': 58,
 '##is': 54,
 'f': 10}

In [15]:
!mkdir tokenizer

In [16]:
!ls -al tokenizer

total 0


In [17]:
bert_wordpiece_tokenizer.save_model("tokenizer")

['tokenizer/vocab.txt']

In [18]:
!tree -a "./"

[01;34m./[0m
├── [00mcorpus.txt[0m
├── [00mIMDB.csv[0m
└── [01;34mtokenizer[0m
    └── [00mvocab.txt[0m

1 directory, 3 files


In [19]:
!cat "./tokenizer/vocab.txt"

[PAD]
[UNK]
[CLS]
[SEP]
[MASK]
.
a
c
d
e
f
g
h
i
l
m
n
o
p
r
s
t
u
v
w
y
##e
##n
##t
##i
##o
##d
##v
##w
##r
##s
##f
##u
##l
##h
##g
##a
##p
##m
##y
##c
th
##er
##on
##en
##ti
ti
wa
won
##is
##der
##fu
##gh
the
##tion
wonder
##ful
wonderful


##### 验证分词

In [20]:
tokenizer1 = BertWordPieceTokenizer.from_file("tokenizer/vocab.txt")
tokenizer1

Tokenizer(vocabulary_size=63, model=BertWordPiece, unk_token=[UNK], sep_token=[SEP], cls_token=[CLS], pad_token=[PAD], mask_token=[MASK], clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=True, wordpieces_prefix=##)

In [21]:
tokenized_sentence = tokenizer1.encode("Oh it works just fine")
tokenized_sentence

Encoding(num_tokens=12, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [22]:
tokenized_sentence.tokens

['[CLS]',
 'o',
 '##h',
 'i',
 '##t',
 '[UNK]',
 '[UNK]',
 'f',
 '##i',
 '##n',
 '##e',
 '[SEP]']

In [23]:
tokenized_sentence = tokenizer1.encode("ohoh i thougt it might be workingg well")
tokenized_sentence

Encoding(num_tokens=24, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [24]:
tokenized_sentence.tokens

['[CLS]',
 'o',
 '##h',
 '##o',
 '##h',
 'i',
 'th',
 '##o',
 '##u',
 '##g',
 '##t',
 'i',
 '##t',
 'm',
 '##i',
 '##gh',
 '##t',
 '[UNK]',
 '[UNK]',
 'w',
 '##e',
 '##l',
 '##l',
 '[SEP]']

In [25]:
!tree -a "./"

[01;34m./[0m
├── [00mcorpus.txt[0m
├── [00mIMDB.csv[0m
└── [01;34mtokenizer[0m
    └── [00mvocab.txt[0m

1 directory, 3 files


## 3 训练分词器

1. 将原始数据文件 (IMDB.csv) 重新组织为 corpus.txt 文件
2. 利用 corpus.txt 数据进行分词, 生成 vocab.txt
3. 利用 vocab.txt 训练分词器
4. 利用 corpus.txt 生成数据集
5. 生成 PyTorch DataLoaders
6. 训练参数
7. 模型配置
8. 模型结构
9. 训练
10. 保存模型

In [26]:
!tree -a "./"

[01;34m./[0m
├── [00mcorpus.txt[0m
├── [00mIMDB.csv[0m
└── [01;34mtokenizer[0m
    └── [00mvocab.txt[0m

1 directory, 3 files


In [27]:
from transformers import BertTokenizerFast

tokenizer2 = BertTokenizerFast.from_pretrained("tokenizer")
tokenizer2

BertTokenizerFast(name_or_path='tokenizer', vocab_size=63, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [28]:
tokenizer2("ohoh i thougt it might be workingg well")

{'input_ids': [2, 17, 39, 30, 39, 13, 46, 30, 37, 40, 28, 13, 28, 15, 29, 57, 28, 1, 1, 24, 26, 38, 38, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [29]:
tokenizer2.save_pretrained("tokenizer2")

('tokenizer2/tokenizer_config.json',
 'tokenizer2/special_tokens_map.json',
 'tokenizer2/vocab.txt',
 'tokenizer2/added_tokens.json',
 'tokenizer2/tokenizer.json')

In [30]:
!tree -a ./

[01;34m./[0m
├── [00mcorpus.txt[0m
├── [00mIMDB.csv[0m
├── [01;34mtokenizer[0m
│   └── [00mvocab.txt[0m
└── [01;34mtokenizer2[0m
    ├── [00mspecial_tokens_map.json[0m
    ├── [00mtokenizer_config.json[0m
    ├── [00mtokenizer.json[0m
    └── [00mvocab.txt[0m

2 directories, 7 files


## 4 数据集

1. 将原始数据文件 (IMDB.csv) 重新组织为 corpus.txt 文件
2. 利用 corpus.txt 数据进行分词, 生成 vocab.txt
3. 利用 vocab.txt 训练分词器
4. 利用 corpus.txt 生成数据集
5. 生成 PyTorch DataLoaders
6. 训练参数
7. 模型配置
8. 模型结构
9. 训练
10. 保存模型

In [31]:
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=tokenizer2, file_path="corpus.txt", block_size=128
)
# dir(dataset)
# dataset.examples
dataset



<transformers.data.datasets.language_modeling.LineByLineTextDataset at 0x7a3188856950>

In [32]:
dataset[0]

{'input_ids': tensor([ 2, 17, 27, 26, 17, 36, 58, 17, 28, 39, 47, 19, 26, 32, 29, 26, 33, 47,
         35, 12, 41, 35, 15, 49, 59, 26, 31, 46, 41, 28,  5,  5,  5,  3])}

## 5 PyTorch DataLoaders

1. 将原始数据文件 (IMDB.csv) 重新组织为 corpus.txt 文件
2. 利用 corpus.txt 数据进行分词, 生成 vocab.txt
3. 利用 vocab.txt 训练分词器
4. 利用 corpus.txt 生成数据集
5. 生成 PyTorch DataLoaders
6. 训练参数
7. 模型配置
8. 模型结构
9. 训练
10. 保存模型

In [33]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer2, mlm=True, mlm_probability=0.15
)
data_collator

DataCollatorForLanguageModeling(tokenizer=BertTokenizerFast(name_or_path='tokenizer', vocab_size=63, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
), mlm=True, mlm_probability=0.15, pad_to_multiple_of=None, tf_experimental_co

## 6 TrainingArguments

1. 将原始数据文件 (IMDB.csv) 重新组织为 corpus.txt 文件
2. 利用 corpus.txt 数据进行分词, 生成 vocab.txt
3. 利用 vocab.txt 训练分词器
4. 利用 corpus.txt 生成数据集
5. 生成 PyTorch DataLoaders
6. 训练参数
7. 模型配置
8. 模型结构
9. 训练
10. 保存模型

In [34]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="checkout_point",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=128,
    report_to=[]
)
training_args

TrainingArguments(
_n_gpu=0,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=IntervalStrategy.NO,
eval_us

## 7 BertConfig

1. 将原始数据文件 (IMDB.csv) 重新组织为 corpus.txt 文件
2. 利用 corpus.txt 数据进行分词, 生成 vocab.txt
3. 利用 vocab.txt 训练分词器
4. 利用 corpus.txt 生成数据集
5. 生成 PyTorch DataLoaders
6. 训练参数
7. 模型配置
8. 模型结构
9. 训练
10. 保存模型

In [35]:
from transformers import BertConfig, BertForMaskedLM
config = BertConfig()
config

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.48.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

## 8 模型结构

1. 将原始数据文件 (IMDB.csv) 重新组织为 corpus.txt 文件
2. 利用 corpus.txt 数据进行分词, 生成 vocab.txt
3. 利用 vocab.txt 训练分词器
4. 利用 corpus.txt 生成数据集
5. 生成 PyTorch DataLoaders
6. 训练参数
7. 模型配置
8. 模型结构
9. 训练
10. 保存模型

In [36]:
# 模型没有使用任何已经训练好的权重, 参数都是随机初始化的
bert = BertForMaskedLM(config)
bert

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

## 9 Trainer

1. 将原始数据文件 (IMDB.csv) 重新组织为 corpus.txt 文件
2. 利用 corpus.txt 数据进行分词, 生成 vocab.txt
3. 利用 vocab.txt 训练分词器
4. 利用 corpus.txt 生成数据集
5. 生成 PyTorch DataLoaders
6. 训练参数
7. 模型配置
8. 模型结构
9. 训练
10. 保存模型

In [37]:
from transformers import Trainer

trainer = Trainer(
    model=bert, args=training_args, data_collator=data_collator, train_dataset=dataset
)
trainer

<transformers.trainer.Trainer at 0x7a31601ce010>

In [38]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=1, training_loss=10.156402587890625, metrics={'train_runtime': 31.4171, 'train_samples_per_second': 0.127, 'train_steps_per_second': 0.032, 'total_flos': 69913778400.0, 'train_loss': 10.156402587890625, 'epoch': 1.0})

## 10 保存模型

1. 将原始数据文件 (IMDB.csv) 重新组织为 corpus.txt 文件
2. 利用 corpus.txt 数据进行分词, 生成 vocab.txt
3. 利用 vocab.txt 训练分词器
4. 利用 corpus.txt 生成数据集
5. 生成 PyTorch DataLoaders
6. 训练参数
7. 模型配置
8. 模型结构
9. 训练
10. 保存模型

In [39]:
trainer.save_model("BertOut")

In [40]:
!tree -a "./"

[01;34m./[0m
├── [01;34mBertOut[0m
│   ├── [00mconfig.json[0m
│   ├── [00mgeneration_config.json[0m
│   ├── [00mmodel.safetensors[0m
│   └── [00mtraining_args.bin[0m
├── [01;34mcheckout_point[0m
│   └── [01;34mcheckpoint-1[0m
│       ├── [00mconfig.json[0m
│       ├── [00mgeneration_config.json[0m
│       ├── [00mmodel.safetensors[0m
│       ├── [00moptimizer.pt[0m
│       ├── [00mrng_state.pth[0m
│       ├── [00mscheduler.pt[0m
│       ├── [00mtrainer_state.json[0m
│       └── [00mtraining_args.bin[0m
├── [00mcorpus.txt[0m
├── [00mIMDB.csv[0m
├── [01;34mtokenizer[0m
│   └── [00mvocab.txt[0m
└── [01;34mtokenizer2[0m
    ├── [00mspecial_tokens_map.json[0m
    ├── [00mtokenizer_config.json[0m
    ├── [00mtokenizer.json[0m
    └── [00mvocab.txt[0m

5 directories, 19 files


## 使用不同参数的 BertConfig 实例化模型并训练

In [41]:
from transformers import BertConfig

config = BertConfig()
config

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.48.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [42]:
tiny_bert_config = BertConfig(
    max_position_embeddings=512,
    hidden_size=128,
    num_attention_heads=2,
    num_hidden_layers=2,
    intermediate_size=512,
)
tiny_bert_config

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 128,
  "initializer_range": 0.02,
  "intermediate_size": 512,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 2,
  "num_hidden_layers": 2,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.48.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [43]:
tiny_bert = BertForMaskedLM(tiny_bert_config)
trainer = Trainer(
    model=tiny_bert,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)
trainer.train()

Step,Training Loss


TrainOutput(global_step=1, training_loss=10.408717155456543, metrics={'train_runtime': 0.9756, 'train_samples_per_second': 4.1, 'train_steps_per_second': 1.025, 'total_flos': 362377440.0, 'train_loss': 10.408717155456543, 'epoch': 1.0})

## tensorflow

In [44]:
from transformers import TFBertModel, BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
tokenizer

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


BertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [45]:
bert = TFBertModel.from_pretrained("bert-base-uncased")
bert

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

<transformers.models.bert.modeling_tf_bert.TFBertModel at 0x7a3160d74590>

In [46]:
bert.layers

[<transformers.models.bert.modeling_tf_bert.TFBertMainLayer at 0x7a315f6c7fd0>]

In [47]:
tokenized_text = tokenizer.batch_encode_plus(
    ["hello how is it going with you", "lets test it"],
    return_tensors="tf",
    max_length=256,
    truncation=True,
    pad_to_max_length=True,
)
tokenized_text



{'input_ids': <tf.Tensor: shape=(2, 256), dtype=int32, numpy=
array([[  101,  7592,  2129,  2003,  2009,  2183,  2007,  2017,   102,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0, 

In [48]:
bert(tokenized_text)

TFBaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=<tf.Tensor: shape=(2, 256, 768), dtype=float32, numpy=
array([[[ 1.00471482e-01,  6.77027702e-02, -8.33593458e-02, ...,
         -4.93304521e-01,  1.16539404e-01,  2.26647347e-01],
        [ 3.23624790e-01,  3.70718509e-01,  6.14685833e-01, ...,
         -6.27267718e-01,  3.79083008e-01,  7.05303252e-02],
        [ 1.99533507e-01, -8.75509262e-01, -6.47860318e-02, ...,
         -1.28087141e-02,  3.07651460e-01, -2.07329299e-02],
        ...,
        [-6.53300136e-02,  1.19046159e-01,  5.76847076e-01, ...,
         -2.95460641e-01,  2.49744691e-02,  1.13964200e-01],
        [-2.64715314e-01, -7.86391348e-02,  5.47280669e-01, ...,
         -1.37515366e-01, -5.94692305e-02, -5.17934039e-02],
        [-2.44958907e-01, -1.14799649e-01,  5.92174232e-01, ...,
         -1.56881928e-01, -3.39757986e-02, -8.46138969e-02]],

       [[ 2.94559058e-02,  2.30868489e-01,  2.92651713e-01, ...,
         -1.30421668e-01,  1.89659417e-01,  

In [49]:
from tensorflow import keras
import tensorflow as tf

max_length = 256
tokens = keras.layers.Input(shape=(max_length,), dtype=tf.dtypes.int32)
masks = keras.layers.Input(shape=(max_length,), dtype=tf.dtypes.int32)
embedding_layer = bert.layers[0]([tokens, masks])[0][:, 0, :]
dense = tf.keras.layers.Dense(units=2, activation="softmax")(embedding_layer)

model = keras.Model([tokens, masks], dense)
model

<tf_keras.src.engine.functional.Functional at 0x7a314acc2350>

In [50]:
tokenized = tokenizer.batch_encode_plus(
    ["hello how is it going with you", "hello how is it going with you"],
    return_tensors="tf",
    max_length=max_length,
    truncation=True,
    pad_to_max_length=True,
)
tokenized

{'input_ids': <tf.Tensor: shape=(2, 256), dtype=int32, numpy=
array([[ 101, 7592, 2129, 2003, 2009, 2183, 2007, 2017,  102,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0, 

In [51]:
model([tokenized["input_ids"], tokenized["attention_mask"]])

<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
array([[0.61480767, 0.38519228],
       [0.61480767, 0.38519228]], dtype=float32)>

In [52]:
model.compile(optimizer="Adam", loss="categorical_crossentropy", metrics=["accuracy"])
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 256)]                0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 256)]                0         []                            
                                                                                                  
 bert (TFBertMainLayer)      TFBaseModelOutputWithPooli   1094822   ['input_1[0][0]',             
                             ngAndCrossAttentions(last_   40         'input_2[0][0]']             
                             hidden_state=(None, 256, 7                                           
                             68),                                                             

In [53]:
model.layers[2].trainable = False

In [54]:
import pandas as pd

imdb_df = pd.read_csv("IMDB.csv")
reviews = list(imdb_df.review)
tokenized_reviews = tokenizer.batch_encode_plus(
    reviews,
    return_tensors="tf",
    max_length=max_length,
    truncation=True,
    pad_to_max_length=True,
)

import numpy as np

train_split = int(0.8 * len(tokenized_reviews["attention_mask"]))
train_tokens = tokenized_reviews["input_ids"][:train_split]
test_tokens = tokenized_reviews["input_ids"][train_split:]
train_masks = tokenized_reviews["attention_mask"][:train_split]
test_masks = tokenized_reviews["attention_mask"][train_split:]
sentiments = list(imdb_df.sentiment)
labels = np.array(
    [[0, 1] if sentiment == "positive" else [1, 0] for sentiment in sentiments]
)
train_labels = labels[:train_split]
test_labels = labels[train_split:]



In [55]:
# 模型没有任何参数, 从头开始训练
model.fit([train_tokens, train_masks], train_labels, epochs=5)

Epoch 1/5




Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tf_keras.src.callbacks.History at 0x7a315dd0de90>

In [56]:
!tree -a "./"

[01;34m./[0m
├── [01;34mBertOut[0m
│   ├── [00mconfig.json[0m
│   ├── [00mgeneration_config.json[0m
│   ├── [00mmodel.safetensors[0m
│   └── [00mtraining_args.bin[0m
├── [01;34mcheckout_point[0m
│   └── [01;34mcheckpoint-1[0m
│       ├── [00mconfig.json[0m
│       ├── [00mgeneration_config.json[0m
│       ├── [00mmodel.safetensors[0m
│       ├── [00moptimizer.pt[0m
│       ├── [00mrng_state.pth[0m
│       ├── [00mscheduler.pt[0m
│       ├── [00mtrainer_state.json[0m
│       └── [00mtraining_args.bin[0m
├── [00mcorpus.txt[0m
├── [00mIMDB.csv[0m
├── [01;34mtokenizer[0m
│   └── [00mvocab.txt[0m
└── [01;34mtokenizer2[0m
    ├── [00mspecial_tokens_map.json[0m
    ├── [00mtokenizer_config.json[0m
    ├── [00mtokenizer.json[0m
    └── [00mvocab.txt[0m

5 directories, 19 files
