In [1]:
from transformers import AutoConfig, AutoModelForMaskedLM, AutoTokenizer, \
    LineByLineTextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments,RobertaConfig
from sklearn.model_selection import train_test_split
import os

In [2]:
transformers_model_name = "hfl/chinese-roberta-wwm-ext"
corpus_file_path = "./data/corpus/all_corpus.txt"
force_download=False
cache_dir='../data/download_transformer_models'
max_sentence_len = 510
random_state=32
valid_size=0.25

In [5]:
config = AutoConfig.from_pretrained(transformers_model_name, type_vocab_size=2,force_download=force_download,cache_dir=cache_dir)

In [6]:
config

BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "directionality": "bidi",
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "type_vocab_size": 2,
  "vocab_size": 21128
}

In [7]:
tokenizer = AutoTokenizer.from_pretrained(transformers_model_name,force_download=force_download,cache_dir=cache_dir )

In [8]:
tokenizer.encode('这是transformers的微调方法')

[101,
 6821,
 3221,
 162,
 10477,
 8118,
 12725,
 8755,
 4638,
 2544,
 6444,
 3175,
 3791,
 102]

In [9]:
model = AutoModelForMaskedLM.from_pretrained(transformers_model_name, config=config,force_download=force_download,cache_dir=cache_dir)

Some weights of the model checkpoint at hfl/chinese-roberta-wwm-ext were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
dataset = LineByLineTextDataset(tokenizer=tokenizer, file_path=corpus_file_path, block_size=max_sentence_len )

In [11]:
train_set, valid_set = train_test_split(dataset, test_size=valid_size, random_state=random_state)

In [12]:
train_set[:1]

[tensor([ 101, 2573, 4403, 5826, 4567,  137,  126,  119,  687, 4562, 3416, 4649,
         5502, 2573, 4403, 5826, 4567, 1909, 2108, 1962, 1355, 8024, 1914, 6224,
          754, 5503, 5523, 1036, 4997,  511, 2573, 4403, 5826, 4567,  137,  712,
         6225,  677, 3187, 3209, 3227, 2697, 6230, 8024, 3300, 3198, 4924, 4573,
         8024, 2382,  680, 5273, 4589, 2400, 2100,  511,  102])]

In [13]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

In [14]:
training_args = TrainingArguments(
        output_dir="../data/finetune_transformer_models/",
        logging_dir='../saved/finetune_logging',
        logging_steps=500,
        overwrite_output_dir=True,
        weight_decay=0.01,
        adam_epsilon=1e-6,
        learning_rate=2e-5,
        num_train_epochs=5,
        per_device_train_batch_size=2,
        per_device_eval_batch_size=32,
        max_grad_norm=5.0,
        save_steps=1000,
        save_total_limit=1,
        gradient_accumulation_steps=32,
        evaluate_during_training=True,
        do_train=True,
        do_eval=True,
        eval_steps=1000,

    )



In [15]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_set,
    eval_dataset=valid_set,

)
trainer.train()

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=5595.0, style=ProgressStyle(description_w…

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)






HBox(children=(FloatProgress(value=0.0, description='Iteration', max=5595.0, style=ProgressStyle(description_w…

{'loss': 1.3118284912109375, 'learning_rate': 8.505747126436782e-06, 'epoch': 2.8693476318141196, 'total_flos': 21444965876755968, 'step': 500}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=5595.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=5595.0, style=ProgressStyle(description_w…





TrainOutput(global_step=870, training_loss=1.2564043417744253)