In [1]:
import warnings
warnings.simplefilter('ignore')

import os
import re
import glob

import numpy as np
import pandas as pd

from transformers import (AutoModel, AutoModelForMaskedLM,
                          AutoTokenizer, LineByLineTextDataset,
                          DataCollatorForLanguageModeling,
                          Trainer, TrainingArguments)

In [2]:
text_files = glob.glob('juben/*/*.txt')
text_list = list()
for f in text_files:
    with open(f, 'r') as handler:
        texts = handler.read().split('\n')
        for text in texts:
            text = text.strip()
            if len(text) > 0 and (not re.match(r'[0-9]', text)):
                text_list.append(text)

In [3]:
text_str = '\n'.join(text_list)
with open('juben/text.txt', 'w') as handler:
    handler.write(text_str)

In [4]:
model_name = 'hfl/chinese-roberta-wwm-ext'

model = AutoModelForMaskedLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.save_pretrained('./roberta-wwm-ext-pretrain')

Some weights of the model checkpoint at hfl/chinese-roberta-wwm-ext were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


('./roberta-wwm-ext-pretrain/tokenizer_config.json',
 './roberta-wwm-ext-pretrain/special_tokens_map.json',
 './roberta-wwm-ext-pretrain/vocab.txt',
 './roberta-wwm-ext-pretrain/added_tokens.json',
 './roberta-wwm-ext-pretrain/tokenizer.json')

In [6]:
train_dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="juben/text.txt",  # mention train text file here
    block_size=256)

valid_dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="juben/text.txt",  # mention valid text file here
    block_size=256)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

In [7]:
training_args = TrainingArguments(
    output_dir="./roberta-wwm-ext-pretrain",  # select model path for checkpoint
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    # gradient_accumulation_steps=2,
    evaluation_strategy='steps',
    save_total_limit=2,
    eval_steps=5000,
    save_steps=5000,
    metric_for_best_model='eval_loss',
    greater_is_better=False,
    load_best_model_at_end=True,
    prediction_loss_only=True,
    report_to="none")

In [8]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset)

trainer.train()

***** Running training *****
  Num examples = 52793
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 9900


Step,Training Loss,Validation Loss
5000,1.5535,1.42022


***** Running Evaluation *****
  Num examples = 52793
  Batch size = 16
Saving model checkpoint to ./roberta-wwm-ext-pretrain/checkpoint-5000
Configuration saved in ./roberta-wwm-ext-pretrain/checkpoint-5000/config.json
Model weights saved in ./roberta-wwm-ext-pretrain/checkpoint-5000/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./roberta-wwm-ext-pretrain/checkpoint-5000 (score: 1.4202196598052979).


TrainOutput(global_step=9900, training_loss=1.595300934146149, metrics={'train_runtime': 1576.0117, 'train_samples_per_second': 100.494, 'train_steps_per_second': 6.282, 'total_flos': 6458052129123264.0, 'train_loss': 1.595300934146149, 'epoch': 3.0})

In [9]:
trainer.save_model('./roberta-wwm-ext-pretrain')

Saving model checkpoint to ./roberta-wwm-ext-pretrain
Configuration saved in ./roberta-wwm-ext-pretrain/config.json
Model weights saved in ./roberta-wwm-ext-pretrain/pytorch_model.bin
