In [11]:
import sys
sys.path.append("..")
import thirdparty
from thirdparty import *
from transformers import BertTokenizer, BartForConditionalGeneration, Text2TextGenerationPipeline
from transformers import HfArgumentParser, TrainingArguments, Trainer, set_seed
from datasets import load_dataset, Dataset
os.environ['CUDA_VISIBLE_DEVICES']="0"
os.environ["WANDB_DISABLED"] = "true"

# 模型导入

In [2]:
model_dir = "/root/autodl-tmp/model/"
model_name =  "bart4csc-base-chinese"
tokenizer = BertTokenizer.from_pretrained(os.path.join(model_dir,model_name))
model = BartForConditionalGeneration.from_pretrained(os.path.join(model_dir,model_name))

  return self.fget.__get__(instance, owner)()


# 模型测试

In [3]:
# 进行文本纠错
def correct_text(text, tokenizer, model):
    input_ids = tokenizer.encode(text, return_tensors="pt")
    output_ids = model.generate(input_ids, max_length=128, num_beams=4, early_stopping=True)
    corrected_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return corrected_text
# 示例
test_sentence = "妈麻"
corrected_sentence = correct_text(test_sentence, tokenizer, model)
print(corrected_sentence)

妈 吗


# 导入数据

In [4]:
def read_data(path,num,test_num):
    index=0
    lines=[]
    with open(path) as f:
        for line in f.readlines():
            lines.append(eval(line.strip()))
            index+=1
            if(index == num): break
        # lines=[eval(line.strip()) for line in f.readlines()]
    lines=[s[0].replace("\t","")+"\t"+s[1].replace("\t","") for s in lines]
    return {'text':lines[test_num:num]},{'text':lines[0:test_num]}
data1,data2=read_data("/root/autodl-tmp/data/text-correct/train_data",505000,5000)

In [5]:
def tokenize_dataset(tokenizer, dataset, max_len):
    def convert_to_features(example_batch):
        src_texts = []
        trg_texts = []
        for example in example_batch['text']:
            terms = example.split('\t', 1)
            src_texts.append(terms[0])
            trg_texts.append(terms[1])
        input_encodings = tokenizer.batch_encode_plus(
            src_texts,
            truncation=True,
            padding='max_length',
            max_length=max_len,
        )
        target_encodings = tokenizer.batch_encode_plus(
            trg_texts,
            truncation=True,
            padding='max_length',
            max_length=max_len,
        )

        encodings = {
            'input_ids': input_encodings['input_ids'],
            'labels': target_encodings['input_ids']
        }
        return encodings
    dataset = dataset.map(convert_to_features, batched=True)
    dataset = dataset.remove_columns(['text'])
    return dataset

In [6]:
train_dataset = Dataset.from_dict(data1, split='train')
train_data = tokenize_dataset(tokenizer, train_dataset,128)
test_dataset = Dataset.from_dict(data2, split='test')
test_data = tokenize_dataset(tokenizer, test_dataset,128)

Map:   0%|          | 0/500000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [7]:
print(len(train_dataset),len(test_dataset))
index=16550
print(train_dataset[index])
print(train_data[index])

500000 5000
{'text': '在每一个州内各党选举疑个人\t在每一个州内各党选举一个人'}
{'input_ids': [101, 1762, 3680, 671, 702, 2336, 1079, 1392, 1054, 6848, 715, 4542, 702, 782, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': [101, 1762, 3680, 671, 702, 2336, 1079, 1392, 1054, 6848, 715, 671, 702, 782, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


# 训练

In [9]:
training_args = TrainingArguments(
    output_dir='./results',         # output directory 结果输出地址
    num_train_epochs=1,          # total # of training epochs 训练总批次
    per_device_train_batch_size=64,  # batch size per device during training 训练批大小
    per_device_eval_batch_size=32,   # batch size for evaluation 评估批大小
    logging_dir='./logs/rn_log',    # directory for storing logs 日志存储位置
    learning_rate=1e-4,             # 学习率
    save_steps=5000,# 不保存检查点
    logging_steps=1000
)
trainer = Trainer(model=model,args=training_args,train_dataset=train_data,eval_dataset=test_data)
trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss
1000,0.016
2000,0.0372
3000,0.036
4000,0.0337
5000,0.0326
6000,0.0313
7000,0.0303


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_eos_token_id': 102}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_eos_token_id': 102}


TrainOutput(global_step=7813, training_loss=0.030877372725098717, metrics={'train_runtime': 3192.2428, 'train_samples_per_second': 156.63, 'train_steps_per_second': 2.447, 'total_flos': 3.810852864e+16, 'train_loss': 0.030877372725098717, 'epoch': 1.0})

In [12]:
model.save_pretrained("./finetune-model/bart/")
torch.save(model,"./finetune-model/bart/pytorch_model.bin")

Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_eos_token_id': 102}


# 模型比较

In [17]:
device=get_device()
model_dir = "/root/autodl-tmp/model/"
model_name =  "bart4csc-base-chinese"
tokenizer = BertTokenizer.from_pretrained(os.path.join(model_dir,model_name))
model = BartForConditionalGeneration.from_pretrained(os.path.join(model_dir,model_name))
model_self = torch.load("./finetune-model/bart/pytorch_model.bin").to("cpu")

In [21]:
def correct_text(text, tokenizer, model):
    input_ids = tokenizer.encode(text, return_tensors="pt")
    output_ids = model.generate(input_ids, max_length=128, num_beams=4, early_stopping=True)
    corrected_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return corrected_text
# 示例
test_sentence = "我煤田都在认真xue"
corrected_sentence = correct_text(test_sentence, tokenizer, model)
corrected_sentence_2 = correct_text(test_sentence, tokenizer, model_self)
print("原模型: ",corrected_sentence)
print("微调后模型: ",corrected_sentence_2)

原模型:  你 是 不 是 有 点 过 粪 了
微调后模型:  你 是 不 是 有 点 过 了
