In [1]:
import os

# 获取当前工作目录
current_directory = os.getcwd()
print(f"Current directory before change: {current_directory}")

# 要更改的目标目录
target_directory = 'NLP-Tutorial-How-to-be-Shakesapeare/Its-a-Long-Story'

# 更改当前工作目录
os.chdir(target_directory)

# 获取更改后的当前工作目录地址
new_directory = os.getcwd()
print(f"Current directory after change: {new_directory}")


Current directory before change: /teamspace/studios/this_studio
Current directory after change: /teamspace/studios/this_studio/NLP-Tutorial-How-to-be-Shakesapeare/Its-a-Long-Story


In [2]:
import os
import torch
from transformers import BartTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments
from datasets import load_from_disk, DatasetDict

# 加载预处理后的数据集路径
preprocessed_data_path = 'tokenized_datasets'

# 检查是否存在已保存的预处理后的数据集
if os.path.exists(preprocessed_data_path):
    print(f"Loading preprocessed data from {preprocessed_data_path}...")
    tokenized_datasets = load_from_disk(preprocessed_data_path)

    # 选择训练集、验证集和测试集的各10000条数据
    tokenized_datasets['train'] = tokenized_datasets['train'].select(range(5000))
    tokenized_datasets['validation'] = tokenized_datasets['validation'].select(range(200))
    tokenized_datasets['test'] = tokenized_datasets['test'].select(range(200))

else:
    print("Preprocessing data...")

    # 加载原始数据集
    dataset = load_from_disk('filtered_dataset')

    # 加载BART分词器和模型
    model_name = 'facebook/bart-large-cnn'
    tokenizer = BartTokenizer.from_pretrained(model_name)
    model = BartForConditionalGeneration.from_pretrained(model_name)

    # 定义数据预处理函数，将数据转换为模型所需格式
    def preprocess_function(examples):
        inputs = examples['article']
        model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

        # 设置目标 (Target) 为 highlights
        with tokenizer.as_target_tokenizer():
            labels = tokenizer(examples['highlights'], max_length=128, truncation=True)

        model_inputs['labels'] = labels['input_ids']
        return model_inputs

    # 使用多进程进行 map 操作
    tokenized_datasets = dataset.map(
        preprocess_function,
        batched=True,
        num_proc=4,  # 使用的进程数量，根据你的CPU核心数调整
        remove_columns=['article', 'highlights', 'id']
    )

    # 保存预处理后的数据集
    tokenized_datasets.save_to_disk(preprocessed_data_path)
    print(f"Preprocessed data saved to {preprocessed_data_path}")


Loading preprocessed data from tokenized_datasets...


In [3]:
import nltk
from nltk.corpus import wordnet
import nlpaug.augmenter.word as naw
from transformers import DataCollatorForSeq2Seq, Trainer, TrainingArguments, BartForConditionalGeneration, BartTokenizer
import sacremoses

# 设置 nltk 数据下载路径
nltk_data_path = 'nltk_data'
nltk.data.path.append(nltk_data_path)

# 下载所需的 nltk 数据集
nltk.download('wordnet', download_dir=nltk_data_path)
nltk.download('averaged_perceptron_tagger', download_dir=nltk_data_path)

# 初始化增强器
synonym_aug = naw.SynonymAug(aug_src='wordnet')
random_insert_aug = naw.ContextualWordEmbsAug(action="insert", model_path='bert-base-uncased')
random_swap_aug = naw.RandomWordAug(action="swap")
random_delete_aug = naw.RandomWordAug(action="delete")
back_translation_aug = naw.BackTranslationAug(from_model_name='facebook/wmt19-en-de', to_model_name='facebook/wmt19-de-en')

# 定义增强函数
def augment_text(text):
    aug_methods = [
        synonym_aug,
        random_insert_aug,
        random_swap_aug,
        random_delete_aug,
        back_translation_aug
    ]
    
    # 50%的概率进行数据增强
    if random.random() < 0.5:
        aug_method = random.choice(aug_methods)
        text = aug_method.augment(text)
    return text

class DataCollatorForSeq2SeqWithAugmentation(DataCollatorForSeq2Seq):
    def __call__(self, features):
        for feature in features:
            if 'article' in feature:
                feature['input_ids'] = tokenizer(augment_text(feature['article']), max_length=1024, truncation=True)['input_ids']
            if 'highlights' in feature:
                feature['labels'] = tokenizer(feature['highlights'], max_length=128, truncation=True)['input_ids']
        return super().__call__(features)


[nltk_data] Downloading package wordnet to nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
Some weights of FSMTForConditionalGeneration were not initialized from the model checkpoint at facebook/wmt19-en-de and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of FSMTForConditionalGeneration were not initialized from the model checkpoint at facebook/wmt19-de-en and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:

# 加载BART分词器和模型
model_name = 'facebook/bart-large-cnn'
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

# 初始化 DataCollator
data_collator = DataCollatorForSeq2SeqWithAugmentation(tokenizer, model=model)




In [14]:
from datasets import load_metric
from sklearn.metrics import accuracy_score

# 加载ROUGE评估指标
rouge = load_metric("rouge", trust_remote_code=True)

# 定义计算指标的函数
def compute_metrics(p):
    # 取出预测的序列
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = preds.argmax(-1)
    
    # 解码预测的文本
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    
    # 处理None值，避免解码错误
    decoded_preds = ["".join([token if token is not None else "" for token in tokenizer.convert_ids_to_tokens(pred)]) for pred in preds]
    
    # 解码真实标签文本
    decoded_labels = tokenizer.batch_decode(p.label_ids, skip_special_tokens=True)
    
    # 计算ROUGE分数
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels)
    
    # 返回所需的结果
    result = {k: v.mid.fmeasure * 100 for k, v in result.items()}
    
    return result


# 设置训练参数
training_args = TrainingArguments(
    output_dir='./bart_results',
    logging_dir='./bart_logs',
    warmup_steps=500,                 # 预热步数
    weight_decay=0.01,                # 权重衰减
    evaluation_strategy="epoch",      # 在每个 epoch 结束时进行评估
    save_strategy="epoch",            # 在每个 epoch 结束时保存模型
    # eval_steps=10,                   # 每隔多少步进行一次评估
    learning_rate=3e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    save_total_limit=3,
    load_best_model_at_end=True,
    logging_steps=500,
    fp16=True,
)

# 初始化Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator,  # 使用自定义的DataCollator
    # compute_metrics=compute_metrics
)




In [15]:
# 开始训练
trainer.train()

# 训练完成后评估模型
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")


Epoch,Training Loss,Validation Loss
1,1.3146,1.758318
2,0.9049,1.939801
3,0.5468,2.237341
4,0.3139,2.608653
5,0.1852,2.87826


Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_

Evaluation results: {'eval_loss': 1.7583175897598267, 'eval_runtime': 3.5504, 'eval_samples_per_second': 56.332, 'eval_steps_per_second': 14.083, 'epoch': 5.0}


---

我为了尽快实现这个pipeline，将训练数量进行了缩减。

你可以修改读取数据集那部分的代码，实现全量的训练来达到更好的效果。