In [1]:
import os

# 获取当前工作目录
current_directory = os.getcwd()
print(f"Current directory before change: {current_directory}")

# 要更改的目标目录
target_directory = 'NLP-Tutorial-How-to-be-Shakesapeare/Leave-Me-Alone'

# 如果当前目录不是目标目录，则更改当前工作目录
if not current_directory.endswith(target_directory):
    os.chdir(target_directory)
    print(f"Directory changed to: {target_directory}")
else:
    print("Already in the target directory.")

# 获取更改后的当前工作目录地址
new_directory = os.getcwd()
print(f"Current directory after change: {new_directory}")


Current directory before change: /teamspace/studios/this_studio
Directory changed to: NLP-Tutorial-How-to-be-Shakesapeare/Leave-Me-Alone
Current directory after change: /teamspace/studios/this_studio/NLP-Tutorial-How-to-be-Shakesapeare/Leave-Me-Alone


In [2]:
# !pip install transformers
from transformers import BertTokenizer, BertForPreTraining

# 加载预训练的BERT分词器和模型
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForPreTraining.from_pretrained('bert-base-uncased')




将文本数据转换为 BERT 所需的格式，包括掩蔽语言模型 (Masked Language Model, MLM) 和下一句预测 (Next Sentence Prediction, NSP)，是因为这些任务是 BERT 预训练的核心组成部分。

### 掩蔽语言模型 (Masked Language Model, MLM)
MLM 是 BERT 预训练过程中的关键任务。具体步骤如下：
1. **掩蔽部分词汇**：在输入序列中随机掩蔽一些词汇，使用特殊的 `[MASK]` 标记替换它们。
2. **预测被掩蔽的词汇**：模型尝试根据上下文预测这些被掩蔽的词汇。

这种任务的好处是模型可以利用双向上下文信息来理解词汇之间的关系，而不仅仅是前向或后向单向信息。这种双向上下文的理解对于许多 NLP 任务是非常重要的。

### 下一句预测 (Next Sentence Prediction, NSP)
NSP 是 BERT 预训练过程中的另一项任务。具体步骤如下：
1. **句子对**：对于每个训练样本，模型接收两个句子。
2. **预测句子关系**：模型需要预测第二个句子是否是第一个句子的自然续写。

这种任务的好处是模型可以学习到句子级别的关系和连贯性，从而在处理需要理解句子关系的任务（如问答和自然语言推理）时表现得更好。


In [3]:
import os
from datasets import Dataset, load_from_disk
import torch
import pandas as pd
import re
from transformers import BertTokenizer


# 定义清理函数
def clean_text(text):
    text = re.sub(r'<br\s*/?>', ' ', text)  # 移除HTML换行标签
    text = re.sub(r'<.*?>', '', text)       # 移除其他HTML标签
    text = re.sub(r'[^\u4e00-\u9fa5A-Za-z0-9\s]+', '', text)  # 移除特殊字符，保留中文、英文、数字和空格
    return text

# 读取处理后的CSV文件
processed_df = pd.read_csv('processed_unsupervised_data.csv')
processed_df['text'] = processed_df['text'].apply(clean_text)

# 转换为Dataset对象
unsupervised_dataset = Dataset.from_pandas(processed_df)

# 定义数据预处理函数
def preprocess_data(examples):
    encoding = tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)
    input_ids = torch.tensor(encoding['input_ids'])
    
    # 创建掩蔽语言模型任务的标签
    labels = input_ids.clone()
    
    # 80%的时间替换为[Mask]
    rand = torch.rand(input_ids.shape)
    mask_arr = (rand < 0.15) * (input_ids != tokenizer.cls_token_id) * (input_ids != tokenizer.sep_token_id) * (input_ids != tokenizer.pad_token_id)
    selection = []
    for i in range(input_ids.shape[0]):
        selection.append(
            torch.flatten(mask_arr[i].nonzero()).tolist()
        )
    for i in range(input_ids.shape[0]):
        input_ids[i, selection[i]] = tokenizer.mask_token_id

    # 创建下一句预测的标签
    next_sentence_label = torch.zeros(input_ids.shape[0], dtype=torch.long)
    
    encoding['labels'] = labels.tolist()  # 将 tensor 转回列表
    encoding['input_ids'] = input_ids.tolist()  # 将 tensor 转回列表
    encoding['next_sentence_label'] = next_sentence_label.tolist()
    return encoding

# 检查是否存在预处理后的数据集
preprocessed_data_path = 'preprocessed_unsupervised_data'

if os.path.exists(preprocessed_data_path):
    print(f"Loading preprocessed data from {preprocessed_data_path}...")
    unsupervised_dataset = load_from_disk(preprocessed_data_path)
else:
    print("Preprocessing data...")
    # 预处理数据集
    unsupervised_dataset = unsupervised_dataset.map(preprocess_data, batched=True)
    unsupervised_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels', 'next_sentence_label'])
    
    # 保存预处理后的数据集
    unsupervised_dataset.save_to_disk(preprocessed_data_path)
    print(f"Preprocessed data saved to {preprocessed_data_path}")


Loading preprocessed data from preprocessed_unsupervised_data...


In [4]:
from datasets import load_from_disk

# 从磁盘加载预处理后的数据集
preprocessed_data_path = 'preprocessed_unsupervised_data'
unsupervised_dataset = load_from_disk(preprocessed_data_path)

# 设置数据格式
unsupervised_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels', 'next_sentence_label'])


In [None]:
from transformers import BertForPreTraining, Trainer, TrainingArguments
import torch
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import Dataset

# 定义计算指标的函数
def compute_metrics(p):
    preds = np.argmax(p.predictions[0], axis=1)
    labels = p.label_ids
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# 自定义 Trainer 以重写 compute_loss 方法
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        next_sentence_label = inputs.pop("next_sentence_label")
        outputs = model(**inputs)
        prediction_logits = outputs.prediction_logits
        seq_relationship_logits = outputs.seq_relationship_logits
        
        # 计算掩蔽语言模型的损失
        mlm_loss = torch.nn.functional.cross_entropy(prediction_logits.view(-1, model.config.vocab_size), labels.view(-1))
        
        # 计算下一句预测的损失
        nsp_loss = torch.nn.functional.cross_entropy(seq_relationship_logits.view(-1, 2), next_sentence_label.view(-1))
        
        loss = mlm_loss + nsp_loss
        return (loss, outputs) if return_outputs else loss

# 加载预训练的 BERT 模型
model = BertForPreTraining.from_pretrained('bert-base-uncased')

# 定义训练参数
training_args = TrainingArguments(
    output_dir='./pretrain_results',
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    logging_dir='./pretrain_logs',
    logging_steps=500,                 # 记录日志的步数
    evaluation_strategy="no",          # 暂时禁用评估
    save_strategy="epoch",             # 在每个 epoch 结束时保存模型
    fp16=True                          # 启用自动混合精度
)

# 创建一个伪的评估数据集
dummy_data = {'input_ids': [[0]], 'attention_mask': [[0]], 'labels': [[0]], 'next_sentence_label': [0]}
dummy_eval_dataset = Dataset.from_dict(dummy_data)

# 初始化 CustomTrainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=unsupervised_dataset,
    eval_dataset=dummy_eval_dataset,
    compute_metrics=compute_metrics
)

# 开始训练
trainer.train()


In [3]:
import torch
from transformers import BertForPreTraining, BertTokenizer
from datasets import load_from_disk
import pandas as pd

# 从磁盘加载预处理后的数据集
preprocessed_data_path = 'preprocessed_unsupervised_data'
unsupervised_dataset = load_from_disk(preprocessed_data_path)

# 设置数据格式
unsupervised_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels', 'next_sentence_label'])

# 加载训练好的模型和分词器
model_path = 'pretrain_results/checkpoint-8682'
model = BertForPreTraining.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# 选择一些样本进行预测
sample_indices = [0, 1, 2, 3, 4]  # 可以根据需要调整索引
samples = unsupervised_dataset.select(sample_indices)

# 将数据加载到模型中进行预测
model.eval()
with torch.no_grad():
    for i, sample in enumerate(samples):
        input_ids = sample['input_ids'].unsqueeze(0)  # 增加batch维度
        attention_mask = sample['attention_mask'].unsqueeze(0)
        labels = sample['labels'].unsqueeze(0)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        prediction_logits = outputs.prediction_logits
        seq_relationship_logits = outputs.seq_relationship_logits

        # 解码预测结果
        predicted_tokens = torch.argmax(prediction_logits, dim=-1)
        predicted_text = tokenizer.decode(predicted_tokens[0], skip_special_tokens=True)
        
        print(f"Sample {i + 1}:")
        print(f"Original text: {tokenizer.decode(input_ids[0], skip_special_tokens=True)}")
        print(f"Predicted text: {predicted_text}")
        print(f"Sequence relationship prediction: {torch.argmax(seq_relationship_logits, dim=-1).item()}")
        print()




Sample 1:
Original text: this is just a little the play the script are excellent i can compare this movie anything else maybe except the movie leonly played by jean reno and natalieman but what can say about this one this is best movie anne parillaud played in please frankie shes speaking english there to see i mean the story young punk girl nikita into the depraved world the secret government forces has exceptionally over used by americans never the of no return and the la femme nikita tv they cannot believe me videos buy this one do not it buy it btwware of the subtitles of the company translate us release a disgrace if you can understand french get a dubbed youll later
Predicted text: this is just a nice little movie the play the script are excellent i cant compare this movie to anything else maybe except the movie leon brilliantly played by jean reno and natalie portman but what can i say about this one this is the best movie anne parillaud has ever played in and please frankie ava

### 解释

在这段代码中，我们使用了预训练后的BERT模型对一些样本进行了预测，并打印了原始文本、预测文本以及序列关系预测的结果。以下是输出的详细解释：

#### 输出示例

**Sample 1**:
- **Original text (原始文本)**:
  ```text
  this is just a little the play the script are excellent i can compare this movie anything else maybe except the movie leonly played by jean reno and natalieman but what can say about this one this is best movie anne parillaud played in please frankie shes speaking english there to see i mean the story young punk girl nikita into the depraved world the secret government forces has exceptionally over used by americans never the of no return and the la femme nikita tv they cannot believe me videos buy this one do not it buy it btwware of the subtitles of the company translate us release a disgrace if you can understand french get a dubbed youll later
  ```
  这是从数据集中提取的原始文本，包含了一些拼写错误和语法错误，显得有些混乱。

- **Predicted text (预测文本)**:
  ```text
  this is just a nice little movie the play the script are excellent i cant compare this movie to anything else maybe except the movie leon brilliantly played by jean reno and natalie portman but what can i say about this one this is the best movie anne parillaud has ever played in and please frankie avalon think shes speaking english there to see what i mean the story of young punk girl nikita thrown into the depraved world of the secret government forces has been exceptionally over used by americans never mind the point of no return and unlike the la femme nikita tv series they cannot get it it believe me buy the videos buy this one do not rent it buy it btw beware of the subtitles of the production company to translate the us release what a disgrace if you cant understand french get a dubbed version and youll laugh later
  ```
  这是模型根据原始文本预测出的文本。可以看到，模型在一定程度上改正了一些拼写错误和语法错误，使得文本更通顺。例如，将 "leonly" 更正为 "leon"，将 "natalieman" 更正为 "natalie portman"。

- **Sequence relationship prediction (序列关系预测)**: `0`
  这是模型对下一句预测任务的结果。BERT模型通常会进行两个任务：
  - 掩蔽语言模型 (Masked Language Model, MLM)：预测被掩蔽的词。
  - 下一句预测 (Next Sentence Prediction, NSP)：判断两段文本是否相邻。
  
  序列关系预测的结果是 `0`，这通常表示模型预测这两段文本不是相邻的句子。


---

这些文件是使用 `transformers` 库训练模型后生成的检查点文件。这些文件保存了模型的各种状态和参数，以便于在训练过程中断点续训或进行评估。

1. **`config.json`**:
   - 这个文件包含了模型的配置参数。它定义了模型的架构和超参数，例如层数、隐藏层大小、注意力头数等。
   - 在加载模型时，这个文件用于重新构建模型架构。

2. **`model.safetensors`**:
   - 这个文件包含了训练好的模型权重。它是模型实际的参数，用于进行推理和进一步的训练。
   - 这个文件的格式是 `.safetensors`，它是一个高效的二进制格式，用于保存大规模张量数据。

3. **`optimizer.pt`**:
   - 这个文件保存了优化器的状态。优化器的状态包括动量和学习率调度器等信息。
   - 在恢复训练时，这个文件用于恢复优化器的状态，使得训练可以从中断的地方继续。

4. **`rng_state.pth`**:
   - 这个文件保存了随机数生成器的状态，包括 PyTorch 和 NumPy 的随机数生成器状态。
   - 这样可以确保训练过程中断点续训时，随机数序列的一致性，从而保证结果的可重复性。

5. **`scheduler.pt`**:
   - 这个文件保存了学习率调度器的状态。学习率调度器用于动态调整训练过程中的学习率。
   - 在恢复训练时，这个文件用于恢复学习率调度器的状态。

6. **`trainer_state.json`**:
   - 这个文件保存了训练器（Trainer）的状态，包括当前的步数、损失值等信息。
   - 在恢复训练时，这个文件用于恢复训练器的状态，使得训练可以从中断的地方继续。

7. **`training_args.bin`**:
   - 这个文件保存了训练参数。这些参数定义了训练过程中的各种配置，例如批处理大小、学习率、训练轮数等。
   - 在恢复训练时，这个文件用于恢复训练参数，使得训练可以从中断的地方继续。


In [5]:
import os

def get_directory_size(directory):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(directory):
        for filename in filenames:
            file_path = os.path.join(dirpath, filename)
            if os.path.isfile(file_path):
                total_size += os.path.getsize(file_path)
    return total_size

def format_size(size):
    # 将字节大小格式化为更易读的格式
    for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
        if size < 1024:
            return f"{size:.2f} {unit}"
        size /= 1024

# 指定目录
directory = 'pretrain_results/checkpoint-8682'

# 计算目录总大小
total_size = get_directory_size(directory)

# 格式化大小并输出
formatted_size = format_size(total_size)
print(f"The total size of the files in the directory '{directory}' is {formatted_size}.")


The total size of the files in the directory 'pretrain_results/checkpoint-8682' is 1.23 GB.
