# Eng-Dem 语言数据预处理操作 
## 基础流程
原始语料 → 清洗过滤 → 分词训练 → 数据加载 → 模型训练 → 评估优化 → 部署上线


## 参考他人的数据分析流程
### 分析问题
1. 自然语言的预处理流程我不熟悉，使用的第三方库我不熟悉
2. torchtext的版本最高是0.18.0，匹配torch的2.3.0，但是torch的2.3.0与我的cuda版本12.2不匹配，不知道安装过程难度是否过大

### 解决方法
我之所以遇到这种问题，原因是因为torchtext已经停止维护，目前应该使用hugging face下的transformers库，进行数据预处理

数据预处理刘彻个：
- 1. 数据准备：留下本地数据的地址
- 2. 加载 pretrained tokenizer，也可以本地训练然后加载 tokenizer
    - 如何使用本地语料库训练tokenizer:
        - 1. 加载语料库
        - 2. 定义tokenizer参数并保存tokenizer
        - 3. 基于语料库训练tokenizer
        - 4. 加载测试 customed tokenizer
        - 5. 
- 3. 定义数据读取与预训练函数
- 4. 数据预处理与加载：创建dataset
- 5. 创建Dataloader批量加载数据

## Exploration epoch 1:
使用预训练tokenizer进行数据预处理

In [7]:
# 1. data path
source_file = "/harddisk1/SZC-Project/NLP-learning/Transformer/Transformer-pytorch-from-scratch/europarl-v7.de-en.en"
target_file = "/harddisk1/SZC-Project/NLP-learning/Transformer/Transformer-pytorch-from-scratch/europarl-v7.de-en.de"

# 2. loading the language dataset
def load_corpus(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = [line.strip() for line in f if line.strip()]
    return lines

en_corpus = load_corpus(source_file)
de_corpus = load_corpus(target_file)


In [13]:
len(en_corpus)

1911843

In [15]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

def train_tokenizer(corpus, vocab_size, output_path, language_name):
    """训练自定义分词器"""
    # 初始化分词器模型
    tokenizer = Tokenizer(BPE(unk_token="[UNK]"))

    # 预训练分词器（例如按照空格分割）
    # ？？
    tokenizer.pre_tokenizer = Whitespace()

    # 定义训练参数
    trainer = BpeTrainer(
        special_tokens = ["[PAD]","[UNK]","[BOS]","[EOS]"],
        vocab_size = vocab_size
    )

    # 训练分词器
    tokenizer.train_from_iterator(corpus, trainer=trainer)

    # 保存分词器
    tokenizer.save(f"{output_path}/{language_name}_tokenizer.json")
    print(f"分词器已保存到 {output_path}/{language_name}_tokenizer.json")

train_tokenizer(en_corpus, vocab_size=30000, output_path="/harddisk1/SZC-Project/NLP-learning/Transformer/Transformer-pytorch-from-scratch", language_name='en')
train_tokenizer(de_corpus, vocab_size=30000, output_path="/harddisk1/SZC-Project/NLP-learning/Transformer/Transformer-pytorch-from-scratch", language_name='de')





分词器已保存到 /harddisk1/SZC-Project/NLP-learning/Transformer/Transformer-pytorch-from-scratch/en_tokenizer.json



分词器已保存到 /harddisk1/SZC-Project/NLP-learning/Transformer/Transformer-pytorch-from-scratch/de_tokenizer.json


In [18]:
en_tokenizer_path = "/harddisk1/SZC-Project/NLP-learning/Transformer/Transformer-pytorch-from-scratch/en_tokenizer.json"
de_tokenizer_path = "/harddisk1/SZC-Project/NLP-learning/Transformer/Transformer-pytorch-from-scratch/de_tokenizer.json"

en_tokenizer = Tokenizer.from_file(en_tokenizer_path)
de_tokenizer = Tokenizer.from_file(de_tokenizer_path)

# 测试分词器
test_sentence_en = "this is my community. we get to open people's mind to accept new idea"
test_sentence_de = "Dies ist ein Test Satz."

print("English Tokenization:")
print(en_tokenizer.encode(test_sentence_en).tokens)

print("German Tokenization:")
print(de_tokenizer.encode(test_sentence_de).tokens)

English Tokenization:
['this', 'is', 'my', 'community', '.', 'we', 'get', 'to', 'open', 'people', "'", 's', 'mind', 'to', 'accept', 'new', 'idea']
German Tokenization:
['Dies', 'ist', 'ein', 'Test', 'Satz', '.']


In [19]:
# 将自定义分词器转换为Transformer模型

from transformers import PreTrainedTokenizerFast

# create English tokenizer
en_transformer_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object = en_tokenizer,
    bos_token = "[BOS]",
    eos_token = "[EOS]",
    pad_token = "[PAD]",
    unk_token = "[UNK]"
)

# create German tokenizer
de_transformer_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object = de_tokenizer,
    bos_token = "[BOS]",
    eos_token = "[EOS]",
    pad_token = "[PAD]",
    unk_token = "[UNK]"   
)

In [21]:
# 编码英语句子
encoded_en = en_transformer_tokenizer(test_sentence_en, return_tensors="pt", padding=True, truncation=True, max_length=128)

# 编码德语句子
encoded_de = de_transformer_tokenizer(test_sentence_de, return_tensors="pt", padding=True, truncation=True, max_length=128)

print(encoded_en)
print(encoded_de)

{'input_ids': tensor([[ 370,  317,  616, 2309,   17,  350,  673,  322, 1162,  640,   10,   85,
         1409,  322, 1154,  605, 1954]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
{'input_ids': tensor([[1005,  371,  336, 6719, 7111,   17]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}


In [27]:
encoded_en['input_ids'].squeeze(0).size()

torch.Size([17])

## Exploration epoch 2: 
将上述的自定义的tokenizer用于创建数据集

In [5]:
from transformers import PreTrainedTokenizerFast
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
import torch 
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
import numpy as np


source_file = "/harddisk1/SZC-Project/NLP-learning/Transformer/Transformer-pytorch-from-scratch/europarl-v7.de-en.en"
target_file = "/harddisk1/SZC-Project/NLP-learning/Transformer/Transformer-pytorch-from-scratch/europarl-v7.de-en.de"

en_tokenizer_path = "/harddisk1/SZC-Project/NLP-learning/Transformer/Transformer-pytorch-from-scratch/en_tokenizer.json"
de_tokenizer_path = "/harddisk1/SZC-Project/NLP-learning/Transformer/Transformer-pytorch-from-scratch/de_tokenizer.json"

en_tokenizer = Tokenizer.from_file(en_tokenizer_path)
de_tokenizer = Tokenizer.from_file(de_tokenizer_path)

# create English tokenizer
en_transformer_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object = en_tokenizer,
    bos_token = "[BOS]",
    eos_token = "[EOS]",
    pad_token = "[PAD]",
    unk_token = "[UNK]"
)

# create German tokenizer
de_transformer_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object = de_tokenizer,
    bos_token = "[BOS]",
    eos_token = "[EOS]",
    pad_token = "[PAD]",
    unk_token = "[UNK]"  
    )

def load_corpus(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = [line.strip() for line in f if line.strip()]
    return lines

def train_tokenizer(corpus, vocab_size, output_path, language_name):
    """训练自定义分词器"""
    # 初始化分词器模型
    tokenizer = Tokenizer(BPE(unk_token="[UNK]"))

    # 预训练分词器（例如按照空格分割）
    # ？？
    tokenizer.pre_tokenizer = Whitespace()

    # 定义训练参数
    trainer = BpeTrainer(
        special_tokens = ["[PAD]","[UNK]","[BOS]","[EOS]"],
        vocab_size = vocab_size
    )

    # 训练分词器
    tokenizer.train_from_iterator(corpus, trainer=trainer)

    # 保存分词器
    tokenizer.save(f"{output_path}/{language_name}_tokenizer.json")
    print(f"分词器已保存到 {output_path}/{language_name}_tokenizer.json")


def collate_fn(batch):
    """自定义 collate_fn 用于处理不同长度的序列"""
    input_ids = torch.stack([item['input_ids'] for item in batch])  # 使用列表推导式
    attention_mask = torch.stack([item['attention_mask'] for item in batch])  # 使用列表推导式
    labels = torch.stack([item['labels'] for item in batch])  # 使用列表推导式

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }

class TranslationDataset(Dataset):
    def __init__(self, src_lines, tgt_lines, src_transformer_tokenizer, tgt_transformer_tokenizer, max_length=128):
        self.src_lines = src_lines
        self.tgt_lines = tgt_lines
        self.src_transformer_tokenizer = src_transformer_tokenizer
        self.tgt_transformer_tokenizer = tgt_transformer_tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.src_lines)

    def __getitem__(self, idx):
        # 对源语言编码
        src_encoding = self.src_transformer_tokenizer(
            self.src_lines[idx],
            padding = 'max_length',
            truncation = True,
            max_length = self.max_length,
            return_tensors = "pt"
        )

        tgt_encoding = self.tgt_transformer_tokenizer(
            self.tgt_lines[idx],
            padding = 'max_length',
            truncation = True,
            max_length = self.max_length,
            return_tensors = "pt"
        )

        return {
            "input_ids": src_encoding['input_ids'].squeeze(0),
            "attention_mask": src_encoding['attention_mask'].squeeze(0),
            'labels': tgt_encoding['input_ids'].squeeze(0)
        }

src_lines = load_corpus(source_file)
tgt_lines = load_corpus(target_file)

train_dataset = TranslationDataset(src_lines, tgt_lines, en_transformer_tokenizer, de_transformer_tokenizer, max_length=128)

sample_ratio = 0.1  # 采样10%数据
indices = np.random.choice(
    len(train_dataset), 
    int(len(train_dataset)*sample_ratio),
    replace=False
)
sampler = SubsetRandomSampler(indices)

sampled_loader = DataLoader(
    train_dataset,
    batch_size=64,
    sampler=sampler,
    num_workers=4
)

train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)

In [10]:
for batch in train_dataloader:
    print("word:")
    print(batch['input_ids'])
    print(batch['input_ids'].size())
    print("")
    print("attention mask:")
    print(batch['attention_mask'])
    print(batch['attention_mask'].size())
    print("labels:")
    print(batch['labels'])
    print(batch['labels'].size())
    print(batch['labels'][1:].size())
    break

word:
tensor([[  584,   928,  1162,  ...,     0,     0,     0],
        [  403,  8061,   323,  ...,     0,     0,     0],
        [10529,    15,   310,  ...,     0,     0,     0],
        ...,
        [  480,   360,  1426,  ...,     0,     0,     0],
        [  403,   543,   864,  ...,     0,     0,     0],
        [  437,   475,    15,  ...,     0,     0,     0]])
torch.Size([64, 128])

attention mask:
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])
torch.Size([64, 128])
labels:
tensor([[ 1101,  1524,   371,  ...,     0,     0,     0],
        [12620,  4379,  2744,  ...,     0,     0,     0],
        [ 5056,    17,    16,  ...,     0,     0,     0],
        ...,
        [ 3046,  1060,   379,  ...,     0,     0,     0],
        [ 3046,   416,   366,  ...,     0,     0,     0],
        [ 1748,   845,  2959,  ...,     0,

## Exploration epoch 3: 
创建 Transformer类，先尝试使用pytorch的方式创建

In [10]:
import torch.nn as nn
import torch.optim as optim

class TransformerModel(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model=512, n_head=8, num_encoder_layers=6, num_decoder_layers=6, dim_feedforward=2048, dropout=0.1, max_len=128):
        super(TransformerModel,self).__init__()

        # 位置编码
        self.src_embedding = nn.Embedding(src_vocab_size, d_model)
        self.tgt_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.positional_encoding = nn.Parameter(torch.zeros(max_len, 1, d_model), requires_grad=True)

        # Transformer
        self.transformer = nn.Transformer(
            d_model = d_model,
            nhead = n_head,
            num_encoder_layers = num_encoder_layers,
            num_decoder_layers = num_decoder_layers,
            dim_feedforward = dim_feedforward,
            dropout = dropout
        )

        # 输出层
        self.fc_out = nn.Linear(d_model, tgt_vocab_size)

    def generate_square_subsequent_mask(self, sz):
        """生成掩码，防止解码器看到未来的时间步"""
        return self.transformer.generate_square_subsequent_mask(sz)

    def forward(self, src, tgt, src_mask=None, tgt_mask=None):
        src_emb = self.src_embedding(src) + self.positional_encoding[:src.size(0)]
        tgt_emb = self.tgt_embedding(tgt) + self.positional_encoding[:tgt.size(0)]

        # Transformer前向传播
        output = self.transformer(src_emb, tgt_emb, src_mask=src_mask, tgt_mask=tgt_mask)

        # 线性输出层
        output = self.fc_out(output)
        return output

# Training function
def train(model, dataloader, optimizer, criterion, device, epoch):
    model.train()
    total_loss = 0

    i = 0
    for batch in dataloader:
        if i % 10 == 0:
            print(f"batch:{i}")
        input_ids = batch['input_ids']
        input_ids = input_ids.to(device)

        attention_mask = batch['attention_mask']
        attention_mask = attention_mask.to(device)

        labels = batch['labels']
        labels = labels.to(device)

        # 创建mask
        src_mask = model.generate_square_subsequent_mask(input_ids.size(0)).to(device)
        tgt_mask = model.generate_square_subsequent_mask(labels.size(0) - 1).to(device)

        # 前向传播
        optimizer.zero_grad()
        output = model(input_ids, labels[:-1], src_mask=src_mask, tgt_mask=tgt_mask)

        # 计算损失
        loss = criterion(output.view(-1, output.size(-1)), labels[1:].reshape(-1))
        loss.backward()

        # 更新参数
        optimizer.step()

        total_loss += loss.item()
        i += 1

    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch}, Loss: {avg_loss:.4f}")

# Evaluate function
def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            src_mask = model.generate_square_subsequent_mask(input_ids.size(0)).to(device)
            tgt_mask = model.generate_square_subsequent_mask(labels.size(0)).to(device)

            output = model(input_ids, labels[:-1], src_mask=src_mask, tgt_mask=tgt_mask)
            loss = criterion(output.view(-1, output.size(-1)), labels[1:].reshape(-1))
            total_loss += loss.item()
    avg_loss = total_loss / len(dataloader)
    print(f"Evaluation Loss: {avg_loss:.4f}")
    return avf_loss



In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = TransformerModel(
    src_vocab_size=en_transformer_tokenizer.vocab_size,
    tgt_vocab_size=de_transformer_tokenizer.vocab_size,
    d_model=512,
    n_head=8,
    num_encoder_layers=6,
    num_decoder_layers=6,
    dim_feedforward=2048,
    dropout=0.1,
    max_len=64
).to(device)

# 定义优化器和损失函数
optimizer = optim.Adam(model.parameters(), lr=0.0001)
criterion = nn.CrossEntropyLoss(ignore_index=de_transformer_tokenizer.pad_token_id)

# 训练模型
num_epochs = 2
for epoch in range(num_epochs):
    train(model, sampled_loader, optimizer, criterion, device, epoch)
    #evaluate(model, train_dataloader, criterion, device)



batch:0
batch:10
batch:20
batch:30
batch:40
batch:50
batch:60
batch:70
batch:80
batch:90
batch:100
batch:110
batch:120
batch:130
batch:140
batch:150
batch:160
batch:170
batch:180
batch:190
batch:200
batch:210
batch:220
batch:230
batch:240
batch:250
batch:260
batch:270
batch:280
batch:290
batch:300
batch:310
batch:320
batch:330
batch:340
batch:350
batch:360
batch:370
batch:380
batch:390
batch:400
batch:410
batch:420
batch:430
batch:440
batch:450
batch:460
batch:470
batch:480
batch:490
batch:500
batch:510
batch:520
batch:530
batch:540
batch:550
batch:560
batch:570
batch:580
batch:590
batch:600
batch:610
batch:620
batch:630
batch:640
batch:650
batch:660
batch:670
batch:680
batch:690
batch:700
batch:710
batch:720
batch:730
batch:740
batch:750
batch:760
batch:770
batch:780
batch:790
batch:800
batch:810
batch:820
batch:830
batch:840
batch:850
batch:860
batch:870
batch:880
batch:890
batch:900
batch:910
batch:920
batch:930
batch:940
batch:950
batch:960
batch:970
batch:980
batch:990
batch:1000


In [12]:
len(sampled_loader)

2988