In [None]:
import pandas as pd

# 加载 csv 文件
df=pd.read_csv("files/en2fr.csv")
# 统计数据集中有多少短语对
num_examples=len(df)
print(f"there are {num_examples} examples in the training data")
# 打印英文短语示例
print(df.iloc[30856]["en"])
# 打印对应的法语翻译
print(df.iloc[30856]["fr"])

there are 47173 examples in the training data
How are you?
Comment êtes-vous?


In [35]:
# 导入预训练分词器
from transformers import XLMTokenizer

tokenizer = XLMTokenizer.from_pretrained("xlm-clm-enfr-1024")
# 使用分词器对英文句子进行分词
tokenized_en=tokenizer.tokenize("I don't speak French.")
print(tokenized_en)
# 对法语句子进行分词
tokenized_fr=tokenizer.tokenize("Je ne parle pas français.")
print(tokenized_fr)
print(tokenizer.tokenize("How are you?"))
print(tokenizer.tokenize("Comment êtes-vous?"))

['i</w>', 'don</w>', "'t</w>", 'speak</w>', 'fr', 'ench</w>', '.</w>']
['je</w>', 'ne</w>', 'parle</w>', 'pas</w>', 'franc', 'ais</w>', '.</w>']
['how</w>', 'are</w>', 'you</w>', '?</w>']
['comment</w>', 'et', 'es-vous</w>', '?</w>']


In [None]:
from collections import Counter
# 从训练数据集中获取所有英文句子
en=df["en"].tolist()
# 对所有英文句子进行分词
en_tokens=[["BOS"]+tokenizer.tokenize(x)+["EOS"] for x in en]        
PAD=0
UNK=1
word_count=Counter()
for sentence in en_tokens:
    for word in sentence:
        word_count[word]+=1
# 统计词频
frequency=word_count.most_common(50000)        
total_en_words=len(frequency)+2
# 创建字典，将词元映射到索引
en_word_dict={w[0]:idx+2 for idx,w in enumerate(frequency)}
en_word_dict["PAD"]=PAD
en_word_dict["UNK"]=UNK
# 创建字典，将索引映射到词元
en_idx_dict={v:k for k,v in en_word_dict.items()}

In [8]:
enidx=[en_word_dict.get(i,UNK) for i in tokenized_en]   
print(enidx)

[15, 100, 38, 377, 476, 574, 5]


In [None]:
# 将索引转换为词元
entokens=[en_idx_dict.get(i,"UNK") for i in enidx]   
print(entokens)
# 将词元连接成一个字符串
en_phrase="".join(entokens)
# 用空格替换分隔符
en_phrase=en_phrase.replace("</w>"," ")
for x in '''?:;.,'("-!&)%''':
    # 去除标点符号前的空格
    en_phrase=en_phrase.replace(f" {x}",f"{x}")   
print(en_phrase)

['i</w>', 'don</w>', "'t</w>", 'speak</w>', 'fr', 'ench</w>', '.</w>']
i don't speak french. 


In [None]:
tokens=['how</w>', 'are</w>', 'you</w>', '?</w>']
indexes=[en_word_dict.get(i,UNK) for i in tokens]   
print(indexes)
tokens=[en_idx_dict.get(i,"UNK") for i in indexes]   
print(tokens)
phrase="".join(tokens)
phrase=phrase.replace("</w>"," ")
for x in '''?:;.,'("-!&)%''':
    phrase=phrase.replace(f" {x}",f"{x}")   
print(phrase)

[157, 17, 22, 26]
['how</w>', 'are</w>', 'you</w>', '?</w>']
how are you? 


In [None]:
# 对所有法语句子进行分词
fr=df["fr"].tolist()       
fr_tokens=[["BOS"]+tokenizer.tokenize(x)+["EOS"] for x in fr] 
word_count=Counter()
for sentence in fr_tokens:
    for word in sentence:
        word_count[word]+=1
# 统计法语词频
frequency=word_count.most_common(50000)        
total_fr_words=len(frequency)+2
# 创建一个字典，将法语词元映射到索引
fr_word_dict={w[0]:idx+2 for idx,w in enumerate(frequency)}
fr_word_dict["PAD"]=PAD
fr_word_dict["UNK"]=UNK
# 创建一个字典，将索引映射到法语词元
fr_idx_dict={v:k for k,v in fr_word_dict.items()}

In [12]:
fridx=[fr_word_dict.get(i,UNK) for i in tokenized_fr]   
print(fridx)

[28, 40, 231, 32, 726, 370, 4]


In [13]:
frtokens=[fr_idx_dict.get(i,"UNK") for i in fridx]   
print(frtokens)
fr_phrase="".join(frtokens)
fr_phrase=fr_phrase.replace("</w>"," ")
for x in '''?:;.,'("-!&)%''':
    fr_phrase=fr_phrase.replace(f" {x}",f"{x}")  
print(fr_phrase)

['je</w>', 'ne</w>', 'parle</w>', 'pas</w>', 'franc', 'ais</w>', '.</w>']
je ne parle pas francais. 


In [None]:
tokens=['comment</w>', 'et', 'es-vous</w>', '?</w>']
indexes=[fr_word_dict.get(i,UNK) for i in tokens]   
print(indexes)
tokens=[fr_idx_dict.get(i,"UNK") for i in indexes]   
print(tokens)
phrase="".join(tokens)
phrase=phrase.replace("</w>"," ")
for x in '''?:;.,'("-!&)%''':
    phrase=phrase.replace(f" {x}",f"{x}")   
print(phrase)

[452, 61, 742, 30]
['comment</w>', 'et', 'es-vous</w>', '?</w>']
comment etes-vous? 


In [15]:
import pickle

with open("files/dict.p","wb") as fb:
    pickle.dump((en_word_dict,en_idx_dict,
                 fr_word_dict,fr_idx_dict),fb)

In [16]:
out_en_ids=[[en_word_dict.get(w,1) for w in s] for s in en_tokens]
out_fr_ids=[[fr_word_dict.get(w,1) for w in s] for s in fr_tokens]
sorted_ids=sorted(range(len(out_en_ids)),
                  key=lambda x:len(out_en_ids[x]))
out_en_ids=[out_en_ids[x] for x in sorted_ids]
out_fr_ids=[out_fr_ids[x] for x in sorted_ids]

In [17]:
import numpy as np

batch_size=128
idx_list=np.arange(0,len(en_tokens),batch_size)
np.random.shuffle(idx_list)

batch_indexs=[]
for idx in idx_list:
    batch_indexs.append(np.arange(idx,min(len(en_tokens),
                                          idx+batch_size)))

In [None]:
def seq_padding(X, padding=0):
    L = [len(x) for x in X]
    # 找出批次中最长序列的长度
    ML = max(L)
    # 如果批次短于最长序列，则在序列末尾填充 0
    padded_seq = np.array([np.concatenate([x, [padding] * (ML - len(x))])
        if len(x) < ML else x for x in X])
    return padded_seq

In [19]:
from util import Batch

batches=[]
for b in batch_indexs:
    batch_en=[out_en_ids[x] for x in b]
    batch_fr=[out_fr_ids[x] for x in b]
    batch_en=seq_padding(batch_en)
    batch_fr=seq_padding(batch_fr)
    batches.append(Batch(batch_en,batch_fr))

In [20]:
src_vocab = len(en_word_dict)
tgt_vocab = len(fr_word_dict)
print(f"there are {src_vocab} distinct English tokens")
print(f"there are {tgt_vocab} distinct French tokens")

there are 11055 distinct English tokens
there are 11239 distinct French tokens


In [None]:
from util import PositionalEncoding
import torch
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# 实例化 PositionalEncoding() 类并将模型维度设置为 256
pe = PositionalEncoding(256, 0.1)
# 创建一个词嵌入并用零填充
x = torch.zeros(1, 8, 256).to(DEVICE)
# 通过将位置编码添加到词嵌入来计算输入嵌入
y = pe.forward(x)
print(f"the shape of positional encoding is {y.shape}")
# 打印输入嵌入，由于词嵌入被设置为零，因此它与位置编码相同
print(y)

the shape of positional encoding is torch.Size([1, 8, 256])
tensor([[[ 0.0000e+00,  1.1111e+00,  0.0000e+00,  ...,  1.1111e+00,
           0.0000e+00,  1.1111e+00],
         [ 9.3497e-01,  6.0034e-01,  8.9107e-01,  ...,  1.1111e+00,
           1.1940e-04,  1.1111e+00],
         [ 1.0103e+00, -4.6239e-01,  1.0646e+00,  ...,  1.1111e+00,
           2.3880e-04,  1.1111e+00],
         ...,
         [-1.0655e+00,  3.1518e-01, -1.1091e+00,  ...,  1.1111e+00,
           5.9700e-04,  0.0000e+00],
         [-3.1046e-01,  1.0669e+00, -7.1559e-01,  ...,  1.1111e+00,
           7.1640e-04,  1.1111e+00],
         [ 0.0000e+00,  8.3767e-01,  2.5419e-01,  ...,  1.1111e+00,
           8.3581e-04,  1.1111e+00]]], device='cuda:0')


In [22]:
from util import create_model

model = create_model(src_vocab, tgt_vocab, N=6,
    d_model=256, d_ff=1024, h=8, dropout=0.1)

In [23]:
from util import NoamOpt

optimizer = NoamOpt(256, 1, 2000, torch.optim.Adam(
    model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))

In [24]:
from util import LabelSmoothing, SimpleLossCompute

criterion = LabelSmoothing(tgt_vocab, 
                           padding_idx=0, smoothing=0.1)
loss_func = SimpleLossCompute(
            model.generator, criterion, optimizer)

In [None]:
for epoch in range(100):
    model.train()
    tloss=0
    tokens=0
    for batch in batches:
        # 使用 Transformer 进行预测
        out = model(batch.src, batch.trg, 
                    batch.src_mask, batch.trg_mask)
        # 计算损失并调整模型参数
        loss = loss_func(out, batch.trg_y, batch.ntokens)
        tloss += loss
        # 计算批次中的词元数量
        tokens += batch.ntokens
    print(f"Epoch {epoch}, average loss: {tloss/tokens}")
# 保存训练好的模型权重
torch.save(model.state_dict(),"files/en2fr.pth")   

Epoch 0, average loss: 5.837538242340088
Epoch 1, average loss: 3.6549735069274902
Epoch 2, average loss: 2.867757797241211
Epoch 3, average loss: 2.248039722442627
Epoch 4, average loss: 1.8180747032165527
Epoch 5, average loss: 1.5864763259887695
Epoch 6, average loss: 1.4054176807403564
Epoch 7, average loss: 1.2680836915969849
Epoch 8, average loss: 1.162902593612671
Epoch 9, average loss: 1.0832992792129517
Epoch 10, average loss: 1.016732931137085
Epoch 11, average loss: 0.9556108713150024
Epoch 12, average loss: 0.9017373919487
Epoch 13, average loss: 0.8607497215270996
Epoch 14, average loss: 0.8227623701095581
Epoch 15, average loss: 0.7802281975746155
Epoch 16, average loss: 0.7497196793556213
Epoch 17, average loss: 0.7211118340492249
Epoch 18, average loss: 0.6901364922523499
Epoch 19, average loss: 0.668834924697876
Epoch 20, average loss: 0.6482908725738525
Epoch 21, average loss: 0.626395583152771
Epoch 22, average loss: 0.6073558926582336
Epoch 23, average loss: 0.58771

In [None]:
def translate(eng):
    tokenized_en=tokenizer.tokenize(eng)
    tokenized_en=["BOS"]+tokenized_en+["EOS"]
    enidx=[en_word_dict.get(i,UNK) for i in tokenized_en]  
    src=torch.tensor(enidx).long().to(DEVICE).unsqueeze(0)
    src_mask=(src!=0).unsqueeze(-2)
    # 使用编码器将英文短语转换为向量表示
    memory=model.encode(src,src_mask)
    # 使用解码器预测下一个词元
    start_symbol=fr_word_dict["BOS"]
    ys = torch.ones(1, 1).fill_(start_symbol).type_as(src.data)
    translation=[]
    for i in range(100):
        out = model.decode(memory,src_mask,ys,
        subsequent_mask(ys.size(1)).type_as(src.data))
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.data[0]
        ys = torch.cat([ys, torch.ones(1, 1).type_as(
            src.data).fill_(next_word)], dim=1)
        sym = fr_idx_dict[ys[0, -1].item()]
        # 当下一个词元为 “EOS” 时停止翻译
        if sym != 'EOS':
            translation.append(sym)
        else:
            break
    # 将预测的词元连接起来形成法语句子
    trans="".join(translation)
    trans=trans.replace("</w>"," ") 
    for x in '''?:;.,'("-!&)%''':
        trans=trans.replace(f" {x}",f"{x}")    
    print(trans)
    return trans

In [29]:
from util import subsequent_mask

with open("files/dict.p","rb") as fb:
    en_word_dict,en_idx_dict,\
    fr_word_dict,fr_idx_dict=pickle.load(fb)
trained_weights=torch.load("files/en2fr.pth",
                           map_location=DEVICE,
                           weights_only=False)
model.load_state_dict(trained_weights)
model.eval()
eng = "Today is a beautiful day!"
translated_fr = translate(eng)

aujourd'hui est une belle journee! 


In [30]:
eng = "A little boy in jeans climbs a small tree while another child looks on."
translated_fr = translate(eng)

un petit garcon en jeans grimpe un petit arbre tandis qu'un autre enfant regarde. 


In [31]:
eng = "I don't speak French."
translated_fr = translate(eng)

je ne parle pas francais. 


In [32]:
eng = "I do not speak French."
translated_fr = translate(eng)

je ne parle pas francais. 


In [None]:
eng = "I love skiing in the winter!"
translated_fr = translate(eng)
eng = "How are you?"
translated_fr = translate(eng)

j'aime faire du ski dans l'hiver! 
comment etes-vous? 
