In [1]:
import torch
import torch.nn as nn

torch.random.manual_seed(0)

<torch._C.Generator at 0x24fa34473f0>

## EncoderModule

In [2]:
class EncoderModule(nn.Module):
    # 将vocab_size这么多维度降维到embedding_dim
    def __init__(self, vocab_size, embedding_dim, hidden_size=None, num_layers=1, bidirectional=False):
        # 调用父类nn.Module
        super(EncoderModule, self).__init__()
        self.embedding_layer = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
        hidden_size = hidden_size or embedding_dim
        self.features = nn.Sequential(
            nn.Linear(in_features=embedding_dim, out_features=hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU()
        )
        self.output_dim = hidden_size

    def forward(self, x):
        """
        编码器前向过程
        :param x: [N,T] token id tensor对象
        :return: [N,L] 向量矩阵，针对每个文本用一个L维的向量进行表示
        """
        x = self.embedding_layer(x)  # [N,T] -> [N,T,E]
        hn = self.features(x)  # hn [N,T,E] -> [N,T,hidden_size]
        hz = torch.mean(hn, dim=1)  # [N,T,hidden_size] -> [N,hidden_size]
        return hz


In [3]:
net = EncoderModule(vocab_size=100, embedding_dim=3, num_layers=2, bidirectional=True)
x = torch.randint(50, size=(2, 4))
c = net(x)
print(c.shape)

torch.Size([2, 3])


### onnx

In [12]:
import torch.onnx
# import netron

# 设置模型为评估模式
net.eval()

# 创建示例输入
dummy_input = torch.randint(50, size=(2, 4))

# 导出模型为 ONNX 格式
output_file = "encoder_model.onnx"
torch.onnx.export(net, dummy_input, output_file,
                  export_params=True,        # 存储训练过的参数
                  opset_version=10,         # ONNX 版本
                  do_constant_folding=True, # 是否执行常量折叠优化
                  input_names=['input'],    # 输入名称
                  output_names=['output'],  # 输出名称
                  dynamic_axes={'input': {0: 'batch_size'}, 'output': {0: 'batch_size'}} # 批次大小动态
                  )

# print(f"ONNX model exported to {output_file}")

# # 使用 netron 查看 ONNX 模型
# netron.start(output_file)

OnnxExporterError: Module onnx is not installed!

## DecoderModule

In [16]:
class DecoderModule(nn.Module):
    def __init__(self,
                 vocab_size, embedding_dim, encoder_state_dim,
                 hidden_size=None, num_layers=1, eos_token_id=0, max_seq_length=20
                 ):
        super(DecoderModule, self).__init__()
        self.embedding_layer = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
        self.hidden_size = hidden_size or embedding_dim
        assert num_layers == 1, "当前解码器仅支持单层结构!"
        self.num_layers = num_layers
        self.rnn_state_proj = nn.Sequential(
            nn.Linear(in_features=encoder_state_dim, out_features=self.hidden_size * self.num_layers),
            nn.Tanh()
        )
        self.rnn = nn.RNN(
            input_size=embedding_dim, hidden_size=self.hidden_size,
            num_layers=self.num_layers, batch_first=True, bidirectional=False
        )
        # 当前模拟代码中，类别数目和词汇表数目一致
        self.proj = nn.Linear(
            in_features=self.hidden_size,
            out_features=vocab_size
        )
        # 解码器属性
        self.max_seq_length = max_seq_length
        self.eos_token_id = eos_token_id
        self.rnn_cell = nn.RNNCell(input_size=embedding_dim, hidden_size=self.hidden_size)
        self.rnn_cell.weight_ih = self.rnn.weight_ih_l0
        self.rnn_cell.weight_hh = self.rnn.weight_hh_l0
        self.rnn_cell.bias_ih = self.rnn.bias_ih_l0
        self.rnn_cell.bias_hh = self.rnn.bias_hh_l0

    def forward(self, x, encoder_state):
        """
        解码器的前向过程
        :param x: [N,T] 训练的时候，是token id列表，T为实际长度；预测的时候T为1
        :param encoder_state: [N,encoder_state_dim] 解码器的初始状态信息 ---> 一般来源于编码器的输出
        :return: [N,T,vocab_size] N个文本，对应T个时刻，每个时刻预测的类别置信度值
            NOTE: 训练的时候返回值中的T和x中的T一致，推理预测的时候不一致
        """
        # 将编码器传递过来的状态信息进行转换，作为解码器的初始状态信息
        init_state = self.rnn_state_proj(encoder_state)  # [N,encoder_state_dim] -> [N,hidden_size*num_layers]
        init_state = torch.reshape(init_state, shape=(-1, self.hidden_size, self.num_layers))
        init_state = torch.permute(init_state, dims=[2, 0, 1])   # [num_layers,N,hidden_size]符合rnn hn的输入要求

        # embedding操作
        x = self.embedding_layer(x)  # [N,T] -> [N,T,E]

        if self.training:
            # print ("model is in train mode")
            output, _ = self.rnn(x, init_state)  # output -> [n,T,hidden_size]
            scores = self.proj(output)  # [n,T,vocab_size]
            return scores
        else:
            # print ("model is in eval mode")
            # 需要进行遍历操作，每个时刻每个时刻进行预测，直到预测结果为eos_token_id或者预测的序列长度超过阈值的时候，结束预测
            outputs = []
            hx = init_state[0]  # 第一层的rnn的状态信息
            xi = x[:, 0, :]     # 第一个时刻的输入
            n, _ = xi.shape     # n为batch_size
            eos_token_ids, is_eos = None, None
            while len(outputs) < self.max_seq_length:
                # 当前rnn的输入: x和状态信息 --> 获取当前rnn的输出
                hx = self.rnn_cell(xi, hx)  # [N,hidden_size]
                oi = hx  # RNN的状态信息就是输出信息

                # 进一步的特征提取转换，获取当前时刻的预测token id
                scores_i = self.proj(oi)  # 得到当前时刻的预测置信度 [N,hidden_size] ->  [N,vocab_size]
                token_ids_i = torch.argmax(scores_i, dim=1, keepdim=True)  # 当前预测id [N, 1]
                outputs.append(token_ids_i)

                # 判断当前时刻的预测值是不是都是结束符号，如果是，直接退出循环
                if eos_token_ids is None:
                    eos_token_ids = token_ids_i   # [N,1]
                    is_eos = (eos_token_ids == self.eos_token_id).to(token_ids_i.dtype)   # 是eos的就是1，不是的就是0

                eos_token_ids = eos_token_ids * is_eos + token_ids_i * (1 - is_eos)  # 合并数据
                is_eos = (eos_token_ids == self.eos_token_id).to(token_ids_i.dtype)  # 是eos的就是1，不是的就是0
                eos_number = torch.sum(is_eos).item()
                if eos_number >= n:
                    break

                # 更新下一个时刻的输入 --> 将当前时刻的预测token id作为下一个时刻的输入
                xi = self.embedding_layer(token_ids_i)[:, 0, :]
            outputs = torch.concat(outputs, dim=1)  # [N,T2]
            return outputs


In [31]:
net2 = DecoderModule(vocab_size=100, embedding_dim=3, encoder_state_dim=net.output_dim)
y = torch.randint(50, size=(2, 6))
# net2.eval()
r2 = net2(y, c)
print(r2.shape)

model is in train mode
torch.Size([2, 6, 100])


In [32]:
print (net2)

DecoderModule(
  (embedding_layer): Embedding(100, 3)
  (rnn_state_proj): Sequential(
    (0): Linear(in_features=3, out_features=3, bias=True)
    (1): Tanh()
  )
  (rnn): RNN(3, 3, batch_first=True)
  (proj): Linear(in_features=3, out_features=100, bias=True)
  (rnn_cell): RNNCell(3, 3)
)


### netron

In [18]:
import torch
import torch.onnx
import netron

# 设置模型为评估模式
net2.eval()

# 创建示例输入
# x: [batch_size, sequence_length]
dummy_input_x = torch.randint(0, 50, size=(2, 4))  # 假设词汇表大小为50
# encoder_state: [batch_size, encoder_state_dim]
dummy_input_encoder_state = torch.randn(2, net2.hidden_size)  # 假设encoder_state_dim等于hidden_size

# 导出模型为 ONNX 格式
output_file = "decodermodel.onnx"
torch.onnx.export(net2, 
                  (dummy_input_x, dummy_input_encoder_state),  # 传入两个dummy inputs
                  output_file,
                  export_params=True,        # 存储训练过的参数
                  opset_version=10,         # ONNX 版本
                  do_constant_folding=True, # 是否执行常量折叠优化
                  input_names=['x', 'encoder_state'],    # 输入名称
                  output_names=['output'],  # 输出名称
                  dynamic_axes={'x': {0: 'batch_size'}, 'encoder_state': {0: 'batch_size'}, 'output': {0: 'batch_size'}}) # 批次大小动态

# 使用 netron 查看 ONNX 模型
netron.start(output_file)

model is in eval mode
Serving 'decodermodel.onnx' at http://localhost:19610


  eos_number = torch.sum(is_eos).item()
  if eos_number >= n:


('localhost', 19610)

In [None]:
import torch.onnx
import netron

# 设置模型为评估模式
net2.eval()

# 创建示例输入
dummy_input = torch.randint(50, size=(2, 4))

# 导出模型为 ONNX 格式
output_file = "decodermodel.onnx"
torch.onnx.export(net2, dummy_input, output_file,
                  export_params=True,        # 存储训练过的参数
                  opset_version=10,         # ONNX 版本
                  do_constant_folding=True, # 是否执行常量折叠优化
                  input_names=['input'],    # 输入名称
                  output_names=['output'],  # 输出名称
                  dynamic_axes={'input': {0: 'batch_size'}, 'output': {0: 'batch_size'}} # 批次大小动态
                  )

# print(f"ONNX model exported to {output_file}")

# 使用 netron 查看 ONNX 模型
netron.start(output_file)

## Seq2SeqModel

In [14]:
class Seq2SeqModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim,
                 encoder_num_layers=1, encoder_bidirectional=True, encoder_hidden_size=None,
                 decoder_num_layers=1, decoder_vocab_size=None, decoder_embedding_dim=None, decoder_hidden_size=None,
                 eos_token_id=0,

                 ):
        super(Seq2SeqModel, self).__init__()

        self.encoder = EncoderModule(
            vocab_size, embedding_dim, hidden_size=encoder_hidden_size,
            num_layers=encoder_num_layers, bidirectional=encoder_bidirectional
        )
        self.decoder = DecoderModule(
            decoder_vocab_size or vocab_size, decoder_embedding_dim or embedding_dim,
            hidden_size=decoder_hidden_size,
            encoder_state_dim=self.encoder.output_dim, num_layers=decoder_num_layers,
            eos_token_id=eos_token_id,
            # eos_token_id=6,  # 临时更改，为了预测退出逻辑
            max_seq_length=200
        )
        self.eos_token_id = eos_token_id
        self.loss_fn = nn.CrossEntropyLoss()

    def forward(self, encoder_input_ids, label_ids=None):
        """
        前向过程： 前向预测 + loss
        NOTE: loss仅在训练的时候计算
        :param encoder_input_ids: [N,T1] token id tensor列表
        :param label_ids: [N,T2] 训练时候给定的标签id列表，推理预测的时候为None
        :return: [N,T2,vocab_size], loss
        """
        # 1. 基于编码器提取特征
        c = self.encoder(encoder_input_ids)
        # 2. 解码器操作
        if self.training:
            # 获取解码器的信息：解码器的输入label_ids的偏移 + 编码器的状态信息
            eos_ids = torch.zeros(size=(label_ids.shape[0], 1), dtype=label_ids.dtype)
            torch.fill_(eos_ids, self.eos_token_id)
            shift_decoder_input_ids = torch.concat([eos_ids, label_ids], dim=1)  # [N,T2+1]
            shift_decoder_output_ids = torch.concat([label_ids, eos_ids], dim=1)  # [N,T2+1]
            scores = self.decoder(shift_decoder_input_ids, c)  # [N,T2+1,vocab_size]
            # 损失的计算
            loss = self.loss_fn(torch.permute(scores, dims=[0, 2, 1]), shift_decoder_output_ids)
            return scores, loss
        else:
            # 构建解码器第一个时刻的输入
            eos_ids = torch.zeros(size=(encoder_input_ids.shape[0], 1), dtype=torch.long)
            torch.fill_(eos_ids, self.eos_token_id)
            token_ids = self.decoder(eos_ids, c)  # [N,T2+1,vocab_size]
            return token_ids


## Seq2SeqModel_1

In [17]:
# 假定场景: 词典大小为26个字母 + 一个特殊值<EOS> + 一个特殊值<PAD>
# 有一条样本，编码器的输入: a b c，解码器的最终输出: w x y z
# 对数据做转换:
# ** 编码器输入: a b c
# ** 解码器输入: <EOS> w x y z
# ** 解码器输出: w x y z <EOS>
# 解码器理解成序列生成，生成序列的时候是不是要一个字符/token一个字符/token来生成，在生成当前token/字符的时候，和是之前的token有强烈的关联关系的
# 词典映射关系: {<EOS>:0, a:1, b:2, c:3, ......, w:23, x:24, y:25, z:26, <PAD>:27}
x_id = torch.tensor([
    [1, 2, 3],
    [1, 2, 5]
])
label_ids = torch.tensor([
    [23, 24, 25, 26],
    [23, 24, 25, 26]
])
net = Seq2SeqModel(
    vocab_size=28,
    embedding_dim=4,
    encoder_num_layers=1,
    encoder_hidden_size=16,
    decoder_hidden_size=16,
    eos_token_id=0
)
_scores, _loss = net(x_id, label_ids)
print(_scores.shape)
print(_loss)

net.eval()
_predict_token_ids = net(x_id)
print(_predict_token_ids)

torch.Size([2, 5, 28])
tensor(3.4099, grad_fn=<NllLoss2DBackward0>)
tensor([[11,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
          1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
          1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
          1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
          1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
          1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
          1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
          1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
          1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
          1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
          1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
          1,  1],
        [11,  1,  1,  1

## Seq2SeqModel_2

In [18]:
# 假定翻译场景（中译英）：编码器和解码器的词典大小不一样的
# 有一条样本，编码器的输入: 小明 吃 苹果，解码器的最终输出：xiao  ming  eats  apples
# 对数据做转换:
# ** 编码器输入: 小明 吃 苹果
# ** 解码器输入: <begin> xiao  ming  eats  apples
# ** 解码器输出: xiao  ming  eats  apples <end>
# 解码器理解成序列生成，生成序列的时候是不是要一个字符/token一个字符/token来生成，在生成当前token/字符的时候，和是之前的token有强烈的关联关系的
net = Seq2SeqModel(
    vocab_size=10000,  # 总共有10000个中文词语
    embedding_dim=128,
    encoder_hidden_size=64, encoder_num_layers=2, encoder_bidirectional=True,
    decoder_num_layers=1,
    decoder_vocab_size=3000,  # 总共有3000个英文词语
    decoder_embedding_dim=64,
    eos_token_id=0
)
print(net)
x_id = torch.tensor([
    [1, 2, 3],
    [1, 2, 3]
])
label_ids = torch.tensor([
    [23, 24, 25, 26],
    [23, 24, 25, 26]
])
_scores, _loss = net(x_id, label_ids)
print(_scores.shape)
print(_loss)

Seq2SeqModel(
  (encoder): EncoderModule(
    (embedding_layer): Embedding(10000, 128)
    (features): Sequential(
      (0): Linear(in_features=128, out_features=64, bias=True)
      (1): ReLU()
      (2): Linear(in_features=64, out_features=64, bias=True)
      (3): ReLU()
    )
  )
  (decoder): DecoderModule(
    (embedding_layer): Embedding(3000, 64)
    (rnn_state_proj): Sequential(
      (0): Linear(in_features=64, out_features=64, bias=True)
      (1): Tanh()
    )
    (rnn): RNN(64, 64, batch_first=True)
    (proj): Linear(in_features=64, out_features=3000, bias=True)
    (rnn_cell): RNNCell(64, 64)
  )
  (loss_fn): CrossEntropyLoss()
)
torch.Size([2, 5, 3000])
tensor(8.0339, grad_fn=<NllLoss2DBackward0>)


## lowercase2uppercase

### generate_data & dataloader

In [19]:
import string
import random
from torch.utils.data import Dataset, DataLoader

# 生成一些示例数据
def generate_data(num_samples=100):
    data = []
    for _ in range(num_samples):
        word_length = random.randint(2, 10)
        lowercase_word = ''.join(random.choices(string.ascii_lowercase, k=word_length))
        uppercase_word = lowercase_word.upper()
        data.append((lowercase_word, uppercase_word))
    return data

# 生成数据
data = generate_data()

# 构建词汇表
vocab = ['<PAD>', '<EOS>'] + list(string.ascii_lowercase) + list(string.ascii_uppercase)
vocab_size = len(vocab)
char_to_id = {char: idx for idx, char in enumerate(vocab)}
id_to_char = {idx: char for idx, char in enumerate(vocab)}

# 将单词转换为ID序列
def word_to_ids(word, char_to_id, max_length=10):
    ids = [char_to_id[char] for char in word]
    if len(ids) < max_length:
        ids += [char_to_id['<PAD>']] * (max_length - len(ids))
    return ids

# 数据集类
class CharTransformDataset(Dataset):
    def __init__(self, data, char_to_id, max_length=10):
        self.data = data
        self.char_to_id = char_to_id
        self.max_length = max_length
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        lowercase_word, uppercase_word = self.data[idx]
        x = word_to_ids(lowercase_word, self.char_to_id, self.max_length)
        y = word_to_ids(uppercase_word, self.char_to_id, self.max_length)
        return torch.tensor(x), torch.tensor(y)

# 创建数据集实例
dataset = CharTransformDataset(data, char_to_id, max_length=10)

# 创建DataLoader
batch_size = 32
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [5]:
dataset_iter = iter(dataloader)
sample1 = next(dataset_iter)
sample1

[tensor([[ 9, 16, 25,  0,  0,  0,  0,  0,  0,  0],
         [24, 16,  5,  9, 16, 11, 15,  0,  0,  0],
         [14, 15, 17, 11,  6,  0,  0,  0,  0,  0],
         [ 3, 16, 23, 26, 27, 18,  0,  0,  0,  0],
         [21, 18,  6,  4, 20, 11,  4, 16,  0,  0],
         [16,  6, 16, 22,  2, 21, 25,  8,  0,  0],
         [ 8,  8, 21, 19, 23, 20, 23,  5,  5, 19],
         [ 6, 19,  3,  0,  0,  0,  0,  0,  0,  0],
         [ 3,  4,  9, 10, 12, 18, 16,  2,  0,  0],
         [ 5, 20,  4,  0,  0,  0,  0,  0,  0,  0],
         [10, 26, 19,  9,  0,  0,  0,  0,  0,  0],
         [22, 21,  9, 11, 25,  0,  0,  0,  0,  0],
         [10,  4,  6, 13,  0,  0,  0,  0,  0,  0],
         [21, 23, 15, 17, 17, 24, 21, 15, 24,  0],
         [ 4,  9, 15,  0,  0,  0,  0,  0,  0,  0],
         [18, 12, 18, 27,  0,  0,  0,  0,  0,  0],
         [ 5, 24, 10, 13, 20, 24, 10,  0,  0,  0],
         [ 2, 11,  2, 21, 13,  5,  0,  0,  0,  0],
         [ 6, 16,  9,  7, 13,  0,  0,  0,  0,  0],
         [ 9, 13, 11,  6, 13,  

In [6]:
sample1[0].shape, sample1[1].shape

(torch.Size([32, 10]), torch.Size([32, 10]))

In [7]:
i= 1
sample1[0][i], sample1[1][i]

(tensor([24, 16,  5,  9, 16, 11, 15,  0,  0,  0]),
 tensor([50, 42, 31, 35, 42, 37, 41,  0,  0,  0]))

In [111]:
from torch import optim

# 模型参数
embedding_dim = 32
encoder_hidden_size = 64
decoder_hidden_size = 64
eos_token_id = char_to_id['<EOS>']

# 初始化模型
model = Seq2SeqModel(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    encoder_hidden_size=encoder_hidden_size,
    decoder_hidden_size=decoder_hidden_size,
    eos_token_id=eos_token_id
)

# 定义优化器
optimizer = optim.Adam(model.parameters(), lr=0.001)

### model

In [20]:
# 模型参数
embedding_dim = 32
encoder_hidden_size = 64
decoder_hidden_size = 64
eos_token_id = char_to_id['<EOS>']

# 初始化模型
model = Seq2SeqModel(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    encoder_hidden_size=encoder_hidden_size,
    decoder_hidden_size=decoder_hidden_size,
    eos_token_id=eos_token_id
)

### train

In [21]:
from torch import optim

# 定义优化器
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 加载模型
model.load_state_dict(torch.load('seq2seq_model.pth'))

# 训练循环
num_epochs = 500
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for x_batch, label_batch in dataloader:
        optimizer.zero_grad()
        scores, loss = model(x_batch, label_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")

# 保存模型
torch.save(model.state_dict(), 'seq2seq_model.pth')

  model.load_state_dict(torch.load('seq2seq_model.pth'))


FileNotFoundError: [Errno 2] No such file or directory: 'seq2seq_model.pth'

### eval

In [19]:
# 加载模型
model.load_state_dict(torch.load('seq2seq_model.pth'))
model.eval()

# 测试函数
def test_model(model, input_word, char_to_id, id_to_char, max_length=10):
    with torch.no_grad():
        input_ids = word_to_ids(input_word, char_to_id, max_length)
        input_tensor = torch.tensor([input_ids])
        output_ids = model(input_tensor)
        output_word = ''.join([id_to_char[id] for id in output_ids[0].tolist() if id != char_to_id['<PAD>']])
        return output_word

# 测试几个例子
# test_words = ['hello', 'world', 'pytorch']
test_words = data[:5]
test_words = [lowercase_word[0] for lowercase_word in test_words]
for word in test_words:
    print(f"Input: {word} -> Output: {test_model(model, word, char_to_id, id_to_char)}")


Input: vhnkxpyn -> Output: VHNKXPYN<EOS>
Input: hoif -> Output: HOIF<EOS>
Input: lqs -> Output: LQS<EOS>
Input: kaz -> Output: KAZ<EOS>
Input: dxyhqyrfah -> Output: DXYHQYRFAH<EOS>


  model.load_state_dict(torch.load('seq2seq_model.pth'))
