In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

# 假设我们有一个非常小的数据集
corpus = 'hello world this is a simple example corpus'
chars = sorted(list(set(corpus)))
vocab_size = len(chars)
char_to_index = dict((c, i) for i, c in enumerate(chars))
index_to_char = dict((i, c) for i, c in enumerate(chars))

# 准备数据
seq_length = 10
dataX = []
dataY = []
for i in range(0, len(corpus) - seq_length):
    seq_in = corpus[i:i + seq_length]
    seq_out = corpus[i + seq_length]
    dataX.append([char_to_index[char] for char in seq_in])
    dataY.append(char_to_index[seq_out])

# 转换数据为PyTorch张量
tensor_x = torch.tensor(dataX, dtype=torch.long).cuda()
tensor_y = torch.tensor(dataY, dtype=torch.long).cuda()

# 创建LSTM模型
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        output, (hidden, cell) = self.lstm(embedded)
        return self.fc(hidden[-1])

# 设置模型参数
embedding_dim = 32
hidden_dim = 64
output_dim = vocab_size

model = LSTMModel(vocab_size, embedding_dim, hidden_dim, output_dim).cuda()

# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

# 训练模型
num_epochs = 100
for epoch in range(num_epochs):
    model.zero_grad()
    output = model(tensor_x)
    loss = criterion(output, tensor_y)
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')

# 生成文本
def generate_text(model, start_char, gen_size=50):
    model.eval()
    with torch.no_grad():
        input = torch.tensor([[char_to_index[start_char]]], dtype=torch.long).cuda()
        output_chars = [start_char]

        for _ in range(gen_size):
            output = model(input)
            _, top_i = output.topk(1)
            char = index_to_char[top_i.item()]
            output_chars.append(char)

            input = torch.tensor([[top_i.item()]], dtype=torch.long).cuda()

        return ''.join(output_chars)

# 使用模型生成文本
start_char = 'h'
generated_text = generate_text(model, start_char)
print(generated_text)


ValueError: Expected input batch_size (10) to match target batch_size (33).

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim

# 假设我们有一个非常小的数据集
corpus = 'hello world this is a simple example corpus'
chars = sorted(list(set(corpus)))
vocab_size = len(chars)
char_to_index = dict((c, i) for i, c in enumerate(chars))
index_to_char = dict((i, c) for i, c in enumerate(chars))

# 准备数据
seq_length = 10
dataX = []
dataY = []
for i in range(0, len(corpus) - seq_length):
    seq_in = corpus[i:i + seq_length]
    seq_out = corpus[i + seq_length]
    dataX.append([char_to_index[char] for char in seq_in])
    dataY.append(char_to_index[seq_out])

# 转换数据为PyTorch张量
tensor_x = torch.tensor(dataX, dtype=torch.long).cuda()
tensor_y = torch.tensor(dataY, dtype=torch.long).cuda()

# 创建LSTM模型
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        output, (hidden, cell) = self.lstm(embedded)
        return self.fc(hidden[-1])

# 设置模型参数
embedding_dim = 32
hidden_dim = 64
output_dim = vocab_size

model = LSTMModel(vocab_size, embedding_dim, hidden_dim, output_dim).cuda()

# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

# 训练模型
num_epochs = 100
for epoch in range(num_epochs):
    for i in range(0, tensor_x.size(0), 1):  # Adjusted to loop through one sequence at a time
        model.zero_grad()
        output = model(tensor_x[i].view(-1, 1))  # Adjusted to handle one sequence at a time
        loss = criterion(output, tensor_y[i].view(1))  # Adjusted to handle one target at a time
        loss.backward()
        optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')

# 生成文本
def generate_text(model, start_char, gen_size=50):
    model.eval()
    with torch.no_grad():
        input = torch.tensor([[char_to_index[start_char]]], dtype=torch.long).cuda()
        output_chars = [start_char]

        for _ in range(gen_size):
            output = model(input)
            _, top_i = output.topk(1)
            char = index_to_char[top_i.item()]
            output_chars.append(char)

            input = torch.tensor([[top_i.item()]], dtype=torch.long).cuda()

        return ''.join(output_chars)

# 使用模型生成文本
start_char = 'h'
generated_text = generate_text(model, start_char)
print(generated_text)


Epoch [10/100], Loss: 0.6315
Epoch [20/100], Loss: 0.1015
Epoch [30/100], Loss: 0.0368
Epoch [40/100], Loss: 0.0186
Epoch [50/100], Loss: 0.0112
Epoch [60/100], Loss: 0.0074
Epoch [70/100], Loss: 0.0053
Epoch [80/100], Loss: 0.0038
Epoch [90/100], Loss: 0.0029
Epoch [100/100], Loss: 0.0022
his s s s s s s s s s s s s s s s s s s s s s s s s


In [3]:
# 首先，我们需要一些基本的库来构建和训练我们的模型。torch是PyTorch的主库，它提供了构建神经网络所需的所有工具。torch.nn是PyTorch的神经网络库，它包含了许多预先定义好的层和函数，我们可以直接使用它们来构建模型。torch.optim是PyTorch的优化库，它包含了许多优化算法，我们可以使用它们来更新模型的权重。
import torch
import torch.nn as nn
import torch.optim as optim

# 接下来，我们定义了一个非常小的文本数据集，这个数据集就是我们的“语料库”。我们从这个语料库中创建了一个字符到索引的映射，这样我们就可以将文本字符转换为数字，因为神经网络只能处理数字输入。
# 假设我们有一个非常小的数据集
corpus = 'hello world this is a simple example corpus'
chars = sorted(list(set(corpus)))
vocab_size = len(chars)
char_to_index = dict((c, i) for i, c in enumerate(chars))
index_to_char = dict((i, c) for i, c in enumerate(chars))

# 然后，我们准备我们的数据。我们决定每个序列的长度是10个字符，这意味着我们的模型将会每次看到10个字符，并尝试预测这10个字符之后的下一个字符。我们为每个这样的序列创建了一个输入和一个目标输出。
# 准备数据
seq_length = 10
dataX = []
dataY = []
for i in range(0, len(corpus) - seq_length):
    seq_in = corpus[i:i + seq_length]
    seq_out = corpus[i + seq_length]
    dataX.append([char_to_index[char] for char in seq_in])
    dataY.append(char_to_index[seq_out])

# 我们将这些输入和输出转换为PyTorch张量，并将它们移动到GPU上（如果有的话）以加速训练过程。
# 转换数据为PyTorch张量
tensor_x = torch.tensor(dataX, dtype=torch.long).cuda()
tensor_y = torch.tensor(dataY, dtype=torch.long).cuda()

# 接下来，我们定义了我们的LSTM模型。这个模型包含一个嵌入层，它将字符索引转换为向量表示；一个LSTM层，它将处理这些向量序列；和一个全连接层，它将LSTM层的输出转换为每个可能字符的分数
# 创建LSTM模型
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        output, (hidden, cell) = self.lstm(embedded)
        return self.fc(hidden[-1])

# 我们设置了一些模型参数，并创建了一个模型实例。我们还定义了损失函数和优化器，这些都是训练神经网络所必需的。
# 设置模型参数
embedding_dim = 32
hidden_dim = 64
output_dim = vocab_size

model = LSTMModel(vocab_size, embedding_dim, hidden_dim, output_dim).cuda()

# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

# 然后，我们开始训练模型。我们决定训练100个周期，每个周期我们都将遍历整个数据集一次。在每次迭代中，我们将一个序列输入模型，并计算模型预测的下一个字符与实际下一个字符之间的差异。然后，我们使用这个差异来更新模型的权重。
# 训练模型
num_epochs = 100
for epoch in range(num_epochs):
    for i in range(0, tensor_x.size(0), 1):  # Adjusted to loop through one sequence at a time
        model.zero_grad()
        output = model(tensor_x[i].view(-1, 1))  # Adjusted to handle one sequence at a time
        loss = criterion(output, tensor_y[i].view(1))  # Adjusted to handle one target at a time
        loss.backward()
        optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')

# 最后，我们定义了一个函数来生成文本。这个函数接受一个起始字符，并使用模型来预测接下来的50个字符。它将每个预测的字符添加到文本中，并使用这个新的文本片段作为输入来预测下一个字符。
# 生成文本
def generate_text(model, start_char, gen_size=50):
    model.eval()
    with torch.no_grad():
        input = torch.tensor([[char_to_index[start_char]]], dtype=torch.long).cuda()
        output_chars = [start_char]

        for _ in range(gen_size):
            output = model(input)
            _, top_i = output.topk(1)
            char = index_to_char[top_i.item()]
            output_chars.append(char)

            input = torch.tensor([[top_i.item()]], dtype=torch.long).cuda()

        return ''.join(output_chars)

# 使用模型生成文本
start_char = 'h'
generated_text = generate_text(model, start_char)
print(generated_text)


Epoch [10/100], Loss: 1.1415
Epoch [20/100], Loss: 0.2357
Epoch [30/100], Loss: 0.0667
Epoch [40/100], Loss: 0.0308
Epoch [50/100], Loss: 0.0176
Epoch [60/100], Loss: 0.0113
Epoch [70/100], Loss: 0.0077
Epoch [80/100], Loss: 0.0055
Epoch [90/100], Loss: 0.0041
Epoch [100/100], Loss: 0.0031
hrpus corpus corpus corpus corpus corpus corpus cor
