In [9]:
import torch
import torch.nn as nn

class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, n_layers=1, dropout=0.5):
        super(RNNModel, self).__init__()
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.rnn = nn.LSTM(hidden_size, hidden_size, n_layers, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, input, hidden):
        embedded = self.embedding(input)
        output, hidden = self.rnn(embedded, hidden)
        output = self.fc(output)
        return output, hidden

    def init_hidden(self, batch_size):
        return (torch.zeros(self.n_layers, batch_size, self.hidden_size).to(device),
                torch.zeros(self.n_layers, batch_size, self.hidden_size).to(device))


In [10]:
import string
char_to_num = {char: i for i, char in enumerate(string.ascii_lowercase, 1)}

# 字符到数字的映射
char_to_num = {char: i for i, char in enumerate(string.ascii_lowercase, 1)}
char_to_num['<pad>'] = 0  # 填充符
char_to_num['-'] = 27  # -
char_to_num['<eos>'] = 28  # 结束符
char_to_num['<bos>'] = 29  # 开始符（如果使用）

num_to_char = {i: j for j, i in char_to_num.items()}

def tokenize(text, bidirectional=False):
    # 先全部变成小写，把大写加进来可能会出现
    lowercase_text = text.lower()
    # 27用来表示结束，如果需要可以在头上加上0，这样就可以随意给出一段文字然后来生成开始和结尾
    numbers = [char_to_num[char] for char in lowercase_text if char in char_to_num] + [28]
    if bidirectional == True:
        numbers = [29] + numbers
    return numbers

def reverse_tokenize(numbers, bidirectional=False):
    # 如果是双向模式，跳过第一个数字（29）
    start_index = 1 if bidirectional and numbers[0] == 29 else 0

    # 转换数字到字符，直到遇到28
    characters = []
    for number in numbers[start_index:]:
        if number == 28:
            break
        characters.append(num_to_char[number])


    # 将字符列表转换为字符串
    return ''.join(characters)


In [11]:
bidirectional = False
batch_size = 32
learning_rate = 0.01
input_size = 29
hidden_size = 100
output_size = 29
n_layers = 1
dropout = 0.5
n_epochs = 10

model = RNNModel(input_size, hidden_size, output_size, n_layers, dropout)
loss_function = nn.CrossEntropyLoss(ignore_index=char_to_num['<pad>'])
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)



In [12]:
from torch.utils.data import DataLoader
import torch.nn.functional as F
import numpy as np
from torch.nn.utils.rnn import pad_sequence

# 读取和处理数据
def load_data(filename):
    names = []
    with open(filename, 'r', encoding='utf-8') as file:
        for line in file:
            name = line.strip()
            name = tokenize(name, bidirectional=bidirectional)
            names.append(torch.tensor(name, dtype=torch.long))
    return names

names = load_data('names.txt')
f_names = load_data('female.txt')
m_names = load_data('male.txt')
dataset = f_names + m_names 


lengths = [len(sequence) for sequence in dataset]
padded_dataset = pad_sequence([torch.tensor(sequence, dtype=torch.long) for sequence in dataset], \
                              batch_first=True, padding_value=0)
data_loader = DataLoader(padded_dataset, batch_size=batch_size, shuffle=True)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")

model = model.to(device)

  padded_dataset = pad_sequence([torch.tensor(sequence, dtype=torch.long) for sequence in dataset], \


In [13]:
model.train()
for epoch in range(n_epochs):
    for batch in data_loader:
        hidden = model.init_hidden(batch.shape[0])
        input, target = batch[:, :-1], batch[:, 1:]    # 移除最后一个字符作为输入，第二个字符到最后一个字符作为目标
        input, target = input.to(device), target.to(device)
        optimizer.zero_grad()
        output, hidden = model(input, hidden)
        hidden = tuple([each.data for each in hidden])  # 分离隐藏状态
        # 计算损失
        loss = loss_function(output.reshape(-1, output_size), target.reshape(-1))
        loss.backward()
        optimizer.step()

        print('Epoch: {}/{}.............'.format(epoch, n_epochs), end=' ')
        print("Loss: {:.4f}".format(loss.item()))

Epoch: 0/10............. Loss: 3.3541
Epoch: 0/10............. Loss: 3.2112
Epoch: 0/10............. Loss: 2.9660
Epoch: 0/10............. Loss: 2.7389
Epoch: 0/10............. Loss: 2.6811
Epoch: 0/10............. Loss: 2.7217
Epoch: 0/10............. Loss: 2.6265
Epoch: 0/10............. Loss: 2.4687
Epoch: 0/10............. Loss: 2.4879
Epoch: 0/10............. Loss: 2.4216
Epoch: 0/10............. Loss: 2.5165
Epoch: 0/10............. Loss: 2.5389
Epoch: 0/10............. Loss: 2.3853
Epoch: 0/10............. Loss: 2.4855
Epoch: 0/10............. Loss: 2.4132
Epoch: 0/10............. Loss: 2.3196
Epoch: 0/10............. Loss: 2.2874
Epoch: 0/10............. Loss: 2.5116
Epoch: 0/10............. Loss: 2.3079
Epoch: 0/10............. Loss: 2.3382
Epoch: 0/10............. Loss: 2.3174
Epoch: 0/10............. Loss: 2.2107
Epoch: 0/10............. Loss: 2.1851
Epoch: 0/10............. Loss: 2.2409
Epoch: 0/10............. Loss: 2.2973
Epoch: 0/10............. Loss: 2.4332
Epoch: 0/10.

In [14]:
torch.save(model.state_dict(), 'model_state_dict.pth')


In [20]:
import torch.nn.functional as F

model.load_state_dict(torch.load('model_state_dict.pth'))
model.eval()  # 设置模型为评估模式

text = "hen" 
text_tokenize = [tokenize(text, bidirectional=bidirectional)[:-1]]

texts_tokenize = torch.tensor(text_tokenize, dtype=torch.long).to(device)#



# 通过模型运行数据
def test(texts_tokenize):
    update = torch.LongTensor(1, texts_tokenize.shape[1] + 1)  # 确保update是长整型
    hidden = model.init_hidden(1)
    output, hidden = model(texts_tokenize, hidden)

    probabilities = F.softmax(output, dim=-1)
    top_probabilities, top_indices = torch.topk(probabilities, 5)
    char_list = [num_to_char[number] for number in top_indices[:, -1].squeeze().tolist()]
    prob_list = []
    for item1, item2 in zip(char_list, top_probabilities[:, -1].squeeze().tolist()):
        item2 = "{:.3f}".format(item2)
        prob_list.append(f"{item1}: {item2}")
    print(prob_list)
    
    max_values, max_indices = torch.max(output, dim=-1)
    max_indices = max_indices.long()  # 确保max_indices是长整型

    if int(max_indices[:, -1]) == 28:
        return torch.cat([texts_tokenize[0], max_indices[:, -1]], dim=0)
    else:
        update[0] = torch.cat([texts_tokenize[0], max_indices[:, -1]], dim=0)
        
        return test(update)


name_a = reverse_tokenize(test(texts_tokenize=texts_tokenize).squeeze().tolist())

print(name_a)


['r: 0.506', 'd: 0.141', 'n: 0.071', 'i: 0.045', 'g: 0.038']
['i: 0.773', 'y: 0.152', 'a: 0.036', 'e: 0.021', 'r: 0.006']
['e: 0.477', 'c: 0.141', 'q: 0.079', 'd: 0.060', 'k: 0.058']
['<eos>: 0.790', 'l: 0.093', 't: 0.051', 'c: 0.017', 's: 0.009']
henrie
