## 读取数据

In [29]:
import os

input_texts = []
target_texts = []

root_dir = '.\Classical-Modern-main\双语数据'
books = os.listdir(root_dir)

def read_corpus(dir):
    global input_texts
    global target_texts
    for root, dirs, files in os.walk(dir):
        for file_name in files:
            if file_name == 'bitext.txt':
                file_path = os.path.join(root, file_name)
                with open(file_path, 'r', encoding='utf-8') as file:
                    data = file.read()
                    pieces = data.split('\n\n')
                    for piece in pieces:
                        if piece == '':
                            break
                        input_texts.append(piece.split('\n')[1][4:])
                        target_texts.append(piece.split('\n')[0][3:])
    return

## 数据预处理

In [30]:
from jiayan import load_lm
from jiayan import CharHMMTokenizer
from jieba import lcut
from torch.nn.utils.rnn import pad_sequence

word2idx_input = {}
word2idx_target = {}
idx2word_input = {}
idx2word_target = {}
padded_input = []
padded_target = []
maxlen_input = 0
maxlen_target = 0

def data_pre_work():
    global input_texts
    global target_texts
    global word2idx_input
    global word2idx_target
    global idx2word_input
    global idx2word_target
    global padded_input
    global padded_target
    global maxlen_input
    global maxlen_target

    # jieba对现代文进行分词
    tokenized_input_data = [lcut(sentence) for sentence in input_texts]

    # jiayan对古文进行分词
    lm = load_lm('jiayan.klm')
    target_tokenizer = CharHMMTokenizer(lm)
    tokenized_target_data = [list(target_tokenizer.tokenize(sentence)) for sentence in target_texts]

    # 构建词汇表
    word2idx_input = {word: idx + 1 for idx, word in enumerate(set([word for sentence in tokenized_input_data for word in sentence]))}
    word2idx_input['<PAD>'] = 0
    word2idx_target = {word: idx + 1 for idx, word in enumerate(set([word for sentence in tokenized_target_data for word in sentence]))}
    word2idx_target['<PAD>'] = 0

    idx2word_input = {idx: word for word, idx in word2idx_input.items()}
    idx2word_target = {idx: word for word, idx in word2idx_target.items()}

    # 将文本转换为整数序列
    int_input = []
    for sentence in tokenized_input_data:
        seq = []
        for word in sentence:
            if word in word2idx_input:
                seq.append(word2idx_input[word])
        int_input.append(seq)
    
    int_target = []
    for sentence in tokenized_target_data:
        seq = []
        for word in sentence:
            if word in word2idx_target:
                seq.append(word2idx_target[word])
        int_target.append(seq)

    # 序列填充, 是所有序列具有相同的长度
    maxlen_input = max(len(seq) for seq in input_sequences)
    maxlen_target = max(len(seq) for seq in target_sequences)
    # 将整数序列转换为PyTorch张量
    tensor_input_sequence = [torch.tensor(sequence) for sequence in input_sequences]
    tensor_target_sequence = [torch.tensor(sequence) for sequence in target_sequences]
    padded_input = pad_sequence(tensor_input_sequence, batch_first=True)
    padded_target = pad_sequence(tensor_target_sequence, batch_first=True)

## 定义语言风格迁移模型

In [31]:
from torch import nn
class StyleTransferModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(StyleTransferModel, self).__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, input_seq):
        embedded = self.embedding(input_seq)
        output, _ = self.lstm(embedded)
        output = self.fc(output[:, -1, :])  # 只使用最后一个时间步的输出
        return output

## 定义训练函数

In [32]:
def train(model, input_sequences, target_sequences, criterion, optimizer, num_epochs):
    model.train()
    
    for epoch in range(num_epochs):
        optimizer.zero_grad()
        output = model(input_sequences)
        loss = criterion(output, target_sequences)
        loss.backward()
        optimizer.step()
        
        if (epoch + 1) % 10 == 0:
            print(f'Epoch: {epoch+1}, Loss: {loss.item()}')

## 定义预测函数

In [33]:
def predict(model, input_sequence):
    model.eval()
    with torch.no_grad():
        output = model(input_sequence)
        predicted_sequence = torch.argmax(output, dim=1)
    return predicted_sequence

## 主函数

In [34]:
if __name__ == '__main__':
    read_corpus(root_dir + '//' + books[0])

    data_pre_work()

    # 定义超参数
    input_size = len(word2idx_input)
    output_size = len(word2idx_target)
    hidden_size = 256
    learning_rate = 0.001
    num_epochs = 100

    # 初始化模型、损失函数和优化器
    model = StyleTransferModel(input_size, hidden_size, output_size)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    # 训练模型
    for epoch in range(num_epochs):
        loss = train(model, input_data, target_data, criterion, optimizer)
        print('Epoch: {}, Loss: {}'.format(epoch + 1, loss))
    
    # 进行预测
    input_seq = input("请输入进行风格转换的句子:")
    input_words = lcut(input_seq)
    int_input = []
    for word in input_words:
        if word in word2idx_input:
            int_input.append(word2idx_input[word])
    tensor_input = torch.zeros(maxlen_input, dtype=torch.long)
    padded_input[:len(int_input)] = torch.tensor(int_input)
    output_seq = predict(model, padded_input)
    print("转换结果为:")
    print(output_seq)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\WANGLE~1\AppData\Local\Temp\jieba.cache


Loading model cost 0.364 seconds.
Prefix dict has been built successfully.


OSError: Cannot read model 'jiayan.klm' (util\file.cc:74 in util::OpenReadOrThrow threw ErrnoException because `-1 == (ret = _open(name, 0x8000 | 0x0000))'. No such file or directory while opening c:\Users\Wang Lei\PycharmProjects\pythonProject\jiayan.klm)