In [9]:
# 在Google Colab中运行时，请确保上传文件到适当的目录
zh_file_path = '/mnt/raw.zh'
en_file_path = '/mnt/raw.en'

# 读取数据
with open(zh_file_path, 'r', encoding='utf-8') as f:
    raw_zh = f.readlines()

with open(en_file_path, 'r', encoding='utf-8') as f:
    raw_en = f.readlines()

print(f"Loaded {len(raw_zh)} Chinese sentences and {len(raw_en)} English sentences.")


Loaded 394066 Chinese sentences and 394066 English sentences.


In [14]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# 创建分词器
tokenizer_zh = Tokenizer(filters='')
tokenizer_en = Tokenizer(filters='')

# 训练分词器
tokenizer_zh.fit_on_texts(raw_zh)
tokenizer_en.fit_on_texts(raw_en)

# 将文本转换为序列
seqs_zh = tokenizer_zh.texts_to_sequences(raw_zh)
seqs_en = tokenizer_en.texts_to_sequences(raw_en)

# 填充序列
seqs_zh = pad_sequences(seqs_zh, padding='post')
seqs_en = pad_sequences(seqs_en, padding='post')

# 创建训练和测试数据
train_size = int(0.8 * len(seqs_zh))
X_train, X_test = seqs_zh[:train_size], seqs_zh[train_size:]
y_train, y_test = seqs_en[:train_size], seqs_en[train_size:]

# 将标签进行one-hot编码
vocab_size_en = len(tokenizer_en.word_index) + 1
y_train = pad_sequences(y_train, padding='post', value=0)
y_test = pad_sequences(y_test, padding='post', value=0)


In [15]:
from tensorflow.keras.utils import Sequence
import numpy as np

class DataGenerator(Sequence):
    def __init__(self, X, y, batch_size=16):
        self.X = X
        self.y = y
        self.batch_size = batch_size
        self.indices = np.arange(len(X))
        self.on_epoch_end()

    def __len__(self):
        return int(np.ceil(len(self.X) / self.batch_size))

    def __getitem__(self, index):
        indices = self.indices[index * self.batch_size:(index + 1) * self.batch_size]
        X_batch = self.X[indices]
        y_batch = self.y[indices]
        decoder_input_data = np.zeros_like(y_batch)
        decoder_input_data[:, 1:] = y_batch[:, :-1]
        decoder_target_data = y_batch[:, :, np.newaxis]
        return [X_batch, decoder_input_data], decoder_target_data[:, :, 0]

    def on_epoch_end(self):
        np.random.shuffle(self.indices)


In [3]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense

# 模型参数
embedding_dim = 8  # 减小嵌入维度大小
units = 8  # 减少LSTM单元数量
vocab_size_zh = len(tokenizer_zh.word_index) + 1

# 编码器
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(input_dim=vocab_size_zh, output_dim=embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(units, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# 解码器
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(input_dim=vocab_size_en, output_dim=embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(vocab_size_en, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# 构建模型
rnn_model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
rnn_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
rnn_model.summary()

# 创建数据生成器
train_generator = DataGenerator(X_train, y_train, batch_size=16)
val_generator = DataGenerator(X_test, y_test, batch_size=16)

# 训练模型
rnn_model.fit(train_generator, validation_data=val_generator, epochs=1)  # 进一步减少训练轮数以加快训练速度


NameError: name 'tokenizer_zh' is not defined

In [None]:
from tensorflow.keras.layers import MultiHeadAttention, LayerNormalization, Dropout, GlobalAveragePooling1D, Dense

def transformer_encoder(inputs, head_size, num_heads, ff_dim, dropout=0):
    x = inputs
    x = MultiHeadAttention(key_dim=head_size, num_heads=num_heads, dropout=dropout)(x, x)
    x = LayerNormalization(epsilon=1e-6)(x)
    res = x
    x = Dense(ff_dim, activation="relu")(x)
    x = Dense(inputs.shape[-1])(x)
    x = Dropout(dropout)(x)
    return LayerNormalization(epsilon=1e-6)(x + res)

def build_transformer_model(input_shape, vocab_size_zh, vocab_size_en, num_heads=4, ff_dim=32, num_transformer_blocks=2):
    encoder_inputs = Input(shape=input_shape)
    decoder_inputs = Input(shape=(None,))

    # 编码器部分
    x = Embedding(vocab_size_zh, embedding_dim)(encoder_inputs)
    for _ in range(num_transformer_blocks):
        x = transformer_encoder(x, head_size=embedding_dim, num_heads=num_heads, ff_dim=ff_dim)
    encoder_outputs = LayerNormalization(epsilon=1e-6)(x)

    # 解码器部分
    y = Embedding(vocab_size_en, embedding_dim)(decoder_inputs)
    for _ in range(num_transformer_blocks):
        y = transformer_encoder(y, head_size=embedding_dim, num_heads=num_heads, ff_dim=ff_dim)

    # 解码器的输出
    decoder_outputs = Dense(vocab_size_en, activation="softmax")(y)

    return Model([encoder_inputs, decoder_inputs], decoder_outputs)

# 构建Transformer模型
input_shape = (X_train.shape[1],)
transformer_model = build_transformer_model(input_shape, vocab_size_zh, vocab_size_en)
transformer_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
transformer_model.summary()

# 训练Transformer模型
transformer_model.fit(train_generator, validation_data=val_generator, epochs=3)  # 进一步减少训练轮数以加快训练速度


In [None]:
# 使用模型生成伪标签
pseudo_labels = transformer_model.predict(X_test)
pseudo_labels = np.argmax(pseudo_labels, axis=-1)

# 将原始英文标签和伪标签合并
X_combined = np.concatenate((X_train, X_test), axis=0)
y_combined = np.concatenate((y_train, pseudo_labels), axis=0)

# 创建数据生成器
combined_generator = DataGenerator(X_combined, y_combined, batch_size=16)

# 重新训练模型
transformer_model.fit(combined_generator, epochs=3, validation_data=val_generator)  # 进一步减少训练轮数以加快训练速度


In [None]:
# 评估RNN Seq2Seq模型
rnn_test_loss, rnn_test_acc = rnn_model.evaluate(val_generator)
print(f"RNN Seq2Seq Test accuracy: {rnn_test_acc}")

# 评估Transformer模型
transformer_test_loss, transformer_test_acc = transformer_model.evaluate(val_generator)
print(f"Transformer Test accuracy: {transformer_test_acc}")

# 演示翻译结果
def translate_sentence(sentence, model, tokenizer_input, tokenizer_output):
    seq = tokenizer_input.texts_to_sequences([sentence])
    seq = pad_sequences(seq, maxlen=X_train.shape[1], padding='post')
    prediction = model.predict([seq, seq])
    predicted_seq = np.argmax(prediction, axis=-1)
    translated_sentence = tokenizer_output.sequences_to_texts(predicted_seq)
    return translated_sentence[0]

# 示例翻译
example_sentence = "你好，世界！"
rnn_translated_sentence = translate_sentence(example_sentence, rnn_model, tokenizer_zh, tokenizer_en)
transformer_translated_sentence = translate_sentence(example_sentence, transformer_model, tokenizer_zh, tokenizer_en)

print(f"RNN Seq2Seq Translated: {rnn_translated_sentence}")
print(f"Transformer Translated: {transformer_translated_sentence}")
