In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
data_path="/content/gdrive/MyDrive/Colab Notebooks/cmn.txt"
with open(data_path, 'r', encoding = "utf-8") as f:
    lines = f.read().split('\n')

In [None]:
lines[:10]

['Hi.\t嗨。\tCC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #891077 (Martha)',
 'Hi.\t你好。\tCC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #4857568 (musclegirlxyp)',
 'Run.\t你用跑的。\tCC-BY 2.0 (France) Attribution: tatoeba.org #4008918 (JSakuragi) & #3748344 (egg0073)',
 'Stop!\t住手！\tCC-BY 2.0 (France) Attribution: tatoeba.org #448320 (CM) & #448321 (GlossaMatik)',
 'Wait!\t等等！\tCC-BY 2.0 (France) Attribution: tatoeba.org #1744314 (belgavox) & #4970122 (wzhd)',
 'Wait!\t等一下！\tCC-BY 2.0 (France) Attribution: tatoeba.org #1744314 (belgavox) & #5092613 (mirrorvan)',
 'Begin.\t开始！\tCC-BY 2.0 (France) Attribution: tatoeba.org #6102432 (mailohilohi) & #5094852 (Jin_Dehong)',
 'Hello!\t你好。\tCC-BY 2.0 (France) Attribution: tatoeba.org #373330 (CK) & #4857568 (musclegirlxyp)',
 'I try.\t我试试。\tCC-BY 2.0 (France) Attribution: tatoeba.org #20776 (CK) & #8870261 (will66)',
 'I won!\t我赢了。\tCC-BY 2.0 (France) Attribution: tatoeba.org #2005192 (CK) & #5102367 (mirrorvan)']

In [None]:
import re
import pickle as pkl

def preprocess_cn(sentence):
    """
    Lowercases a Chinese sentence and inserts a whitespace between two characters.
    Surrounds the split sentence with <SOS> and <EOS>.
    """
    # 將句子轉小寫，並移除前後的空白。
    sentence = sentence.lower().strip()
    # 把多個空白合併成一個。
    sentence = re.sub(r"[' ']+", " ", sentence)
    # 再次保險清掉頭尾空白。
    sentence = sentence.strip()
    # 把字串拆成每個字元之後，用空格連接起來，例如："你好嗎" 變成 "你 好 嗎"
    sentence = " ".join(sentence)
    # 加上特殊起始與結束標記，這樣模型訓練時可以知道句子邊界。
    sentence = "<SOS> " + sentence + " <EOS>"
    return sentence


def preprocess_eng(sentence):
    """
    Lowercases an English sentence and inserts a whitespace within 2 words or punctuations.
    Surrounds the split sentence with <SOS> and <EOS>
    """
    sentence = sentence.lower().strip()
    # 對標點符號前後加上空格，例如 "Hello!" → "Hello !"
    sentence = re.sub(r"([,.!?\"'])", r" \1", sentence)
    sentence = re.sub(r"\s+", " ", sentence)
    # 移除不是字母或標點的字元，例如數字、#、@、& 全部換成空格。
    sentence = re.sub(r"[^a-zA-Z,.!?\"']", ' ', sentence)
    sentence = "<SOS> " + sentence + " <EOS>"
    return sentence

In [None]:
seq_pairs = []

for line in lines:
    # ensures that the line loaded contains Chinese and English sentences
    if len(line.split('\t')) >= 3:
      # 把 line 依照 tab (\t) 切成三個部分，然後只保留前兩個
        eng_doc, cn_doc, _ = line.split('\t')
        eng_doc = preprocess_eng(eng_doc)
        cn_doc = preprocess_cn(cn_doc)
        seq_pairs.append([eng_doc, cn_doc])
    else:
        continue

In [None]:
seq_pairs[:10]

[['<SOS> hi . <EOS>', '<SOS> 嗨 。 <EOS>'],
 ['<SOS> hi . <EOS>', '<SOS> 你 好 。 <EOS>'],
 ['<SOS> run . <EOS>', '<SOS> 你 用 跑 的 。 <EOS>'],
 ['<SOS> stop ! <EOS>', '<SOS> 住 手 ！ <EOS>'],
 ['<SOS> wait ! <EOS>', '<SOS> 等 等 ！ <EOS>'],
 ['<SOS> wait ! <EOS>', '<SOS> 等 一 下 ！ <EOS>'],
 ['<SOS> begin . <EOS>', '<SOS> 开 始 ！ <EOS>'],
 ['<SOS> hello ! <EOS>', '<SOS> 你 好 。 <EOS>'],
 ['<SOS> i try . <EOS>', '<SOS> 我 试 试 。 <EOS>'],
 ['<SOS> i won ! <EOS>', '<SOS> 我 赢 了 。 <EOS>']]

In [None]:
# Save list seq_pairs to file
# with open("/content/gdrive/MyDrive/Colab Notebooks/eng-cn.pkl", "wb") as f:
#     pkl.dump(seq_pairs, f)

In [None]:
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [None]:
with open("/content/gdrive/MyDrive/Colab Notebooks/eng-cn.pkl", "rb") as f:
    seq_pairs = pkl.load(f)

# reduce size of seq_pairs
n_samples = 10000
seq_pairs = seq_pairs[:n_samples]

eng_sentences = [pair[0] for pair in seq_pairs]
chn_sentences = [pair[1] for pair in seq_pairs]

In [None]:
eng_sentences[0]

'<SOS> hi . <EOS>'

In [None]:
chn_sentences[0]

'<SOS> 嗨 。 <EOS>'

In [None]:
def create_tokeniser(sentences):
    # create a tokeniser specific to texts
    tokeniser = Tokenizer(filters = ' ') # filters=' ' 表示只把空白當作分隔符號，保留所有標點符號
    tokeniser.fit_on_texts(sentences)
    # preview the first 3 sentences versus their word tokenised versions
    for i in range(3):
        print("original: {} - word tokenised: {}".format(sentences[i], tokeniser.texts_to_sequences(sentences)[i]))
    return tokeniser.texts_to_sequences(sentences), tokeniser

# word tokenise source and target sentences
eng_word_tokenised, eng_tokeniser = create_tokeniser(eng_sentences)
chn_word_tokenised, chn_tokeniser = create_tokeniser(chn_sentences)

original: <SOS> hi . <EOS> - word tokenised: [1, 730, 3, 2]
original: <SOS> hi . <EOS> - word tokenised: [1, 730, 3, 2]
original: <SOS> run . <EOS> - word tokenised: [1, 322, 3, 2]
original: <SOS> 嗨 。 <EOS> - word tokenised: [1, 1284, 3, 2]
original: <SOS> 你 好 。 <EOS> - word tokenised: [1, 6, 25, 3, 2]
original: <SOS> 你 用 跑 的 。 <EOS> - word tokenised: [1, 6, 138, 268, 7, 3, 2]


In [None]:
eng_word_tokenised[0]

[1, 730, 3, 2]

In [None]:
print(eng_tokeniser.word_index)  # 查看英文詞和ID對應
print(chn_tokeniser.word_index)  # 查看中文詞和ID對應

{'<sos>': 1, '<eos>': 2, '.': 3, 'i': 4, '?': 5, 'you': 6, 'is': 7, 'a': 8, 'tom': 9, 'the': 10, "'s": 11, "'t": 12, 'to': 13, 'it': 14, 'he': 15, "'m": 16, 'my': 17, 'do': 18, 'this': 19, 'me': 20, 'are': 21, 'we': 22, 'that': 23, 'can': 24, 'what': 25, 'have': 26, 'don': 27, 'your': 28, 'she': 29, 'like': 30, 'was': 31, "'re": 32, 'go': 33, 'in': 34, 'not': 35, 'of': 36, 'how': 37, ',': 38, 'want': 39, '!': 40, 'be': 41, 'on': 42, 'at': 43, 'here': 44, "'ll": 45, 'did': 46, 'very': 47, 'please': 48, 'they': 49, 'him': 50, 'for': 51, 'has': 52, 'his': 53, 'let': 54, 'no': 55, 'good': 56, 'know': 57, 'up': 58, 'where': 59, "'ve": 60, 'need': 61, 'there': 62, 'now': 63, 'her': 64, 'come': 65, 'why': 66, 'time': 67, 'all': 68, 'who': 69, 'am': 70, 'see': 71, 'will': 72, 'get': 73, 'too': 74, 'got': 75, 'out': 76, 'isn': 77, 'so': 78, 'car': 79, 'home': 80, 'with': 81, 'an': 82, 'think': 83, 'much': 84, 'book': 85, 'about': 86, 'help': 87, 'work': 88, 'one': 89, 'should': 90, 'eat': 91, '

In [None]:
eng_vocab_dict = eng_tokeniser.word_index
chn_vocab_dict = chn_tokeniser.word_index
# Tokenizer 產生的 word_index 是從 1 開始編號的，
eng_vocab_size = len(eng_vocab_dict) + 1
chn_vocab_size = len(chn_vocab_dict) + 1

print(eng_vocab_size)
print(chn_vocab_size)

3080
2455


In [None]:
eng_max_seq_length = len(max(eng_word_tokenised, key = len))
chn_max_seq_length = len(max(chn_word_tokenised, key = len))

print(eng_max_seq_length)
print(chn_max_seq_length)

11
22


In [None]:
# 把已經 word tokenized 的英文和中文句子補齊成統一長度（padding），並且增加一個維度來符合模型輸入的要求

eng_sentences_padded = pad_sequences(eng_word_tokenised, maxlen = eng_max_seq_length, padding = "post")
chn_sentences_padded = pad_sequences(chn_word_tokenised, maxlen = chn_max_seq_length, padding = "post")

eng_sentences_padded = eng_sentences_padded.reshape(*eng_sentences_padded.shape, 1)
chn_sentences_padded = chn_sentences_padded.reshape(*chn_sentences_padded.shape, 1)

In [None]:
import numpy as np

def encode_input_sequences(tokeniser, max_seq_length, sentences):
    """
    Label encode every sentences to create features X
    """
    # label encode every sentences
    sentences_le = tokeniser.texts_to_sequences(sentences)
    # pad sequences with zeros at the end
    X = pad_sequences(sentences_le, maxlen = max_seq_length, padding = "post")
    return X


def encode_output_labels(sequences, vocab_size):
    """
    One-hot encode target sequences to create labels y
    """
    y_list = []
    for seq in sequences:
        # 把數字序列轉成 one-hot 矩陣。
        oh_encoded = to_categorical(seq, num_classes = vocab_size)
        y_list.append(oh_encoded)
    y = np.array(y_list, dtype = np.float32)
    y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size) # sequences.shape[0]：句子的數量（樣本數）sequences.shape[1]：句子的最大長度（padding 後長度）vocab_size：詞彙表大小（每個詞的 one-hot 長度）
    return y

# create encoder inputs, decoder inputs and decoder outputs
enc_inputs = encode_input_sequences(eng_tokeniser, eng_max_seq_length, eng_sentences) # shape: (n_samples, eng_max_seq_length)
print("Step 1 done")
dec_inputs = encode_input_sequences(chn_tokeniser, chn_max_seq_length, chn_sentences) # shape: (n_samples, chn_max_seq_length)
print("Step 2 done")
dec_outputs = encode_input_sequences(chn_tokeniser, chn_max_seq_length, chn_sentences)
print("Step 3 done")
dec_outputs = encode_output_labels(dec_outputs, chn_vocab_size) # shape: (n_samples, chn_max_seq_length, chn_vocab_size)
print("Step 4 done")

Step 1 done
Step 2 done
Step 3 done
Step 4 done


In [None]:
enc_inputs.shape

(10000, 11)

In [None]:
# save required data to a compressed file
# np.savez_compressed("/content/gdrive/MyDrive/Colab Notebooks/eng-chn_data.npz", enc_inputs = enc_inputs, dec_inputs = dec_inputs, dec_outputs = dec_outputs, eng_vocab_size = eng_vocab_size)