In [1]:
import os
import numpy as np
import pandas as pd
import pickle
import nltk
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
import jieba

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_datasets as tfds

os.environ['CUDA_VISIBLE_DEVICES']='1'
tf.config.set_soft_device_placement(True)

print("Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("Hub version: ", hub.__version__)
print("GPU is", "available" if tf.config.list_physical_devices("GPU") else "NOT AVAILABLE")

Version:  2.5.0
Eager mode:  True
Hub version:  0.12.0
GPU is available


# Constants

In [2]:
folder_name = '20210925_wmt19_en_zh'

# Load data
Source: [data.statmt.org](http://data.statmt.org/news-commentary/v14/)

In [3]:
corpus = pd.read_csv(
    'news-commentary-v14.en-zh.tsv', 
    sep='\t', 
    error_bad_lines=False,
    skipfooter= 1,
    header = None,
)

  
Skipping line 5803: '	' expected after '"'. Error could possibly be due to parsing errors in the skipped footer rows (the skipfooter keyword is only applied after Python's csv library has parsed all rows).
Skipping line 5804: '	' expected after '"'. Error could possibly be due to parsing errors in the skipped footer rows (the skipfooter keyword is only applied after Python's csv library has parsed all rows).
Skipping line 12524: '	' expected after '"'. Error could possibly be due to parsing errors in the skipped footer rows (the skipfooter keyword is only applied after Python's csv library has parsed all rows).
Skipping line 12525: '	' expected after '"'. Error could possibly be due to parsing errors in the skipped footer rows (the skipfooter keyword is only applied after Python's csv library has parsed all rows).
Skipping line 12526: '	' expected after '"'. Error could possibly be due to parsing errors in the skipped footer rows (the skipfooter keyword is only applied after Python'

Skipping line 207570: '	' expected after '"'. Error could possibly be due to parsing errors in the skipped footer rows (the skipfooter keyword is only applied after Python's csv library has parsed all rows).
Skipping line 211623: '	' expected after '"'. Error could possibly be due to parsing errors in the skipped footer rows (the skipfooter keyword is only applied after Python's csv library has parsed all rows).
Skipping line 211645: '	' expected after '"'. Error could possibly be due to parsing errors in the skipped footer rows (the skipfooter keyword is only applied after Python's csv library has parsed all rows).
Skipping line 214161: '	' expected after '"'. Error could possibly be due to parsing errors in the skipped footer rows (the skipfooter keyword is only applied after Python's csv library has parsed all rows).
Skipping line 214165: '	' expected after '"'. Error could possibly be due to parsing errors in the skipped footer rows (the skipfooter keyword is only applied after Pyt

In [4]:
en = corpus[0].to_numpy(dtype = str)
zh = corpus[1].to_numpy(dtype = str)

In [5]:
print(en[:5])

['1929 or 1989?'
 'PARIS – As the economic crisis deepens and widens, the world has been searching for historical analogies to help us understand what has been happening.'
 'At the start of the crisis, many people likened it to 1982 or 1973, which was reassuring, because both dates refer to classical cyclical downturns.'
 'Today, the mood is much grimmer, with references to 1929 and 1931 beginning to abound, even if some governments continue to behave as if the crisis was more classical than exceptional.'
 'The tendency is either excessive restraint (Europe) or a diffusion of the effort (the United States).']


In [6]:
print(zh[:5])

['1929年还是1989年?' '巴黎-随着经济危机不断加深和蔓延，整个世界一直在寻找历史上的类似事件希望有助于我们了解目前正在发生的情况。'
 '一开始，很多人把这次危机比作1982年或1973年所发生的情况，这样得类比是令人宽心的，因为这两段时期意味着典型的周期性衰退。'
 '如今人们的心情却是沉重多了，许多人开始把这次危机与1929年和1931年相比，即使一些国家政府的表现仍然似乎把视目前的情况为是典型的而看见的衰退。'
 '目前的趋势是，要么是过度的克制（欧洲 ） ， 要么是努力的扩展（美国 ） 。']


# Tokenize

## English

### Convert the text into sequences

In [7]:
en_tkr = tf.keras.preprocessing.text.Tokenizer(
    filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n',
)
en_tkr.fit_on_texts(['<bos>', '<eos>'])
en_tkr.fit_on_texts(en)
en_seq = en_tkr.texts_to_sequences(en)

In [8]:
en_word2idx = eval(en_tkr.get_config()['word_index'])
bosIdx = en_word2idx['<bos>']
eosIdx = en_word2idx['<eos>']
print(bosIdx)
print(eosIdx)

46227
46228


### Count the average sequence length.

In [9]:
en_seq_len = [len(s) for s in en_seq]
print(np.mean(en_seq_len))
print(np.std(en_seq_len))

21.864398210418873
11.40384685075576


In [10]:
en_seq_length = 32

## Chinese

In [11]:
zh = [' '.join(list(jieba.cut(zh_seq, cut_all=False))) for zh_seq in zh]

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.614 seconds.
Prefix dict has been built successfully.


In [12]:
print(zh[:5])

['1929 年 还是 1989 年 ?', '巴黎 - 随着 经济危机 不断 加深 和 蔓延 ， 整个 世界 一直 在 寻找 历史 上 的 类似 事件 希望 有助于 我们 了解 目前 正在 发生 的 情况 。', '一 开始 ， 很多 人 把 这次 危机 比作 1982 年 或 1973 年 所 发生 的 情况 ， 这样 得 类比 是 令人 宽心 的 ， 因为 这 两段 时期 意味着 典型 的 周期性 衰退 。', '如今 人们 的 心情 却是 沉重 多 了 ， 许多 人 开始 把 这次 危机 与 1929 年 和 1931 年 相比 ， 即使 一些 国家 政府 的 表现 仍然 似乎 把视 目前 的 情况 为 是 典型 的 而 看见 的 衰退 。', '目前 的 趋势 是 ， 要么 是 过度 的 克制 （ 欧洲   ）   ，   要么 是 努力 的 扩展 （ 美国   ）   。']


In [13]:
zh_tkr = tf.keras.preprocessing.text.Tokenizer()
zh_tkr.fit_on_texts(zh)
zh_seq = zh_tkr.texts_to_sequences(zh)

In [14]:
zh_seq_len = [len(s) for s in zh_seq]
print(np.mean(zh_seq_len))
print(np.std(zh_seq_len))

23.058609735855537
12.381971197590456


In [15]:
zh_seq_length = 35

# Remove long sentences

In [16]:
keep = [[zh, en] for zh, en in zip(zh_seq, en_seq) if len(zh)<=zh_seq_length and len(en)<=en_seq_length-2]
print(len(keep))

241964


# Add BOS and EOS into decoder

In [17]:
zh_reduce_seq = [pair[0] for pair in keep]
en_reduce_seq = [pair[1] for pair in keep]
for i, seq in enumerate(en_reduce_seq):
    en_reduce_seq[i] = [bosIdx]
    en_reduce_seq[i].extend(seq)
    en_reduce_seq[i].append(eosIdx)

# Build the dictionary and re-tokenize sequences

In [19]:
# English
en_reduce = en_tkr.sequences_to_texts(en_reduce_seq)
en_tkr = tf.keras.preprocessing.text.Tokenizer(
    filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n',
)
en_tkr.fit_on_texts(en_reduce)
en_reduce_seq = en_tkr.texts_to_sequences(en_reduce)
for i, seq in enumerate(en_reduce_seq):
    en_reduce_seq[i].append(0)
en_idx2word = eval(en_tkr.get_config()['index_word'])
en_idx2word['0'] = '<pad>'
en_word2idx = eval(en_tkr.get_config()['word_index'])
en_word2idx['<pad>'] = '0'

# Czech
zh_reduce = zh_tkr.sequences_to_texts(zh_reduce_seq)
zh_tkr = tf.keras.preprocessing.text.Tokenizer(
    filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n',
)
zh_tkr.fit_on_texts(zh_reduce)
zh_reduce_seq = zh_tkr.texts_to_sequences(zh_reduce)
zh_idx2word = eval(zh_tkr.get_config()['index_word'])
zh_idx2word['0'] = '<pad>'
zh_word2idx = eval(zh_tkr.get_config()['word_index'])
zh_word2idx['<pad>'] = '0'

1929 年 还是 1989 年


In [20]:
print([ en_idx2word[str(i)] for i in en_reduce_seq[0]])
print([ zh_idx2word[str(i)] for i in zh_reduce_seq[0]])

['<bos>', '1929', 'or', '1989', '<eos>', '<pad>']
['1929', '年', '还是', '1989', '年']


In [21]:
en_num_words = len(en_word2idx)
zh_num_words = len(zh_word2idx)
print(f'num. of en. word: {en_num_words}')
print(f'num. of cs. word: {zh_num_words}')

num. of en. word: 57298
num. of cs. word: 75420


# Padding

In [22]:
en_pad_seq = tf.keras.preprocessing.sequence.pad_sequences(
    en_reduce_seq,
    dtype='int32',
    padding='post',
)

In [23]:
print(en_pad_seq.shape)

(241964, 33)


In [24]:
zh_pad_seq = tf.keras.preprocessing.sequence.pad_sequences(
    zh_reduce_seq,
    dtype='int32',
    padding='post',
)

In [25]:
print(zh_pad_seq.shape)

(241964, 35)


# Distribute

In [26]:
num_pair = len(en_pad_seq)
encoder_train = zh_pad_seq[int(num_pair*0.1):]
decoder_train = en_pad_seq[int(num_pair*0.1):, :-1]
teacher_train = en_pad_seq[int(num_pair*0.1):, 1:]
encoder_vali  = zh_pad_seq[:int(num_pair*0.1)]
decoder_vali  = en_pad_seq[:int(num_pair*0.1), :-1]
teacher_vali  = en_pad_seq[:int(num_pair*0.1), 1:]

In [27]:
print(encoder_train.shape)
print(decoder_train.shape)
print(teacher_train.shape)

(217768, 35)
(217768, 32)
(217768, 32)


# Word2Vec

## 32 dim. English embedding 

In [28]:
tmp = [list(e) for e in en_pad_seq]
model = Word2Vec(
    sentences=tmp, 
    vector_size=32, 
    window=5, 
    min_count=1, 
    workers=16,
    sg = 1,
    negative = 10,
)
model.build_vocab(tmp)
model.train(tmp, total_examples=model.corpus_count, epochs=10)

(37970183, 79848120)

In [29]:
model.wv[5257]

array([ 0.0179631 , -0.8287212 , -0.51448613,  0.17656817,  0.35869485,
       -0.3403227 ,  0.19687517,  0.3582963 ,  0.05079257,  1.1925439 ,
       -0.24706498, -0.7956718 ,  0.29024014, -0.0999219 , -0.85695   ,
       -0.29522488, -0.8852313 ,  0.29105672, -1.5672745 , -0.08303724,
       -0.21882707, -1.0520015 ,  0.73115075,  0.45493084,  0.48359972,
       -0.4749396 , -0.9445318 ,  0.82705444, -0.7292918 , -0.18514132,
       -0.043488  ,  0.1810614 ], dtype=float32)

In [30]:
en_emb32 = np.array([ model.wv[i] for i in range(en_num_words)])

## 32 dim. Czech embedding

In [31]:
tmp = [list(e) for e in zh_pad_seq]
model = Word2Vec(
    sentences=tmp, 
    vector_size=32, 
    window=5, 
    min_count=1, 
    workers=16,
    sg = 1,
    negative = 10,
)
model.build_vocab(tmp)
model.train(tmp, total_examples=model.corpus_count, epochs=10)

(39123827, 84687400)

In [32]:
zh_emb32 = np.array([ model.wv[i] for i in range(zh_num_words)])

# Save preprocessed data

In [33]:
pickle.dump(encoder_train, open(f'{folder_name}/encoder_train.pkl','wb'))
pickle.dump(decoder_train, open(f'{folder_name}/decoder_train.pkl','wb'))
pickle.dump(teacher_train, open(f'{folder_name}/teacher_train.pkl','wb'))
pickle.dump(encoder_vali,  open(f'{folder_name}/encoder_vali.pkl','wb'))
pickle.dump(decoder_vali,  open(f'{folder_name}/decoder_vali.pkl','wb'))
pickle.dump(teacher_vali,  open(f'{folder_name}/teacher_vali.pkl','wb'))

pickle.dump(en_idx2word, open(f'{folder_name}/en_idx2word.pkl','wb'))
pickle.dump(en_word2idx, open(f'{folder_name}/en_word2idx.pkl','wb'))
pickle.dump(zh_idx2word, open(f'{folder_name}/zh_idx2word.pkl','wb'))
pickle.dump(zh_word2idx, open(f'{folder_name}/zh_word2idx.pkl','wb'))

pickle.dump(en_emb32, open(f'{folder_name}/en_emb32.pkl','wb'))
pickle.dump(zh_emb32, open(f'{folder_name}/zh_emb32.pkl','wb'))

# Load data

In [34]:
encoder_train = pickle.load(open(f'{folder_name}/encoder_train.pkl', 'rb'))
decoder_train = pickle.load(open(f'{folder_name}/decoder_train.pkl', 'rb'))
teacher_train = pickle.load(open(f'{folder_name}/teacher_train.pkl', 'rb'))
encoder_vali  = pickle.load(open(f'{folder_name}/encoder_vali.pkl', 'rb'))
decoder_vali  = pickle.load(open(f'{folder_name}/decoder_vali.pkl', 'rb'))
teacher_vali  = pickle.load(open(f'{folder_name}/teacher_vali.pkl', 'rb'))

en_idx2word   = pickle.load(open(f'{folder_name}/en_idx2word.pkl','rb'))
en_word2idx   = pickle.load(open(f'{folder_name}/en_word2idx.pkl','rb'))
zh_idx2word   = pickle.load(open(f'{folder_name}/zh_idx2word.pkl','rb'))
zh_word2idx   = pickle.load(open(f'{folder_name}/zh_word2idx.pkl','rb'))

en_emb32    = pickle.load(open(f'{folder_name}/en_emb32.pkl', 'rb'))
zh_emb32    = pickle.load(open(f'{folder_name}/zh_emb32.pkl', 'rb'))

In [35]:
def seq2word(seq_tensor, idx2word):
    return np.array([[idx2word[str(i)] for i in seq] for seq in seq_tensor])

In [36]:
seq2word(encoder_train[5:10], zh_idx2word)

array([['权衡', '和', '折衷', '永远', '是', '资本主义', '的', '真谛', '：', '我们', '必须',
        '容忍', '为', '我们', '提供', '有效', '工具', '的', '赚钱', '企业', '的', '不',
        '道德行为', '。', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>',
        '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>'],
       ['伊朗人', '可以', '用', '这些', '工具', '来', '反抗', '专政', '，', '西藏', '持',
        '不同', '政见', '者', '也', '可以', '用', '这些', '工具', '来', '拯救', '他们',
        '的', '文化', '。', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>',
        '<pad>', '<pad>', '<pad>', '<pad>', '<pad>'],
       ['这些', '工具', '也', '可以', '用来', '统计', '灭绝', '犹太人', '的', '数量', '，',
        '逮捕', '中国', '的', '持', '不同', '政见', '者', '，', '或者', '破坏', '俄罗斯',
        '的', '人权', '组织', '。', '<pad>', '<pad>', '<pad>', '<pad>',
        '<pad>', '<pad>', '<pad>', '<pad>', '<pad>'],
       ['俄罗斯', '的', '微软', '或者', '中国', '的', '谷歌', '告诉', '我们', '资本主义', '并',
        '不道德', '：', '而是', '更加', '注重', '效率', '。', '<pad>', '<pad>',
        '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>',