In [1]:
import os
import numpy as np
import pandas as pd
import pickle
import nltk
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_datasets as tfds

os.environ['CUDA_VISIBLE_DEVICES']='1'
tf.config.set_soft_device_placement(True)

print("Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("Hub version: ", hub.__version__)
print("GPU is", "available" if tf.config.list_physical_devices("GPU") else "NOT AVAILABLE")

Version:  2.5.0
Eager mode:  True
Hub version:  0.12.0
GPU is available


# Constants

In [2]:
folder_name = '20210925_wmt19_cs_en'

# Load data
Source: [data.statmt.org](http://data.statmt.org/news-commentary/v14/)

In [3]:
corpus = pd.read_csv(
    'news-commentary-v14.cs-en.tsv', 
    sep='\t', 
    error_bad_lines=False,
    skipfooter= 1,
    header = None,
)

  
Skipping line 11501: '	' expected after '"'. Error could possibly be due to parsing errors in the skipped footer rows (the skipfooter keyword is only applied after Python's csv library has parsed all rows).
Skipping line 31651: '	' expected after '"'. Error could possibly be due to parsing errors in the skipped footer rows (the skipfooter keyword is only applied after Python's csv library has parsed all rows).
Skipping line 36410: '	' expected after '"'. Error could possibly be due to parsing errors in the skipped footer rows (the skipfooter keyword is only applied after Python's csv library has parsed all rows).
Skipping line 38172: '	' expected after '"'. Error could possibly be due to parsing errors in the skipped footer rows (the skipfooter keyword is only applied after Python's csv library has parsed all rows).
Skipping line 48214: '	' expected after '"'. Error could possibly be due to parsing errors in the skipped footer rows (the skipfooter keyword is only applied after Pytho

Skipping line 149100: '	' expected after '"'. Error could possibly be due to parsing errors in the skipped footer rows (the skipfooter keyword is only applied after Python's csv library has parsed all rows).
Skipping line 149235: '	' expected after '"'. Error could possibly be due to parsing errors in the skipped footer rows (the skipfooter keyword is only applied after Python's csv library has parsed all rows).
Skipping line 149237: '	' expected after '"'. Error could possibly be due to parsing errors in the skipped footer rows (the skipfooter keyword is only applied after Python's csv library has parsed all rows).
Skipping line 149251: '	' expected after '"'. Error could possibly be due to parsing errors in the skipped footer rows (the skipfooter keyword is only applied after Python's csv library has parsed all rows).
Skipping line 152487: '	' expected after '"'. Error could possibly be due to parsing errors in the skipped footer rows (the skipfooter keyword is only applied after Pyt

In [4]:
cs = corpus[0].to_numpy(dtype = str)
en = corpus[1].to_numpy(dtype = str)

# Tokenize

## English

### Convert the text into sequences

In [5]:
en_tkr = tf.keras.preprocessing.text.Tokenizer(
    filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n',
)
en_tkr.fit_on_texts(['<bos>', '<eos>'])
en_tkr.fit_on_texts(en)
en_seq = en_tkr.texts_to_sequences(en)

In [6]:
en_word2idx = eval(en_tkr.get_config()['word_index'])
bosIdx = en_word2idx['<bos>']
eosIdx = en_word2idx['<eos>']
print(bosIdx)
print(eosIdx)

41251
41252


### Count the average sequence length.

In [7]:
en_seq_len = [len(s) for s in en_seq]
print(np.mean(en_seq_len))
print(np.std(en_seq_len))

21.316457186469854
11.501927989194174


In [8]:
en_seq_length = 32

## Czech

In [9]:
cs_tkr = tf.keras.preprocessing.text.Tokenizer()
cs_tkr.fit_on_texts(cs)
cs_seq = cs_tkr.texts_to_sequences(cs)

In [10]:
cs_seq_len = [len(s) for s in cs_seq]
print(np.mean(cs_seq_len))
print(np.std(cs_seq_len))

18.807237621993796
10.329988244009906


In [11]:
cs_seq_length = 28

# remove long sentences

In [12]:
keep = [[cs, en] for cs, en in zip(cs_seq, en_seq) if len(cs)<=cs_seq_length and len(en)<=en_seq_length-2]
print(len(keep))

193131


# Add BOS and EOS into decoder

In [13]:
cs_reduce_seq = [pair[0] for pair in keep]
en_reduce_seq = [pair[1] for pair in keep]
for i, seq in enumerate(en_reduce_seq):
    en_reduce_seq[i] = [bosIdx]
    en_reduce_seq[i].extend(seq)
    en_reduce_seq[i].append(eosIdx)

# Build the dictionary and re-tokenize sequences

In [14]:
# English
en_reduce = en_tkr.sequences_to_texts(en_reduce_seq)
en_tkr = tf.keras.preprocessing.text.Tokenizer(
    filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n',
)
en_tkr.fit_on_texts(en_reduce)
en_reduce_seq = en_tkr.texts_to_sequences(en_reduce)
for i, seq in enumerate(en_reduce_seq):
    en_reduce_seq[i].append(0)
en_idx2word = eval(en_tkr.get_config()['index_word'])
en_idx2word['0'] = '<pad>'
en_word2idx = eval(en_tkr.get_config()['word_index'])
en_word2idx['<pad>'] = '0'
# Czech
cs_reduce = cs_tkr.sequences_to_texts(cs_reduce_seq)
cs_tkr = tf.keras.preprocessing.text.Tokenizer(
    filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n',
)
cs_tkr.fit_on_texts(cs_reduce)
cs_reduce_seq = cs_tkr.texts_to_sequences(cs_reduce)
cs_idx2word = eval(cs_tkr.get_config()['index_word'])
cs_idx2word['0'] = '<pad>'
cs_word2idx = eval(cs_tkr.get_config()['word_index'])
cs_word2idx['<pad>'] = '0'

In [15]:
print([ en_idx2word[str(i)] for i in en_reduce_seq[121]])

['<bos>', 'meanwhile', 'in', 'about', '2005', 'i', 'was', 'in', 'south', 'africa', 'with', 'a', 'small', 'group', 'advising', 'former', 'president', 'thabo', 'mbeki', 'and', 'his', 'government', 'about', 'its', 'it', 'policy', '<eos>', '<pad>']


In [16]:
en_num_words = len(en_word2idx)
cs_num_words = len(cs_word2idx)
print(f'num. of en. word: {en_num_words}')
print(f'num. of cs. word: {cs_num_words}')

num. of en. word: 52575
num. of cs. word: 155078


# Padding

In [17]:
en_pad_seq = tf.keras.preprocessing.sequence.pad_sequences(
    en_reduce_seq,
    dtype='int32',
    padding='post',
)

In [18]:
print(en_pad_seq.shape)

(193131, 33)


In [19]:
cs_pad_seq = tf.keras.preprocessing.sequence.pad_sequences(
    cs_reduce_seq,
    dtype='int32',
    padding='post',
)

In [20]:
print(cs_pad_seq.shape)

(193131, 28)


# Distribute

In [21]:
num_pair = len(en_pad_seq)
encoder_train = cs_pad_seq[int(num_pair*0.1):]
decoder_train = en_pad_seq[int(num_pair*0.1):, :-1]
teacher_train = en_pad_seq[int(num_pair*0.1):, 1:]
encoder_vali  = cs_pad_seq[:int(num_pair*0.1)]
decoder_vali  = en_pad_seq[:int(num_pair*0.1), :-1]
teacher_vali  = en_pad_seq[:int(num_pair*0.1), 1:]

In [22]:
print(encoder_train.shape)
print(decoder_train.shape)
print(teacher_train.shape)

(173818, 28)
(173818, 32)
(173818, 32)


# Word2Vec

## 32 dim. English embedding 

In [23]:
tmp = [list(e) for e in en_pad_seq]
model = Word2Vec(
    sentences=tmp, 
    vector_size=32, 
    window=5, 
    min_count=1, 
    workers=16,
    sg = 1,
    negative = 10,
)
model.build_vocab(tmp)
model.train(tmp, total_examples=model.corpus_count, epochs=10)

(29727808, 63733230)

In [24]:
model.wv[5257]

array([-0.5323224 , -0.33435014, -0.00609   ,  0.7025845 ,  0.2990808 ,
       -0.7005396 ,  0.56037045,  0.04551583,  0.2818839 ,  0.31856662,
        0.48854238,  0.2362166 , -0.41161078, -0.15584254,  0.39335993,
       -0.16227524, -0.34686658, -0.18837813, -0.742977  ,  0.18588375,
       -0.28991964,  0.7114349 ,  0.38617498,  0.41311225, -0.10103195,
       -0.8036458 , -1.4665223 ,  0.34539932, -0.52019036, -0.26782683,
        0.18907385,  0.02807016], dtype=float32)

In [25]:
en_emb32 = np.array([ model.wv[i] for i in range(en_num_words)])

## 32 dim. Czech embedding

In [26]:
tmp = [list(e) for e in cs_pad_seq]
model = Word2Vec(
    sentences=tmp, 
    vector_size=32, 
    window=5, 
    min_count=1, 
    workers=16,
    sg = 1,
    negative = 10,
)
model.build_vocab(tmp)
model.train(tmp, total_examples=model.corpus_count, epochs=10)

(28097970, 54076680)

In [27]:
cs_emb32 = np.array([ model.wv[i] for i in range(cs_num_words)])

# Save preprocessed data

In [28]:
pickle.dump(encoder_train, open(f'{folder_name}/encoder_train.pkl','wb'))
pickle.dump(decoder_train, open(f'{folder_name}/decoder_train.pkl','wb'))
pickle.dump(teacher_train, open(f'{folder_name}/teacher_train.pkl','wb'))
pickle.dump(encoder_vali,  open(f'{folder_name}/encoder_vali.pkl','wb'))
pickle.dump(decoder_vali,  open(f'{folder_name}/decoder_vali.pkl','wb'))
pickle.dump(teacher_vali,  open(f'{folder_name}/teacher_vali.pkl','wb'))

pickle.dump(en_idx2word, open(f'{folder_name}/en_idx2word.pkl','wb'))
pickle.dump(en_word2idx, open(f'{folder_name}/en_word2idx.pkl','wb'))
pickle.dump(cs_idx2word, open(f'{folder_name}/cs_idx2word.pkl','wb'))
pickle.dump(cs_word2idx, open(f'{folder_name}/cs_word2idx.pkl','wb'))

pickle.dump(en_emb32, open(f'{folder_name}/en_emb32.pkl','wb'))
pickle.dump(cs_emb32, open(f'{folder_name}/cs_emb32.pkl','wb'))

# Load data

In [29]:
encoder_train = pickle.load(open(f'{folder_name}/encoder_train.pkl', 'rb'))
decoder_train = pickle.load(open(f'{folder_name}/decoder_train.pkl', 'rb'))
teacher_train = pickle.load(open(f'{folder_name}/teacher_train.pkl', 'rb'))
encoder_vali  = pickle.load(open(f'{folder_name}/encoder_vali.pkl', 'rb'))
decoder_vali  = pickle.load(open(f'{folder_name}/decoder_vali.pkl', 'rb'))
teacher_vali  = pickle.load(open(f'{folder_name}/teacher_vali.pkl', 'rb'))

en_idx2word   = pickle.load(open(f'{folder_name}/en_idx2word.pkl','rb'))
en_word2idx   = pickle.load(open(f'{folder_name}/en_word2idx.pkl','rb'))
cs_idx2word   = pickle.load(open(f'{folder_name}/cs_idx2word.pkl','rb'))
cs_word2idx   = pickle.load(open(f'{folder_name}/cs_word2idx.pkl','rb'))

en_emb32    = pickle.load(open(f'{folder_name}/en_emb32.pkl', 'rb'))
cs_emb32    = pickle.load(open(f'{folder_name}/cs_emb32.pkl', 'rb'))