# 1. Packages

In [2]:
import numpy as np
import string
from zhon import hanzi
import collections
import tensorflow as tf

  from ._conv import register_converters as _register_converters


In [3]:
vocab_size = 10000
source_seq_len = 40
target_seq_len = 30

# 2. Data Process

In [4]:
# load data
with open('DATA/english.txt', 'r') as f:
    source_text = f.read()
with open('DATA/chinese.txt', 'r') as f:
    target_text = f.read()

In [5]:
# data summary
print('Dataset Stats:')
print('-'*5 + 'English Text As Source Text' + '-'*5)
sentences = source_text.split('\n')
word_count = [len(s.split()) for s in sentences]
print('Num of Sentence: {}'.format(len(sentences)))
print('Average num of words in a sentence: {}'.format(np.average(word_count)))
print('Max num of words in a sentence: {}'.format(np.max(word_count)))
print()
print('-'*5 + 'Chinese Text As Target Text' + '-'*5)
sentences = target_text.split('\n')
word_count = [len(s.split()) for s in sentences]
print('Num of Sentence: {}'.format(len(sentences)))
print('Average num of words in a sentence: {}'.format(np.average(word_count)))
print('Max num of words in a sentence: {}'.format(np.max(word_count)))

Dataset Stats:
-----English Text As Source Text-----
Num of Sentence: 100001
Average num of words in a sentence: 32.89916100838992
Max num of words in a sentence: 60

-----Chinese Text As Target Text-----
Num of Sentence: 100001
Average num of words in a sentence: 24.73931260687393
Max num of words in a sentence: 60


In [27]:
def data_clean(text):
    ## data clean punctuations
    english_punct = string.punctuation
    chinese_punct = hanzi.punctuation
    punct = english_punct + chinese_punct
    text = ''.join(char for char in text if char not in punct)
    return text

In [28]:
def build_dict(text, vocab_size, is_target=False):
    ## builiding vocabulary
    count = [['<UNK>', -1], ['<PAD>', -2]] if not is_target else [['<UNK>', -1], ['<PAD>', -2], ['<GO>', -3], ['<EOS>', -4]]
    counter = collections.Counter(text.split()).most_common(vocab_size-1)
    count.extend(counter)
    vocab = [word for word, _ in count]
    ## building dictionaries
    vocab_to_int = {w: i for i, w in enumerate(vocab)}
    int_to_vocab = {i: w for i, w in enumerate(vocab)}
    return vocab_to_int, int_to_vocab

In [44]:
def text_to_int(text, map_dict, max_length=35, is_target=False):
    # text to list of sentences
    sentences = text.lower().split('\n')
    sentences = [s.split() for s in sentences]
    
    # text to int
    text_to_idx = []
    sent_to_idx = []
    unk_idx = map_dict.get('<UNK>')
    pad_idx = map_dict.get('<PAD>')
    eos_idx = map_dict.get('<EOS>')
    
    for sentence in sentences:
        for word in sentence:
            sent_to_idx.append(map_dict.get(word, unk_idx))
        if len(sent_to_idx) > max_length:
            sent_to_idx = sent_to_idx[:max_length]
        else:
            sent_to_idx = sent_to_idx + [pad_idx] * (max_length - len(sent_to_idx))
        if is_target:
            sent_to_idx.append(eos_idx)
        text_to_idx.append(sent_to_idx)
        sent_to_idx = []
    return text_to_idx

In [47]:
# data clean
source_text = data_clean(source_text)
target_text = data_clean(target_text)

# vocabularies
source_vocab_to_int, source_int_to_vocab = build_dict(source_text, vocab_size)
target_vocab_to_int, target_int_to_vocab = build_dict(target_text, vocab_size, is_target=True)

# text_to_int
source_text_to_int = text_to_int(source_text, source_vocab_to_int, source_seq_len)
target_text_to_int = text_to_int(target_text, target_vocab_to_int, target_seq_len, is_target=True)

In [49]:
random_idx = 77
print('-'*5 + 'English Example' + '-'*5)
print(source_text.split('\n')[random_idx])
print(source_text_to_int[random_idx])
print()
print('-'*5 + 'Chinese Example' + '-'*5)
print(target_text.split('\n')[random_idx])
print(target_text_to_int[random_idx])

-----English Example-----
the general administration of customs alone has investigated and handled 13 officials at the level of provincial department head or equivalent and 50 officials at the level of provincial section head or equivalent who have violated discipline and law 
[2, 159, 262, 4, 1355, 1910, 15, 2807, 3, 1819, 921, 457, 24, 2, 169, 4, 624, 318, 1038, 69, 4213, 3, 722, 457, 24, 2, 169, 4, 624, 2448, 1038, 69, 4213, 103, 20, 2623, 785, 3, 102, 1]

-----Chinese Example-----
仅 中国 海关 就 查处 违纪 违法 的 厅局级 官员 十三 人  处 级 官员 五十 人 
[1014, 7, 1655, 38, 1558, 2922, 1284, 3, 0, 538, 4630, 44, 785, 1126, 538, 1935, 44, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2]


In [50]:
X = np.array(source_text_to_int)
Y = np.array(target_text_to_int)

# 3. Build Model

In [None]:
# Encoder Layer Hyperparameters
encoder_hps = {'source_vocab_size': len(source_vocab_to_int),
               'encoder_embed_size': 100,
               'rnn_size': ,
               'rnn_num_layers': ,
               'source_seq_len': source_seq_len}

# Decoder Layer Hyperparameters
decoder_hps = {'target_vocab_size': len(target_vocab_to_int),
               'encoder_embed_size': 100,
               'rnn_size': ,
               'rnn_num_layers': ,
               'target_seq_len': target_seq_len}