In [1]:
import os
import jieba
import nltk

import tensorflow as tf

In [2]:
def handle_datum2017(source_path, result_path):
    for i in range(1, 21, 1):
        with open(os.path.join(result_path, "Book_en.txt"), mode="a+") as f_en:
            with open(os.path.join(source_path, "Book{}_en.txt".format(i)), mode="r") as f:
                for line in f.readlines():
                    text = nltk.word_tokenize(line)
                    f_en.write(" ".join(text) + "\n")
        
        with open(os. path.join(result_path, "Book_cn.txt"), mode="a+") as f_cn:
            with open(os.path.join(source_path, "Book{}_cn.txt".format(i)), mode="r") as f:
                for line in f.readlines():
                    text = jieba.cut(line.replace("\n", ""), cut_all=False)
                    f_cn.write(" ".join(text) + "\n")
                    
handle_datum2017("/Users/hdx/data/datum2017", "./data/")

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/b5/lmshx9x92hd8brq1_0jwgsrm0000gn/T/jieba.cache
Loading model cost 0.719 seconds.
Prefix dict has been built succesfully.


# explore the data

In [8]:
def load_data(path):
    input_file = os.path.join(path)
    with open(input_file, 'r', encoding='utf-8') as f:
        data = f.read()

    return data

source_path = 'data/Book_en.txt'
target_path = 'data/Book_cn.txt'
source_text = load_data(source_path)
target_text = load_data(target_path)

In [6]:
import numpy as np
from collections import Counter

print('Dataset Brief Stats')
print('* number of unique words in English sample sentences: {}'.format(len(Counter(source_text.split()))))
print()

english_sentences = source_text.split('\n')
print('* English sentences')
print('\t- number of sentences: {}'.format(len(english_sentences)))
print('\t- avg. number of words in a sentence: {}'.format(np.average([len(sentence.split()) for sentence in english_sentences])))
print()

chinese_sentences = target_text.split('\n')
print('* Chinese sentences')
print('\t- number of sentences: {} [data integrity check / should have the same number]'.format(len(chinese_sentences)))
print('\t- avg. number of words in a sentence: {}'.format(np.average([len(sentence.split()) for sentence in chinese_sentences])))
print()

sample_sentence_range = (0, 2)
side_by_side_sentences = list(zip(english_sentences, chinese_sentences))[sample_sentence_range[0]:sample_sentence_range[1]]
print('* Sample sentences range from {} to {}'.format(sample_sentence_range[0], sample_sentence_range[1]))

for index, sentence in enumerate(side_by_side_sentences):
    en_sent, cn_sent = sentence
    print('[{}-th] sentence'.format(index+1))
    print('\tEN: {}'.format(en_sent))
    print('\tCN: {}'.format(cn_sent))
    print()


Dataset Brief Stats
* number of unique words in English sample sentences: 190061

* English sentences
	- number of sentences: 1000005
	- avg. number of words in a sentence: 27.719305403472983

* Chinese sentences
	- number of sentences: 1000005 [data integrity check / should have the same number]
	- avg. number of words in a sentence: 24.891550542247288

* Sample sentences range from 0 to 2
[1-th] sentence
	EN: 2 . The Committee elected Syed Amjad Ali as Chairman and Mr. Peter Gregg as Vice-Chairman by acclamation .
	CN: 2   .   委员会   以   鼓掌   方式   推选   赛义德 · 阿姆 贾德 · 阿里   为主   席   ,   彼得 · 格雷格   先生   为   副 主席   。    

[2-th] sentence
	EN: Mr. Henrik Amneus also provided valuable assistance as acting Vice-Chairman responsible for leading key sessions of the drafting committee .
	CN:   同时   ,   亨 里克 · 安纽斯   先生   作为   主持   起草   委员会   各个   主要   届   会   的   代理   副 主席   也   提供   了   宝贵   的   援助   。    



# Preprocessing

In [13]:
import copy
import pickle

CODES = {'<PAD>': 0, '<EOS>': 1, '<UNK>': 2, '<GO>': 3 }

def create_lookup_tables(text):
    # make a list of unique words
    vocab = set(text.split())

    # (1)
    # starts with the special tokens
    vocab_to_int = copy.copy(CODES)

    # the index (v_i) will starts from 4 (the 2nd arg in enumerate() specifies the starting index)
    # since vocab_to_int already contains special tokens
    for v_i, v in enumerate(vocab, len(CODES)):
        vocab_to_int[v] = v_i

    # (2)
    int_to_vocab = {v_i: v for v, v_i in vocab_to_int.items()}

    return vocab_to_int, int_to_vocab

def text_to_ids(source_text, target_text, source_vocab_to_int, target_vocab_to_int):
    """
        1st, 2nd args: raw string text to be converted
        3rd, 4th args: lookup tables for 1st and 2nd args respectively
    
        return: A tuple of lists (source_id_text, target_id_text) converted
    """
    # empty list of converted sentences
    source_text_id = []
    target_text_id = []
    
    # make a list of sentences (extraction)
    source_sentences = source_text.split("\n")
    target_sentences = target_text.split("\n")
    
    max_source_sentence_length = max([len(sentence.split(" ")) for sentence in source_sentences])
    max_target_sentence_length = max([len(sentence.split(" ")) for sentence in target_sentences])
    
    # iterating through each sentences (# of sentences in source&target is the same)
    for i in range(len(source_sentences)):
        # extract sentences one by one
        source_sentence = source_sentences[i]
        target_sentence = target_sentences[i]
        
        # make a list of tokens/words (extraction) from the chosen sentence
        source_tokens = source_sentence.split()
        target_tokens = target_sentence.split()
        
        # empty list of converted words to index in the chosen sentence
        source_token_id = []
        target_token_id = []
        
        for index, token in enumerate(source_tokens):
            if (token != ""):
                source_token_id.append(source_vocab_to_int[token])
        
        for index, token in enumerate(target_tokens):
            if (token != ""):
                target_token_id.append(target_vocab_to_int[token])
                
        # put <EOS> token at the end of the chosen target sentence
        # this token suggests when to stop creating a sequence
        target_token_id.append(target_vocab_to_int['<EOS>'])
            
        # add each converted sentences in the final list
        source_text_id.append(source_token_id)
        target_text_id.append(target_token_id)
    
    return source_text_id, target_text_id

def preprocess_and_save_data(source_path, target_path, text_to_ids):
    # Preprocess
    
    # load original data (English, French)
    source_text = load_data(source_path)
    target_text = load_data(target_path)

    # to the lower case
    source_text = source_text.lower()
    target_text = target_text.lower()

    # create lookup tables for English and French data
    source_vocab_to_int, source_int_to_vocab = create_lookup_tables(source_text)
    target_vocab_to_int, target_int_to_vocab = create_lookup_tables(target_text)

    # create list of sentences whose words are represented in index
    source_text, target_text = text_to_ids(source_text, target_text, source_vocab_to_int, target_vocab_to_int)

    # Save data for later use
    pickle.dump((
        (source_text, target_text),
        (source_vocab_to_int, target_vocab_to_int),
        (source_int_to_vocab, target_int_to_vocab)), open('preprocess.p', 'wb'))

preprocess_and_save_data(source_path, target_path, text_to_ids)