In [1]:
import jieba
import nltk
import os

"""
Notebook exploring the weird misalignment between Zh and En corpus. 
Inspecting the files on unix console we expect 227,330 lines in both corpus. 
However, looping through the file line-by-line in python we find:
    EN: 227568
    ZH: 227603  (diff: 35)


On console: 
$ wc -l training/news-commentary-v12.zh-en.zh
227330 training/news-commentary-v12.zh-en.zh

$ wc -l training/news-commentary-v12.zh-en.en
227330 training/news-commentary-v12.zh-en.en


WMT17 training Dataset corpus can be downloaded from:
http://data.statmt.org/wmt17/translation-task/training-parallel-nc-v12.tgz



"""
zh_filepath="tmp/wmt17_en_zh/training/news-commentary-v12.zh-en.zh"
en_filepath="tmp/wmt17_en_zh/training/news-commentary-v12.zh-en.en"

In [2]:
""" Counting using splitlines. somehow we get a different count for En. """
def count_splitlines(filename):
    return len(open(filename, mode='rb').read().splitlines())

count_splitlines(zh_filepath), count_splitlines(en_filepath)

(227603, 227568)

In [3]:
""" readlines() matches what we typically expect when reading line-by-line from python. """
def count_readlines(filename):
    return len(open(filename, mode='rb').readlines())

count_readlines(zh_filepath), count_readlines(en_filepath)

(227330, 227330)

In [5]:
""" Count lines that are blank. """
def blank_line_count(filename):
    with open(filename, mode='rb') as fd:
        count = sum(1 for line in fd if len(line.strip()) == 0)
    return count

In [6]:
blank_line_count(zh_filepath), blank_line_count(en_filepath)

(28, 141)

In [53]:
""" find occurences of weird line breaks? \n, \r, \t, \v"""
import re
def cr_count(filename, substr="\r"):
    full = open(filename).read()
    return len(re.findall(substr, full))

cr_count(zh_filepath, "\r"), cr_count(en_filepath, "\r") 

(0, 0)

In [48]:
def report_blank_lines(filename):
    tot = 0
    with open(filename) as f:
        for i, l in enumerate(f):
            if len(l.strip()) < 1:
                print("[%d] %s[END]" % (i, l))
                tot = tot + 1
    print("     total: %d" % tot)

In [49]:
report_blank_lines(zh_filepath)

[27660] 
[END]
[51225] 
[END]
[66871] 
[END]
[75770] 
[END]
[75775] 
[END]
[82330] 
[END]
[89880]     
[END]
[91075] 
[END]
[105145] 
[END]
[119307] 
[END]
[126515] 
[END]
[128127] 
[END]
[137127] 
[END]
[137604] 
[END]
[145516] 
[END]
[146597] 
[END]
[147274] 
[END]
[151833] 
[END]
[166718] 
[END]
[167566] 
[END]
[167574] 
[END]
[167586] 
[END]
[167591] 
[END]
[167598] 
[END]
[172120] 
[END]
[176885] 
[END]
[178064] 
[END]
[178066] 
[END]
[178643]  
[END]
[178983] 
[END]
[178985] 
[END]
[179010] 
[END]
[179817] 
[END]
[180836] 
[END]
[183407] 
[END]
[190958]  
[END]
[193349] 
[END]
[197577] 
[END]
[206624] 
[END]
     total: 39


In [50]:
report_blank_lines(en_filepath)

[4088] 
[END]
[8112] 
[END]
[13275] 
[END]
[13276] 
[END]
[13357] 
[END]
[13358] 
[END]
[13581] 
[END]
[13582] 
[END]
[13783] 
[END]
[13784] 
[END]
[14646] 
[END]
[14740] 
[END]
[15454] 
[END]
[15514] 
[END]
[15515] 
[END]
[16244] 
[END]
[20289] 
[END]
[23012] 
[END]
[24964] 
[END]
[24965] 
[END]
[27670] 
[END]
[31465] 
[END]
[31466] 
[END]
[32799] 
[END]
[35079] 
[END]
[35080] 
[END]
[37662] 
[END]
[37663] 
[END]
[39318] 
[END]
[39321] 
[END]
[45101] 
[END]
[48448] 
[END]
[48450] 
[END]
[48451] 
[END]
[48454] 
[END]
[49221] 
[END]
[49222] 
[END]
[51263] 
[END]
[55062] 
[END]
[64458] 
[END]
[66912] 
[END]
[70704] 
[END]
[74897] 
[END]
[75821] 
[END]
[75826] 
[END]
[75975] 
[END]
[79343] 
[END]
[80313] 
[END]
[80691] 
[END]
[82383] 
[END]
[82385] 
[END]
[83542] 
[END]
[85636] 
[END]
[88839] 
[END]
[91118] 
[END]
[91831] 
[END]
[91832] 
[END]
[92274] 
[END]
[98705] 
[END]
[100113] 
[END]
[102805] 
[END]
[103524] 
[END]
[103525] 
[END]
[103531] 
[END]
[103532] 
[END]
[104059] 
[END]
[1051