In [1]:
import string
import re
from pickle import dump
from unicodedata import normalize
from numpy import array

In [2]:
'''There is punctuation.
The text contains uppercase and lowercase.
There are special characters in the German.
There are duplicate phrases in English with different translations in German.
The file is ordered by sentence length with very long sentences toward the end of the file.
'''

'There is punctuation.\nThe text contains uppercase and lowercase.\nThere are special characters in the German.\nThere are duplicate phrases in English with different translations in German.\nThe file is ordered by sentence length with very long sentences toward the end of the file.\n'

In [8]:
def load_doc(filename):
    #open the file as read only
    file = open(filename, mode='rt', encoding = 'utf-8')
    #read all text
    text = file.read()

    file.close()

    return text

In [4]:
# split data into two sentences

def to_pairs(doc):
    lines = doc.strip().split('\n')
    pairs = [line.split('\t') for line in lines]
    return pairs

In [10]:
#clean a list of lines
def clean_pairs(lines):
    cleaned = list()
    # prepare regex for char filtering
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    #prepare translation table for removing punctuation
    table = str.maketrans('','',string.punctuation)
    for pair in lines:
        clean_pair = list()
        for line in pair:
            line = normalize('NFD', line).encode('ascii','ignore')
            line = line.decode('UTF-8')
            #tokenize on white space
            line = line.split()
            #lowercase
            line = [word.lower() for word in line]
            #remove punctuation
            line = [word.translate(table) for word in line]
            #remove non-printable chars from each token
            line = [re_print.sub('',w) for w in line]
            # remove tokens with numbers in them
            line = [word for word in line if word.isalpha()]
            #store as a string
            clean_pair.append(' '.join(line))
        cleaned.append(clean_pair)
    return array(cleaned)

In [6]:
def save_clean_data(sentences, filename):
    dump(sentences, open(filename,'wb'))
    print('Saved: %s' %filename)

In [11]:
filename = 'all-english-german.txt'
doc = load_doc(filename)

pairs = to_pairs(doc)
clean_pairs = clean_pairs(pairs)

save_clean_data(clean_pairs,'clean-all-english-german.pkl')

Saved: clean-all-english-german.pkl


In [13]:
clean_pairs[10000]

array(['i work in a bank', 'ich arbeite bei einer bank'], dtype='<U370')

In [29]:
print(max([len(x) for x in clean_pairs[:50000,0]]))

25


In [30]:
large_pairs = clean_pairs[:100000]

In [31]:
large_pairs[-1]

array(['youve always been very good to me',
       'du bist immer sehr gut zu mir gewesen'], dtype='<U370')

In [32]:
save_clean_data(large_pairs, 'english-german-large.pkl')

Saved: english-german-large.pkl
