In [1]:
import string
import re
import pickle as pkl
from unicodedata import normalize
import numpy as np
import os

In [2]:
# Imports for colab
from google.colab import drive
drive.mount('/content/gdrive')
dirname = '/content/gdrive/My Drive/Colab Notebooks/'

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# Preparing the data

## Load data

In [3]:
def load_doc(filename):
    # Open the file as read only
    with open(filename, mode='rt', encoding='utf-8') as file:
        # Read all data
        text = file.read()
    return text

## Split data into pairs

In [4]:
def to_pairs(doc):
    lines = doc.strip().split('\n')
    pairs = [line.split('\t')[:2] for line in  lines]
    return pairs

## Clean data

In [5]:
def clean_pairs(lines, encoding='unicode', remove_punct=True, remove_nums=False, to_lower=True, unicode_norm=False):
    cleaned = list()

    # Reg. expression for removing non-printable characters
    if encoding != 'unicode':
        re_print = re.compile('[^%s]' % re.escape(string.printable))
    else:
        control_chars = ''.join(map(chr, [int(i) for i in range(0,32)] + [int(i) for i in range(127,160)]))
        re_print = re.compile('[%s]' % re.escape(control_chars))

    # Translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    for pair in lines:
        clean_pair = []
        for line in pair:
            # Normalize unicode characters
            if unicode_norm:
                line = normalize('NFD', line).encode('ascii', 'ignore')
                line = line.decode('UTF-8')
      
            # Split sentence to tokens on white space
            line = line.split()

            # Convert to lowercase (if needed)
            if to_lower:
                line = [word.lower() for word in line]

            # Remove punctuation from each token (if needed)
            if remove_punct:
                line = [word.translate(table) for word in line]

            # Remove non-printable chars form each token
            line = [re_print.sub('', w) for w in line]
      
            # Remove tokens with numbers in them (if needed)
            if remove_nums:
                line = [word for word in line if word.isalpha()]

            # Store cleaned sentence as string
            clean_pair.append(' '.join(line))
        cleaned.append(clean_pair)

    return np.array(cleaned)

## Save cleaned data to pickle file

In [6]:
def save_clean_data(sentences, filename):
    pkl.dump(sentences, open(filename, 'wb'))
    print('Saved: %s' % filename)

## Running all the procedures from above

In [7]:
# Load dataset
filename = dirname + 'rus.txt'
doc = load_doc(filename)

# Split into english-russian pairs
pairs = to_pairs(doc)

# Clean sentences
cleaned_pairs = clean_pairs(pairs)

# Save cleaned pairs to file
save_clean_data(cleaned_pairs, dirname + 'english-russian.pkl')

# Checking for data
print('Data shape: ', cleaned_pairs.shape)

for i in range(30):
    print('[%s] => [%s]' % (cleaned_pairs[i,0], cleaned_pairs[i,1]))

Saved: /content/gdrive/My Drive/Colab Notebooks/english-russian.pkl
Data shape:  (370085, 2)
[go] => [марш]
[go] => [иди]
[go] => [идите]
[hi] => [здравствуйте]
[hi] => [привет]
[hi] => [хай]
[hi] => [здрасте]
[hi] => [здоро́во]
[run] => [беги]
[run] => [бегите]
[run] => [беги]
[run] => [бегите]
[who] => [кто]
[wow] => [вот это да]
[wow] => [круто]
[wow] => [здорово]
[wow] => [ух ты]
[wow] => [ого]
[wow] => [вах]
[fire] => [огонь]
[fire] => [пожар]
[help] => [помогите]
[help] => [на помощь]
[help] => [спасите]
[jump] => [прыгай]
[jump] => [прыгайте]
[jump] => [прыгай]
[jump] => [прыгайте]
[stop] => [стой]
[stop] => [остановитесь]


# Shuffling, reduction and splitting the data

## Load clean data

In [8]:
def load_clean_sentences(filename):
    return pkl.load(open(filename, 'rb'))

In [13]:
raw_dataset = load_clean_sentences(dirname + 'english-russian.pkl')

# Reduce dataset size (if needed)
reduction = True
# Number of pairs from original corpus
n_sentences = 100000
if reduction:
    idx = str(n_sentences//1000)+'k'
    dataset = raw_dataset[:n_sentences, :]
else:
    idx = 'full'
    dataset = raw_dataset

# Random shuffle
np.random.seed(0)
np.random.shuffle(dataset)

# Split into train/test
train_share = 0.9
train_len = int(dataset.shape[0]*train_share)

train, test = dataset[:train_len], dataset[train_len:]
# Uncomment next three lines and comment previous one, if you want train-val-test split
# Also don't forget to change val_share, if you need and dump val set to pickle
#val_share = 0.05
#val_len = int(dataset.shape[0]*val_share)
#train, val, test = dataset[:train_len], dataset[train_len:train_len + val_len], dataset[train_len + val_len:]

# Save
save_clean_data(dataset, dirname + 'english-russian-' + idx +'-both.pkl')
save_clean_data(train, dirname + 'english-russian-' + idx + '-train.pkl')
save_clean_data(test, dirname + 'english-russian-'+ idx + '-test.pkl')

Saved: /content/gdrive/My Drive/Colab Notebooks/english-russian-100k-both.pkl
Saved: /content/gdrive/My Drive/Colab Notebooks/english-russian-100k-train.pkl
Saved: /content/gdrive/My Drive/Colab Notebooks/english-russian-100k-test.pkl


In [14]:
for i in range(10):
    print('Train: [%s] => [%s]' % (train[i,0], train[i,1]))
    print('Test: [%s] => [%s]' % (test[i,0], test[i,1]))

Train: [i buried it] => [я её закопал]
Test: [tom is very good] => [том очень хороший]
Train: [go and wake up mary] => [пойди разбуди мэри]
Test: [i was born in 1960] => [я родился в 1960]
Train: [she did a good job] => [она проделала хорошую работу]
Test: [do you work in boston] => [вы работаете в бостоне]
Train: [i work at a zoo] => [я работаю в зоопарке]
Test: [look at this picture] => [посмотрите на эту картинку]
Train: [i want them] => [я хочу их]
Test: [theres a problem] => [есть проблема]
Train: [ive been thinking] => [я размышляю]
Test: [tom is my neighbor] => [том  мой сосед]
Train: [we talked about boys] => [мы говорили о мальчиках]
Test: [wasnt he your friend] => [он разве не был тебе другом]
Train: [is that blood] => [это кровь]
Test: [i was playing here] => [я здесь играл]
Train: [dont be so selfish] => [не будь такой эгоисткой]
Test: [its good to dream] => [мечтать хорошо]
Train: [people are stupid] => [люди глупы]
Test: [come by tomorrow] => [заходи завтра]
