In [1]:
import requests
import os
import shutil
import tarfile
import gzip
from tqdm import tqdm

data_dir = 'data'

def download_file(url: str, repeat_download=False, output_path=None):

    print(f"Downloading {url}")
    zip_path = url.split('/')[-1]
    zip_path = os.path.join(data_dir, zip_path)
    if output_path is None:
        output_path = '.'.join(zip_path.split('.')[:-1])

    if not os.path.exists(zip_path) or repeat_download:
        response = requests.get(url, stream=True)

        total_size = int(response.headers.get("content-length", 0))
        block_size = 1024
        with tqdm(total=total_size, unit="B", unit_scale=True) as progress_bar:
            with open(zip_path, "wb") as file:
                for data in response.iter_content(block_size):
                    progress_bar.update(len(data))
                    file.write(data)
        print(f"Downloaded to {zip_path}")

    if not os.path.exists(output_path):
        try:
            file = tarfile.open(zip_path) 
            file.extractall(output_path) 
            file.close()
            print(f"Extracted to {output_path}")
        except Exception:
            try:
                with gzip.open(zip_path, 'rb') as f_in:
                    with open(output_path, 'wb') as f_out:
                        shutil.copyfileobj(f_in, f_out)
                print(f"Extracted to {output_path}")
            except Exception:
                print(f"Failed to extract {zip_path}")
    else:
        print(f"{output_path} exists")

In [2]:
file_urls = [
    # 'http://data.statmt.org/wmt19/translation-task/test.tgz',
    'https://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz',
    'https://s3.amazonaws.com/web-language-models/paracrawl/release1/paracrawl-release1.en-ru.zipporah0-dedup-clean.tgz',
    'https://data.statmt.org/news-commentary/v14/training/news-commentary-v14.en-ru.tsv.gz',
]

if not os.path.exists(data_dir):
    os.makedirs(data_dir)

for url in file_urls:
    download_file(url)

Downloading https://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz
data/training-parallel-commoncrawl exists
Downloading https://s3.amazonaws.com/web-language-models/paracrawl/release1/paracrawl-release1.en-ru.zipporah0-dedup-clean.tgz
data/paracrawl-release1.en-ru.zipporah0-dedup-clean exists
Downloading https://data.statmt.org/news-commentary/v14/training/news-commentary-v14.en-ru.tsv.gz
data/news-commentary-v14.en-ru.tsv exists


In [3]:
import re
import string
import numpy as np
from fast_langdetect import detect_language, DetectError

def NormStr(s: str):
    s = s.strip().lower()
    s = re.sub(r'\s+', ' ', s)
    s = re.sub(r'[“«»”]', '"', s)
    s = re.sub(r'[’‘]', '\'',s)
    s = re.sub(r'[\[\{]', '(', s)
    s = re.sub(r'[\]\}]', ')', s)
    s = re.sub(r'[—–]', '-', s)
    return s

def CheckStr(s: str, lang):
    if len(s) < 10:
        return False
    if re.search(fr'[^a-zа-я\d\s{string.punctuation}]', s) is not None:
        return False
    if len(re.sub(r'\w', '', s)) / len(s) > 0.5:
        return False
    
    detected = False
    for s1 in s.split('.'):
        if len(s1.strip()) < 3:
            continue
        if len(re.sub(r'\w', '', s1)) / len(s1) > 0.8:
            continue
        try:
            if detect_language(s1[:100], low_memory=False) != lang:
                return False
            detected = True
        except DetectError:
            pass
    return detected

def CheckPair(pair):
    en, ru = pair
    en_nums = sorted(re.findall(r'\d+', en))
    ru_nums = sorted(re.findall(r'\d+', ru))
    if en_nums != ru_nums:
        return False
    return True
    
def Normalizer(data):
    return list(map(lambda line: [NormStr(s) for s in line], tqdm(data)))

def StrFilter(data, langs=['EN', 'RU']):
    return list(filter(lambda line: all([CheckStr(s, lang) for s, lang in zip(line, langs)]), tqdm(data)))

def SimplePairFilter(data):
    return list(filter(CheckPair, tqdm(data)))

def LenghthFilter(data):
    lens_en = [len(pair[0]) for pair in data]
    en_max_len = np.percentile(lens_en, 99)
    lens_ru = [len(pair[1]) for pair in data]
    ru_max_len = np.percentile(lens_ru, 99)
    return list(filter(lambda pair: len(pair[0]) <= en_max_len and len(pair[1]) <= ru_max_len, tqdm(data)))

def DedupFilter(data):
    en_sents = set()
    ru_sents = set()
    new_data = []
    for en, ru in tqdm(data):
        if en in en_sents or ru in ru_sents:
            continue
        en_sents.add(en)
        ru_sents.add(ru)
        new_data.append((en, ru))
    return new_data

def apply_filters(data, filters = None):
    if filters is None:
        filters = [
            Normalizer,
            SimplePairFilter,
            StrFilter,
            DedupFilter,
            LenghthFilter,
        ]
    start_count = len(data)
    for f in filters:
        data = f(data)
    print(f"{(1 - len(data) / start_count) * 100:.3f}% deleted, {len(data)} left")
    return data

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import random
import pandas as pd
random.seed(42)

# Build Train
test_pairs = []

def add_data(data, fract=1.0):
    data_clean = apply_filters(data, [Normalizer, SimplePairFilter, StrFilter])
    random.shuffle(data_clean)
    if fract < 1:
        data_clean = data_clean[:int(fract*len(data_clean))]
    test_pairs.extend(data_clean)
    return pd.DataFrame(data_clean[:10], columns=['en', 'ru'])

# News Commentary
print("News Commentary")
with open('data/news-commentary-v14.en-ru.tsv', 'r') as f:
    lines = f.readlines()
    nc_data = [line.strip().split('\t') for line in lines]
    nc_data = [(line[0], line[1]) for line in nc_data if len(line) == 2]
nc_example = add_data(nc_data)

#Yandex
print("Yandex")
with open('data/1mcorpus/corpus.en_ru.1m.en', 'r') as f:
    en = f.readlines()
with open('data/1mcorpus/corpus.en_ru.1m.ru', 'r') as f:
    ru = f.readlines()
ya_example = add_data(list(zip(en, ru)))

print("Deduplication")
random.shuffle(test_pairs)
test_pairs = apply_filters(test_pairs, [DedupFilter, LenghthFilter])

News Commentary


100%|██████████| 281003/281003 [00:10<00:00, 28038.88it/s]
100%|██████████| 281003/281003 [00:02<00:00, 128443.19it/s]
100%|██████████| 268838/268838 [00:39<00:00, 6841.45it/s]


15.958% deleted, 236161 left
Yandex


100%|██████████| 1000000/1000000 [00:33<00:00, 30144.35it/s]
100%|██████████| 1000000/1000000 [00:07<00:00, 138758.15it/s]
100%|██████████| 895520/895520 [01:54<00:00, 7810.35it/s]


14.479% deleted, 855208 left
Deduplication


100%|██████████| 1091369/1091369 [00:02<00:00, 419002.25it/s]
100%|██████████| 1081946/1081946 [00:01<00:00, 554587.64it/s]


2.353% deleted, 1065691 left


In [5]:
nc_example

Unnamed: 0,en,ru
0,why aren't these steps being taken?,так почему эти шаги не предпринимаются?
1,if he opens the economy and adjusts the exchan...,если он откроет экономику и поправит обменный ...
2,all traditional individual rights are already ...,все традиционные права человека уже предусмотр...
3,"for the majority of iranians, economic improve...",для большинства иранцев улучшение экономическо...
4,"remarkably, the imf was slow to learn the lesson.","поразительно то, что мвф потребовалось так мно..."
5,one obvious point is that it is hard to identi...,один из очевидных фактов: трудно установить че...
6,nor does time cushion anemic post-crisis recov...,не защищает время также анемичное посткризисно...
7,to defend post-utopian values in the longer-te...,"при борьбе с такими группами, как аль-каида, о..."
8,given the importance of relationship-building ...,в сми очень важно выстраивать отношения. более...
9,a second function of goals is to create peer p...,вторая функция цели заключается в создании соц...


In [6]:
ya_example

Unnamed: 0,en,ru
0,the mission of herbalife nutrition institute i...,миссией и целью создания института питания her...
1,"""the unforeseen does not exist,"" quietly repli...","- непредвиденного не существует, - спокойно от..."
2,"they have an abundance of skills, but lack the...","они обладали массой умений, но недостатком уве..."
3,there are two common reasons:,есть две основные причины:
4,those who are serious seekers of personal deve...,"тот же, кто решил заняться самосовершенствован..."
5,"""we may expect women to own more luxury items,...","""мы предполагали, что у женщин больше ювелирны..."
6,this uncertainty makes ccamlr's work more comp...,эта неопределенность осложняет работу анткома.
7,"""and i'll... lose my personality?""",а я... потеряю свою личность?
8,"because the phenomenon is catching headlines, ...","благодаря тому, что этот феномен замелькал в х..."
9,it is not known if the assailants belonged to ...,смертельно ранен родственник одного из подозре...


In [7]:
def shuffle_data(data, label_count):
    zip_data = list(zip(data, np.arange(len(data))))
    random.shuffle(zip_data)
    labels = [0] * label_count
    data = []
    for index, pair in enumerate(zip_data):
        s, id = pair
        data.append(s)
        if id < label_count:
            labels[id] = index
    return data, labels

def sample_data(data, label_count):
    en = list(map(lambda x: x[0], data))
    ru = list(map(lambda x: x[1], data))
    en = en[:label_count * 2]
    ru = ru[:label_count] + ru[label_count*2:]
    return en, ru

label_count = len(test_pairs) // 3
en, ru = sample_data(test_pairs, label_count)
en, en_labels = shuffle_data(en, label_count)
ru, ru_labels = shuffle_data(ru, label_count)
print(en[en_labels[0]])
print(ru[ru_labels[0]])

all at once they all go into one great big splash of blood.
внезапно они все слились в одно большое пятно крови.


In [8]:
test_dir = 'data/miner_test'
if not os.path.exists(test_dir):
    os.makedirs(test_dir)

with open(os.path.join(test_dir, 'en_sents'), 'w') as f:
    f.write('\n'.join(en))

with open(os.path.join(test_dir, 'ru_sents'), 'w') as f:
    f.write('\n'.join(ru))

labels_df = pd.DataFrame(list(zip(en_labels, ru_labels)), columns=['en', 'ru'])
labels_df.to_csv(os.path.join(test_dir, 'labels.csv'), index=False)

In [9]:
#ParaCrawl
with open('data/paracrawl-release1.en-ru.zipporah0-dedup-clean/paracrawl-release1.en-ru.zipporah0-dedup-clean.en', 'r') as f:
    en = f.readlines()
with open('data/paracrawl-release1.en-ru.zipporah0-dedup-clean/paracrawl-release1.en-ru.zipporah0-dedup-clean.ru', 'r') as f:
    ru = f.readlines()

pc_data = list(zip(en, ru))
random.shuffle(pc_data)
pc_data_clean = apply_filters(pc_data[:1000000])

100%|██████████| 1000000/1000000 [00:30<00:00, 32506.90it/s]
100%|██████████| 1000000/1000000 [00:05<00:00, 171793.19it/s]
100%|██████████| 679895/679895 [01:04<00:00, 10508.95it/s]
100%|██████████| 439380/439380 [00:01<00:00, 424453.95it/s]
100%|██████████| 428090/428090 [00:00<00:00, 469108.89it/s]

57.801% deleted, 421994 left





In [10]:
en_ru_dir = 'data/en-ru'
if not os.path.exists(en_ru_dir):
    os.makedirs(en_ru_dir)

# CommonCrawl
print("CommonCrawl")
with open('data/training-parallel-commoncrawl/commoncrawl.ru-en.en', 'r') as f:
    en = f.readlines()
with open('data/training-parallel-commoncrawl/commoncrawl.ru-en.ru', 'r') as f:
    ru = f.readlines()
cc_data = apply_filters(list(zip(en, ru)))
random.shuffle(cc_data)
cc_df = pd.DataFrame(cc_data, columns=['en', 'ru'])
cc_df.to_csv(os.path.join(en_ru_dir, 'train.csv'), index=False)

CommonCrawl


100%|██████████| 878386/878386 [00:31<00:00, 27535.08it/s]
100%|██████████| 878386/878386 [00:06<00:00, 140574.51it/s]
100%|██████████| 739184/739184 [01:30<00:00, 8128.97it/s]
100%|██████████| 590889/590889 [00:01<00:00, 459398.67it/s]
100%|██████████| 567663/567663 [00:01<00:00, 512325.04it/s]


36.370% deleted, 558917 left
