In [1]:
import requests
import os
import shutil
import tarfile
import gzip
from tqdm import tqdm

data_dir = 'data'

def download_file(url: str, repeat_download=False, output_path=None):

    print(f"Downloading {url}")
    zip_path = url.split('/')[-1]
    zip_path = os.path.join(data_dir, zip_path)
    if output_path is None:
        output_path = '.'.join(zip_path.split('.')[:-1])

    if not os.path.exists(zip_path) or repeat_download:
        response = requests.get(url, stream=True)

        total_size = int(response.headers.get("content-length", 0))
        block_size = 1024
        with tqdm(total=total_size, unit="B", unit_scale=True) as progress_bar:
            with open(zip_path, "wb") as file:
                for data in response.iter_content(block_size):
                    progress_bar.update(len(data))
                    file.write(data)
        print(f"Downloaded to {zip_path}")

    if not os.path.exists(output_path):
        try:
            file = tarfile.open(zip_path) 
            file.extractall(output_path) 
            file.close()
            print(f"Extracted to {output_path}")
        except Exception:
            try:
                with gzip.open(zip_path, 'rb') as f_in:
                    with open(output_path, 'wb') as f_out:
                        shutil.copyfileobj(f_in, f_out)
                print(f"Extracted to {output_path}")
            except Exception:
                print(f"Failed to extract {zip_path}")
    else:
        print(f"{output_path} exists")

In [2]:
file_urls = [
    'http://data.statmt.org/wmt19/translation-task/test.tgz',
    'https://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz',
    'https://s3.amazonaws.com/web-language-models/paracrawl/release1/paracrawl-release1.en-ru.zipporah0-dedup-clean.tgz',
    'https://data.statmt.org/news-commentary/v14/training/news-commentary-v14.en-ru.tsv.gz',
]

if not os.path.exists(data_dir):
    os.makedirs(data_dir)

for url in file_urls:
    download_file(url)

Downloading http://data.statmt.org/wmt19/translation-task/test.tgz
data/test exists
Downloading https://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz
data/training-parallel-commoncrawl exists
Downloading https://s3.amazonaws.com/web-language-models/paracrawl/release1/paracrawl-release1.en-ru.zipporah0-dedup-clean.tgz
data/paracrawl-release1.en-ru.zipporah0-dedup-clean exists
Downloading https://data.statmt.org/news-commentary/v14/training/news-commentary-v14.en-ru.tsv.gz
data/news-commentary-v14.en-ru.tsv exists


In [3]:
import re
import string
from fast_langdetect import detect_language, DetectError

def NormStr(s: str):
    s = s.strip().lower()
    s = re.sub(r'\s+', ' ', s)
    s = re.sub(r'[“«»”]', '"', s)
    s = re.sub(r'[’‘]', '\'',s)
    s = re.sub(r'[\[\{]', '(', s)
    s = re.sub(r'[\]\}]', ')', s)
    s = re.sub(r'[—–]', '-', s)
    return s

def CheckStr(s: str, lang):
    if re.search(fr'[^a-zа-я\d\s{string.punctuation}]', s) is not None:
        return False
    if len(re.sub(r'\w', '', s)) / len(s) > 0.5:
        return False
    
    detected = False
    for s1 in s.split('.'):
        if len(s1.strip()) < 3:
            continue
        if len(re.sub(r'\w', '', s1)) / len(s1) > 0.8:
            continue
        try:
            if detect_language(s1[:100], low_memory=False) != lang:
                return False
            detected = True
        except DetectError:
            pass
    return detected

def CheckPair(pair):
    en, ru = pair
    en_nums = sorted(re.findall(r'\d+', en))
    ru_nums = sorted(re.findall(r'\d+', ru))
    if en_nums != ru_nums:
        return False
    if not CheckStr(en, 'EN') or not CheckStr(ru, 'RU'):
        return False
    return True
    
def Normalizer(data):
    return list(map(lambda pair: (NormStr(pair[0]), NormStr(pair[1])), tqdm(data)))

def SimpleFilter(data):
    return list(filter(CheckPair, tqdm(data)))

def DedupFilter(data):
    en_sents = set()
    ru_sents = set()
    new_data = []
    for en, ru in tqdm(data):
        if en in en_sents or ru in ru_sents:
            continue
        en_sents.add(en)
        ru_sents.add(ru)
        new_data.append((en, ru))
    return new_data

def apply_filters(data, filters = None):
    if filters is None:
        filters = [
            Normalizer,
            SimpleFilter,
            DedupFilter
        ]
    start_count = len(data)
    for f in filters:
        data = f(data)
    print(f"{(1 - len(data) / start_count) * 100:.3f}% deleted, {len(data)} left")
    return data

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
import random
import pandas as pd
random.seed(42)

# Build Train
train_pairs = []

def add_data(data, fract=1.0):
    data_clean = apply_filters(data, [Normalizer, SimpleFilter])
    random.shuffle(data_clean)
    if fract < 1:
        data_clean = data_clean[:int(fract*len(data_clean))]
    train_pairs.extend(data_clean)
    return pd.DataFrame(data_clean[:10], columns=['en', 'ru'])

# CommonCrawl
print("CommonCrawl")
with open('data/training-parallel-commoncrawl/commoncrawl.ru-en.en', 'r') as f:
    en = f.readlines()
with open('data/training-parallel-commoncrawl/commoncrawl.ru-en.ru', 'r') as f:
    ru = f.readlines()
cc_example = add_data(list(zip(en, ru)), 0.1)

# News Commentary
print("News Commentary")
with open('data/news-commentary-v14.en-ru.tsv', 'r') as f:
    lines = f.readlines()
    nc_data = [line.strip().split('\t') for line in lines]
    nc_data = [(line[0], line[1]) for line in nc_data if len(line) == 2]
nc_example = add_data(nc_data)

#Yandex
print("Yandex")
with open('data/1mcorpus/corpus.en_ru.1m.en', 'r') as f:
    en = f.readlines()
with open('data/1mcorpus/corpus.en_ru.1m.ru', 'r') as f:
    ru = f.readlines()
ya_example = add_data(list(zip(en, ru)))

print("Deduplication")
random.shuffle(train_pairs)
train_pairs = apply_filters(train_pairs, [DedupFilter])

CommonCrawl


100%|██████████| 878386/878386 [00:27<00:00, 31651.91it/s]
100%|██████████| 878386/878386 [01:38<00:00, 8934.67it/s] 


32.730% deleted, 590889 left
News Commentary


100%|██████████| 281003/281003 [00:09<00:00, 29502.66it/s]
100%|██████████| 281003/281003 [00:36<00:00, 7749.57it/s]


15.882% deleted, 236375 left
Yandex


100%|██████████| 1000000/1000000 [00:31<00:00, 31607.06it/s]
100%|██████████| 1000000/1000000 [01:58<00:00, 8439.00it/s]


14.421% deleted, 855785 left
Deduplication


100%|██████████| 1151248/1151248 [00:02<00:00, 500097.42it/s]


0.991% deleted, 1139843 left


In [6]:
cc_example

Unnamed: 0,en,ru
0,"when this happens, we try to provide you with ...","поэтому наша задача заключается в том, чтобы к..."
1,above you can see photos and pictures and can ...,вы можете бесплатно скачать фотографии и карти...
2,do not contact the webmaster for issues concer...,"обратите внимание, что на другие вопросы (связ..."
3,"and the forwarding firms, who bore the highest...",на основании этого опыта мы разработали новый ...
4,the guest reviews are submitted by our custome...,следующие отзывы были оставлены нашими клиента...
5,it is called the adeli approach and is based o...,"эта технология, известная как методики ""адели""..."
6,"at the same time, industrial enterprises have ...",в то же время в работе предприятий промышленно...
7,you won't help getting to like hotel ukraine.,вам непременно захочется вернуться в нашу гост...
8,displays files with a pound (#) symbol.,отображает файлы с символом фунта (#).
9,following groups of buttons are allocated for ...,на панели инструментов выделены следующие груп...


In [7]:
nc_example

Unnamed: 0,en,ru
0,"as a result, there are now numerous overlappin...",в результате в настоящее время существует множ...
1,but labour will not win a general election und...,однако лейбористы во главе с корбиными не смог...
2,let us consider each.,рассмотрим оба способа.
3,"at the same time, greece would ensure that the...","в то же время, греция бы гарантировала, что эт..."
4,although the rest of postcommunist europe conf...,хотя остальная посткоммунистическая европа сто...
5,no sooner was the controversy over the creatio...,не успел к всеобщему удовлетворению разрешитьс...
6,"in even the best-performing economies, such as...","даже в наиболее эффективных экономиках, таких ..."
7,global income redistribution by the rich count...,глобальное перераспределение доходов богатыми ...
8,"more recently, south korea's supreme court ord...",а недавно верховный суд южной кореи обязал кру...
9,other gestures to help struggling middle-class...,"другие жесты, которые помогают борющемуся сред..."


In [8]:
ya_example

Unnamed: 0,en,ru
0,"for more information, see how to: enable encry...",дополнительные сведения см. в разделе включени...
1,ii. to sign within one year and ratify within ...,ii. подписать в течение одного и ратифицироват...
2,"thefts, fire, floods, earthquakes, road accide...","кражы, пожары, наводнения, землетрясения, доро..."
3,3. a tearing of the south atlantic rift allowi...,"3. разрыв южной части атлантического разлома, ..."
4,making their way through the guards and the se...,пробираясь через охрану и системы безопасности...
5,sedam communication limited (uk) and the publi...,"sedam communications limited, (великобритания)..."
6,the best among them were sevara dehqonova and ...,лучшими среди них объявлены севара дехканова и...
7,"if it continues on this track, america may one...","игнорировать шос, как это делали сша на протяж..."
8,"as it happens, i've got no problem calling nor...","между прочим, я по-прежнему называю северную к..."
9,"- now ""all plug-ins list"" items (in submenu) a...","- теперь пункты в меню ""все плагины"" сортируют..."


In [9]:
#ParaCrawl
with open('data/paracrawl-release1.en-ru.zipporah0-dedup-clean/paracrawl-release1.en-ru.zipporah0-dedup-clean.en', 'r') as f:
    en = f.readlines()
    en = [text.strip() for text in en]
with open('data/paracrawl-release1.en-ru.zipporah0-dedup-clean/paracrawl-release1.en-ru.zipporah0-dedup-clean.ru', 'r') as f:
    ru = f.readlines()
    ru = [text.strip() for text in ru]

pc_data = list(zip(en, ru))
random.shuffle(pc_data)
pc_data_clean = apply_filters(pc_data[:1000000])

100%|██████████| 1000000/1000000 [00:24<00:00, 41069.38it/s]
100%|██████████| 1000000/1000000 [01:09<00:00, 14298.90it/s]
100%|██████████| 446720/446720 [00:01<00:00, 429188.60it/s]


56.706% deleted, 432939 left


In [10]:
train_pairs_df = pd.DataFrame(train_pairs, columns=['en', 'ru'])

In [11]:
en_ru_dir = 'data/en-ru'
if not os.path.exists(en_ru_dir):
    os.makedirs(en_ru_dir)
train_pairs_df.to_csv(os.path.join(en_ru_dir, 'train.csv'))