In [None]:
import requests
import os
import shutil
import tarfile
import gzip
from tqdm import tqdm

data_dir = 'data_src'

def download_file(url: str, repeat_download=False):
    print(f"Downloading {url}")
    zip_path = url.split('/')[-1]
    zip_path = os.path.join(data_dir, zip_path)
    output_path = '.'.join(zip_path.split('.')[:-1])

    if not os.path.exists(zip_path) or repeat_download:
        response = requests.get(url, stream=True)

        total_size = int(response.headers.get("content-length", 0))
        block_size = 1024
        with tqdm(total=total_size, unit="B", unit_scale=True) as progress_bar:
            with open(zip_path, "wb") as file:
                for data in response.iter_content(block_size):
                    progress_bar.update(len(data))
                    file.write(data)
        print(f"Downloaded to {zip_path}")

    if not os.path.exists(output_path):
        try:
            file = tarfile.open(zip_path) 
            file.extractall(output_path) 
            file.close()
            print(f"Extracted to {output_path}")
        except Exception:
            try:
                with gzip.open(zip_path, 'rb') as f_in:
                    with open(output_path, 'wb') as f_out:
                        shutil.copyfileobj(f_in, f_out)
                print(f"Extracted to {output_path}")
            except Exception:
                print(f"Failed to extract {zip_path}")
    else:
        print(f"{output_path} exists")

In [33]:
file_urls = [
    'http://data.statmt.org/wmt19/translation-task/test.tgz',
    'https://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz',
    'https://s3.amazonaws.com/web-language-models/paracrawl/release1/paracrawl-release1.en-ru.zipporah0-dedup-clean.tgz',
    'https://data.statmt.org/news-commentary/v14/training/news-commentary-v14.en-ru.tsv.gz',
    'https://data.statmt.org/wikititles/v1/wikititles-v1.ru-en.tsv.gz',
]

if not os.path.exists(data_dir):
    os.makedirs(data_dir)

for url in file_urls:
    download_file(url)

Downloading http://data.statmt.org/wmt19/translation-task/test.tgz
data/test exists
Downloading https://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz
data/training-parallel-commoncrawl exists
Downloading https://s3.amazonaws.com/web-language-models/paracrawl/release1/paracrawl-release1.en-ru.zipporah0-dedup-clean.tgz
data/paracrawl-release1.en-ru.zipporah0-dedup-clean exists
Downloading https://data.statmt.org/news-commentary/v14/training/news-commentary-v14.en-ru.tsv.gz
data/news-commentary-v14.en-ru.tsv exists
Downloading https://data.statmt.org/wikititles/v1/wikititles-v1.ru-en.tsv.gz
data/wikititles-v1.ru-en.tsv exists


In [165]:
import re
import string
from langdetect import detect, DetectorFactory, LangDetectException

DetectorFactory.seed = 0

def NormStr(s: str):
    s = s.lower()
    s = re.sub(r'\s+', ' ', s)
    s = re.sub(r'[“«»”]', '"', s)
    s = re.sub(r'[’‘]', '\'',s)
    s = re.sub(r'[\[\{]', '(', s)
    s = re.sub(r'[\]\}]', ')', s)
    s = re.sub(r'[—–]', '-', s)
    return s

def CheckStr(s: str, lang):
    if re.search(fr'[^a-zа-я\d\s{string.punctuation}]', s) is not None:
        return False
    if detect(s) != lang:
        return False
    return True

def CheckPair(pair):
    en, ru = pair
    en = NormStr(en)
    ru = NormStr(ru)
    en_nums = sorted(re.findall(r'\d+', en))
    ru_nums = sorted(re.findall(r'\d+', ru))
    if en_nums != ru_nums:
        return False
    if not CheckStr(en, 'en') or not CheckStr(ru, 'ru'):
        return False
    return True
    

def SimpleFilter(data):
    data = list(filter(CheckPair, tqdm(data)))
    return data

def DedupFilter(data):
    en_sents = set()
    ru_sents = set()
    new_data = []
    for en, ru in tqdm(data):
        if en in en_sents or ru in ru_sents:
            continue
        en_sents.add(en)
        ru_sents.add(ru)
        new_data.append((en, ru))
    return new_data

def apply_filters(data):
    filters = [
        SimpleFilter,
        DedupFilter
    ]
    for f in filters:
        start_count = len(data)
        data = f(data)
        print(f"{(1 - len(data) / start_count) * 100}% deleted, {len(data)} left")
    return data

In [None]:
import random
random.seed(42)

# data_en_ru = 'data/en_ru'

# if not os.path.exists(data_en_ru):
#     os.makedirs(data_en_ru)

# Build Train
train_pairs = []

#CommonCrawl
with open('data/training-parallel-commoncrawl/commoncrawl.ru-en.en', 'r') as f:
    en = f.readlines()
    en = [text.strip() for text in en]
with open('data/training-parallel-commoncrawl/commoncrawl.ru-en.ru', 'r') as f:
    ru = f.readlines()
    ru = [text.strip() for text in ru]
train_pairs.extend(list(zip(en, ru)))

#ParaCrawl
with open('data/training-parallel-commoncrawl/commoncrawl.ru-en.en', 'r') as f:
    en = f.readlines()
    en = [text.strip() for text in en]

100%|██████████| 10000/10000 [00:34<00:00, 287.98it/s]


31.489999999999995% deleted, 6851 left


100%|██████████| 6851/6851 [00:00<00:00, 641453.15it/s]

1.0217486498321438% deleted, 6781 left





In [169]:
with open('data/paracrawl-release1.en-ru.zipporah0-dedup-clean/paracrawl-release1.en-ru.zipporah0-dedup-clean.en', 'r') as f:
    en = f.readlines()
    en = [text.strip() for text in en]

: 

In [168]:
cc_data_clean

[('It is performed on all types of skin and neither anesthesia nor sick leave is required.',
  'Он применяется для всех типов кожи и не требует ни анестезии, ни приостановления профессиональной деятельности.'),
 ('We tried to make lyrics as correct as possible, however if you have any corrections for Katie Wants A Fast One lyrics, please feel free to submit them to us.',
  'Вы так же можете скачать перевод текста песни Wariner Steve Katie Wants A Fast One здесь . Мы стараемся сделать так, чтобы слова песни Katie Wants A Fast One были наиболее точными, поэтому если у вас есть какие-то корректировки текста, пожалуйста отправляйте их нам.'),
 ('Hydrofusion, chromatherapy, music therapy, infra-red heat, vibrant massage – this is far not full list of procedures that you can have in SPA-capsule.',
  'Гидрофузия, хроматерапия, музыкотерапия, инфракрасное тепло, вибромассаж - вот далеко не полный список процедур, которые Вы можете пройти благодаря спа-капсуле.'),
 ('The Cabinet of Ministers re

In [None]:
import re
import string
from collections import Counter

en_valid = string.punctuation + string.ascii_letters + string.digits + string.whitespace
en = list(map(lambda x: x[0], cc_data_clean))
ru = list(map(lambda x: x[1], cc_data_clean))
en_chars = re.sub(fr'[a-zа-я\d\s{string.punctuation}]', '', NormStr(''.join(en)))
en_char_counts = Counter(''.join(en_chars))
ru_char_counts = Counter(''.join(ru))
en_char_counts

Counter({'é': 445,
         '\xad': 384,
         'á': 248,
         '®': 234,
         'ü': 198,
         '�': 193,
         '…': 174,
         '•': 160,
         'ä': 156,
         '™': 150,
         'ö': 148,
         'š': 130,
         'ó': 114,
         '€': 111,
         '°': 97,
         'í': 92,
         '„': 85,
         '´': 75,
         'à': 75,
         '\\': 74,
         'ç': 70,
         '\x97': 69,
         '\x93': 61,
         '²': 59,
         '\x94': 59,
         'ø': 58,
         'ł': 55,
         'ī': 55,
         'č': 54,
         'è': 53,
         'ā': 52,
         'ž': 51,
         'ē': 51,
         'ã': 47,
         'å': 38,
         'ñ': 37,
         '№': 37,
         '©': 36,
         'ı': 36,
         'ô': 34,
         'ý': 33,
         'ê': 33,
         '\x92': 29,
         'ń': 27,
         'õ': 27,
         'ß': 25,
         'â': 24,
         '·': 24,
         '̇': 24,
         'ć': 23,
         '³': 22,
         'ś': 21,
         'æ': 21,
         'ú': 21

In [116]:
import string

string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'