In [1]:
import site
site.addsitedir('../tmxutil')
from tmxutil import make_reader, TranslationUnit, TranslationUnitVariant
import unidecode
import string
from xxhash import xxh64
from notebook import table, first
from tqdm.autonotebook import tqdm
from logging import getLogger, ERROR
getLogger().setLevel(ERROR) # Hide warnings while importing

def checksum(unit):
    return xxh64("\t".join(
        unidecode.unidecode(translation.text.lower().replace(" ", "").translate(str.maketrans('', '', string.punctuation + string.digits)))
        for translation in unit.translations.values())).hexdigest()

def read(filename):
    with open(filename, 'rb') as fh:
        for unit in make_reader(fh, progress=True):
            yield unit

  from tqdm.autonotebook import tqdm


In [2]:
files = {
    'paracrawl-7.1': 'en-mt.tmx.gz',
    'paracrawl-8': 'en-mt.cc-2016-30-cc-2017-30-cc-2018-30-cc-2019-18-cc-2019-35-gwb-hieu-marta-philipp-wide00006-wide00015.tmx.gz'
}

In [3]:
hashes = {
    release: frozenset(checksum(unit) for unit in read(filename))
    for release, filename in files.items()
}

en-mt.tmx.gz:   0%|          | 0.00/197M [00:00<?, ?b/s]

en-mt.cc-2016-30-cc-2017-30-cc-2018-30-cc-2019-18-cc-2019-35-gwb-hieu-marta-philipp-wide00006-wide00015.tmx.gz…

# Overlap between releases
How many sentence pairs are in one while not in the other, and how many are in both?

In [4]:
len(hashes['paracrawl-7.1'] & hashes['paracrawl-8']) / len(hashes['paracrawl-8']) # Overlap

0.18467959367509593

In [5]:
len(hashes['paracrawl-8'] - hashes['paracrawl-7.1']) / len(hashes['paracrawl-8']) # Novel

0.8153204063249041

In [6]:
len(hashes['paracrawl-7.1'] - hashes['paracrawl-8']) / len(hashes['paracrawl-7.1']) # Lost

0.6543089586479063

Okay, so about 65% of all sentence pairs in 7.1 do not return in 8? Why? Are these from domains that we marked as likely MT? Are they low quality?

In [7]:
from stats import RandomSample
from collections import Counter
import re

def domain(url):
    """Take the (very broadly defined) domain part of an url, or return the
    full url if that didn't work."""
    match = re.match(r'^(https?:)?(//)?(?P<domain>[^/]+)', url)
    return match.group('domain')

lost_hashes = hashes['paracrawl-7.1'] - hashes['paracrawl-8']

sample = RandomSample(100)

domains = Counter()

collections_overall = Counter()

collections_missing = Counter()

for unit in read(files['paracrawl-7.1']):
    collections_overall.update(unit['collection'])
    if checksum(unit) in lost_hashes:
        collections_missing.update(unit['collection'])
        sample.add(unit)
        for translation in unit.translations.values():
            domains.update(domain(url) for url in translation['source-document'])

en-mt.tmx.gz:   0%|          | 0.00/197M [00:00<?, ?b/s]

In [8]:
import notebook
import importlib
notebook = importlib.reload(notebook)
table = notebook.table

In [9]:
table({
    'score': first(unit['score-bicleaner']),
    'collection': unit['collection'],
    **{lang: translation.text for lang, translation in unit.translations.items()}
} for unit in sample)

score,collection,en,mt
0.720,hieu,Sevelamer carbonate Zentiva 800 mg film-coated tablets,Sevelamer carbonate Zentiva 800 mg pilloli miksija b'rita
hieu,,,
0.800,philipp,It's a bitch and is not so much for being a Bearded Collie.,Huwa kelba u mhux daqshekk kbira li jkun Collie Bearded.
philipp,,,
0.870,philipp,"The images of people, animals and plants always be exaggerated, deformed, or time-warped, or out of order.","L-istampi ta 'nies, annimali u pjanti dejjem ikunu eżaġerati, deformat, jew iż-warped, jew out of order."
philipp,,,
0.822,philipp,"So to finish this module, let’s have a look at the topics that we covered.","Allura biex jintemm dan il-modulu, ejja agħti ħarsa lejn is-suġġetti li aħna koperti."
philipp,,,
0.864,philipp,"I did not mean to disrupt you ""AS Sordid came close enough to touch the boy he grabbed Grunt's arm and pulled him to his feet and said with a calm voice"" A good idea.","Nota I ma jfissirx li Sfratta int ""Kif sordid daħal qrib biżżejjed tmissx il-boy hu driegħ grabbed Grunt u ġenna miġbud għall saqajh u qal b'vuċi kalm"" A idea tajba."
philipp,,,

0
hieu

0
philipp

0
philipp

0
philipp

0
philipp

0
hieu

0
wide00006

0
philipp

0
wide00015

0
hieu

0
philipp

0
philipp

0
philipp

0
philipp

0
philipp

0
philipp

0
philipp

0
hieu
philipp

0
philipp

0
philipp

0
hieu
philipp

0
wide00006

0
hieu

0
wide00006

0
hieu

0
philipp

0
wide00006

0
wide00006

0
philipp

0
wide00015

0
philipp

0
philipp

0
hieu
philipp

0
philipp

0
wide00006

0
philipp

0
philipp

0
hieu
philipp

0
wide00006

0
wide00006-pdf

0
wide00006-pdf

0
philipp

0
philipp

0
philipp

0
hieu

0
philipp

0
philipp

0
philipp

0
philipp

0
philipp

0
philipp

0
hieu
philipp

0
philipp

0
philipp

0
philipp

0
hieu

0
philipp

0
wide00006

0
philipp

0
wide00006-pdf

0
philipp

0
hieu

0
hieu

0
wide00006

0
wide00006-pdf

0
philipp

0
philipp

0
hieu
philipp

0
wide00015

0
wide00006

0
philipp

0
philipp

0
wide00006

0
hieu

0
philipp

0
philipp

0
hieu

0
philipp

0
hieu

0
philipp

0
philipp

0
philipp

0
philipp

0
philipp

0
hieu

0
philipp

0
hieu
philipp

0
philipp

0
hieu

0
philipp

0
philipp

0
philipp

0
wide00006-pdf

0
hieu

0
philipp

0
philipp

0
hieu

0
philipp

0
hieu

0
hieu


# Missing Collections
Is it maybe a particular collection that is missing?


In [10]:
table(collections_overall.items(), headers=['collection', 'count'])

collection,count
wide00006,59626
hieu,181729
philipp,613267
wide00006-pdf,26406
wide00015,33655


In [11]:
table(collections_missing.items(), headers=['collection', 'count'])

collection,count
wide00006,54425
hieu,126298
philipp,375491
wide00006-pdf,24841
wide00015,14459


In [12]:
from operator import itemgetter
table({
    'collection': collection,
    'percentage': '{:0.1f}%'.format(100 * collections_missing[collection] / count),
    'records': collections_missing[collection],
    'total': count
} for collection, count in sorted(collections_overall.items(), key=itemgetter(1), reverse=True))

collection,percentage,records,total
philipp,61.2%,375491,613267
hieu,69.5%,126298,181729
wide00006,91.3%,54425,59626
wide00015,43.0%,14459,33655
wide00006-pdf,94.1%,24841,26406


Hmm, correct, pdfs haven't been processed for this release. Philipp has been processed with a different pipeline. But I have no plausible explanation for Hieu or wide00006/15 in terms of processing. Maybe MT filter?

# Missing web domains
Is there anything particular about the missing sentence pairs? Maybe they're all from domains that got filtered because of MT?

In [13]:
table({
    'domain': f"http://{domain_name}/",
    'count': count
} for domain_name, count in domains.most_common(15))

domain,count
http://straightpoint.com/,481378
http://www.welcome-to-barcelona.com/,195795
http://www.straightpoint.com/,151848
http://transposh.org/,151056
http://2fish.co/,113978
http://builttobrag.com/,112523
http://www.outlookimport.com/,102456
http://blog.simmakers.com/,98508
http://www.conex.net/,88135
http://therefugeecenter.org/,85340


Okay I'm not going to do this by hand let's quickly implement something that matches the behaviour of warc2text's mt filter, import our filter list, and *make the assumption the page we processed is the same as the page that is online today*.

In [14]:
with open('mt-filter-list.txt', 'r') as fh:
    mt_filter_list = fh.read()

In [15]:
from bs4 import BeautifulSoup
from urllib.request import urlopen

def check_for_mt(url, filter_list):
    with urlopen(url, timeout=10) as fh:
        data = fh.read(5 * 1024 * 1024) # limit to 5mb
        soup = BeautifulSoup(data, 'html.parser')
        
        for line in filter_list.split('\n'):
            if line.startswith('#') or line.strip() == '':
                continue
            elment, attribute, pattern = line.split('\t')
            if soup.find_all(elment, attrs={attribute: re.compile(pattern)}):
                return line, fh.url
        return None, fh.url

In [16]:
check_for_mt('http://straightpoint.com/', mt_filter_list)

("a\tonclick\tdoGTranslate\\(\\'.{2}\\|.{2}\\'\\)",
 'https://straightpoint.com/')

So let's first look at the top 30 missing domains

In [17]:
def check_domains(n):
    for domain_name, count in domains.most_common(n):
        try:
            hit, url = check_for_mt(f"http://{domain_name}/", mt_filter_list)
        except Exception as e:
            hit, url = str(e), f'http://{domain_name}'
        yield url, count, hit

table(check_domains(30), headers=['domain', 'count', 'matched filter'])

domain,count,matched filter
https://straightpoint.com/,481378,a	onclick	doGTranslate\(\'.{2}\|.{2}\'\)
http://www.welcome-to-barcelona.com/,195795,
https://www.straightpoint.com/,151848,a	onclick	doGTranslate\(\'.{2}\|.{2}\'\)
http://transposh.org/,151056,meta	name	translation-stats
https://2fish.co/,113978,
https://builttobrag.com/,112523,
https://www.outlookimport.com/,102456,
https://blog.simmakers.com/,98508,
https://www.conex.net/,88135,
http://therefugeecenter.org,85340,HTTP Error 502: Bad Gateway


Of course, we're not targeting home pages in our sentence pairs, but lower level pages. I'm going to use the sampled sentence pairs and look at their urls.

In [18]:
def check_unit_for_mt(unit):
    error = None
    
    for url in set().union(*(translation['source-document'] for translation in unit.translations.values())):
        # Fix for data from philipp and hieu collections: urls have no prefix.
        if not url.startswith('http'):
            url = f'http://{url}'
        
        try:
            hit, url = check_for_mt(url, mt_filter_list)
            if hit:
                return hit, url
        except Exception as e:
            error = str(e)
        
    return error, None


checked_units = [
    (unit, *check_unit_for_mt(unit))
    for unit in tqdm(list(sample))
]


  0%|          | 0/100 [00:00<?, ?it/s]

In [19]:
table({
    'url': url or first(set.union(*(t['source-document'] for t in unit.translations.values()))),
    'filter': hit,
    'collections': unit['collection']
} for unit, hit, url in checked_units)

url,filter,collections
myhealthbox.eu/cs/lék/potactasol/3544604,'ascii' codec can't encode character '\xe9' in position 9: ordinal not in range(128),hieu
hieu,,
http://skolarbete.nu/mt/skolarbeten/2008/02/page/2/,HTTP Error 500: Internal Server Error,philipp
philipp,,
http://www.oil-painting-online.com/mt/symbolism-oil-painting/cao-li-oil-painting-art.html,link	rel	alternate machine-translated-from,philipp
philipp,,
http://kaizenlog.com/80657ae-finance-advanced-microsoft-dynamics-nav-2015/2/,HTTP Error 403: Forbidden,philipp
philipp,,
http://skolarbete.nu/mt/skolarbeten/the-dragon-reborn/,HTTP Error 500: Internal Server Error,philipp
philipp,,

0
hieu

0
philipp

0
philipp

0
philipp

0
philipp

0
hieu

0
wide00006

0
philipp

0
wide00015

0
hieu

0
philipp

0
philipp

0
philipp

0
philipp

0
philipp

0
philipp

0
philipp

0
hieu
philipp

0
philipp

0
philipp

0
hieu
philipp

0
wide00006

0
hieu

0
wide00006

0
hieu

0
philipp

0
wide00006

0
wide00006

0
philipp

0
wide00015

0
philipp

0
philipp

0
hieu
philipp

0
philipp

0
wide00006

0
philipp

0
philipp

0
hieu
philipp

0
wide00006

0
wide00006-pdf

0
wide00006-pdf

0
philipp

0
philipp

0
philipp

0
hieu

0
philipp

0
philipp

0
philipp

0
philipp

0
philipp

0
philipp

0
hieu
philipp

0
philipp

0
philipp

0
philipp

0
hieu

0
philipp

0
wide00006

0
philipp

0
wide00006-pdf

0
philipp

0
hieu

0
hieu

0
wide00006

0
wide00006-pdf

0
philipp

0
philipp

0
hieu
philipp

0
wide00015

0
wide00006

0
philipp

0
philipp

0
wide00006

0
hieu

0
philipp

0
philipp

0
hieu

0
philipp

0
hieu

0
philipp

0
philipp

0
philipp

0
philipp

0
philipp

0
hieu

0
philipp

0
hieu
philipp

0
philipp

0
hieu

0
philipp

0
philipp

0
philipp

0
wide00006-pdf

0
hieu

0
philipp

0
philipp

0
hieu

0
philipp

0
hieu

0
hieu


In [20]:
def count(iterable):
    return sum(1 for _ in iterable)

Number of units that were accessible but did trigger an MT detection rule

In [21]:
count(unit for unit, hit, url in checked_units if url is not None)

17

Number of units that were accessible and did not trigger the MT system

In [22]:
count(unit for unit, hit, url in checked_units if url is None and hit is None)

22

Number of units that were no longer accessible on any of its associated source documents

In [23]:
count(unit for unit, hit, url in checked_units if url is None and hit is not None)

61

In [27]:
notebook.table(Counter(hit for _, hit, url in checked_units if url is None and hit is not None).most_common())

0,1
HTTP Error 403: Forbidden,25
HTTP Error 404: Not Found,17
HTTP Error 500: Internal Server Error,8
"<urlopen error [Errno 8] nodename nor servname provided, or not known>",6
'ascii' codec can't encode character '\xe9' in position 9: ordinal not in range(128),1
<urlopen error timed out>,1
'ascii' codec can't encode character '\xa0' in position 21: ordinal not in range(128),1
HTTP Error 429: Too Many Requests,1
HTTP Error 503: Service Temporarily Unavailable,1
