In [60]:
import site
site.addsitedir('../tmxutil')
from tmxutil import make_reader, TranslationUnit, TranslationUnitVariant
import unidecode
import string
from xxhash import xxh64
from notebook import table, first
from tqdm.autonotebook import tqdm
from logging import getLogger, ERROR
getLogger().setLevel(ERROR) # Hide warnings while importing

def checksum(unit):
    return xxh64("\t".join(
        unidecode.unidecode(translation.text.lower().replace(" ", "").translate(str.maketrans('', '', string.punctuation + string.digits)))
        for translation in unit.translations.values())).hexdigest()

def read(filename):
    with open(filename, 'rb') as fh:
        for unit in make_reader(fh, progress=True):
            yield unit

  from tqdm.autonotebook import tqdm


In [61]:
files = {
    'paracrawl-7.1': 'en-mt.tmx.gz',
    'paracrawl-8': 'en-mt.cc-2016-30-cc-2017-30-cc-2018-30-cc-2019-18-cc-2019-35-gwb-hieu-marta-philipp-wide00006-wide00015.tmx.gz'
}

In [62]:
hashes = {
    release: frozenset(checksum(unit) for unit in read(filename))
    for release, filename in files.items()
}

en-mt.tmx.gz:   0%|          | 0.00/197M [00:00<?, ?b/s]

en-mt.cc-2016-30-cc-2017-30-cc-2018-30-cc-2019-18-cc-2019-35-gwb-hieu-marta-philipp-wide00006-wide00015.tmx.gz…

# Overlap between releases
How many sentence pairs are in one while not in the other, and how many are in both?

In [63]:
len(hashes['paracrawl-7.1'] & hashes['paracrawl-8']) / len(hashes['paracrawl-8']) # Overlap

0.18467959367509593

In [64]:
len(hashes['paracrawl-8'] - hashes['paracrawl-7.1']) / len(hashes['paracrawl-8']) # Novel

0.8153204063249041

In [65]:
len(hashes['paracrawl-7.1'] - hashes['paracrawl-8']) / len(hashes['paracrawl-7.1']) # Lost

0.6543089586479063

Okay, so about 65% of all sentence pairs in 7.1 do not return in 8? Why? Are these from domains that we marked as likely MT? Are they low quality?

In [66]:
from stats import RandomSample
from collections import Counter
import re

def domain(url):
    """Take the (very broadly defined) domain part of an url, or return the
    full url if that didn't work."""
    match = re.match(r'^(https?:)?(//)?(?P<domain>[^/]+)', url)
    return match.group('domain')

lost_hashes = hashes['paracrawl-7.1'] - hashes['paracrawl-8']

sample = RandomSample(1000)

domains = Counter()

collections_overall = Counter()

collections_missing = Counter()

for unit in read(files['paracrawl-7.1']):
    collections_overall.update(unit['collection'])
    if checksum(unit) in lost_hashes:
        collections_missing.update(unit['collection'])
        sample.add(unit)
        for translation in unit.translations.values():
            domains.update(domain(url) for url in translation['source-document'])

en-mt.tmx.gz:   0%|          | 0.00/197M [00:00<?, ?b/s]

In [8]:
import notebook
import importlib
notebook = importlib.reload(notebook)
table = notebook.table

In [67]:
table({
    'score': first(unit['score-bicleaner']),
    'collection': unit['collection'],
    **{lang: translation.text for lang, translation in unit.translations.items()}
} for unit in sample[:100])

TypeError: 'RandomSample' object is not subscriptable

# Missing Collections
Is it maybe a particular collection that is missing?


In [10]:
table(collections_overall.items(), headers=['collection', 'count'])

collection,count
wide00006,59626
hieu,181729
philipp,613267
wide00006-pdf,26406
wide00015,33655


In [11]:
table(collections_missing.items(), headers=['collection', 'count'])

collection,count
wide00006,54425
hieu,126298
philipp,375491
wide00006-pdf,24841
wide00015,14459


In [12]:
from operator import itemgetter
table({
    'collection': collection,
    'percentage': '{:0.1f}%'.format(100 * collections_missing[collection] / count),
    'records': collections_missing[collection],
    'total': count
} for collection, count in sorted(collections_overall.items(), key=itemgetter(1), reverse=True))

collection,percentage,records,total
philipp,61.2%,375491,613267
hieu,69.5%,126298,181729
wide00006,91.3%,54425,59626
wide00015,43.0%,14459,33655
wide00006-pdf,94.1%,24841,26406


Hmm, correct, pdfs haven't been processed for this release. Philipp has been processed with a different pipeline. But I have no plausible explanation for Hieu or wide00006/15 in terms of processing. Maybe MT filter?

# Missing web domains
Is there anything particular about the missing sentence pairs? Maybe they're all from domains that got filtered because of MT?

In [13]:
table({
    'domain': f"http://{domain_name}/",
    'count': count
} for domain_name, count in domains.most_common(15))

domain,count
http://straightpoint.com/,481378
http://www.welcome-to-barcelona.com/,195795
http://www.straightpoint.com/,151848
http://transposh.org/,151056
http://2fish.co/,113978
http://builttobrag.com/,112523
http://www.outlookimport.com/,102456
http://blog.simmakers.com/,98508
http://www.conex.net/,88135
http://therefugeecenter.org/,85340


Okay I'm not going to do this by hand let's quickly implement something that matches the behaviour of warc2text's mt filter, import our filter list, and *make the assumption the page we processed is the same as the page that is online today*.

In [14]:
with open('mt-filter-list.txt', 'r') as fh:
    mt_filter_list = fh.read()

In [15]:
from bs4 import BeautifulSoup
from urllib.request import urlopen

def check_for_mt(url, filter_list):
    with urlopen(url, timeout=10) as fh:
        data = fh.read(5 * 1024 * 1024) # limit to 5mb
        soup = BeautifulSoup(data, 'html.parser')
        
        for line in filter_list.split('\n'):
            if line.startswith('#') or line.strip() == '':
                continue
            elment, attribute, pattern = line.split('\t')
            if soup.find_all(elment, attrs={attribute: re.compile(pattern)}):
                return line, fh.url
        return None, fh.url

In [16]:
check_for_mt('http://straightpoint.com/', mt_filter_list)

("a\tonclick\tdoGTranslate\\(\\'.{2}\\|.{2}\\'\\)",
 'https://straightpoint.com/')

So let's first look at the top 30 missing domains

In [17]:
def check_domains(n):
    for domain_name, count in domains.most_common(n):
        try:
            hit, url = check_for_mt(f"http://{domain_name}/", mt_filter_list)
        except Exception as e:
            hit, url = str(e), f'http://{domain_name}'
        yield url, count, hit

table(check_domains(30), headers=['domain', 'count', 'matched filter'])

domain,count,matched filter
https://straightpoint.com/,481378,a	onclick	doGTranslate\(\'.{2}\|.{2}\'\)
http://www.welcome-to-barcelona.com/,195795,
https://www.straightpoint.com/,151848,a	onclick	doGTranslate\(\'.{2}\|.{2}\'\)
http://transposh.org/,151056,meta	name	translation-stats
https://2fish.co/,113978,
https://builttobrag.com/,112523,
https://www.outlookimport.com/,102456,
https://blog.simmakers.com/,98508,
https://www.conex.net/,88135,
http://therefugeecenter.org,85340,HTTP Error 502: Bad Gateway


Of course, we're not targeting home pages in our sentence pairs, but lower level pages. I'm going to use the sampled sentence pairs and look at their urls.

In [18]:
def check_unit_for_mt(unit):
    error = None
    
    for url in set().union(*(translation['source-document'] for translation in unit.translations.values())):
        # Fix for data from philipp and hieu collections: urls have no prefix.
        if not url.startswith('http'):
            url = f'http://{url}'
        
        try:
            hit, url = check_for_mt(url, mt_filter_list)
            if hit:
                return hit, url
        except Exception as e:
            error = str(e)
        
    return error, None


checked_units = [
    (unit, *check_unit_for_mt(unit))
    for unit in tqdm(list(sample))
]


  0%|          | 0/100 [00:00<?, ?it/s]

In [19]:
table({
    'url': url or first(set.union(*(t['source-document'] for t in unit.translations.values()))),
    'filter': hit,
    'collections': unit['collection']
} for unit, hit, url in checked_units)

url,filter,collections
myhealthbox.eu/cs/lék/potactasol/3544604,'ascii' codec can't encode character '\xe9' in position 9: ordinal not in range(128),hieu
hieu,,
http://skolarbete.nu/mt/skolarbeten/2008/02/page/2/,HTTP Error 500: Internal Server Error,philipp
philipp,,
http://www.oil-painting-online.com/mt/symbolism-oil-painting/cao-li-oil-painting-art.html,link	rel	alternate machine-translated-from,philipp
philipp,,
http://kaizenlog.com/80657ae-finance-advanced-microsoft-dynamics-nav-2015/2/,HTTP Error 403: Forbidden,philipp
philipp,,
http://skolarbete.nu/mt/skolarbeten/the-dragon-reborn/,HTTP Error 500: Internal Server Error,philipp
philipp,,

0
hieu

0
philipp

0
philipp

0
philipp

0
philipp

0
hieu

0
wide00006

0
philipp

0
wide00015

0
hieu

0
philipp

0
philipp

0
philipp

0
philipp

0
philipp

0
philipp

0
philipp

0
hieu
philipp

0
philipp

0
philipp

0
hieu
philipp

0
wide00006

0
hieu

0
wide00006

0
hieu

0
philipp

0
wide00006

0
wide00006

0
philipp

0
wide00015

0
philipp

0
philipp

0
hieu
philipp

0
philipp

0
wide00006

0
philipp

0
philipp

0
hieu
philipp

0
wide00006

0
wide00006-pdf

0
wide00006-pdf

0
philipp

0
philipp

0
philipp

0
hieu

0
philipp

0
philipp

0
philipp

0
philipp

0
philipp

0
philipp

0
hieu
philipp

0
philipp

0
philipp

0
philipp

0
hieu

0
philipp

0
wide00006

0
philipp

0
wide00006-pdf

0
philipp

0
hieu

0
hieu

0
wide00006

0
wide00006-pdf

0
philipp

0
philipp

0
hieu
philipp

0
wide00015

0
wide00006

0
philipp

0
philipp

0
wide00006

0
hieu

0
philipp

0
philipp

0
hieu

0
philipp

0
hieu

0
philipp

0
philipp

0
philipp

0
philipp

0
philipp

0
hieu

0
philipp

0
hieu
philipp

0
philipp

0
hieu

0
philipp

0
philipp

0
philipp

0
wide00006-pdf

0
hieu

0
philipp

0
philipp

0
hieu

0
philipp

0
hieu

0
hieu


In [20]:
def count(iterable):
    return sum(1 for _ in iterable)

Number of units that were accessible but did trigger an MT detection rule

In [21]:
count(unit for unit, hit, url in checked_units if url is not None)

17

Number of units that were accessible and did not trigger the MT system

In [22]:
count(unit for unit, hit, url in checked_units if url is None and hit is None)

22

Number of units that were no longer accessible on any of its associated source documents

In [23]:
count(unit for unit, hit, url in checked_units if url is None and hit is not None)

61

In [27]:
notebook.table(Counter(hit for _, hit, url in checked_units if url is None and hit is not None).most_common())

0,1
HTTP Error 403: Forbidden,25
HTTP Error 404: Not Found,17
HTTP Error 500: Internal Server Error,8
"<urlopen error [Errno 8] nodename nor servname provided, or not known>",6
'ascii' codec can't encode character '\xe9' in position 9: ordinal not in range(128),1
<urlopen error timed out>,1
'ascii' codec can't encode character '\xa0' in position 21: ordinal not in range(128),1
HTTP Error 429: Too Many Requests,1
HTTP Error 503: Service Temporarily Unavailable,1


Just to make sure, we're not seeing anything odd in terms of units that matched across shards, right?

In [101]:
from publicsuffixlist import PublicSuffixList
from urllib.parse import urlparse
import fnv
import re

psl = PublicSuffixList()

def urls(unit):
    return [url for tu in unit.translations.values() for url in tu['source-document']]

def slug(key):
    """https://github.com/paracrawl/giashard/blob/master/shard.go#L88"""
    url = urlparse(key)
    if url.netloc:
        host = re.sub(r':\d+$', '', url.netloc)
    else:
        match = re.match(r'^([a-zA-Z0-9][a-zA-Z0-9\-.]*[a-zA-Z0-9]).*', key)
        if not match:
            raise ValueError(f'Unable to determine host using regexp from {key}')
        host = match.group(1)
    
    public = psl.publicsuffix(host)
    private = psl.privatesuffix(host)
    
    
    if not public or not private:
        match = re.match('^([^/]+).*', host)
        if not match or len(match.group(1)) == 0:
            raise ValueError(f'Unable to determine slug by parsing {host} from {key}')
        slug = match.group(1)
    else:
        slug = re.sub('\.?' + re.escape(public) + '$', '', private)
    
    if not slug:
        raise ValueError(f'Unable to determine slug from {key}')
    
    return slug

def shard_id(key, n=8):
    """https://github.com/paracrawl/giashard/blob/master/shard.go#L120"""
    return fnv.hash(slug(key).encode(), algorithm=fnv.fnv, bits=64) % (1 << n)

In [109]:
weird = [
    unit
    for unit in philipp #read(files['paracrawl-7.1'])
    if len(
        frozenset(shard_id(url, 8) for url in unit.translations['en']['source-document']) ^
        frozenset(shard_id(url, 8) for url in unit.translations['en']['source-document'])
    ) != 0
]

weird

[]

In [107]:
philipp = [unit for unit in read(files['paracrawl-7.1']) if unit['collection'] == set(['philipp'])]

en-mt.tmx.gz:   0%|          | 0.00/197M [00:00<?, ?b/s]

In [108]:
len(philipp)

557395

In [111]:
inside = RandomSample(100)
outside = RandomSample(100)

for unit in philipp:
    if checksum(unit) in hashes['paracrawl-8']:
        inside.add(unit)
    else:
        outside.add(unit)

In [112]:
inside.n, outside.n

(216111, 341284)

In [115]:
table({lang: tuv.text for lang, tuv in unit.translations.items()} for unit in outside)

en,mt
"Later on, he turned to watches which were much smaller and required more attention.","Minn hemm ma damx ma dar għall-arloġġi tal-idejn, li għalkemm kienu iżgħar u b’hekk kien jeħtiġielhom iktar attenzjoni, xorta waħda sab irkaptu tagħhom."
"George took a black girl to prom, and mentored young black children.","George ħa tifla sewda promettenti, u mentored tfal iswed żgħażagħ."
To develop a new variety can take up to 15 years.,Biex tiżviluppa varjetà ġdida tista 'tieħu sa 15-il sena.
"China New Wireless Hearing Aid Amplifier Walmart With Noise Cancellation And Receiver In Canal Ric Hearing Aids Online Manufacturers,Suppliers and Factory - Wholesale Products - EarsMATE","Iċ-Ċina Ġdida Wireless Smigħ Amplifikatur Walmart Bil-Kanċellazzjoni tal-Istorbju u Riċevitur Fil-Canal Ric Għajnuniet tas-Smigħ Online Manifatturi, Fornituri u Fabbrika - Prodotti bl-ingrossa - EarsMATE"
"Last week the President of the Central Bank of the European Union, Jean-Claude Trichet, took the trouble to come to Malta to take part in the opening of the propaganda campaign spread over three months, in favour of the Euro currency, and to speak about it at a conference organized by the National Committee about the Ewro (the NECC) and the Central Bank of Malta, with the help of the European Union.","Se nħallsu prezz għoli għall-Ewro Il-Ġimgħa l-oħra l-President tal-Bank Ċentrali ta’ l-Unjoni Ewropea, Jean-Claude Trichet, qagħad jiskomoda ruħu u jiġi f’Malta ħalli jieħu sehem fil-ftuħ ta’ kampanja ta’ propaganda mifruxa fuq tliet xhur, favur il-munita Ewro, u biex jitkellem f’konferenza dwarha, organizzata mill-Kumitat Nazzjonali dwar l-Ewro (il-NECC) u l-Bank Ċentrali ta’ Malta, bl-għajnuna ta’ l-Unjoni Ewropea."
* The main application window.That Implements interfaces support events,* It-tieqa prinċipali applikazzjoni . Dan implimenti avvenimenti interfaces ta 'appoġġ
"We are developing further and offer our future students even a wider choice: nyob rau hauv 2015 8 specialities within the Faculty of Biology, Geology, Economics and Applied Mathematics and Informatics were submitted to be licensed and accredited .","Aħna qegħdin jiżviluppaw ulterjorment u joffru lill-istudenti futuri tagħna anke għażla aktar wiesgħa: fl 2015 8 speċjalitajiet fi ħdan il-Fakultà tal-Bijoloġija, ġeoloġija, Ekonomija u l-Matematika Applikata u Informatika kienu sottomessi biex tiġi liċenzjata u akkreditata."
Criminology and Sociology,Kriminoloġija u s-Soċjoloġija
"If you decide to go look at an apartment, ask an American friend to go with you to make sure the apartment is not a scam.","Jekk inti tiddeċiedi li tmur tħares lejn f'appartament, titlob lill-Ħabib Amerikan li jmorru miegħek biex tiżgura l-appartament mhuwiex a scam."
"How Does The Tea Packaging Box Reflect Environmental Awareness Oct 05, 2018","Kif il-kaxxa ta ' l-ippakkjar tat-tè jirriflettu kuxjenza ambjentali Oct 05, 2018"


In [119]:
Counter(domain(url) for unit in inside for url in urls(unit)).most_common(50)

[('www.lsherb.com', 22),
 ('mt.lsherb.com', 22),
 ('kaizenlog.com', 20),
 ('www.muhammad.com', 18),
 ('europa.eu', 18),
 ('www.meto-wiper.com', 17),
 ('m.meto-wiper.com', 17),
 ('mt.meto-wiper.com', 17),
 ('m.mt.meto-wiper.com', 17),
 ('en.zazagame.com', 13),
 ('mt.zazagame.com', 13),
 ('www.conex.net', 12),
 ('www.byvisiontech.com', 10),
 ('m.byvisiontech.com', 10),
 ('mt.byvisiontech.com', 10),
 ('m.mt.byvisiontech.com', 10),
 ('www.cnimalta.org', 8),
 ('www.yourwebdoc.org', 8),
 ('www.summerlyquartzstone.com', 7),
 ('mt.summerlyquartzstone.com', 7),
 ('www.yourwebdoc.com', 6),
 ('mt.yourwebdoc.com', 6),
 ('maltaracingclub.com', 6),
 ('www.electric-test.com', 5),
 ('mt.electric-test.com', 5),
 ('www.lexilogos.com', 4),
 ('m.summerlyquartzstone.com', 4),
 ('m.mt.summerlyquartzstone.com', 4),
 ('m.electric-test.com', 4),
 ('m.mt.electric-test.com', 4),
 ('www.medjugorje.ws', 4),
 ('www.kreattivita.org', 4),
 ('www.zhitov.ru', 4),
 ('m.htongsteel.com', 3),
 ('www.htongsteel.com', 3),
 (

In [120]:
Counter(domain(url) for unit in outside for url in urls(unit)).most_common(50)

[('straightpoint.com', 180),
 ('www.welcome-to-barcelona.com', 106),
 ('www.straightpoint.com', 46),
 ('skolarbete.nu', 24),
 ('www.tulipstores.com', 20),
 ('fionavella.com', 18),
 ('mail.straightpoint.com', 18),
 ('www.emltopst.com', 18),
 ('builttobrag.com', 14),
 ('educationbro.com', 14),
 ('www.gfesport.com', 12),
 ('2fish.co', 11),
 ('transposh.org', 10),
 ('mamaclub.info', 8),
 ('www.exacthacks.com', 7),
 ('www.metric-conversions.org', 6),
 ('kaizenlog.com', 6),
 ('theforexclub.eu', 6),
 ('trenboloneresults.com', 6),
 ('www.conex.net', 6),
 ('www.holmbygden.se', 6),
 ('www.cnimalta.org', 4),
 ('www.blogprinvizor.ro', 4),
 ('www.federacionfotovasca.org', 4),
 ('www.muhammad.com', 4),
 ('www.belvini-weinversand.de', 4),
 ('www.extendoffice.com', 4),
 ('m.cncmachinecenters.com', 4),
 ('www.cncmachinecenters.com', 4),
 ('m.mt.cncmachinecenters.com', 4),
 ('mt.cncmachinecenters.com', 4),
 ('yatraveller.com', 4),
 ('www.parisdakar.it', 4),
 ('infotakraw.com', 4),
 ('www.infinitecosmos.

In [157]:
inside_domains, outside_domains = Counter(), Counter()

for unit in philipp:
    if checksum(unit) in hashes['paracrawl-8']:
        inside_urls.update(domain(url) for url in urls(unit))
    else:
        outside_urls.update(domain(url) for url in urls(unit))    
        

In [123]:
len(inside_urls), len(outside_urls)

(604, 927)

In [124]:
len(inside_urls & outside_urls)

596

In [125]:
len(inside_urls ^ outside_urls)

339

# Filtered domains from logs
I also still have the warc2text processing logs of Philipp's dataset. These contain all urls that got filtered. For easy usage, I'm going to collapse them to their domain, and then see how many of the domains in Paracrawl 7.1 and 8 are in this list of filtered domains.

I assume the answer will be many and nearly none respectively. Maybe some will slip through for Paracrawl 8 because the MT filter works at a page level, not a domain level, and a domain could be a mix of MT and non-MT pages.

In [151]:
domains = {
    dataset: Counter(domain(url) for unit in read(path) for url in urls(unit))
    for dataset, path in files.items()
}

en-mt.tmx.gz:   0%|          | 0.00/197M [00:00<?, ?b/s]

en-mt.cc-2016-30-cc-2017-30-cc-2018-30-cc-2019-18-cc-2019-35-gwb-hieu-marta-philipp-wide00006-wide00015.tmx.gz…

That's the number of urls per domain. Now load in the list of filtered urls.

In [131]:
import gzip
with gzip.open('warc2text-philipp.urls.gz') as fh:
    filtered_domains = set(domain(url.strip().decode()) for url in fh)

In [172]:
def percentage(selection_label, selection, unit_label, total):
    return {
        f'{selection_label} {unit_label}': selection,
        f'total {unit_label}': total,
        f'% of {unit_label}': '{:0.2f}%'.format(selection / total * 100)
    }

table({
    'dataset': dataset,
    **percentage('filtered', len(set(counter) & filtered_domains), 'domains', len(counter)),
    **percentage('filtered', sum(counter[url] for url in set(counter) & filtered_domains), 'urls', sum(counter[url] for url in counter))
} for dataset, counter in domains.items())

dataset,filtered domains,total domains,% of domains,filtered urls,total urls,% of urls
paracrawl-7.1,152,2889,5.26%,2897031,6248521,46.36%
paracrawl-8,108,30543,0.35%,173880,18489248,0.94%
unexplained,35,819,4.27%,20510,976197,2.10%


But this is when counting domains. More interesting is the number of units that are now filtered by the domain filter.

I'm taking all units from Paracrawl 7.1 release that originated from Philipp's dataset, and marking them "missing" if they did not end up in Paracrawl 8. I'm also marking them "filtered" if they appeared in the filtered domain logs.

In [161]:
missing_philipp = [unit for unit in philipp if checksum(unit) not in hashes['paracrawl-8']]

In [162]:
len(missing_philipp) / len(philipp)

0.6122839279146746

In [158]:
filtered_philipp = [unit for unit in philipp if all(domain(url) in filtered_domains for url in urls(unit))]

In [160]:
len(filtered_philipp) / len(philipp)

0.35446496649593195

61% of Philipp is missing from Paracrawl 8. 35% of Philipp would not show up at all in Paracrawl 8 because these units originate only from filtered domains.

In [163]:
missing_philipp_set = {id(unit) for unit in missing_philipp}
filtered_philipp_set = {id(unit) for unit in filtered_philipp}
len(missing_philipp_set & filtered_philipp_set) / len(missing_philipp_set)

0.5652389212503369

In [164]:
len(missing_philipp_set & filtered_philipp_set) / len(filtered_philipp_set)

0.9763636455660325

56% of all missing sentence pairs are probably filtered out by MT.

98% of all sentence pairs that appear in Paracrawl 7.1, that match a domain that was filtered out for MT while processing Paracrawl 8, do not appear in Paracrawl 8. Well, not really surprising I think…

But there is still a large set of sentence pairs that are from domains that did not get filtered out for MT reasons. What's up with these then?

In [166]:
unexplained_unit_ids = missing_philipp_set - filtered_philipp_set
unexplained_units = [unit for unit in philipp if id(unit) in unexplained_unit_ids]

In [171]:
domains['unexplained'] = Counter(domain(url) for unit in unexplained_units for url in urls(unit))

In [174]:
len(set(domains['unexplained']) & set(domains['paracrawl-8'])) / len(domains['unexplained'])

0.7326007326007326

73% of the unexplained domains also occur in Paracrawl 8, so we're not excluding whole swaths of domains.

In [186]:
table({
    'domain': d,
    'unexplained': count, # pairs where all domains are filtered
    'paracrawl-7.1': domains['paracrawl-7.1'][d], # pairs where at least some domains are filtered
    'paracrawl-8': domains['paracrawl-8'][d] # pairs in latest release
} for d, count in domains['unexplained'].most_common()[:15])

domain,unexplained,paracrawl-7.1,paracrawl-8
www.yourwebdoc.org,61823,98619,24669
www.conex.net,45103,115743,75459
www.byvisiontech.com,43040,66895,45704
mt.byvisiontech.com,40274,62537,33369
www.muhammad.com,35786,89654,73555
kaizenlog.com,28170,75374,55065
www.metric-conversions.org,22950,40994,0
www.airmalta.com,21780,41055,51236
europa.eu,21392,101565,700276
actioncertification.org,19233,44444,31612
