In [1]:
import site
site.addsitedir('../tmxutil')
from tmxutil import make_reader, TranslationUnit, TranslationUnitVariant
import unidecode
import string
from xxhash import xxh64
from notebook import table, first
from tqdm.autonotebook import tqdm
from logging import getLogger, ERROR
getLogger().setLevel(ERROR) # Hide warnings while importing

def checksum(unit):
    return xxh64("\t".join(
        unidecode.unidecode(translation.text.lower().replace(" ", "").translate(str.maketrans('', '', string.punctuation + string.digits)))
        for translation in unit.translations.values())).hexdigest()

def read(filename):
    with open(filename, 'rb') as fh:
        for unit in make_reader(fh, progress=True):
            yield unit

  from tqdm.autonotebook import tqdm


In [2]:
files = {
    'paracrawl-7.1': 'en-mt.tmx.gz',
    'paracrawl-8': 'en-mt.cc-2016-30-cc-2017-30-cc-2018-30-cc-2019-18-cc-2019-35-gwb-hieu-marta-philipp-wide00006-wide00015.tmx.gz'
}

In [3]:
hashes = {
    release: frozenset(checksum(unit) for unit in read(filename))
    for release, filename in files.items()
}

en-mt.tmx.gz:   0%|          | 0.00/197M [00:00<?, ?b/s]

en-mt.cc-2016-30-cc-2017-30-cc-2018-30-cc-2019-18-cc-2019-35-gwb-hieu-marta-philipp-wide00006-wide00015.tmx.gz…

In [13]:
%%html
<style>
caption {
    font-weight: bold;
    caption-side: bottom;
}
</style>

# Overlap between releases
How many sentence pairs are in one while not in the other, and how many are in both?

I'm using the same hashes bicleaner uses to identify duplicates (when --aggressive-dedup is passed to it).

In [4]:
len(hashes['paracrawl-7.1'] & hashes['paracrawl-8']) / len(hashes['paracrawl-8']) # Overlap

0.18467959367509593

In [5]:
len(hashes['paracrawl-8'] - hashes['paracrawl-7.1']) / len(hashes['paracrawl-8']) # Novel

0.8153204063249041

In [6]:
len(hashes['paracrawl-7.1'] - hashes['paracrawl-8']) / len(hashes['paracrawl-7.1']) # Lost

0.6543089586479063

Okay, so about 65% of all sentence pairs in 7.1 do not return in 8? Why? Are these from domains that we marked as likely MT? Are they low quality?

Time for some sampling to look at. Also count the domains on either side, see whether it's specific domain names that are missing (which would indicate MT filtering).

In [9]:
from stats import RandomSample
from collections import Counter
import re

def domain(url):
    """Take the (very broadly defined) domain part of an url, or return the
    full url if that didn't work."""
    match = re.match(r'^(https?:)?(//)?(?P<domain>[^/]+)', url)
    return match.group('domain')

lost_hashes = hashes['paracrawl-7.1'] - hashes['paracrawl-8']

sample = RandomSample(1000)

domains = Counter()

collections_overall = Counter()

collections_missing = Counter()

for unit in read(files['paracrawl-7.1']):
    collections_overall.update(unit['collection'])
    if checksum(unit) in lost_hashes:
        collections_missing.update(unit['collection'])
        sample.add(unit)
        for translation in unit.translations.values():
            domains.update(domain(url) for url in translation['source-document'])

en-mt.tmx.gz:   0%|          | 0.00/197M [00:00<?, ?b/s]

In [7]:
from notebook import table

In [10]:
table({
    'score': first(unit['score-bicleaner']),
    'collection': unit['collection'],
    **{lang: translation.text for lang, translation in unit.translations.items()}
} for unit in sample[:20])

score,collection,en,mt
0.592,hieu,"The events are set up either by the National Book Council or by the participants in the Festival, which include Malta's publishers, booksellers and NGOs and public agencies involved in the book industry.","L-attivitajiet huma mħejjija jew mill-Kunsill Nazzjonali tal-Ktieb jew mill-parteċipanti fil-Festival, li tista' tgħid jinkludu kull pubblikatur, bosta bejjiegħa tal-kotba u NGOs kif ukoll aġenziji pubbliċi involuti fl-industrija tal-ktieb."
hieu,,,
0.572,philipp,Comment bacterial content of ProViva Shot,Kumment kontenut batterjoloġiku fil ProViva Shot
philipp,,,
0.806,philipp,dictionary - German - Slovak,dizzjunarju - Ġermaniż - Slovakk
philipp,,,
0.700,hieu philipp,Ukrainian Admission Center main specialization is admission and education of foreign students.,Ukraina Ammissjoni Ċentru ispeċjalizzazzjoni prinċipali hija ammissjoni u l-edukazzjoni ta 'studenti barranin.
hieu,,,
philipp,,,
0.564,philipp,"A:Yes,itcancommunicatewithcomputer.IthasRS232.","T: Iva, jista 'jikkomunika mal-kompjuter."

0
hieu

0
philipp

0
philipp

0
hieu
philipp

0
philipp

0
philipp

0
wide00006

0
hieu

0
philipp

0
wide00006

0
hieu
philipp

0
wide00006

0
wide00015

0
hieu
philipp

0
philipp

0
wide00006

0
philipp

0
hieu

0
philipp

0
hieu


# Missing Collections
Is it maybe a particular collection that is missing? That might indicate something changed in the way we process things.

I would expect some changes here for the Philipp collection since this is the first time we processed that collection using the same pipeline, instead of merging it with previous work.

What might also have an impact is that for Paracrawl 6 we merged the previous release of Paracrawl into the mix just before deduplicating. I'm not entirely sure we did this for Paracrawl 7 as well. Looking at the files on CSD3, in `/rds/project/rds-48gU72OtDNY/paracrawl-combined-releases/en-mt`, I see files named `raw-philipp-v5a.en-mt.{1..3}` so I assume we did.


In [15]:
table(collections_overall.items(), headers=['collection', 'count'], title="counts for paracrawl 7.1")

collection,count
wide00006,59626
hieu,181729
philipp,613267
wide00006-pdf,26406
wide00015,33655


In [16]:
table(collections_missing.items(), headers=['collection', 'count'], title="counts of paracrawl 7.1 not in 8")

collection,count
wide00006,54425
hieu,126298
philipp,375491
wide00006-pdf,24841
wide00015,14459


In [17]:
from operator import itemgetter
table({
    'collection': collection,
    'percentage missing': '{:0.1f}%'.format(100 * collections_missing[collection] / count),
    'units not in 7.1 not in 8': collections_missing[collection],
    'units in 7.1': count
} for collection, count in sorted(collections_overall.items(), key=itemgetter(1), reverse=True))

collection,percentage missing,units not in 7.1 not in 8,units in 7.1
philipp,61.2%,375491,613267
hieu,69.5%,126298,181729
wide00006,91.3%,54425,59626
wide00015,43.0%,14459,33655
wide00006-pdf,94.1%,24841,26406


Hmm, correct, pdfs haven't been processed for this release. Philipp has been processed with a different pipeline. But I have no plausible explanation for Hieu or wide00006/15 in terms of processing. Maybe MT filter?

# Missing web domains
Is there anything particular about the missing sentence pairs? Maybe they're all from domains that got filtered because of MT?

In [18]:
table({
    'domain': f"http://{domain_name}/",
    'count': count
} for domain_name, count in domains.most_common(15))

domain,count
http://straightpoint.com/,481378
http://www.welcome-to-barcelona.com/,195795
http://www.straightpoint.com/,151848
http://transposh.org/,151056
http://2fish.co/,113978
http://builttobrag.com/,112523
http://www.outlookimport.com/,102456
http://blog.simmakers.com/,98508
http://www.conex.net/,88135
http://therefugeecenter.org/,85340


Okay I'm not going to do this by hand let's quickly implement something that matches the behaviour of warc2text's mt filter, import our filter list, and *make the assumption the page we processed is the same as the page that is online today*.

In [19]:
with open('mt-filter-list.txt', 'r') as fh:
    mt_filter_list = fh.read()

In [20]:
from bs4 import BeautifulSoup
from urllib.request import urlopen

def check_for_mt(url, filter_list):
    with urlopen(url, timeout=10) as fh:
        data = fh.read(5 * 1024 * 1024) # limit to 5mb
        soup = BeautifulSoup(data, 'html.parser')
        
        for line in filter_list.split('\n'):
            if line.startswith('#') or line.strip() == '':
                continue
            elment, attribute, pattern = line.split('\t')
            if soup.find_all(elment, attrs={attribute: re.compile(pattern)}):
                return line, fh.url
        return None, fh.url

In [21]:
check_for_mt('http://straightpoint.com/', mt_filter_list)

("a\tonclick\tdoGTranslate\\(\\'.{2}\\|.{2}\\'\\)",
 'https://straightpoint.com/')

So let's first look at the top 30 missing domains

In [22]:
def check_domains(n):
    for domain_name, count in domains.most_common(n):
        try:
            hit, url = check_for_mt(f"http://{domain_name}/", mt_filter_list)
        except Exception as e:
            hit, url = str(e), f'http://{domain_name}'
        yield url, count, hit

table(check_domains(30), headers=['domain', 'count', 'matched filter'])

domain,count,matched filter
https://straightpoint.com/,481378,a	onclick	doGTranslate\(\'.{2}\|.{2}\'\)
http://www.welcome-to-barcelona.com/,195795,
https://www.straightpoint.com/,151848,a	onclick	doGTranslate\(\'.{2}\|.{2}\'\)
http://transposh.org/,151056,meta	name	translation-stats
https://2fish.co/,113978,
https://builttobrag.com/,112523,
https://www.outlookimport.com/,102456,
https://blog.simmakers.com/,98508,
https://www.conex.net/,88135,
http://therefugeecenter.org,85340,HTTP Error 502: Bad Gateway


Of course, we're not targeting home pages in our sentence pairs, but lower level pages. I'm going to use the sampled sentence pairs and look at their urls.

In [24]:
def check_unit_for_mt(unit):
    error = None
    
    for url in set().union(*(translation['source-document'] for translation in unit.translations.values())):
        # Fix for data from philipp and hieu collections: urls have no prefix.
        if not url.startswith('http'):
            url = f'http://{url}'
        
        try:
            hit, url = check_for_mt(url, mt_filter_list)
            if hit:
                return hit, url
        except Exception as e:
            error = str(e)
        
    return error, None


checked_units = [
    (unit, *check_unit_for_mt(unit))
    for unit in tqdm(list(sample[:100]))
]


  0%|          | 0/100 [00:00<?, ?it/s]

In [35]:
table({
    'url': (url or first(set.union(*(t['source-document'] for t in unit.translations.values()))))[:100],
    'filter': hit,
    'collections': ', '.join(unit['collection'])
} for unit, hit, url in checked_units)

url,filter,collections
ktieb.org.mt/mt/applikazzjonijiet-ghall-konkors-letterarju-kitba-ghaz-zghazagh/,HTTP Error 503: Service Temporarily Unavailable,hieu
http://skolarbete.nu/mt/skolarbeten/bakteriehalten-i-proviva-shot/,HTTP Error 500: Internal Server Error,philipp
http://babelx.net/en/dictionary-German/,,philipp
ukraine.admission.center/for-parents/,,"hieu, philipp"
https://www.electric-test.com/info/faq-of-hzjq-1-oil-bdv-tester-oil-dielectric-s-19032802.html,HTTP Error 403: Forbidden,philipp
http://skolarbete.nu/en/skolarbeten/astronomi-5/,HTTP Error 500: Internal Server Error,philipp
http://momsaffiliatemarketing.com/tag/open-education-project/,HTTP Error 403: Forbidden,wide00006
https://builttobrag.com/why-i-got-married-so-young-from-my-wifes-perspective/?lang=mt,meta	name	translation-stats,hieu
http://www.evergrowingcage.com/mt/about-us/,,philipp
http://creatingmybusinessonline.com/john-thornhills-internet-marketing-master-class-coaching-program,HTTP Error 403: Forbidden,wide00006


In [26]:
def count(iterable):
    return sum(1 for _ in iterable)

Number of units that were accessible but did trigger an MT detection rule

In [27]:
count(unit for unit, hit, url in checked_units if url is not None)

22

Number of units that were accessible and did not trigger the MT system

In [28]:
count(unit for unit, hit, url in checked_units if url is None and hit is None)

20

Number of units that were no longer accessible on any of its associated source documents

In [29]:
count(unit for unit, hit, url in checked_units if url is None and hit is not None)

58

In [34]:
table(Counter(hit for _, hit, url in checked_units if url is None and hit is not None).most_common())

0,1
HTTP Error 404: Not Found,18
HTTP Error 403: Forbidden,15
"<urlopen error [Errno 8] nodename nor servname provided, or not known>",9
HTTP Error 500: Internal Server Error,8
HTTP Error 503: Service Temporarily Unavailable,3
timed out,3
<urlopen error timed out>,1
HTTP Error 502: Bad Gateway,1


# Shard ids for each unit
Just to make sure, we're not seeing anything odd in terms of units that matched across shards, right?

Note that we deduplicated sentence pairs. So each sentence pair can have multiple shard ids. However, the shard ids on both sides of the sentence pair should be exactly equal. We can check for that.

Below, `slug` and `shard_id` are Python implementations of these functions in giashard.

In [36]:
from publicsuffixlist import PublicSuffixList
from urllib.parse import urlparse
import fnv
import re

psl = PublicSuffixList()

def urls(unit):
    return [url for tu in unit.translations.values() for url in tu['source-document']]

def slug(key):
    """https://github.com/paracrawl/giashard/blob/master/shard.go#L88"""
    url = urlparse(key)
    if url.netloc:
        host = re.sub(r':\d+$', '', url.netloc)
    else:
        match = re.match(r'^([a-zA-Z0-9][a-zA-Z0-9\-.]*[a-zA-Z0-9]).*', key)
        if not match:
            raise ValueError(f'Unable to determine host using regexp from {key}')
        host = match.group(1)
    
    public = psl.publicsuffix(host)
    private = psl.privatesuffix(host)
    
    
    if not public or not private:
        match = re.match('^([^/]+).*', host)
        if not match or len(match.group(1)) == 0:
            raise ValueError(f'Unable to determine slug by parsing {host} from {key}')
        slug = match.group(1)
    else:
        slug = re.sub('\.?' + re.escape(public) + '$', '', private)
    
    if not slug:
        raise ValueError(f'Unable to determine slug from {key}')
    
    return slug

def shard_id(key, n=8):
    """https://github.com/paracrawl/giashard/blob/master/shard.go#L120"""
    return fnv.hash(slug(key).encode(), algorithm=fnv.fnv, bits=64) % (1 << n)

# Focus: wide00006 data
I'm focussing on wide6 here. For the large number of missing sentence pairs from Philipp, I expect this can be linked to merging old releases into the current release up to 7.1. I also expect some of Hieu's data to have it made into Philipp's data since it has been around for a while. wide6, as far as I can tell, has only been processed by Edinburgh.

In [37]:
units = [unit for unit in read(files['paracrawl-7.1']) if unit['collection'] == set(['wide00006'])]

en-mt.tmx.gz:   0%|          | 0.00/197M [00:00<?, ?b/s]

In [38]:
len(units)

58266

Anything weird going on with the data? The shard ids on both sides of the sentence pairs should be the same because we're not matching across shards. (However, we deduplicate, so we could have merged two sentence pairs that came from separate shards.)

In [39]:
weird = [
    unit
    for unit in units #read(files['paracrawl-7.1'])
    if len(
        frozenset(shard_id(url, 8) for url in unit.translations['en']['source-document']) ^
        frozenset(shard_id(url, 8) for url in unit.translations['en']['source-document'])
    ) != 0
]

weird

[]

In [40]:
inside = RandomSample(100)
outside = RandomSample(100)

for unit in units:
    if checksum(unit) in hashes['paracrawl-8']:
        inside.add(unit)
    else:
        outside.add(unit)

In [41]:
inside.n, outside.n

(4553, 53713)

In [42]:
table({lang: tuv.text for lang, tuv in unit.translations.items()} for unit in outside)

en,mt
This will give me some extra time to weed out the ones who are real and really want to be here.,Dan se jagħti me xi żmien addizzjonali biex weed dawk li huma reali u verament irridu li jkun hawn.
Direction - the direction of the emboss effect from 0 degree to 360 degree.,Direzzjoni - id-direzzjoni ta 'l-emboss effett minn 0 sa 360 grad grad.
You cannot generate a million dollars overnight in internet marketing Serious income in Internet marketing takes a while.,Inti ma tistax tiġġenera miljun dollaru matul il-lejl fl-internet marketing Dħul serji fil-Internet marketing jieħu waqt.
Answer: No rush to upgrade. (wait until you have had 4 tripler positions for nearly 75 days) You can use your earnings to upgrade when the time comes.),(Stenna sakemm inti kellek 4 pożizzjonijiet tripler għal kważi 75-il ġurnata) Tista 'tuża l-qligħ tiegħek li jaġġornaw meta jasal iż-żmien.)
Play Ben 10: The alien DNA combiner,10 ben seħħ aljen: runner 'l-univers
"The array of locally owned island shops and dining establishments offer warm hospitality and friendly service, with an atmosphere unlike any other town on the island.","Il-firxa ta 'ħwienet tal-gżejjer proprjetà lokali u stabbilimenti dining joffru ospitalità sħun u favur servizz, ma atmosfera differenti minn kwalunkwe belt oħra fuq il-gżira."
"About MLM, Announcement, Business-Building, Duplicating, Free Stuff, Heavy Hitter, Home Based Business, Home Business, Industry Experts, Leaders, Leadership, MLM, MLM Education, MLM Training, Million-Dollar Income Earner, Network Marketing, Network Marketing Trainer, Personal Growth, Professional Development, Prospecting, Recruiting, Relationship, Relationshipping, Skills, Success, The Greatest Networkers in the World »","Dwar MLM, Tħabbir, Negozju Bini, jidduplikaw, Free imbarazz, Hitter tqal, Home Based Business, Home Business, Esperti Industrija, Mexxejja, Leadership, MLM, MLM Edukazzjoni, MLM Taħriġ, earner Dħul Million Dollar-, Network Marketing, Network Marketing Trainer, Personali Tkabbir, Żvilupp Professjonali, Tfittix, Reklutaġġ, Relazzjoni, Relationshipping, Ħiliet, Success, Il-Networkers Greatest fid-Dinja »"
I hope you can try mix it (berry and lemon) and hopefully it will become sweet .,Nispera li inti tista 'tipprova tħalltu (berry u lumi) u wieħed jittama li se jsiru ħelu.
"Guests can have a different experience with each visit, and I think that's one of the real keys to our success.""","Mistednin jista 'jkollhom esperjenza differenti ma' kull żjara, u naħseb li wieħed mill-imfietaħ reali għas-suċċess tagħna. """
I am working here in Dubai UAE.,I am xogħol hawnhekk fil Dubai UAE.


In [53]:
from itertools import chain

table((list(chain(*row)) for row in zip(
    Counter(domain(url) for unit in inside for url in urls(unit)).most_common(50),
    Counter(domain(url) for unit in outside for url in urls(unit)).most_common(50)
)), headers=['Paracrawl 8 domain', 'count', 'Missing domain', 'count'], title='Top domains of this collection inside Paracrawl 8 and missing from it.')

Paracrawl 8 domain,count,Missing domain,count.1
flashgameport.com,68,thegreatestnetworker.org,73
www.defensereview.com,43,shopate.com,48
www.clearingexchange.com,41,ideonexus.com,32
www.tinnitus-causes.net,26,hitechzilla.com,22
www.xperteleven.com,17,leeroper.com,19
www.ecb.int,16,innoexts.com,14
www.your-free-cams.com,16,amazeall.com,11
www.vipchatroulette.com,15,skolarbete.nu,9
maltese.vipchatroulette.com,15,www.tinnitus-causes.net,9
skolarbete.nu,14,free-money-making-reviews.com,8


In [54]:
inside_domains, outside_domains = Counter(), Counter()

for unit in units:
    if checksum(unit) in hashes['paracrawl-8']:
        inside_domains.update(domain(url) for url in urls(unit))
    else:
        outside_domains.update(domain(url) for url in urls(unit))    
        

In [55]:
len(inside_domains), len(outside_domains)

(200, 533)

Domains that are both found on inside and outside sentence pairs are probably not filtered because of MT

In [58]:
len(set(inside_domains) & set(outside_domains))

173

Domains that are outside, but not inside, could have been filtered due to MT

In [70]:
len(set(outside_domains) - set(inside_domains))

360

Same for disjoint sets: completely inside or completely missing from Paracrawl 8.

In [71]:
len(set(inside_domains) ^ set(outside_domains))

387

# Filtered domains from logs
I also still have the warc2text processing logs of this dataset. These contain all urls that got filtered. For easy usage, I'm going to collapse them to their domain, and then see how many of the domains in Paracrawl 7.1 and 8 are in this list of filtered domains.

I assume the answer will be many and nearly none respectively. Maybe some will slip through for Paracrawl 8 because the MT filter works at a page level, not a domain level, and a domain could be a mix of MT and non-MT pages.

In [60]:
domains = {
    dataset: Counter(domain(url) for unit in read(path) for url in urls(unit))
    for dataset, path in files.items()
}

en-mt.tmx.gz:   0%|          | 0.00/197M [00:00<?, ?b/s]

en-mt.cc-2016-30-cc-2017-30-cc-2018-30-cc-2019-18-cc-2019-35-gwb-hieu-marta-philipp-wide00006-wide00015.tmx.gz…

In [76]:
table({
    'dataset': dataset,
    'domains': sum(count for _, count in counter.items())
} for dataset, counter in domains.items())

dataset,domains
paracrawl-7.1,6248521
paracrawl-8,18489248


That's the number of urls per domain. Now load in the list of filtered urls.

In [61]:
import gzip
with gzip.open('warc2text-wide00006.urls.gz') as fh:
    filtered_domains = set(domain(url.strip().decode()) for url in fh)

In [62]:
def percentage(selection_label, selection, unit_label, total):
    return {
        f'{selection_label} {unit_label}': selection,
        f'total {unit_label}': total,
        f'% of {unit_label}': '{:0.2f}%'.format(selection / total * 100)
    }

table({
    'dataset': dataset,
    **percentage('filtered', len(set(counter) & filtered_domains), 'domains', len(counter)),
    **percentage('filtered', sum(counter[url] for url in set(counter) & filtered_domains), 'urls', sum(counter[url] for url in counter))
} for dataset, counter in domains.items())

dataset,filtered domains,total domains,% of domains,filtered urls,total urls,% of urls
paracrawl-7.1,309,2889,10.70%,557389,6248521,8.92%
paracrawl-8,31,30543,0.10%,122893,18489248,0.66%


But this is when counting domains. More interesting is the number of units that are now filtered by the domain filter.

I'm taking all units from Paracrawl 7.1 release that originated from Philipp's dataset, and marking them "missing" if they did not end up in Paracrawl 8. I'm also marking them "filtered" if they appeared in the filtered domain logs.

In [63]:
missing = [unit for unit in units if checksum(unit) not in hashes['paracrawl-8']]

In [64]:
len(missing) / len(units)

0.9218583736656025

In [65]:
filtered = [unit for unit in units if all(domain(url) in filtered_domains for url in urls(unit))]

In [66]:
len(filtered) / len(units)

0.8181100470257097

In [67]:
missing_set = {id(unit) for unit in missing}
filtered_set = {id(unit) for unit in filtered}
len(missing_set & filtered_set) / len(missing_set)

0.8585072515033605

In [77]:
len(missing_set & filtered_set) / len(filtered_set)

0.9673785348661575

In [79]:
unexplained_unit_ids = missing_set - filtered_set
unexplained_units = [unit for unit in units if id(unit) in unexplained_unit_ids]

In [80]:
domains['unexplained'] = Counter(domain(url) for unit in unexplained_units for url in urls(unit))

In [81]:
table({
    'dataset': dataset,
    **percentage('filtered', len(set(counter) & filtered_domains), 'domains', len(counter)),
    **percentage('filtered', sum(counter[url] for url in set(counter) & filtered_domains), 'urls', sum(counter[url] for url in counter))
} for dataset, counter in domains.items())

dataset,filtered domains,total domains,% of domains,filtered urls,total urls,% of urls
paracrawl-7.1,309,2889,10.70%,557389,6248521,8.92%
paracrawl-8,31,30543,0.10%,122893,18489248,0.66%
unexplained,44,277,15.88%,3183,29676,10.73%


In [82]:
len(set(domains['unexplained']) & set(domains['paracrawl-8'])) / len(domains['unexplained'])

0.6209386281588448

## Domains that did not end up in Paracrawl 8
These domains did not end up in Paracrawl 8, while they were in Paracrawl 7.1. They are also not filtered out by the MT filtering rules according to the warc2text log files. So what happened to them? Maybe low quality?

In [83]:
table({
    'domain': d,
    'unexplained': count, # pairs where all domains are filtered
    'paracrawl-7.1': domains['paracrawl-7.1'][d], # pairs where at least some domains are filtered
    'paracrawl-8': domains['paracrawl-8'][d] # pairs in latest release
} for d, count in domains['unexplained'].most_common()[:15])

domain,unexplained,paracrawl-7.1,paracrawl-8
aceinfowayindia.com,3459,3632,4
mt.glosbe.com,2667,2757,14886
www.sihanoukvilleonline.com,2214,2300,0
konvertitur.kingconv.com,1875,3227,118733
clanbase.ggl.com,1511,1511,0
kingconv.com,1282,2013,166242
mt.zgamz.com,1019,1267,3511
en.zgamz.com,1003,1218,2982
www.ecb.int,753,67535,6903
converter.kingconv.com,651,1285,28376
