    bool filter(const std::string& lc_tag, const char* attr, const char* value, const util::umap_tag_filters_regex& tagFilters) {
        util::umap_tag_filters_regex::const_iterator tag_it = tagFilters.find(lc_tag);
        if (tag_it == tagFilters.cend())
            return true;
        util::umap_attr_filters_regex::const_iterator attr_it = tag_it->second.find(util::toLowerCopy(attr));
        if (attr_it == tag_it->second.cend())
            return true;
        for (const util::umap_attr_regex& filter : attr_it->second){
            if (std::regex_search(value, filter.regex)) {
                BOOST_LOG_TRIVIAL(debug) << "Tag filter " << tag_it->first << "[" << attr_it->first << " ~ " << filter.str << "] matched '" << value << "'";
                return false;
            }
        }
        return true;
    }


1) std::regex_search returns true if any substring of the value matches the regex;

2) attr name is converted to lowercase

3) return true if any substrign of the value matches any of the regular expressions for the (tag, attr) pair

    void readTagFiltersRegex(const std::string& filename, umap_tag_filters_regex& filters) {
        std::ifstream f(filename);
        if (!f)
            throw TagFiltersFileException();
        std::string line;
        std::vector<std::string> fields;
        for (size_t line_i=1; std::getline(f, line); ++line_i) {
            if (boost::algorithm::all(line, boost::algorithm::is_space()) || boost::algorithm::starts_with(line, "#"))
                continue;
            fields.clear();
            boost::algorithm::split(fields, line, [](char c){return c == '\t';});
            if (fields.size() < 3) {
                BOOST_LOG_TRIVIAL(warning) << "Could not parse tag filter at line " << line_i << " of " << filename;
                continue;
            }
            umap_attr_filters_regex& attrs = filters[fields.at(0)];
            std::vector<umap_attr_regex>& values = attrs[fields.at(1)];
            for (unsigned int i = 2; i < fields.size(); ++i)
                values.emplace_back((umap_attr_regex){
                    std::regex(fields.at(i), std::regex::optimize | std::regex::nosubs),
                    fields.at(i)
                });
        }
        f.close();
    }

while loading file with tag filters, skip all lines consisting of spaces only or starting with #

split each line by TAB, first and second items are tag and attr, starting from the third elements is compiled to a regex, nosubs flag makes all subexpressions (expr) treated as non-marking sub-expressions (?:expr) while matching, probably for effeciency reasons;

In [1]:
import zstandard
import trafilatura
import codecs
import json
from pathlib import Path
import ujson as json
import zstandard
import io
from trafilatura.utils import load_html
from collections import defaultdict
from tqdm import tqdm

In [2]:
def load_tagfilters(fpath = "../../mt-filter-list.annotated"):
    tagfilters = defaultdict(list)
    with open(fpath, 'r') as inp:
        for l in inp:
            l = l.strip()
            if l=='' or l.startswith('#'):
                continue
            ff = l.split('\t')
            tagfilters[tuple(ff[:2])].append(ff[-1])
            print(l)
    print(len(tagfilters))
    print(tagfilters)
    return tagfilters

tagfilters = load_tagfilters()

div	id	mqtranslate
div	id	wporg-translate
div	id	wp-translate
div	id	qtranslate
link	id	wporg-translate-css
link	rel	alternate machine-translated-from
script	src	qtranslate-x
a	onclick	doGTranslate\(\'.{2}\|.{2}\'\)
meta	name	translation-stats
a	data-trp-gettext	$^
div	id	weglot
7
defaultdict(<class 'list'>, {('div', 'id'): ['mqtranslate', 'wporg-translate', 'wp-translate', 'qtranslate', 'weglot'], ('link', 'id'): ['wporg-translate-css'], ('link', 'rel'): ['alternate machine-translated-from'], ('script', 'src'): ['qtranslate-x'], ('a', 'onclick'): ["doGTranslate\\(\\'.{2}\\|.{2}\\'\\)"], ('meta', 'name'): ['translation-stats'], ('a', 'data-trp-gettext'): ['$^']})


In [3]:
import re

class TagFilter0:
    """
    Attempts to optimize TagFilter1: for filters where values don't look like regular expressions include them
    to the argument of iterfind()
    """
    def __init__(self, tagfilters):
        ignorecase = True  # ignore case for better matching, though in the original C++ implementation it was not ignored
        self.tagattr2re = [(k, re.compile(t) if re.escape(re.sub('[- ]','',t)) != re.sub('[- ]','',t) else t)
                      for k,v in tagfilters.items() for t in v]
        print(self.tagattr2re)
        
    
    def matches(self, tree):
        for (tag, attr), filt in  self.tagattr2re: 
            if isinstance(filt, str):
                xpath = f'.//{tag}[@{attr}="{filt}"]'
#                 print(xpath)
                for e in tree.iterfind(xpath):
                    val = e.get(attr)
                    return tag, attr, val
            
            else:
                for e in tree.iterfind(f".//{tag}[@{attr}]"):
                    val = e.get(attr)
    #                 print(tag, attr, val)
                    for m in filt.finditer(val):
                        return tag, attr, val
        return None
    
matcher = TagFilter0(tagfilters)

[(('div', 'id'), 'mqtranslate'), (('div', 'id'), 'wporg-translate'), (('div', 'id'), 'wp-translate'), (('div', 'id'), 'qtranslate'), (('div', 'id'), 'weglot'), (('link', 'id'), 'wporg-translate-css'), (('link', 'rel'), 'alternate machine-translated-from'), (('script', 'src'), 'qtranslate-x'), (('a', 'onclick'), re.compile("doGTranslate\\(\\'.{2}\\|.{2}\\'\\)")), (('meta', 'name'), 'translation-stats'), (('a', 'data-trp-gettext'), re.compile('$^'))]


In [4]:
import re

class TagFilter1:
    """
    Iterates over all nodes with the specified tag having the specified attribute with lxml Element.iterfind(), 
    checks the value using pre-compiled regexps with Python re
    """
    def __init__(self, tagfilters):
        ignorecase = True  # ignore case for better matching, though in the original C++ implementation it was not ignored
        self.tagattr2re = {k: re.compile('|'.join(f'({t})' for t in v), flags=re.IGNORECASE if ignorecase else 0) 
                      for k,v in tagfilters.items()}
        print(self.tagattr2re)
        
    
    def matches(self, tree):
        for (tag, attr), regex in  self.tagattr2re.items(): 
            for e in tree.iterfind(f".//{tag}[@{attr}]"):
                val = e.get(attr)
#                 print(tag, attr, val)
                for m in regex.finditer(val):
                    return tag, attr, val
        return None
    
matcher = TagFilter1(tagfilters)

{('div', 'id'): re.compile('(mqtranslate)|(wporg-translate)|(wp-translate)|(qtranslate)|(weglot)', re.IGNORECASE), ('link', 'id'): re.compile('(wporg-translate-css)', re.IGNORECASE), ('link', 'rel'): re.compile('(alternate machine-translated-from)', re.IGNORECASE), ('script', 'src'): re.compile('(qtranslate-x)', re.IGNORECASE), ('a', 'onclick'): re.compile("(doGTranslate\\(\\'.{2}\\|.{2}\\'\\))", re.IGNORECASE), ('meta', 'name'): re.compile('(translation-stats)', re.IGNORECASE), ('a', 'data-trp-gettext'): re.compile('($^)', re.IGNORECASE)}


In [5]:
import re

class TagFilter2:
    """
    Employs lxml Element.xpath() method, for each (tag,attr) calls xpath()
    """
    def __init__(self, tagfilters, use_and=False):
        ignorecase = True  # ignore case for better matching, though in the original C++ implementation it was not ignored
        self.conjunction = ' and ' if use_and else ']['        
        self.tagattr2re = {k: '|'.join(f'({t})' for t in v)
                      for k,v in tagfilters.items()}        

        
    def __str__(self):
        return str(type(self)) + str(self.tagattr2re)
        
    
    def matches(self, tree):
        for (tag, attr), value_regex in  self.tagattr2re.items(): 
            # TODO: will not work if value_regex contains doublequotes
            xpath = f'.//{tag}[@{attr}{self.conjunction}re:match(@{attr},"{value_regex}")]'  
#             print(xpath)
            for m in tree.xpath(xpath, 
                                namespaces={'re': "http://exslt.org/regular-expressions"}, smart_strings=False):
                return tag, attr, m.get(attr)
        return None
    
matcher = TagFilter2(tagfilters)
print(matcher)

<class '__main__.TagFilter2'>{('div', 'id'): '(mqtranslate)|(wporg-translate)|(wp-translate)|(qtranslate)|(weglot)', ('link', 'id'): '(wporg-translate-css)', ('link', 'rel'): '(alternate machine-translated-from)', ('script', 'src'): '(qtranslate-x)', ('a', 'onclick'): "(doGTranslate\\(\\'.{2}\\|.{2}\\'\\))", ('meta', 'name'): '(translation-stats)', ('a', 'data-trp-gettext'): '($^)'}


In [6]:
import re

from lxml import etree

class TagFilter3:
    """
    Creating etree.XPath objects in the constructor, potentially sould compiles search expressions one time and
    reuse for each input tree
    """
    def __init__(self, tagfilters, use_and=False):
        ignorecase = True  # ignore case for better matching, though in the original C++ implementation it was not ignored
        self.tagattr2re = {k: '|'.join(f'({t})' for t in v)
                      for k,v in tagfilters.items()}
        # TODO: will not work if value_regex contains doublequotes
        conjunction = ' and ' if use_and else ']['
        self.matchers = [etree.XPath(f'.//{tag}[@{attr}{conjunction}re:match(@{attr},"{value_regex}")]', 
                                     namespaces={'re': "http://exslt.org/regular-expressions"}, smart_strings=False) 
                         for (tag, attr), value_regex in  self.tagattr2re.items()]
        
        
        
    def __str__(self):
        return str(type(self)) + '\n'.join(str(m) for m in self.matchers)

    
    def matches(self, tree):
        for matcher in self.matchers:
            for m in matcher(tree):
                return m.tag, *m.items()[0]  # TODO: what is returned in m.attrib?

        return None
    
    
matcher = TagFilter3(tagfilters)
print(matcher)

<class '__main__.TagFilter3'>.//div[@id][re:match(@id,"(mqtranslate)|(wporg-translate)|(wp-translate)|(qtranslate)|(weglot)")]
.//link[@id][re:match(@id,"(wporg-translate-css)")]
.//link[@rel][re:match(@rel,"(alternate machine-translated-from)")]
.//script[@src][re:match(@src,"(qtranslate-x)")]
.//a[@onclick][re:match(@onclick,"(doGTranslate\(\'.{2}\|.{2}\'\))")]
.//meta[@name][re:match(@name,"(translation-stats)")]
.//a[@data-trp-gettext][re:match(@data-trp-gettext,"($^)")]


In [7]:
import re

from lxml import etree

class TagFilter31:
    """
    Creating etree.XPath objects in the constructor, potentially sould compiles search expressions one time and
    reuse for each input tree
    """
    def __init__(self, tagfilters, use_and=False):
        ignorecase = True  # ignore case for better matching, though in the original C++ implementation it was not ignored
        self.tagattr2re = {k: '|'.join(f'({t})' for t in v)
                      for k,v in tagfilters.items()}
        # TODO: will not work if value_regex contains doublequotes
        conjunction = ' and ' if use_and else ']['
        self.matchers = [etree.XPath(f'.//{tag}[@{attr}{conjunction}re:match(@{attr},"{value_regex}")]/@{attr}', 
                                     namespaces={'re': "http://exslt.org/regular-expressions"}, smart_strings=True) 
                         for (tag, attr), value_regex in  self.tagattr2re.items()]
        
        
        
    def __str__(self):
        return str(type(self)) + '\n'.join(str(m) for m in self.matchers)

    
    def matches(self, tree):
        for matcher in self.matchers:
            for m in matcher(tree):
                return m.getparent().tag, m.attrname, str(m)

        return None
    
    
matcher = TagFilter31(tagfilters)
print(matcher)


<class '__main__.TagFilter31'>.//div[@id][re:match(@id,"(mqtranslate)|(wporg-translate)|(wp-translate)|(qtranslate)|(weglot)")]/@id
.//link[@id][re:match(@id,"(wporg-translate-css)")]/@id
.//link[@rel][re:match(@rel,"(alternate machine-translated-from)")]/@rel
.//script[@src][re:match(@src,"(qtranslate-x)")]/@src
.//a[@onclick][re:match(@onclick,"(doGTranslate\(\'.{2}\|.{2}\'\))")]/@onclick
.//meta[@name][re:match(@name,"(translation-stats)")]/@name
.//a[@data-trp-gettext][re:match(@data-trp-gettext,"($^)")]/@data-trp-gettext


In [8]:
import re
from lxml import etree

class TagFilter4:
    """
    Creating etree.XPath objects in the constructor, potentially sould compiles search expressions one time and
    reuse for each input tree
    """
    def __init__(self, tagfilters, use_and=False):
        ignorecase = True  # ignore case for better matching, though in the original C++ implementation it was not ignored
        self.tagattr2re = {k: '|'.join(f'({t})' for t in v)
                      for k,v in tagfilters.items()}
        # TODO: will not work if value_regex contains doublequotes
        conjunction = ' and ' if use_and else ']['
        pats = [ f'.//{tag}[@{attr}{conjunction}re:match(@{attr},"{value_regex}")]' 
                for (tag, attr), value_regex in  self.tagattr2re.items()]
        self.matcher = etree.XPath(' | '.join(pats), 
                                     namespaces={'re': "http://exslt.org/regular-expressions"}, smart_strings=False)                         
        
        
    def __str__(self):
        return str(type(self)) + str(self.matcher)
        
    
    def matches(self, tree):
        for m in self.matcher(tree):
            return m.tag, *m.items()[0]  # TODO: what is returned in m.attrib?

        return None
    
matcher = TagFilter4(tagfilters)
print(matcher)

<class '__main__.TagFilter4'>.//div[@id][re:match(@id,"(mqtranslate)|(wporg-translate)|(wp-translate)|(qtranslate)|(weglot)")] | .//link[@id][re:match(@id,"(wporg-translate-css)")] | .//link[@rel][re:match(@rel,"(alternate machine-translated-from)")] | .//script[@src][re:match(@src,"(qtranslate-x)")] | .//a[@onclick][re:match(@onclick,"(doGTranslate\(\'.{2}\|.{2}\'\))")] | .//meta[@name][re:match(@name,"(translation-stats)")] | .//a[@data-trp-gettext][re:match(@data-trp-gettext,"($^)")]


In [10]:
import re
from lxml import etree

class TagFilter5:
    """
    Creating etree.XPath in the constructor, potentially sould compiles search expressions one time and
    reuse for each input tree. All filters in one XPath. Trying to optimize by usiung both 
    @attr="value" and match(@attr, regex).
    """
    def __init__(self, tagfilters):
        ignorecase = True  # ignore case for better matching, though in the original C++ implementation it was not ignored
        # TODO: will not work if value_regex contains doublequotes
        pats = [ f'.//{tag}[@{attr}=""]' if v=='$^' 
                else f'.//{tag}[@{attr}][re:match(@{attr},"{v}")]' if  v.startswith('doGTranslate')
                else f'.//{tag}[@{attr}="{v}"]'
               for (tag, attr), t in tagfilters.items() for v in t]     
        print('\n'.join(pats))
        self.matcher = etree.XPath(' | '.join(pats), 
                                     namespaces={'re': "http://exslt.org/regular-expressions"}, smart_strings=False)                         
        
        
    def __str__(self):
        return str(type(self)) + str(self.matcher)
        
    
    def matches(self, tree):
        for m in self.matcher(tree):
            return m.tag, *m.items()[0]  # TODO: what is returned in m.attrib?

        return None
    
matcher = TagFilter5(tagfilters)
print(matcher)

.//div[@id="mqtranslate"]
.//div[@id="wporg-translate"]
.//div[@id="wp-translate"]
.//div[@id="qtranslate"]
.//div[@id="weglot"]
.//link[@id="wporg-translate-css"]
.//link[@rel="alternate machine-translated-from"]
.//script[@src="qtranslate-x"]
.//a[@onclick][re:match(@onclick,"doGTranslate\(\'.{2}\|.{2}\'\)")]
.//meta[@name="translation-stats"]
.//a[@data-trp-gettext=""]
<class '__main__.TagFilter5'>.//div[@id="mqtranslate"] | .//div[@id="wporg-translate"] | .//div[@id="wp-translate"] | .//div[@id="qtranslate"] | .//div[@id="weglot"] | .//link[@id="wporg-translate-css"] | .//link[@rel="alternate machine-translated-from"] | .//script[@src="qtranslate-x"] | .//a[@onclick][re:match(@onclick,"doGTranslate\(\'.{2}\|.{2}\'\)")] | .//meta[@name="translation-stats"] | .//a[@data-trp-gettext=""]


In [10]:
from lxml import etree
xpath = './/div[@id][re:match(@id,"$^")] | .//script[@src][re:match(@src,"(qtranslate-x)")]'
find = etree.XPath(xpath, 
                  namespaces={'re': "http://exslt.org/regular-expressions"})
%timeit q = find(load_html('<html><div id=""></html>'))
print(q[0].attrib)

19.1 µs ± 756 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


NameError: name 'q' is not defined

In [None]:
%timeit find(h)

In [None]:
%timeit h.xpath(xpath, namespaces={'re': "http://exslt.org/regular-expressions"})

In [11]:
test = [
    ('<a>', False),

    ('<a data-trp-gettext="">', True),
    ('<div id="wp-translate">', True),
    ('''<a href="#" onclick="doGTranslate('en|zh');  return false;">''', True),
    ('<div id="qtranslate">', True),
    
]
print(matcher)
for t, label in test:
    print('\n',t)
    h = load_html(f'<html>{t}</html>')
#     print('HTML:',h)
    m = matcher.matches(h)
    print('Match:', m)
    if label:
        assert m is not None, t
    else:
        assert m is None, t

<class '__main__.TagFilter5'>.//div[@id="mqtranslate"] | .//div[@id="wporg-translate"] | .//div[@id="wp-translate"] | .//div[@id="qtranslate"] | .//div[@id="weglot"] | .//link[@id="wporg-translate-css"] | .//link[@rel="alternate machine-translated-from"] | .//script[@src="qtranslate-x"] | .//a[@onclick][re:match(@onclick,"doGTranslate\(\'.{2}\|.{2}\'\)")] | .//meta[@name="translation-stats"] | .//a[@data-trp-gettext=""]

 <a>
Match: None

 <a data-trp-gettext="">
Match: ('a', 'data-trp-gettext', '')

 <div id="wp-translate">
Match: ('div', 'id', 'wp-translate')

 <a href="#" onclick="doGTranslate('en|zh');  return false;">
Match: ('a', 'href', '#')

 <div id="qtranslate">
Match: ('div', 'id', 'qtranslate')


In [12]:

r=h.xpath('.//div[@id="qtranslate"]/@id', smart_strings=True)[0]

In [13]:
str(r)

'qtranslate'

In [12]:
from timeit import default_timer as timer
import signal
from contextlib import contextmanager, nullcontext

@contextmanager
def time_limit(seconds):
    def signal_handler(signum, frame):
        raise TimeoutError("Timed out!")
    signal.signal(signal.SIGALRM, signal_handler)
    signal.setitimer(signal.ITIMER_REAL, seconds)
    try:
        yield
    finally:
        signal.setitimer(signal.ITIMER_REAL, 0)



In [39]:
from timeit import default_timer as timer
from collections import Counter

def test_matcher(matcher):
    cnt, cnt1 = Counter(), Counter()
    durh, durm = 0.0, 0.0

    for p in tqdm(list(Path('../../../sample0.01_sample0.001').glob('**/all.zst'))[20::20]):
#         print(p, p.stat().st_size/2**30)
        with io.BufferedReader(zstandard.open(p, 'rb')) as inp:
            for byteline in inp:
                l = byteline.decode('utf-8', errors='ignore')
                d = json.loads(l.strip())
                st = timer()
                try:
                    with time_limit(0.5):                    
                        html = load_html(d['h'])                    
                except TimeoutError as e:
                    print('load_html timeout')
                    html = None
                durh += timer() - st
                if html is None:
                    continue
                try:
                    with time_limit(0.5):
                        st = timer()
                        m = matcher.matches(html)
                        durm += timer() - st
                        if m is not None:
#                             print('Match:', m)
                            cnt[m[0]] += 1  
                            cnt1[m] += 1
                except TimeoutError as e:
                    print('matches timeout')                

    print(durm, durh, durm / (durh+durm))
    return durm / (durh+durm), cnt, cnt1


In [40]:
matchers = [tf(tagfilters) for tf in (TagFilter0, TagFilter1, TagFilter5)] + \
    [tf(tagfilters, use_and) for tf in (TagFilter2, TagFilter3, TagFilter31, TagFilter4) 
        for use_and in (False, True)]


for matcher in matchers:
    dur, cnt, cnt1 = test_matcher(matcher)
    print(matcher)    
    print(dur)
    print(cnt1)
    print(cnt)

[(('div', 'id'), 'mqtranslate'), (('div', 'id'), 'wporg-translate'), (('div', 'id'), 'wp-translate'), (('div', 'id'), 'qtranslate'), (('div', 'id'), 'weglot'), (('link', 'id'), 'wporg-translate-css'), (('link', 'rel'), 'alternate machine-translated-from'), (('script', 'src'), 'qtranslate-x'), (('a', 'onclick'), re.compile("doGTranslate\\(\\'.{2}\\|.{2}\\'\\)")), (('meta', 'name'), 'translation-stats'), (('a', 'data-trp-gettext'), re.compile('$^'))]
{('div', 'id'): re.compile('(mqtranslate)|(wporg-translate)|(wp-translate)|(qtranslate)|(weglot)', re.IGNORECASE), ('link', 'id'): re.compile('(wporg-translate-css)', re.IGNORECASE), ('link', 'rel'): re.compile('(alternate machine-translated-from)', re.IGNORECASE), ('script', 'src'): re.compile('(qtranslate-x)', re.IGNORECASE), ('a', 'onclick'): re.compile("(doGTranslate\\(\\'.{2}\\|.{2}\\'\\))", re.IGNORECASE), ('meta', 'name'): re.compile('(translation-stats)', re.IGNORECASE), ('a', 'data-trp-gettext'): re.compile('($^)', re.IGNORECASE)}
.

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [05:19<00:00, 106.57s/it]


66.37461165781133 195.3740188276861 0.25358150502907373
<__main__.TagFilter0 object at 0x7f001bc8dd90>
0.25358150502907373
Counter({('meta', 'name', 'translation-stats'): 33, ('a', 'onclick', "doGTranslate('en|en');return false;"): 6, ('a', 'onclick', "doGTranslate('en|ar');jQuery('div.switcher div.selected a').html(jQuery(this).html());return false;"): 3, ('a', 'onclick', "doGTranslate('ru|ru');return false;"): 3, ('a', 'onclick', "doGTranslate('pt|pt');return false;"): 3, ('link', 'rel', 'alternate machine-translated-from'): 2, ('a', 'onclick', "doGTranslate('es|en');return false;"): 2, ('a', 'onclick', "doGTranslate('es|ar');jQuery('div.switcher div.selected a').html(jQuery(this).html());return false;"): 1, ('a', 'onclick', "doGTranslate('ca|ca');return false;"): 1, ('a', 'onclick', "doGTranslate('vi|en');return false;"): 1, ('a', 'onclick', "doGTranslate('ru|en');return false;"): 1, ('a', 'onclick', "doGTranslate('de|de');return false;"): 1, ('a', 'onclick', "doGTranslate('en|af');

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [04:49<00:00, 96.60s/it]


51.498575427453034 191.93516834639013 0.21155068573934707
<__main__.TagFilter1 object at 0x7f001b7038d0>
0.21155068573934707
Counter({('meta', 'name', 'translation-stats'): 33, ('div', 'id', 'qtranslate'): 7, ('a', 'onclick', "doGTranslate('en|en')"): 6, ('a', 'onclick', "doGTranslate('en|ar')"): 3, ('a', 'onclick', "doGTranslate('ru|ru')"): 3, ('a', 'onclick', "doGTranslate('es|en')"): 3, ('a', 'onclick', "doGTranslate('pt|pt')"): 3, ('link', 'rel', 'alternate machine-translated-from'): 2, ('a', 'onclick', "doGTranslate('vi|en')"): 2, ('a', 'onclick', "doGTranslate('es|ar')"): 1, ('a', 'onclick', "doGTranslate('ca|ca')"): 1, ('a', 'onclick', "doGTranslate('ru|en')"): 1, ('a', 'onclick', "doGTranslate('de|de')"): 1, ('a', 'onclick', "doGTranslate('en|af')"): 1, ('a', 'onclick', "doGTranslate('is|en')"): 1, ('a', 'onclick', "doGTranslate('cs|en')"): 1, ('a', 'onclick', "doGTranslate('es|pt')"): 1, ('a', 'onclick', "doGTranslate('bs|bs')"): 1, ('a', 'onclick', "doGTranslate('es|es')"): 1

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [05:28<00:00, 109.60s/it]


92.25365000532474 190.14838147396222 0.32667488092092983
<class '__main__.TagFilter5'>.//div[@id="mqtranslate"] | .//div[@id="wporg-translate"] | .//div[@id="wp-translate"] | .//div[@id="qtranslate"] | .//div[@id="weglot"] | .//link[@id="wporg-translate-css"] | .//link[@rel="alternate machine-translated-from"] | .//script[@src="qtranslate-x"] | .//a[@onclick][re:match(@onclick,"doGTranslate\(\'.{2}\|.{2}\'\)")] | .//meta[@name="translation-stats"] | .//a[@data-trp-gettext=""]
0.32667488092092983
Counter({('meta', 'name', 'translation-stats'): 33, ('a', 'href', '#'): 29, ('a', 'href', 'http://vsaduidoma.com'): 2, ('link', 'rel', 'alternate machine-translated-from'): 1, ('link', 'href', '2_default8.htm'): 1, ('a', 'href', 'http://specialfinds.com/af'): 1, ('a', 'href', 'http://dalje.com/sq'): 1})
Counter({'a': 33, 'meta': 33, 'link': 2})


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [05:19<00:00, 106.51s/it]


86.54165783745702 190.9718150215922 0.31184668962509093
<class '__main__.TagFilter2'>{('div', 'id'): '(mqtranslate)|(wporg-translate)|(wp-translate)|(qtranslate)|(weglot)', ('link', 'id'): '(wporg-translate-css)', ('link', 'rel'): '(alternate machine-translated-from)', ('script', 'src'): '(qtranslate-x)', ('a', 'onclick'): "(doGTranslate\\(\\'.{2}\\|.{2}\\'\\))", ('meta', 'name'): '(translation-stats)', ('a', 'data-trp-gettext'): '($^)'}
0.31184668962509093
Counter({('meta', 'name', 'translation-stats'): 33, ('a', 'onclick', "doGTranslate('en|en');return false;"): 6, ('div', 'id', 'qtranslate-2'): 4, ('a', 'onclick', "doGTranslate('en|ar');jQuery('div.switcher div.selected a').html(jQuery(this).html());return false;"): 3, ('a', 'onclick', "doGTranslate('ru|ru');return false;"): 3, ('a', 'onclick', "doGTranslate('pt|pt');return false;"): 3, ('link', 'rel', 'alternate machine-translated-from'): 2, ('a', 'onclick', "doGTranslate('es|en');return false;"): 2, ('a', 'onclick', "doGTranslate(

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [05:21<00:00, 107.06s/it]


88.51841830508783 190.77182695094962 0.31694060143038355
<class '__main__.TagFilter2'>{('div', 'id'): '(mqtranslate)|(wporg-translate)|(wp-translate)|(qtranslate)|(weglot)', ('link', 'id'): '(wporg-translate-css)', ('link', 'rel'): '(alternate machine-translated-from)', ('script', 'src'): '(qtranslate-x)', ('a', 'onclick'): "(doGTranslate\\(\\'.{2}\\|.{2}\\'\\))", ('meta', 'name'): '(translation-stats)', ('a', 'data-trp-gettext'): '($^)'}
0.31694060143038355
Counter({('meta', 'name', 'translation-stats'): 33, ('a', 'onclick', "doGTranslate('en|en');return false;"): 6, ('div', 'id', 'qtranslate-2'): 4, ('a', 'onclick', "doGTranslate('en|ar');jQuery('div.switcher div.selected a').html(jQuery(this).html());return false;"): 3, ('a', 'onclick', "doGTranslate('ru|ru');return false;"): 3, ('a', 'onclick', "doGTranslate('pt|pt');return false;"): 3, ('link', 'rel', 'alternate machine-translated-from'): 2, ('a', 'onclick', "doGTranslate('es|en');return false;"): 2, ('a', 'onclick', "doGTranslate

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [05:00<00:00, 100.28s/it]


64.92188449320383 190.2617568167625 0.2544124073155009
<class '__main__.TagFilter3'>.//div[@id][re:match(@id,"(mqtranslate)|(wporg-translate)|(wp-translate)|(qtranslate)|(weglot)")]
.//link[@id][re:match(@id,"(wporg-translate-css)")]
.//link[@rel][re:match(@rel,"(alternate machine-translated-from)")]
.//script[@src][re:match(@src,"(qtranslate-x)")]
.//a[@onclick][re:match(@onclick,"(doGTranslate\(\'.{2}\|.{2}\'\))")]
.//meta[@name][re:match(@name,"(translation-stats)")]
.//a[@data-trp-gettext][re:match(@data-trp-gettext,"($^)")]
0.2544124073155009
Counter({('meta', 'name', 'translation-stats'): 33, ('a', 'href', '#'): 29, ('div', 'id', 'qtranslate-2'): 2, ('a', 'href', 'http://vsaduidoma.com'): 2, ('link', 'rel', 'alternate machine-translated-from'): 1, ('div', 'id', 'qtranslate-6'): 1, ('link', 'href', '2_default8.htm'): 1, ('a', 'href', 'http://specialfinds.com/af'): 1, ('div', 'class', ' bd-block-18 bd-own-margins widget qtranxs_widget'): 1, ('a', 'href', 'http://dalje.com/sq'): 1, 

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [05:02<00:00, 100.89s/it]


66.9458127807593 190.14872187760193 0.26039376087756944
<class '__main__.TagFilter3'>.//div[@id and re:match(@id,"(mqtranslate)|(wporg-translate)|(wp-translate)|(qtranslate)|(weglot)")]
.//link[@id and re:match(@id,"(wporg-translate-css)")]
.//link[@rel and re:match(@rel,"(alternate machine-translated-from)")]
.//script[@src and re:match(@src,"(qtranslate-x)")]
.//a[@onclick and re:match(@onclick,"(doGTranslate\(\'.{2}\|.{2}\'\))")]
.//meta[@name and re:match(@name,"(translation-stats)")]
.//a[@data-trp-gettext and re:match(@data-trp-gettext,"($^)")]
0.26039376087756944
Counter({('meta', 'name', 'translation-stats'): 33, ('a', 'href', '#'): 29, ('div', 'id', 'qtranslate-2'): 2, ('a', 'href', 'http://vsaduidoma.com'): 2, ('link', 'rel', 'alternate machine-translated-from'): 1, ('div', 'id', 'qtranslate-6'): 1, ('link', 'href', '2_default8.htm'): 1, ('a', 'href', 'http://specialfinds.com/af'): 1, ('div', 'class', ' bd-block-18 bd-own-margins widget qtranxs_widget'): 1, ('a', 'href', 'htt

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [05:02<00:00, 100.86s/it]


67.2351847392274 189.8764123650035 0.2615019528347872
<class '__main__.TagFilter31'>.//div[@id][re:match(@id,"(mqtranslate)|(wporg-translate)|(wp-translate)|(qtranslate)|(weglot)")]/@id
.//link[@id][re:match(@id,"(wporg-translate-css)")]/@id
.//link[@rel][re:match(@rel,"(alternate machine-translated-from)")]/@rel
.//script[@src][re:match(@src,"(qtranslate-x)")]/@src
.//a[@onclick][re:match(@onclick,"(doGTranslate\(\'.{2}\|.{2}\'\))")]/@onclick
.//meta[@name][re:match(@name,"(translation-stats)")]/@name
.//a[@data-trp-gettext][re:match(@data-trp-gettext,"($^)")]/@data-trp-gettext
0.2615019528347872
Counter({('meta', 'name', 'translation-stats'): 33, ('a', 'onclick', "doGTranslate('en|en');return false;"): 6, ('div', 'id', 'qtranslate-2'): 4, ('a', 'onclick', "doGTranslate('en|ar');jQuery('div.switcher div.selected a').html(jQuery(this).html());return false;"): 3, ('a', 'onclick', "doGTranslate('ru|ru');return false;"): 3, ('a', 'onclick', "doGTranslate('pt|pt');return false;"): 3, ('lin

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [05:05<00:00, 101.81s/it]


69.55263154290151 190.26708804396912 0.26769573785043904
<class '__main__.TagFilter31'>.//div[@id and re:match(@id,"(mqtranslate)|(wporg-translate)|(wp-translate)|(qtranslate)|(weglot)")]/@id
.//link[@id and re:match(@id,"(wporg-translate-css)")]/@id
.//link[@rel and re:match(@rel,"(alternate machine-translated-from)")]/@rel
.//script[@src and re:match(@src,"(qtranslate-x)")]/@src
.//a[@onclick and re:match(@onclick,"(doGTranslate\(\'.{2}\|.{2}\'\))")]/@onclick
.//meta[@name and re:match(@name,"(translation-stats)")]/@name
.//a[@data-trp-gettext and re:match(@data-trp-gettext,"($^)")]/@data-trp-gettext
0.26769573785043904
Counter({('meta', 'name', 'translation-stats'): 33, ('a', 'onclick', "doGTranslate('en|en');return false;"): 6, ('div', 'id', 'qtranslate-2'): 4, ('a', 'onclick', "doGTranslate('en|ar');jQuery('div.switcher div.selected a').html(jQuery(this).html());return false;"): 3, ('a', 'onclick', "doGTranslate('ru|ru');return false;"): 3, ('a', 'onclick', "doGTranslate('pt|pt');

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [04:59<00:00, 99.67s/it]


63.46736201143358 190.0003077116562 0.2503962816274394
<class '__main__.TagFilter4'>.//div[@id][re:match(@id,"(mqtranslate)|(wporg-translate)|(wp-translate)|(qtranslate)|(weglot)")] | .//link[@id][re:match(@id,"(wporg-translate-css)")] | .//link[@rel][re:match(@rel,"(alternate machine-translated-from)")] | .//script[@src][re:match(@src,"(qtranslate-x)")] | .//a[@onclick][re:match(@onclick,"(doGTranslate\(\'.{2}\|.{2}\'\))")] | .//meta[@name][re:match(@name,"(translation-stats)")] | .//a[@data-trp-gettext][re:match(@data-trp-gettext,"($^)")]
0.2503962816274394
Counter({('meta', 'name', 'translation-stats'): 33, ('a', 'href', '#'): 29, ('div', 'id', 'qtranslate-2'): 2, ('a', 'href', 'http://vsaduidoma.com'): 2, ('link', 'rel', 'alternate machine-translated-from'): 1, ('div', 'id', 'qtranslate-6'): 1, ('link', 'href', '2_default8.htm'): 1, ('a', 'href', 'http://specialfinds.com/af'): 1, ('div', 'class', ' bd-block-18 bd-own-margins widget qtranxs_widget'): 1, ('a', 'href', 'http://dalje.c

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [05:01<00:00, 100.33s/it]

65.42368156393059 190.09681025519967 0.25604084078798905
<class '__main__.TagFilter4'>.//div[@id and re:match(@id,"(mqtranslate)|(wporg-translate)|(wp-translate)|(qtranslate)|(weglot)")] | .//link[@id and re:match(@id,"(wporg-translate-css)")] | .//link[@rel and re:match(@rel,"(alternate machine-translated-from)")] | .//script[@src and re:match(@src,"(qtranslate-x)")] | .//a[@onclick and re:match(@onclick,"(doGTranslate\(\'.{2}\|.{2}\'\))")] | .//meta[@name and re:match(@name,"(translation-stats)")] | .//a[@data-trp-gettext and re:match(@data-trp-gettext,"($^)")]
0.25604084078798905
Counter({('meta', 'name', 'translation-stats'): 33, ('a', 'href', '#'): 29, ('div', 'id', 'qtranslate-2'): 2, ('a', 'href', 'http://vsaduidoma.com'): 2, ('link', 'rel', 'alternate machine-translated-from'): 1, ('div', 'id', 'qtranslate-6'): 1, ('link', 'href', '2_default8.htm'): 1, ('a', 'href', 'http://specialfinds.com/af'): 1, ('div', 'class', ' bd-block-18 bd-own-margins widget qtranxs_widget'): 1, ('a',


