In [1]:
import sys
import importlib
sys.path.append('../../')

import lxml.html as html

import knutils.autoscraper as kas

In [2]:
def make_classifier(root, rel_depth=5, fwditer_num=20):
    depth_stats, tag_stats, attrk_stats, attrv_stats, attrkv_stats = kas.AutoScraper.get_stats(root)

    p_used_tags = ['a','td','img','tr','div','table','script','meta','span','link','input','center','param',
                   'html','head','title','body','form','object','embed']
    t_used_attrk = ['class', 'id', 'name', 'title', 'type', 'src']
    t_used_attrv_raw = [z for (k, v) in attrkv_stats.items() if k in t_used_attrk for (z, n) in v]
    t_used_attrv = list(set(t_used_attrv_raw))
    p_used_attrkv = [(k, v) for k in t_used_attrk for v in t_used_attrv if v in [x[0] for x in attrkv_stats[k]]]

    p_context = [('cur', lambda p: p),
                 ('prev-1', lambda p: kas.AutoScraper.get_sibling(p, -1)),
                 ('next-1', lambda p: kas.AutoScraper.get_sibling(p, +1)),
                ('child-0',lambda p: kas.AutoScraper.get_child(p, 0)),
                 ('child-1',lambda p: kas.AutoScraper.get_child(p, 1)),
                 ('child-2',lambda p: kas.AutoScraper.get_child(p, 2))]
    classifier1 = kas.ParentRelClassifier(kas.ContextClassifier(kas.TagClassifier(p_used_tags), p_context), rel_depth)
    classifier2 = kas.ParentRelClassifier(kas.ContextClassifier(kas.AttribClassifier(p_used_attrkv), p_context), rel_depth)
    classifier3 = kas.FwdIterClassifier(kas.TagClassifier(p_used_tags), fwditer_num)
    return kas.AutoScraper({'tags':classifier1, 'attrs':classifier2, 'iter':classifier3})
    
def custom_order(clsf_name, split, split_stat):
    res = kas.SplitStat.calc_gini(clsf_name, split, split_stat)
    
    if clsf_name=='iter':
        if split.base().tag_name() == 'other':
            res *= 0.1
        res *= 0.25
    elif clsf_name=='tags':
        if split.base().base().tag_name()=='other':
            res *= 0.1
            
    if clsf_name in ('tags', 'attrs'):
        if split.base().name() != 'cur':
            res *= 0.1
    
    return res

In [4]:
root = html.parse('../../../DataSets/SiteSample/S3/List2.html')
fin_classifier = make_classifier(root)

In [6]:
target_url = []
target_dsc = []

for x in root.iter():
    if x.tag == 'a' and 'href' in x.attrib and 'view.php' in x.attrib['href']:
        target_url.append(x)
    if x.tail is not None and 'Размер' in x.tail:
        target_dsc.append(x.getparent().getparent())
len(target_url), len(target_dsc)

(18, 18)

In [7]:
%%time
url_scraper = fin_classifier.fit(root, target_url, order_function=custom_order)
dsc_scraper = fin_classifier.fit(root, target_dsc, order_function=custom_order)

Wall time: 17.5 s


In [8]:
url_scraper.get_split_tree()

(('attrs', class=box_pic at context cur at rel-depth=1, 1033-0 vs 0-18),
 None,
 None)

In [9]:
dsc_scraper.get_split_tree()

(('attrs', class=box at context cur at rel-depth=4, 997-0 vs 36-18),
 (('tags', <img> at context cur at rel-depth=0, 18-18 vs 18-0),
  None,
  (('attrs', class=box_head at context cur at rel-depth=1, 0-18 vs 18-0),
   None,
   None)),
 None)

In [10]:
importlib.reload(kas)
d_scrapers = {'url': url_scraper, 'dsc': dsc_scraper}
d_structure = [({'url':(1,1), 'dsc':(1,1), 'other':(0,0)}, 'entry'),
               ({'entry':(0,), 'dsc':(0,), 'other':(0,0)}, lambda x: kas.ScraperNode('lst', x.elem(), 
                     [y for y in x.get_list('entry')] +
                    [kas.ScraperNode('entry', y.elem().getparent(), [y]) for y in x.get_list('dsc')])),
               ({'lst':(1,), 'other':(0,0)}, lambda x: kas.ScraperNode('lst', x.elem(),
                    sum([z.get_all() for z in x.get_list('lst')],[]))
               )]
fin_scraper = kas.MultiScraper(d_scrapers, d_structure)

In [11]:
%%time
r = fin_scraper.parse(root)

Wall time: 145 ms


In [14]:
r.get_all()

[entry(<Element table at 0x231ca628d18>) with [url(<Element a at 0x231ca1baea8>), dsc(<Element tr at 0x231ca243bd8>)],
 entry(<Element table at 0x231ca621408>) with [url(<Element a at 0x231ca243b38>), dsc(<Element tr at 0x231ca243c78>)],
 entry(<Element table at 0x231ca621638>) with [url(<Element a at 0x231ca243ae8>), dsc(<Element tr at 0x231ca243d18>)],
 entry(<Element table at 0x231ca621868>) with [url(<Element a at 0x231ca243b88>), dsc(<Element tr at 0x231ca243db8>)],
 entry(<Element table at 0x231ca621a98>) with [url(<Element a at 0x231ca243cc8>), dsc(<Element tr at 0x231ca243e58>)],
 entry(<Element table at 0x231ca621c78>) with [url(<Element a at 0x231ca243d68>), dsc(<Element tr at 0x231ca243ef8>)],
 entry(<Element table at 0x231ca621f98>) with [url(<Element a at 0x231ca243c28>), dsc(<Element tr at 0x231ca243f98>)],
 entry(<Element table at 0x231ca6171d8>) with [url(<Element a at 0x231ca243e08>), dsc(<Element tr at 0x231ca255098>)],
 entry(<Element table at 0x231ca617408>) with [u