In [1]:
import sys
import importlib
sys.path.append('../../')

import lxml.html as html

import knutils.autoscraper as kas

In [80]:
importlib.reload(kas)

<module 'knutils.autoscraper' from '../..\\knutils\\autoscraper.py'>

In [36]:
def make_classifier(root, rel_depth=5, fwditer_num=20):
    depth_stats, tag_stats, attrk_stats, attrv_stats, attrkv_stats = kas.AutoScraper.get_stats(root)

    p_used_tags = ['a','td','img','tr','div','table','script','meta','span','link','input','center','param',
                   'html','head','title','body','form','object','embed']
    t_used_attrk = ['class', 'id', 'name', 'title', 'type', 'src']
    t_used_attrv_raw = [z for (k, v) in attrkv_stats.items() if k in t_used_attrk for (z, n) in v]
    t_used_attrv = list(set(t_used_attrv_raw))
    p_used_attrkv = [(k, v) for k in t_used_attrk for v in t_used_attrv if v in [x[0] for x in attrkv_stats[k]]]

    p_context = [('cur', lambda p: p),
                 ('prev-1', lambda p: kas.AutoScraper.get_sibling(p, -1)),
                 ('next-1', lambda p: kas.AutoScraper.get_sibling(p, +1)),
                ('child-0',lambda p: kas.AutoScraper.get_child(p, 0)),
                 ('child-1',lambda p: kas.AutoScraper.get_child(p, 1)),
                 ('child-2',lambda p: kas.AutoScraper.get_child(p, 2))]
    classifier1 = kas.ParentRelClassifier(kas.ContextClassifier(kas.TagClassifier(p_used_tags), p_context), rel_depth)
    classifier2 = kas.ParentRelClassifier(kas.ContextClassifier(kas.AttribClassifier(p_used_attrkv), p_context), rel_depth)
    classifier3 = kas.FwdIterClassifier(kas.TagClassifier(p_used_tags), fwditer_num)
    return kas.AutoScraper({'tags':classifier1, 'attrs':classifier2, 'iter':classifier3})
    
def custom_order(clsf_name, split, split_stat):
    res = kas.SplitStat.calc_gini(clsf_name, split, split_stat)
    
    if clsf_name=='iter':
        if split.base().tag_name() == 'other':
            res *= 0.1
        res *= 0.25
    elif clsf_name=='tags':
        if split.base().base().tag_name()=='other':
            res *= 0.1
            
    if clsf_name in ('tags', 'attrs'):
        if split.base().name() != 'cur':
            res *= 0.1
    
    return res

In [5]:
root = html.parse('../../../DataSets/SiteSample/S3/List1.html')

In [35]:
target_url = []
target_dsc = []

for x in root.iter():
    if x.tag == 'a' and 'href' in x.attrib and 'view.php' in x.attrib['href']:
        target_url.append(x)
    if x.tail is not None and 'Размер' in x.tail:
        target_dsc.append(x.getparent().getparent())
len(target_url), len(target_dsc)

(27, 40)

In [39]:
fin_classifier = make_classifier(root)

In [40]:
%%time
url_scraper = fin_classifier.fit(root, target_url, order_function=custom_order)
dsc_scraper = fin_classifier.fit(root, target_dsc, order_function=custom_order)

Wall time: 35.7 s


In [41]:
url_scraper.get_split_tree()

(('attrs', class=box_pic at context cur at rel-depth=1, 1817-0 vs 13-27),
 (('attrs',
   src=/images/lock.gif at context child-0 at rel-depth=0,
   0-27 vs 13-0),
  None,
  None),
 None)

In [42]:
dsc_scraper.get_split_tree()

(('attrs', class=box at context cur at rel-depth=4, 1737-0 vs 80-40),
 (('tags', <img> at context cur at rel-depth=0, 40-40 vs 40-0),
  None,
  (('attrs', class=box_head at context cur at rel-depth=1, 0-40 vs 40-0),
   None,
   None)),
 None)

In [217]:
importlib.reload(kas)
d_scrapers = {'url': url_scraper, 'dsc': dsc_scraper}
d_structure = [({'url':(1,1), 'dsc':(1,1), 'other':(0,0)}, 'entry'),
               ({'entry':(0,), 'dsc':(0,), 'other':(0,0)}, lambda x: kas.ScraperNode('lst', x.elem(), 
                     [y for y in x.get_list('entry')] +
                    [kas.ScraperNode('entry', y.elem().getparent(), [y]) for y in x.get_list('dsc')])),
               ({'lst':(1,), 'other':(0,0)}, lambda x: kas.ScraperNode('lst', x.elem(),
                    sum([z.get_all() for z in x.get_list('lst')],[]))
               )]
fin_scraper = kas.MultiScraper(d_scrapers, d_structure)

In [219]:
%%time
r = fin_scraper.parse(root)

Wall time: 358 ms
