In [1]:
import sys
import importlib
sys.path.append('../../')

import lxml.html as html

import knutils.autoscraper as kas

In [111]:
importlib.reload(kas)

<module 'knutils.autoscraper' from '../..\\knutils\\autoscraper.py'>

In [28]:
def make_classifier(root, rel_depth=5, fwditer_num=20):
    depth_stats, tag_stats, attrk_stats, attrv_stats, attrkv_stats = kas.AutoScraper.get_stats(root)

    p_used_tags = ['a','td','img','tr','div','table','script','meta','span','link','input','center','param',
                   'html','head','title','body','form','object','embed']
    t_used_attrk = ['class', 'id', 'name', 'title', 'type']
    t_used_attrv_raw = [z for (k, v) in attrkv_stats.items() if k in t_used_attrk for (z, n) in v]
    t_used_attrv = list(set(t_used_attrv_raw))
    p_used_attrkv = [(k, v) for k in t_used_attrk for v in t_used_attrv if v in [x[0] for x in attrkv_stats[k]]]

    p_context = [('cur', lambda p: p),
                 ('prev-1', lambda p: kas.AutoScraper.get_sibling(p, -1)),
                 ('next-1', lambda p: kas.AutoScraper.get_sibling(p, +1)),
                ('child-0',lambda p: kas.AutoScraper.get_child(p, 0)),
                 ('child-1',lambda p: kas.AutoScraper.get_child(p, 1)),
                 ('child-2',lambda p: kas.AutoScraper.get_child(p, 2))]
    classifier1 = kas.ParentRelClassifier(kas.ContextClassifier(kas.TagClassifier(p_used_tags), p_context), rel_depth)
    classifier2 = kas.ParentRelClassifier(kas.ContextClassifier(kas.AttribClassifier(p_used_attrkv), p_context), rel_depth)
    classifier3 = kas.FwdIterClassifier(kas.TagClassifier(p_used_tags), fwditer_num)
    return kas.AutoScraper({'tags':classifier1, 'attrs':classifier2, 'iter':classifier3})
    
def custom_order(clsf_name, split, split_stat):
    res = kas.SplitStat.calc_gini(clsf_name, split, split_stat)
    
    if clsf_name=='iter':
        if split.base().tag_name() == 'other':
            res *= 0.1
        res *= 0.25
    elif clsf_name=='tags':
        if split.base().base().tag_name()=='other':
            res *= 0.1
            
    if clsf_name in ('tags', 'attrs'):
        if split.base().name() != 'cur':
            res *= 0.1
            
    if clsf_name == 'attrs':
        if 'news-id-' in split.base().base().value_name():
            res = 0
    
    return res

In [29]:
root = html.parse('../../../DataSets/SiteSample/S2/Data.html')

In [30]:
#link to data
target_pics = []
target_cmts = []
for x in root.iter():
    if x.tag=='a' and 'href' in x.attrib and 'displayimage' in x.attrib['href']:
        target_pics.append(x)
    if x.text is not None and 'Сегодня' in x.text:
        target_cmts.append(x.getparent().getparent())

In [31]:
fin_classifier = make_classifier(root)

In [32]:
%%time
pic_scraper = fin_classifier.fit(root, target_pics, order_function=custom_order)
cmt_scraper = fin_classifier.fit(root, target_cmts, order_function=custom_order)

Wall time: 4 s


In [33]:
cmt_scraper.get_split_tree()

(('attrs', id=dle-content at context cur at rel-depth=1, 312-0 vs 6-3),
 (('tags', <div> at context cur at rel-depth=0, 5-0 vs 1-3),
  (('iter', <a> at fwd-iter-id=2, 1-0 vs 0-3), None, None),
  None),
 None)

In [34]:
pic_scraper.get_split_tree()

(('attrs', id=fullstory at context cur at rel-depth=3, 302-0 vs 12-7),
 (('tags', <a> at context cur at rel-depth=0, 12-0 vs 0-7), None, None),
 None)

In [112]:
d_scrapers = {'comment': cmt_scraper, 'picture': pic_scraper}
d_structure = [({'picture':(1,), 'other':(0,0)}, 'pics'),
              ({'comment':(0,), 'pics':(1,1), 'other':(0,0)},
              lambda x: kas.ScraperNode('data', x.elem(), x.get_list('comment') + x.get('pics').get_list('picture')))]
fin_scraper = kas.MultiScraper(d_scrapers, d_structure)

In [114]:
res = fin_scraper.parse(root)

In [115]:
res.get_all()

[picture(<Element a at 0x29b3f14ad68>),
 picture(<Element a at 0x29b3f0a7d68>),
 picture(<Element a at 0x29b3f14a818>),
 picture(<Element a at 0x29b3f14acc8>),
 picture(<Element a at 0x29b3f3eb4f8>),
 picture(<Element a at 0x29b3f3eb458>),
 picture(<Element a at 0x29b3f3eb228>),
 comment(<Element div at 0x29b3f161368>),
 comment(<Element div at 0x29b3f1616d8>),
 comment(<Element div at 0x29b3f161728>)]