In [1]:
import sys
import importlib
sys.path.append('../../')

import lxml.html as html

import knutils.autoscraper as kas

In [2]:
def make_classifier(root, rel_depth=5, fwditer_num=20):
    depth_stats, tag_stats, attrk_stats, attrv_stats, attrkv_stats = kas.AutoScraper.get_stats(root)

    p_used_tags = ['a','td','img','tr','div','table','script','meta','span','link','input','center','param',
                   'html','head','title','body','form','object','embed']
    t_used_attrk = ['class', 'id', 'name', 'title', 'type', 'src']
    t_used_attrv_raw = [z for (k, v) in attrkv_stats.items() if k in t_used_attrk for (z, n) in v]
    t_used_attrv = list(set(t_used_attrv_raw))
    p_used_attrkv = [(k, v) for k in t_used_attrk for v in t_used_attrv if v in [x[0] for x in attrkv_stats[k]]]

    p_context = [('cur', lambda p: p),
                 ('prev-1', lambda p: kas.AutoScraper.get_sibling(p, -1)),
                 ('next-1', lambda p: kas.AutoScraper.get_sibling(p, +1)),
                ('child-0',lambda p: kas.AutoScraper.get_child(p, 0)),
                 ('child-1',lambda p: kas.AutoScraper.get_child(p, 1)),
                 ('child-2',lambda p: kas.AutoScraper.get_child(p, 2))]
    classifier1 = kas.ParentRelClassifier(kas.ContextClassifier(kas.TagClassifier(p_used_tags), p_context), rel_depth)
    classifier2 = kas.ParentRelClassifier(kas.ContextClassifier(kas.AttribClassifier(p_used_attrkv), p_context), rel_depth)
    classifier3 = kas.FwdIterClassifier(kas.TagClassifier(p_used_tags), fwditer_num)
    return kas.AutoScraper({'tags':classifier1, 'attrs':classifier2, 'iter':classifier3})
    
def custom_order(clsf_name, split, split_stat):
    res = kas.SplitStat.calc_gini(clsf_name, split, split_stat)
    
    if clsf_name=='iter':
        if split.base().tag_name() == 'other':
            res *= 0.1
        res *= 0.25
    elif clsf_name=='tags':
        if split.base().base().tag_name()=='other':
            res *= 0.1
            
    if clsf_name in ('tags', 'attrs'):
        if split.base().name() != 'cur':
            res *= 0.1
    
    return res

In [5]:
root = html.parse('../../../DataSets/SiteSample/S3/Data1.html')

In [24]:
target_albums = []
target_descr = []
for x in root.iter():
    if x.tag == 'a' and 'href' in x.attrib and 'album' in x.attrib['href']:
        #print(x.attrib['href'])
        target_albums.append(x)
    if x.text is not None and 'сложе' in x.text:
        target_descr.append(x.getparent().getparent().getparent())
target_descr = list(set(target_descr))

In [25]:
fin_classifier = make_classifier(root)

In [26]:
%%time
album_scraper = fin_classifier.fit(root, target_albums, order_function=custom_order)
descr_scraper = fin_classifier.fit(root, target_descr, order_function=custom_order)

Wall time: 11.6 s


In [27]:
album_scraper.get_split_tree()

(('attrs', class=data_list at context cur at rel-depth=2, 636-0 vs 0-1),
 None,
 None)

In [28]:
descr_scraper.get_split_tree()

(('attrs', id=main_info at context cur at rel-depth=0, 636-0 vs 0-1),
 None,
 None)