forked from microformats/mf2py
-
Notifications
You must be signed in to change notification settings - Fork 2
/
parse_property.py
75 lines (56 loc) · 2.01 KB
/
parse_property.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
from bs4 import Tag
from dom_helpers import get_attr
from urlparse import urljoin
## functions to parse the propertis of elements
def text(el):
# add value-class-pattern
prop_value = get_attr(el, "title", check_name="abbr")
if prop_value is not None:
return prop_value
prop_value = get_attr(el, "value", check_name=("data","input"))
if prop_value is not None:
return prop_value
prop_value = get_attr(el, "alt", check_name=("img","area"))
if prop_value is not None:
return prop_value
## see if get_text() replaces img with alts
# strip here?
return el.get_text()
def url(el, base_url=''):
## do the normalise absolute url thing
prop_value = get_attr(el, "href", check_name=("a","area"))
if prop_value is not None:
return urljoin(base_url, prop_value)
prop_value = get_attr(el, "src", check_name="img")
if prop_value is not None:
return urljoin(base_url, prop_value)
prop_value = get_attr(el, "data", check_name="object")
if prop_value is not None:
return urljoin(base_url, prop_value)
# add value-class-pattern
prop_value = get_attr(el, "title", check_name="abbr")
if prop_value is not None:
return prop_value
prop_value = get_attr(el, "data", check_name="object")
if prop_value is not None:
return prop_value
# strip here?
return el.get_text()
def datetime(el):
# add value-class-pattern
prop_value = get_attr(el, "datetime", check_name=("time","ins","del"))
if prop_value is not None:
return prop_value
prop_value = get_attr(el, "title", check_name="abbr")
if prop_value is not None:
return prop_value
prop_value = get_attr(el, "value", check_name=("data","input"))
if prop_value is not None:
return prop_value
# strip here?
return el.get_text()
def embedded(el):
return {
'html': ''.join([unicode(e) for e in el.children]),
'value': el.get_text() # strip here?
}