/
wiktionary_parser.py
125 lines (102 loc) · 4.08 KB
/
wiktionary_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# simple parser for English Wiktionary
from HTMLParser import HTMLParser
import re
import sys
from xml_parser import XMLParser
class WiktParser(XMLParser):
html_parser = HTMLParser()
header_regex = re.compile("^=+([^=]*?)=+$", re.M)
lang_section_regex = re.compile('==English==$.*', re.M | re.S)
defs_section_regex = re.compile("^=+[^=$]*?=+$[^=]*?^#.*?^=", re.M | re.S)
def_regex = re.compile("^#([^#:\*].*)", re.M)
double_curly_regex = re.compile("{{.*?}}")
replacements = [(re.compile(pattern), subst) for pattern, subst in [
("\[\[(.*?)\|(.*?)\]\]", "\\2")]]
patterns_to_remove = [re.compile(pattern) for pattern in [
"\[\[", "\]\]", "<ref>.*</ref>", "''", "''"]]
pos_name_map = { # entries with categories not listed shall be omitted
'noun': 'n', 'proper noun': 'n', 'verb': 'v', 'adjective': 'adj',
'adverb': 'adv', 'initialism': 'n', 'pronoun': 'n',
'abbreviation': 'n', 'numeral': 'num', 'interjection': 'interj',
'definitions': 'n', # this means the POS is unknown
'preposition': 'prp', 'conjunction': 'conj', 'acronym': 'n',
'cardinal numeral': 'num', 'cardinal number': 'num', 'number': 'num',
'article': 'art', 'particle': 'part', 'determiner': 'det', }
@staticmethod
def get_pages(text):
return WiktParser.iter_sections('page', text)
@staticmethod
def get_pos(section):
header = WiktParser.header_regex.match(section).group(1).lower()
if header not in WiktParser.pos_name_map:
# sys.stderr.write(header+'\n')
return False
return WiktParser.pos_name_map[header]
@staticmethod
def parse_definition(definition):
d = definition.strip()
# semi-colons usually separate two definitions on the same line
d = d.split(';')[0]
d = WiktParser.html_parser.unescape(d)
d = WiktParser.double_curly_regex.sub('', d)
for pattern, subst in WiktParser.replacements:
d = pattern.sub(subst, d)
for pattern in WiktParser.patterns_to_remove:
d = pattern.sub("", d)
# if a definition is longer than 300 characters, that's probably a bug
# and it will cause memory errors when parsing
d = d[:300]
return d.strip()
@staticmethod
def get_definitions(section):
raw_definitions = WiktParser.def_regex.findall(section)
parsed_definitions = map(WiktParser.parse_definition, raw_definitions)
kept_definitions = filter(None, parsed_definitions)
return kept_definitions
@staticmethod
def parse_page(page):
headword = WiktParser.get_section('title', page)
if ":" in headword:
return None
lang_section = WiktParser.lang_section_regex.search(page)
if lang_section is None:
return None
defs_section = WiktParser.defs_section_regex.search(
lang_section.group())
if defs_section is None:
return None
pos = WiktParser.get_pos(defs_section.group())
if pos is False:
return None
definitions = WiktParser.get_definitions(defs_section.group())
if not definitions:
return None
return {
"hw": headword,
"senses": [{
"full_form": headword,
"pos": pos,
"definition": definition}
for definition in definitions]}
@staticmethod
def parse_xml(xml):
for page in WiktParser.get_pages(xml):
parsed_page = WiktParser.parse_page(page)
if parsed_page is not None:
yield parsed_page
def test():
xml = sys.stdin.read()
for entry in WiktParser.parse_xml(xml):
print entry
def print_defs():
xml = sys.stdin.read()
for entry in WiktParser.parse_xml(xml):
hw, senses = entry['hw'], entry['senses']
if not senses:
continue
for sense in senses:
d = sense['definition']
if d:
print "{0}\t{1}".format(hw, d)
if __name__ == "__main__":
print_defs()