/
nszt_parser.py
150 lines (132 loc) · 5.62 KB
/
nszt_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#!usr/bin/python
# -*- coding: utf-8 -*-
import sys
import re
# import json
import textwrap
class NSzTParser():
@staticmethod
def print_definitions(definitions):
# with open('magyar_out.json', 'w') as out:
# json.dump(None, out)
# for section in definitions:
# if section != None:
## print section
# with open('magyar_out.json', 'a') as out:
# json.dump(section, out)
for section in definitions:
# if section is not None:
# print 'start'
print
# print "section: " + str(section)
print section['hw'].encode('utf-8')
if 'redirect' in section:
print textwrap.fill(
'redirect: ' + section['redirect'],
initial_indent=' ',
subsequent_indent=' ').encode('utf-8')
if 'senses' in section:
for sense in section['senses']:
if 'latin' in sense:
print textwrap.fill(
'latin: ' + sense['latin'],
initial_indent=' ',
subsequent_indent=' ').encode('utf-8')
print textwrap.fill(
sense['definition'],
initial_indent=' ',
subsequent_indent=' ').encode('utf-8')
# print
# print 'end'
@staticmethod
def parse_file(input_file):
# for line in iter(open(input_file)):
for entry in re.finditer('<entry.+?<lemma>.+?</lemma>.*?</entry',
# avoid entries with empty lemmas
open(input_file).read().decode('utf-8').strip()):
yield NSzTParser.parse_entry(entry.group(0))
@staticmethod
def parse_entry(entry):
# print 'type of entry: ' + str(type(entry))
# if entry[:6] == '<entry':
# entry_dict = {'hw': NSzTParser.get_hw(entry),
# 'senses': NSzTParser.get_senses(entry)}
# else:
# entry_dict = None
# if entry[:8] == '<entryxr':
# entry_dict['redirect'] = NSzTParser.get_xr(entry)
# return entry_dict
entry_dict = {'hw': NSzTParser.get_hw(entry)}
if entry[:8] == '<entryxr':
entry_dict['redirect'] = NSzTParser.get_xr(entry)
else:
entry_dict['senses'] = NSzTParser.get_senses(entry)
# xr?
return entry_dict
@staticmethod
def get_hw(entry):
hw = re.search('<lemma>(.+?)</lemma>', entry, re.S).group(1)
tags = ['<hom>[1-9]</hom>', '</?deduced>', '</?reflex>']
for tag in tags:
hw = re.sub(tag, '', hw)
return hw
@staticmethod
def get_senses(entry):
hw = NSzTParser.get_hw(entry)
if hw[0] == '-' or hw[-1] == '-': # elotag/utotag
return [{'definition': NSzTParser.clean_definition(re.search(
'<def>(.+?)</def>', entry).group(1))}]
raw_sense_list = re.findall(
'<mainsens>.*?<def>(.*?)</def>.*?</mainsens>', entry)
modified_sense_list = []
for sense in raw_sense_list:
if sense != '<same/>':
modified_sense_list.append(
{'definition': NSzTParser.clean_definition(sense)})
if '<tr>' in sense:
modified_sense_list[-1]['latin'] = NSzTParser.get_latin(
sense)
return modified_sense_list
@staticmethod
def get_xr(entry):
redirect = re.search('<xr>(.+?)</xr>', entry).group(1)
return re.sub('<hom>[1-9]</hom>', '', redirect)
@staticmethod
def get_latin(sense):
latin = re.search('<tr>(.+?)</tr>', sense).group(1)
latin = re.sub('</?sub>', '', latin)
return latin
@staticmethod
def clean_definition(definition):
tags = ['gloss', 'mention', 'syn', 'tr>.+?</tr', 'hom>[1-9]</hom',
'sub', 'syn special="no"', 'mean']
for tag in tags:
definition = re.sub('</?' + tag + '>', ' ', definition)
definition = ' ' + definition
before = ['</?hint>', '<syn special="semicolon">',
'<syn special="comma">', '<syn special="ill">',
'<syn special="v">', ' es\.', ' gyakr\.', ' haszn\.', ' ill\.',
' kapcs\.', u' k\xf6l\.', ' rendsz\.', ' ritk\.', ' v\.',
' vonatk\.', u' \xe1lt\.', ' vm', ' vki', ' mn ', ' fn ', ' pl.',
u' \xfan.', ' {2,}', ' ,']
after = ['', '; ', ', ', ' illetve ', ' vagy ', ' esetleg', ' gyakran',
u' haszn\xe1lt', ' illetve', ' kapcsolatos', u' k\xfcl\xf6n\xf6sen',
' rendszerint', u' ritk\xe1bban', ' vagy', u' vonatkoz\xf3',
u' \xe1ltal\xe1ban', ' valam', ' valaki', u' mell\xe9kn\xe9v ', u' f\u0151n\xe9v ',
u' p\xe9ld\xe1ul', u' \xfagynevezett', ' ', ',']
# places of last two items are important
for b, a in zip(before, after):
definition = re.sub(b, a, definition)
# definition = re.sub('</?hint>', '', definition)
# definition = re.sub('<syn special="semicolon">', '; ', definition)
# definition = re.sub('<syn special="comma">', ', ', definition)
# definition = re.sub('<syn special="ill">', ' illetve ', definition)
# definition = re.sub(' {2,}', ' ', definition)
# definition = re.sub(' ,', ',', definition)
return definition.strip()
@staticmethod
def sub(string, pattern, repl):
pass
if __name__ == "__main__":
for input_file in sys.argv[1:]:
NSzTParser.print_definitions(NSzTParser.parse_file(input_file))