-
Notifications
You must be signed in to change notification settings - Fork 6
/
__init__.py
127 lines (101 loc) · 3.73 KB
/
__init__.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
from collections import OrderedDict
import importlib
import os.path
import yaml
from .models import Classifier, ExampleSentence
from .version import __version__
class VocabWord:
def __init__(self, trad, simp, pinyin, defs, tw_pinyin=None, clfrs=None, example_sentences=None):
"""
:param str trad: traditional form
:param str simp: simplified form
:param str pinyin: pinyin, e.g. 'nǐ hǎo' (not 'ni3 hao3')
:param list[str] defs: list of definitions
:param str|None tw_pinyin: Taiwanese pinyin, or None
:param list[Classifier]|None clfrs: list of classifiers
:param list[ExampleSentence]|None example_sentences: list of example sentences
"""
self.trad = trad
self.simp = simp
self.pinyin = pinyin
self.defs = defs
self.tw_pinyin = tw_pinyin
self.clfrs = clfrs or []
self.example_sentences = example_sentences or []
def __repr__(self):
return '{}(trad={}, simp={}, pinyin={}, defs={}, tw_pinyin={}, clfrs={}, example_sentences={})'.format(
self.__class__.__name__,
repr(self.trad),
repr(self.simp),
repr(self.pinyin),
repr(self.defs),
repr(self.tw_pinyin),
repr(self.clfrs),
repr(self.example_sentences),
)
def to_dict(self):
fields = ['trad', 'simp', 'pinyin', 'defs', 'tw_pinyin', 'clfrs', 'example_sentences']
rv = []
for field in fields:
val = getattr(self, field)
if not val:
continue
if field in ('clfrs', 'example_sentences'):
val = [v.to_dict() for v in val]
rv.append((field, val))
rv = OrderedDict(rv)
if rv['simp'] == rv['trad']:
del rv['simp']
return rv
@classmethod
def from_dict(cls, d):
if 'clfrs' in d:
d['clfrs'] = [Classifier.from_dict(item) for item in d['clfrs']]
if 'example_sentences' in d:
d['example_sentences'] = [ExampleSentence.from_dict(item) for item in d['example_sentences']]
return cls(
trad=d['trad'],
simp=d.get('simp', d['trad']),
pinyin=d['pinyin'],
defs=d['defs'],
tw_pinyin=d.get('tw_pinyin'),
clfrs=d.get('clfrs'),
example_sentences=d.get('example_sentences'))
def __eq__(self, other):
return self.to_dict() == other.to_dict()
# taken from https://stackoverflow.com/questions/16782112/can-pyyaml-dump-dict-items-in-non-alphabetical-order
def represent_ordereddict(dumper, data):
value = []
for item_key, item_value in data.items():
node_key = dumper.represent_data(item_key)
node_value = dumper.represent_data(item_value)
value.append((node_key, node_value))
return yaml.nodes.MappingNode(u'tag:yaml.org,2002:map', value)
yaml.add_representer(OrderedDict, represent_ordereddict)
class VocabList:
@classmethod
def load(cls):
vocab_list_data = importlib.__import__(f'{__name__}.vocab_list_data', globals=globals()).vocab_list_data
return vocab_list_data.vocab_list
@classmethod
def load_from_yaml_str(cls, yaml_str):
words = [VocabWord.from_dict(d) for d in yaml.full_load(yaml_str)]
return VocabList(words)
@classmethod
def load_from_yaml_file(cls, yaml_file_path):
with open(yaml_file_path, encoding='utf-8') as h:
words = [VocabWord.from_dict(d) for d in yaml.full_load(h)]
return VocabList(words)
def __init__(self, words):
self.words = words
self.simp_to_word = {}
self.trad_to_word = {}
for word in self.words:
self.simp_to_word[word.simp] = word
self.trad_to_word[word.trad] = word
def dump_to_yaml_file(self, yaml_file_path):
data = [word.to_dict() for word in self.words]
with open(yaml_file_path, 'w') as h:
yaml.dump(data, h, allow_unicode=True, default_flow_style=False)
def __repr__(self):
return 'VocabList(words={})'.format(repr(self.words))