-
Notifications
You must be signed in to change notification settings - Fork 225
/
__init__.py
111 lines (86 loc) · 3.01 KB
/
__init__.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import os, os.path, re, time
import nltk.data
from nltk.corpus.util import LazyCorpusLoader
from nltk.misc import babelfish
from nltk.tag.simplify import simplify_wsj_tag
from nltk_trainer.tagging.readers import NumberedTaggedSentCorpusReader
try:
import cPickle as pickle
except ImportError:
import pickle
def dump_object(obj, fname, trace=1):
dirname = os.path.dirname(fname)
if dirname and not os.path.exists(dirname):
if trace:
print 'creating directory %s' % dirname
os.makedirs(dirname)
if trace:
print 'dumping %s to %s' % (obj.__class__.__name__, fname)
f = open(fname, 'wb')
pickle.dump(obj, f)
f.close()
def load_model(path):
try:
return nltk.data.load(path)
except LookupError:
return pickle.load(open(os.path.expanduser(path)))
def import_attr(path):
basepath, name = path.rsplit('.', 1)
mod = __import__(basepath, globals(), locals(), [name])
return getattr(mod, name)
def load_corpus_reader(corpus, reader=None, fileids=None, sent_tokenizer=None, word_tokenizer=None, **kwargs):
if corpus == 'timit':
return LazyCorpusLoader('timit', NumberedTaggedSentCorpusReader,
'.+\.tags', tag_mapping_function=simplify_wsj_tag)
real_corpus = getattr(nltk.corpus, corpus, None)
if not real_corpus:
if not reader:
raise ValueError('you must specify a corpus reader')
if not fileids:
fileids = '.*'
root = os.path.expanduser(corpus)
if not os.path.isdir(root):
if not corpus.startswith('corpora/'):
path = 'corpora/%s' % corpus
else:
path = corpus
try:
root = nltk.data.find(path)
except LookupError:
raise ValueError('cannot find corpus path for %s' % corpus)
if sent_tokenizer and isinstance(sent_tokenizer, basestring):
kwargs['sent_tokenizer'] = nltk.data.load(sent_tokenizer)
if word_tokenizer and isinstance(word_tokenizer, basestring):
kwargs['word_tokenizer'] = import_attr(word_tokenizer)()
reader_cls = import_attr(reader)
real_corpus = reader_cls(root, fileids, **kwargs)
return real_corpus
# the major punct this doesn't handle are '"- but that's probably fine
spacepunct_re = re.compile(r'\s([%s])' % re.escape('!.,;:%?)}]'))
punctspace_re = re.compile(r'([%s])\s' % re.escape('{([#$'))
def join_words(words):
'''
>>> join_words(['Hello', ',', 'my', 'name', 'is', '.'])
'Hello, my name is.'
>>> join_words(['A', 'test', '(', 'for', 'parens', ')', '!'])
'A test (for parens)!'
'''
return punctspace_re.sub(r'\1', spacepunct_re.sub(r'\1', ' '.join(words)))
def translate(text, source, target, trace=1, sleep=1, retries=1):
try:
return babelfish.translate(text, source, target)
except babelfish.BabelizerIOError as exc:
if retries:
if trace:
print 'IO error in translation, trying again after %ss' % sleep
time.sleep(sleep)
return translate(text, source, target, sleep, retries=retries-1)
else:
raise exc
except babelfish.BabelfishChangedError as exc:
if trace:
print 'error getting translation for:', text, '::', exc
return ''
if __name__ == '__main__':
import doctest
doctest.testmod()