/
extract.py
231 lines (174 loc) · 7.52 KB
/
extract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
import argparse
from collections import defaultdict
import logging
import nltk
from nltk.sem import relextract
import operator
import re
import sys
LOG = logging.getLogger(__name__)
NE_TYPES = (
"ORGANIZATION", # Georgia-Pacific Corp., WHO
"PERSON", # Eddy Bonte, President Obama
"LOCATION", # Murray River, Mount Everest
"DATE", # June, 2008-06-29
"TIME", # two fifty a m, 1:30 p.m.
"MONEY", # 175 million Canadian Dollars, GBP 10.40
"PERCENT", # twenty pct, 18.75 %
"FACILITY", # Washington Monument, Stonehenge
"GPE", # South East Asia, Midlothian
)
def is_single_item(item):
return not hasattr(item, "__iter__")
def iter_nodes(tree):
for elem in tree:
try:
yield elem.node
except AttributeError:
pass
class NECorpus:
def __init__(self, text):
self._sents = self.tokenize_sentences(text)
self._sents = self.tokenize_words(self._sents)
self._sents = self.tag_nes(self._sents)
self._postprocess()
def tokenize_sentences(cls, text):
tokenizer_url = 'nltk:tokenizers/punkt/english.pickle'
sentence_tokenizer = nltk.data.load(tokenizer_url)
sents = sentence_tokenizer.tokenize(text)
return sents
def tokenize_words(cls, sents):
word_tokenizer = nltk.tokenize.treebank.TreebankWordTokenizer()
tokenized_sents = [word_tokenizer.tokenize(sent) for sent in sents]
return tokenized_sents
def tag_nes(cls, tokenized_sents):
tagger_url = 'nltk:taggers/maxent_treebank_pos_tagger/english.pickle'
tagger = nltk.data.load(tagger_url)
tagged = tagger.batch_tag(tokenized_sents)
ne_chunker_url = 'nltk:chunkers/maxent_ne_chunker/english_ace_multiclass.pickle'
ne_chunker = nltk.data.load(ne_chunker_url)
nes = ne_chunker.batch_parse(tagged)
return nes
def _postprocess(self):
"""Perform postprocessing techniques to increase accuracy and recall."""
# normalize choices of NE throughout the text, e.g. if "Billy Flannigan"
# is usually a PERSON, make him always a PERSON
def symbolize(tree):
return '_'.join(word.lower() for word, tag in tree.leaves())
nes = self.nes()
counts = defaultdict(lambda: defaultdict(int))
for sentence_no, ne in nes:
sym = symbolize(ne)
choice = ne.node
counts[sym][choice] += 1
normalized = dict()
for sym, choices in counts.iteritems():
if len(choices) < 2:
continue
choice, _ = max(choices.iteritems(), key=operator.itemgetter(1))
normalized[sym] = choice
LOG.debug("Normalizing NE '%s' from choices %s => %s" % (sym, choices.items(), choice))
for sentence_no, ne in nes:
sym = symbolize(ne)
ne.node = normalized.get(sym, ne.node)
def rejoin_sent(cls, tree):
return ' '.join(word for word, tag in tree.leaves())
def sents(self):
"""Get the sentences as split by this corpus. Note the original text
is not saved, so rejoining the tokenized version may have slight
differences.
>>> text = 'The Project Gutenberg EBook of Ulysses, by James Joyce. Use this with care.'
>>> corpus = NECorpus(text)
>>> corpus.sents()
['The Project Gutenberg EBook of Ulysses , by James Joyce .', 'Use this with care .']
"""
return [self.rejoin_sent(sent) for sent in self._sents]
def parsed_sents(self):
"""Get sentences as parsed, tokenized, and tagged by this corpus."""
return self._sents
def nes(self, nes=NE_TYPES):
"""Get all NEs of the specified types in the text, or every NE if
not specified, and the sentence they occur in."""
if is_single_item(nes):
nes = [nes]
nes = set(nes)
result = []
for index, sent in enumerate(self._sents):
for elem in sent:
try:
if elem.node in nes:
result.append((index, elem))
except AttributeError:
pass
return result
def ne_sents(self, nes=NE_TYPES, match_all=False):
"""Get sentences containing any of the specified NEs, or any NE if not specified.
>>> text = 'The Project Gutenberg EBook of Ulysses, by James Joyce. Use this with care.'
>>> corpus = NECorpus(text)
>>> corpus.ne_sents('PERSON')
['The Project Gutenberg EBook of Ulysses , by James Joyce .']
"""
return [self.rejoin_sent(sent) for sent in self.ne_parsed_sents(nes, match_all)]
def ne_parsed_sents(self, nes=NE_TYPES, match_all=False):
"""Get parsed sentences containing any of the specified NEs, or any NE if not specified."""
if is_single_item(nes):
nes = [nes]
nes = set(nes)
filterfn = self._contains_all_nodes if match_all else self._contains_any_node
return [sent for sent in self._sents if filterfn(sent, nes)]
def _contains_all_nodes(cls, tree, nodes):
nes = set(iter_nodes(tree))
return nodes.issuperset(nes)
def _contains_any_node(cls, tree, nodes):
nes = set(iter_nodes(tree))
return nodes.intersection(nes)
def extract_rels(self, subj, obj):
"""Extract relationships of the given named entity subj and obj
type."""
return self._naive_extract(subj, obj)
def _naive_extract(self, subj, obj):
"""Get sentences containing both subj and obj named entities."""
# Duplicating self.ne_parsed_sents([subj, obj]) ...
cond = set((subj, obj))
result = []
for index, sent in enumerate(self._sents):
nes = [elem for elem in sent if hasattr(elem, 'node')]
nodes = set(elem.node for elem in nes)
if cond.issubset(nodes):
matching_nes = [elem for elem in nes if elem.node in cond]
result.append((index, matching_nes))
return result
def _nltk_extract(self, subj, obj):
"""Use NLTK's built-in relationship extractor to get subj and obj
named entity relationships and context."""
re_location = re.compile(".*")
result = []
for sent in self._sents:
extraction = relextract.extract_rels(
subj,
obj,
sent,
pattern=re_location,
)
if extraction:
result.append(extraction)
return result
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Extract person-location and person-GPE relationships.')
parser.add_argument('infile', nargs='?', type=argparse.FileType('r'), help='filename to process, or - for stdin')
parser.add_argument('--tests', dest='run_tests', action='store_true', help='run tests')
args = parser.parse_args()
logging.basicConfig(level=logging.DEBUG)
did_something = False
if args.run_tests:
did_something = True
import doctest
doctest.testmod()
text = args.infile.read() if args.infile else ''
if not text:
if not did_something:
parser.print_usage()
sys.exit()
corpus = NECorpus(text)
a = corpus.extract_rels('PERSON', 'LOCATION')
b = corpus.extract_rels('PERSON', 'GPE')