Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

tagset mapping support for treebank_chunk, ChunkedCorpusReader, tagst…

…r2tree
  • Loading branch information...
commit 4bade2daa08d3775f636db8981a9409cc41e3b9a 1 parent 2adaf6f
@japerk authored
View
1  .gitignore
@@ -20,6 +20,7 @@ nltk/test/*.html
# editor temporary files
*.*.sw[op]
.idea
+*~
# git mergetools backups
*.orig
View
9 nltk/chunk/util.py
@@ -10,6 +10,7 @@
import re
from nltk.tree import Tree
+from nltk.tag.mapping import map_tag
from nltk.tag.util import str2tuple
from nltk.compat import python_2_unicode_compatible
@@ -307,7 +308,8 @@ def _chunksets(t, count, chunk_label):
return set(chunks)
-def tagstr2tree(s, chunk_label="NP", root_label="S", sep='/'):
+def tagstr2tree(s, chunk_label="NP", root_label="S", sep='/',
+ source_tagset=None, target_tagset=None):
"""
Divide a string of bracketted tagged text into
chunks and unchunked tokens, and produce a Tree.
@@ -344,7 +346,10 @@ def tagstr2tree(s, chunk_label="NP", root_label="S", sep='/'):
if sep is None:
stack[-1].append(text)
else:
- stack[-1].append(str2tuple(text, sep))
+ word, tag = str2tuple(text, sep)
+ if source_tagset and target_tagset:
+ tag = map_tag(source_tagset, target_tagset, tag)
+ stack[-1].append((word, tag))
if len(stack) != 1:
raise ValueError('Expected ] at char %d' % len(s))
View
2  nltk/corpus/__init__.py
@@ -197,7 +197,7 @@
treebank_chunk = LazyCorpusLoader(
'treebank/tagged', ChunkedCorpusReader, r'wsj_.*\.pos',
sent_tokenizer=RegexpTokenizer(r'(?<=/\.)\s*(?![^\[]*\])', gaps=True),
- para_block_reader=tagged_treebank_para_block_reader, encoding='ascii')
+ para_block_reader=tagged_treebank_para_block_reader, tagset='wsj', encoding='ascii')
treebank_raw = LazyCorpusLoader(
'treebank/raw', PlaintextCorpusReader, r'wsj_.*', encoding='ISO-8859-2')
udhr = LazyCorpusLoader(
View
36 nltk/corpus/reader/chunked.py
@@ -37,14 +37,13 @@ def __init__(self, root, fileids, extension='',
str2chunktree=tagstr2tree,
sent_tokenizer=RegexpTokenizer('\n', gaps=True),
para_block_reader=read_blankline_block,
- encoding='utf8'):
+ encoding='utf8', tagset=None):
"""
:param root: The root directory for this corpus.
:param fileids: A list or regexp specifying the fileids in this corpus.
"""
CorpusReader.__init__(self, root, fileids, encoding)
-
- self._cv_args = (str2chunktree, sent_tokenizer, para_block_reader)
+ self._cv_args = (str2chunktree, sent_tokenizer, para_block_reader, tagset)
"""Arguments for corpus views generated by this corpus: a tuple
(str2chunktree, sent_tokenizer, para_block_tokenizer)"""
@@ -86,37 +85,37 @@ def paras(self, fileids=None):
return concat([ChunkedCorpusView(f, enc, 0, 1, 1, 0, *self._cv_args)
for (f, enc) in self.abspaths(fileids, True)])
- def tagged_words(self, fileids=None):
+ def tagged_words(self, fileids=None, tagset=None):
"""
:return: the given file(s) as a list of tagged
words and punctuation symbols, encoded as tuples
``(word,tag)``.
:rtype: list(tuple(str,str))
"""
- return concat([ChunkedCorpusView(f, enc, 1, 0, 0, 0, *self._cv_args)
+ return concat([ChunkedCorpusView(f, enc, 1, 0, 0, 0, *self._cv_args, target_tagset=tagset)
for (f, enc) in self.abspaths(fileids, True)])
- def tagged_sents(self, fileids=None):
+ def tagged_sents(self, fileids=None, tagset=None):
"""
:return: the given file(s) as a list of
sentences, each encoded as a list of ``(word,tag)`` tuples.
:rtype: list(list(tuple(str,str)))
"""
- return concat([ChunkedCorpusView(f, enc, 1, 1, 0, 0, *self._cv_args)
+ return concat([ChunkedCorpusView(f, enc, 1, 1, 0, 0, *self._cv_args, target_tagset=tagset)
for (f, enc) in self.abspaths(fileids, True)])
- def tagged_paras(self, fileids=None):
+ def tagged_paras(self, fileids=None, tagset=None):
"""
:return: the given file(s) as a list of
paragraphs, each encoded as a list of sentences, which are
in turn encoded as lists of ``(word,tag)`` tuples.
:rtype: list(list(list(tuple(str,str))))
"""
- return concat([ChunkedCorpusView(f, enc, 1, 1, 1, 0, *self._cv_args)
+ return concat([ChunkedCorpusView(f, enc, 1, 1, 1, 0, *self._cv_args, target_tagset=tagset)
for (f, enc) in self.abspaths(fileids, True)])
- def chunked_words(self, fileids=None):
+ def chunked_words(self, fileids=None, tagset=None):
"""
:return: the given file(s) as a list of tagged
words and chunks. Words are encoded as ``(word, tag)``
@@ -125,10 +124,10 @@ def chunked_words(self, fileids=None):
trees over ``(word,tag)`` tuples or word strings.
:rtype: list(tuple(str,str) and Tree)
"""
- return concat([ChunkedCorpusView(f, enc, 1, 0, 0, 1, *self._cv_args)
+ return concat([ChunkedCorpusView(f, enc, 1, 0, 0, 1, *self._cv_args, target_tagset=tagset)
for (f, enc) in self.abspaths(fileids, True)])
- def chunked_sents(self, fileids=None):
+ def chunked_sents(self, fileids=None, tagset=None):
"""
:return: the given file(s) as a list of
sentences, each encoded as a shallow Tree. The leaves
@@ -137,10 +136,10 @@ def chunked_sents(self, fileids=None):
tags).
:rtype: list(Tree)
"""
- return concat([ChunkedCorpusView(f, enc, 1, 1, 0, 1, *self._cv_args)
+ return concat([ChunkedCorpusView(f, enc, 1, 1, 0, 1, *self._cv_args, target_tagset=tagset)
for (f, enc) in self.abspaths(fileids, True)])
- def chunked_paras(self, fileids=None):
+ def chunked_paras(self, fileids=None, tagset=None):
"""
:return: the given file(s) as a list of
paragraphs, each encoded as a list of sentences, which are
@@ -149,7 +148,7 @@ def chunked_paras(self, fileids=None):
has tags) or word strings (if the corpus has no tags).
:rtype: list(list(Tree))
"""
- return concat([ChunkedCorpusView(f, enc, 1, 1, 1, 1, *self._cv_args)
+ return concat([ChunkedCorpusView(f, enc, 1, 1, 1, 1, *self._cv_args, target_tagset=tagset)
for (f, enc) in self.abspaths(fileids, True)])
def _read_block(self, stream):
@@ -158,7 +157,7 @@ def _read_block(self, stream):
class ChunkedCorpusView(StreamBackedCorpusView):
def __init__(self, fileid, encoding, tagged, group_by_sent,
group_by_para, chunked, str2chunktree, sent_tokenizer,
- para_block_reader):
+ para_block_reader, source_tagset=None, target_tagset=None):
StreamBackedCorpusView.__init__(self, fileid, encoding=encoding)
self._tagged = tagged
self._group_by_sent = group_by_sent
@@ -167,13 +166,16 @@ def __init__(self, fileid, encoding, tagged, group_by_sent,
self._str2chunktree = str2chunktree
self._sent_tokenizer = sent_tokenizer
self._para_block_reader = para_block_reader
+ self._source_tagset = source_tagset
+ self._target_tagset = target_tagset
def read_block(self, stream):
block = []
for para_str in self._para_block_reader(stream):
para = []
for sent_str in self._sent_tokenizer.tokenize(para_str):
- sent = self._str2chunktree(sent_str)
+ sent = self._str2chunktree(sent_str, source_tagset=self._source_tagset,
+ target_tagset=self._target_tagset)
# If requested, throw away the tags.
if not self._tagged:
Please sign in to comment.
Something went wrong with that request. Please try again.