Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

PatternChunker, supported by analyze_chunker_coverage

  • Loading branch information...
commit d210eaddeaa8f576477504d8f33dcbd69c27be96 1 parent 90ddd46
@japerk authored
View
14 analyze_chunker_coverage.py
@@ -4,7 +4,9 @@
from nltk.corpus.util import LazyCorpusLoader
from nltk.probability import FreqDist
from nltk.tag.simplify import simplify_wsj_tag
-from nltk_trainer import load_corpus_reader
+from nltk_trainer import load_corpus_reader, load_model
+from nltk_trainer.chunking import chunkers
+from nltk_trainer.tagging import taggers
########################################
## command options & argument parsing ##
@@ -55,12 +57,18 @@
if args.trace:
print 'loading tagger %s' % args.tagger
-tagger = nltk.data.load(args.tagger)
+if args.tagger == 'pattern':
+ tagger = taggers.PatternTagger()
+else:
+ tagger = load_model(args.tagger)
if args.trace:
print 'loading chunker %s' % args.chunker
-chunker = nltk.data.load(args.chunker)
+if args.chunker == 'pattern':
+ chunker = chunkers.PatternChunker()
+else:
+ chunker = load_model(args.chunker)
#######################
## coverage analysis ##
View
15 nltk_trainer/chunking/chunkers.py
@@ -115,3 +115,18 @@ def parse(self, tagged_sent):
if not tagged_sent: return None
chunks = self.tagger.tag(tagged_sent)
return conlltags2tree([(w,t,c) for ((w,t),c) in chunks])
+
+#############
+## pattern ##
+#############
+
+class PatternChunker(ChunkParserI):
+ def parse(self, tagged_sent):
+ # don't import at top since don't want to fail if not installed
+ from pattern.en import parse
+ s = ' '.join([word for word, tag in tagged_sent])
+ # not tokenizing ensures that the number of tagged tokens returned is
+ # the same as the number of input tokens
+ sents = parse(s, tokenize=False).split()
+ if not sents: return None
+ return conlltags2tree([(w, t, c) for w, t, c, p in sents[0]])
View
8 nltk_trainer/tagging/taggers.py
@@ -49,7 +49,7 @@ def choose_tag(self, tokens, index, history):
class PatternTagger(TaggerI):
def tag(self, tokens):
# don't import at top since don't want to fail if not installed
- from pattern.en import parse
- # we don't want chunk tags, and not tokenizing ensures that the number
- # of tagged tokens returned is the same as the number of input tokens
- return [str2tuple(s) for s in parse(u' '.join(tokens), chunks=False, tokenize=False).split(u' ')]
+ from pattern.en import tag
+ # not tokenizing ensures that the number of tagged tokens returned is
+ # the same as the number of input tokens
+ return tag(u' '.join(tokens), tokenize=False)
Please sign in to comment.
Something went wrong with that request. Please try again.