Skip to content
Browse files

PatternTagger, supported by analyze_tagger_coverage

  • Loading branch information...
1 parent 804fce9 commit 90ddd4669efd3200bea112a43eb7330ae0fbbd8e @japerk committed Oct 26, 2012
Showing with 27 additions and 13 deletions.
  1. +6 −10 analyze_tagger_coverage.py
  2. +11 −1 nltk_trainer/__init__.py
  3. +10 −2 nltk_trainer/tagging/taggers.py
View
16 analyze_tagger_coverage.py
@@ -4,7 +4,8 @@
from nltk.corpus.util import LazyCorpusLoader
from nltk.probability import FreqDist
from nltk.tag.simplify import simplify_wsj_tag
-from nltk_trainer import load_corpus_reader
+from nltk_trainer import load_corpus_reader, load_model
+from nltk_trainer.tagging import taggers
########################################
## command options & argument parsing ##
@@ -62,15 +63,10 @@
if args.trace:
print 'loading tagger %s' % args.tagger
-try:
- tagger = nltk.data.load(args.tagger)
-except LookupError:
- try:
- import cPickle as pickle
- except ImportError:
- import pickle
-
- tagger = pickle.load(open(os.path.expanduser(args.tagger)))
+if args.tagger == 'pattern':
+ tagger = taggers.PatternTagger()
+else:
+ tagger = load_model(args.tagger)
#######################
## coverage analysis ##
View
12 nltk_trainer/__init__.py
@@ -1,11 +1,15 @@
import os, os.path, re, time
-import cPickle as pickle
import nltk.data
from nltk.corpus.util import LazyCorpusLoader
from nltk.misc import babelfish
from nltk.tag.simplify import simplify_wsj_tag
from nltk_trainer.tagging.readers import NumberedTaggedSentCorpusReader
+try:
+ import cPickle as pickle
+except ImportError:
+ import pickle
+
def dump_object(obj, fname, trace=1):
dirname = os.path.dirname(fname)
@@ -22,6 +26,12 @@ def dump_object(obj, fname, trace=1):
pickle.dump(obj, f)
f.close()
+def load_model(path):
+ try:
+ return nltk.data.load(path)
+ except LookupError:
+ return pickle.load(open(os.path.expanduser(path)))
+
def import_attr(path):
basepath, name = path.rsplit('.', 1)
mod = __import__(basepath, globals(), locals(), [name])
View
12 nltk_trainer/tagging/taggers.py
@@ -1,6 +1,6 @@
from nltk.tag.sequential import SequentialBackoffTagger
from nltk.probability import FreqDist
-from nltk.tag import ClassifierBasedPOSTagger
+from nltk.tag import ClassifierBasedPOSTagger, TaggerI, str2tuple
from nltk_trainer.featx import phonetics
from nltk_trainer.featx.metaphone import dm
@@ -44,4 +44,12 @@ def choose_tag(self, tokens, index, history):
for tagger in self._taggers:
tags.inc(tagger.choose_tag(tokens, index, history))
- return tags.max()
+ return tags.max()
+
+class PatternTagger(TaggerI):
+ def tag(self, tokens):
+ # don't import at top since don't want to fail if not installed
+ from pattern.en import parse
+ # we don't want chunk tags, and not tokenizing ensures that the number
+ # of tagged tokens returned is the same as the number of input tokens
+ return [str2tuple(s) for s in parse(u' '.join(tokens), chunks=False, tokenize=False).split(u' ')]

0 comments on commit 90ddd46

Please sign in to comment.
Something went wrong with that request. Please try again.