Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

merge

  • Loading branch information...
commit 89c11599048f3cda9c49ad263d597726c13f9ae9 2 parents 804fce9 + c051227
@japerk authored
View
14 analyze_chunker_coverage.py
@@ -4,7 +4,9 @@
from nltk.corpus.util import LazyCorpusLoader
from nltk.probability import FreqDist
from nltk.tag.simplify import simplify_wsj_tag
-from nltk_trainer import load_corpus_reader
+from nltk_trainer import load_corpus_reader, load_model
+from nltk_trainer.chunking import chunkers
+from nltk_trainer.tagging import taggers
########################################
## command options & argument parsing ##
@@ -55,12 +57,18 @@
if args.trace:
print 'loading tagger %s' % args.tagger
-tagger = nltk.data.load(args.tagger)
+if args.tagger == 'pattern':
+ tagger = taggers.PatternTagger()
+else:
+ tagger = load_model(args.tagger)
if args.trace:
print 'loading chunker %s' % args.chunker
-chunker = nltk.data.load(args.chunker)
+if args.chunker == 'pattern':
+ chunker = chunkers.PatternChunker()
+else:
+ chunker = load_model(args.chunker)
#######################
## coverage analysis ##
View
43 analyze_classifier_coverage.py
@@ -1,5 +1,5 @@
#!/usr/bin/python
-import argparse, collections, itertools, operator, re, string
+import argparse, collections, itertools, operator, re, string, time
import cPickle as pickle
import nltk.data
from nltk.classify.util import accuracy
@@ -22,6 +22,8 @@
help='How much trace output you want, defaults to 1. 0 is no trace output.')
parser.add_argument('--metrics', action='store_true', default=False,
help='Use classified instances to determine classifier accuracy, precision & recall')
+parser.add_argument('--speed', action='store_true', default=False,
+ help='Determine average instance classification speed.')
corpus_group = parser.add_argument_group('Corpus Reader Options')
corpus_group.add_argument('--reader',
@@ -46,7 +48,7 @@
feat_group = parser.add_argument_group('Feature Extraction',
'The default is to lowercase every word, strip punctuation, and use stopwords')
-feat_group.add_argument('--ngrams', action='append', type=int,
+feat_group.add_argument('--ngrams', nargs='+', type=int,
help='use n-grams as features.')
feat_group.add_argument('--no-lowercase', action='store_true', default=False,
help="don't lowercase every word")
@@ -114,11 +116,18 @@ def norm_words(words):
## text extraction ##
#####################
+if args.speed:
+ load_start = time.time()
+
try:
classifier = nltk.data.load(args.classifier)
except LookupError:
classifier = pickle.load(open(args.classifier))
+if args.speed:
+ load_secs = time.time() - load_start
+ print 'loading time: %dsecs' % load_secs
+
if args.metrics:
label_instance_function = {
'sents': corpus.category_sent_words,
@@ -149,21 +158,35 @@ def norm_words(words):
print '%s recall: %f' % (label, recall(ref, test) or 0)
print '%s f-measure: %f' % (label, f_measure(ref, test) or 0)
else:
- instance_function = {
- 'sents': categorized_corpus.sents,
- 'paras': lambda: [itertools.chain(*para) for para in categorized_corpus.paras()]
- # TODO: support files
- }
+ if args.instances == 'sents':
+ texts = categorized_corpus.sents()
+ total = len(texts)
+ elif args.instances == 'paras':
+ texts = (itertools.chain(*para) for para in categorized_corpus.paras())
+ total = len(categorized_corpus.paras)
+ elif args.instances == 'files':
+ texts = (categorized_corpus.words(fileids=[fid]) for fid in categorized_corpus.fileids())
+ total = len(categorized_corpus.fileids())
- texts = instance_function[args.instances]()
- stop = int(len(texts)*args.fraction)
- feats = [bag_of_words(norm_words(i)) for i in texts[:stop]]
+ stop = int(total * args.fraction)
+ feats = (bag_of_words(norm_words(i)) for i in itertools.islice(texts, stop))
label_counts = collections.defaultdict(int)
+if args.speed:
+ time_start = time.time()
+
for feat in feats:
label = classifier.classify(feat)
label_counts[label] += 1
+if args.speed:
+ time_end = time.time()
+
for label in sorted(label_counts.keys()):
print label, label_counts[label]
+
+if args.speed:
+ secs = (time_end - time_start)
+ nfeats = sum(label_counts.values())
+ print 'average time per classify: %dsecs / %d feats = %f ms/feat' % (secs, nfeats, (float(secs) / nfeats) * 1000)
View
16 analyze_tagger_coverage.py
@@ -4,7 +4,8 @@
from nltk.corpus.util import LazyCorpusLoader
from nltk.probability import FreqDist
from nltk.tag.simplify import simplify_wsj_tag
-from nltk_trainer import load_corpus_reader
+from nltk_trainer import load_corpus_reader, load_model
+from nltk_trainer.tagging import taggers
########################################
## command options & argument parsing ##
@@ -62,15 +63,10 @@
if args.trace:
print 'loading tagger %s' % args.tagger
-try:
- tagger = nltk.data.load(args.tagger)
-except LookupError:
- try:
- import cPickle as pickle
- except ImportError:
- import pickle
-
- tagger = pickle.load(open(os.path.expanduser(args.tagger)))
+if args.tagger == 'pattern':
+ tagger = taggers.PatternTagger()
+else:
+ tagger = load_model(args.tagger)
#######################
## coverage analysis ##
View
12 nltk_trainer/__init__.py
@@ -1,11 +1,15 @@
import os, os.path, re, time
-import cPickle as pickle
import nltk.data
from nltk.corpus.util import LazyCorpusLoader
from nltk.misc import babelfish
from nltk.tag.simplify import simplify_wsj_tag
from nltk_trainer.tagging.readers import NumberedTaggedSentCorpusReader
+try:
+ import cPickle as pickle
+except ImportError:
+ import pickle
+
def dump_object(obj, fname, trace=1):
dirname = os.path.dirname(fname)
@@ -22,6 +26,12 @@ def dump_object(obj, fname, trace=1):
pickle.dump(obj, f)
f.close()
+def load_model(path):
+ try:
+ return nltk.data.load(path)
+ except LookupError:
+ return pickle.load(open(os.path.expanduser(path)))
+
def import_attr(path):
basepath, name = path.rsplit('.', 1)
mod = __import__(basepath, globals(), locals(), [name])
View
15 nltk_trainer/chunking/chunkers.py
@@ -115,3 +115,18 @@ def parse(self, tagged_sent):
if not tagged_sent: return None
chunks = self.tagger.tag(tagged_sent)
return conlltags2tree([(w,t,c) for ((w,t),c) in chunks])
+
+#############
+## pattern ##
+#############
+
+class PatternChunker(ChunkParserI):
+ def parse(self, tagged_sent):
+ # don't import at top since don't want to fail if not installed
+ from pattern.en import parse
+ s = ' '.join([word for word, tag in tagged_sent])
+ # not tokenizing ensures that the number of tagged tokens returned is
+ # the same as the number of input tokens
+ sents = parse(s, tokenize=False).split()
+ if not sents: return None
+ return conlltags2tree([(w, t, c) for w, t, c, p in sents[0]])
View
12 nltk_trainer/tagging/taggers.py
@@ -1,6 +1,6 @@
from nltk.tag.sequential import SequentialBackoffTagger
from nltk.probability import FreqDist
-from nltk.tag import ClassifierBasedPOSTagger
+from nltk.tag import ClassifierBasedPOSTagger, TaggerI, str2tuple
from nltk_trainer.featx import phonetics
from nltk_trainer.featx.metaphone import dm
@@ -44,4 +44,12 @@ def choose_tag(self, tokens, index, history):
for tagger in self._taggers:
tags.inc(tagger.choose_tag(tokens, index, history))
- return tags.max()
+ return tags.max()
+
+class PatternTagger(TaggerI):
+ def tag(self, tokens):
+ # don't import at top since don't want to fail if not installed
+ from pattern.en import tag
+ # not tokenizing ensures that the number of tagged tokens returned is
+ # the same as the number of input tokens
+ return tag(u' '.join(tokens), tokenize=False)
Please sign in to comment.
Something went wrong with that request. Please try again.