Permalink
Browse files

initial script to combine classifiers into AvgProbClassifier, make ot…

…her scripts executable
  • Loading branch information...
1 parent 2ad5055 commit 8cb9fbc1742f66adbe0f0176c277a5c597bcd6b6 @japerk committed Jun 12, 2011
View
0 analyze_chunked_corpus.py 100644 → 100755
No changes.
View
0 analyze_chunker_coverage.py 100644 → 100755
No changes.
View
3 categorized_corpus2csv.py 100644 → 100755
@@ -1,3 +1,4 @@
+#!/usr/bin/python
import argparse, csv, os.path
import nltk_trainer.classification.corpus
from nltk_trainer import load_corpus_reader
@@ -63,4 +64,4 @@
w = csv.writer(f, quoting=csv.QUOTE_ALL)
for cat, text in cat_instances:
- w.writerow([cat, text])
+ w.writerow([cat, text])
View
@@ -0,0 +1,42 @@
+#!/usr/bin/python
+import argparse, os.path
+import nltk.data
+from nltk_trainer import dump_object
+from nltk_trainer.classification import multi
+
+########################################
+## command options & argument parsing ##
+########################################
+
+parser = argparse.ArgumentParser(description='Combine NLTK Classifiers')
+parser.add_argument('classifiers', nargs='+',
+ help='one or more pickled classifiers to load and combine')
+parser.add_argument('filename', default='~/nltk_data/classifiers/combined.pickle',
+ help='Filename to pickle combined classifier, defaults to %(default)s')
+parser.add_argument('--trace', default=1, type=int,
+ help='How much trace output you want, defaults to 1. 0 is no trace output.')
+
+args = parser.parse_args()
+
+#########################
+## combine classifiers ##
+#########################
+
+# TODO: support MaxVote combining, Hierarchical combinations
+
+classifiers = []
+
+for name in args.classifiers:
+ if args.trace:
+ print 'loading %s' % name
+
+ classifiers.append(nltk.data.load(name))
+
+combined = multi.AvgProbClassifier(classifiers)
+
+##############################
+## dump combined classifier ##
+##############################
+
+fname = os.path.expanduser(args.filename)
+dump_object(combined, fname, trace=args.trace)
@@ -1,5 +1,33 @@
-import collections
-from nltk.classify import MultiClassifierI
+import collections, itertools
+from nltk.classify import ClassifierI, MultiClassifierI
+from nltk.probability import DictionaryProbDist
+
+class AvgProbClassifier(ClassifierI):
+ def __init__(self, classifiers):
+ self._classifiers = classifiers
+ self._labels = sorted(set(itertools.chain(*[c.labels() for c in classifiers])))
+
+ def labels(self):
+ return self._labels
+
+ def classify(self, feat):
+ return self.prob_classify(feat).max()
+
+ def prob_classify(self, feat):
+ label_probs = collections.defaultdict(list)
+
+ for classifier in self._classifiers:
+ cprobs = classifier.prob_classify(feat)
+
+ for label in cprobs.samples():
+ label_probs[label].append(cprobs.prob(label))
+
+ avg_probs = {}
+
+ for label, probs in label_probs.items():
+ avg_probs[label] = float(sum(probs)) / len(probs)
+
+ return DictionaryProbDist(avg_probs)
class MultiBinaryClassifier(MultiClassifierI):
def __init__(self, label_classifiers):

0 comments on commit 8cb9fbc

Please sign in to comment.