Permalink
Browse files

support simplify_tags for analyze_tagger_coverage metrics

  • Loading branch information...
1 parent 43cb97d commit ca62cf99b495e5913147496817b311d1e3f8b1a7 @japerk committed Aug 7, 2011
Showing with 24 additions and 6 deletions.
  1. +1 −3 analyze_tagged_corpus.py
  2. +10 −1 analyze_tagger_coverage.py
  3. +13 −2 tests/analyze_tagger_coverage.sh
@@ -57,9 +57,7 @@
tag_counts = FreqDist()
word_set = set()
-if args.corpus in ['conll2000', 'switchboard']:
- kwargs = {}
-elif args.simplify_tags:
+if args.simplify_tags and args.corpus not in ['conll2000', 'switchboard']:
kwargs = {'simplify_tags': True}
else:
kwargs = {}
@@ -33,6 +33,8 @@
help='Specify fileids to load from corpus')
corpus_group.add_argument('--fraction', default=1.0, type=float,
help='''The fraction of the corpus to use for testing coverage''')
+corpus_group.add_argument('--simplify_tags', action='store_true', default=False,
+ help='Use simplified tags. Requires the --metrics option.')
args = parser.parse_args()
@@ -42,6 +44,13 @@
corpus = load_corpus_reader(args.corpus, reader=args.reader, fileids=args.fileids)
+kwargs = {'fileids': args.fileids}
+
+if args.simplify_tags and not args.metrics:
+ raise ValueError('simplify_tags can only be used with the --metrics option')
+elif args.simplify_tags and args.corpus not in ['conll2000', 'switchboard']:
+ kwargs['simplify_tags'] = True
+
# TODO: support corpora with alternatives to tagged_sents that work just as well
if args.metrics and not hasattr(corpus, 'tagged_sents'):
raise ValueError('%s does not support metrics' % args.corpus)
@@ -79,7 +88,7 @@
tag_test = []
tag_word_refs = collections.defaultdict(set)
tag_word_test = collections.defaultdict(set)
- tagged_sents = corpus.tagged_sents(fileids=args.fileids)
+ tagged_sents = corpus.tagged_sents(**kwargs)
if args.fraction != 1.0:
cutoff = int(math.ceil(len(tagged_sents) * args.fraction))
@@ -33,12 +33,23 @@ analyzing tag coverage of corpora/treebank/tagged with ClassifierBasedPOSTagger
}
it_does_not_support_metrics() {
- last_line=$(./analyze_chunker_coverage.py treebank --score 2>&1 | tail -n 1)
- test "$last_line" "=" "ValueError: treebank does not support scoring"
+ last_line=$(./analyze_tagger_coverage.py movie_reviews --metrics 2>&1 | tail -n 1)
+ test "$last_line" "=" "ValueError: movie_reviews does not support metrics"
}
it_analyzes_treebank_metrics() {
two_lines=$(./analyze_tagger_coverage.py treebank --metrics --fraction 0.5 2>&1 | head -n 5 | tail -n 2)
echo "$two_lines" | grep -q "Accuracy:"
echo "$two_lines" | grep -q "Unknown words:"
+}
+
+it_requires_metrics_with_simplify_tags() {
+ last_line=$(./analyze_tagger_coverage.py treebank --simplify_tags 2>&1 | tail -n 1)
+ test "$last_line" "=" "ValueError: simplify_tags can only be used with the --metrics option"
+}
+
+it_analyzes_treebank_simplify_tags_metrics() {
+ two_lines=$(./analyze_tagger_coverage.py treebank --simplify_tags --metrics --fraction 0.5 2>&1 | head -n 5 | tail -n 2)
+ echo "$two_lines" | grep -q "Accuracy:"
+ echo "$two_lines" | grep -q "Unknown words:"
}

0 comments on commit ca62cf9

Please sign in to comment.