Skip to content

Commit

Permalink
support simplify_tags for analyze_tagger_coverage metrics
Browse files Browse the repository at this point in the history
  • Loading branch information
japerk committed Aug 7, 2011
1 parent 43cb97d commit ca62cf9
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 6 deletions.
4 changes: 1 addition & 3 deletions analyze_tagged_corpus.py
Expand Up @@ -57,9 +57,7 @@
tag_counts = FreqDist() tag_counts = FreqDist()
word_set = set() word_set = set()


if args.corpus in ['conll2000', 'switchboard']: if args.simplify_tags and args.corpus not in ['conll2000', 'switchboard']:
kwargs = {}
elif args.simplify_tags:
kwargs = {'simplify_tags': True} kwargs = {'simplify_tags': True}
else: else:
kwargs = {} kwargs = {}
Expand Down
11 changes: 10 additions & 1 deletion analyze_tagger_coverage.py
Expand Up @@ -33,6 +33,8 @@
help='Specify fileids to load from corpus') help='Specify fileids to load from corpus')
corpus_group.add_argument('--fraction', default=1.0, type=float, corpus_group.add_argument('--fraction', default=1.0, type=float,
help='''The fraction of the corpus to use for testing coverage''') help='''The fraction of the corpus to use for testing coverage''')
corpus_group.add_argument('--simplify_tags', action='store_true', default=False,
help='Use simplified tags. Requires the --metrics option.')


args = parser.parse_args() args = parser.parse_args()


Expand All @@ -42,6 +44,13 @@


corpus = load_corpus_reader(args.corpus, reader=args.reader, fileids=args.fileids) corpus = load_corpus_reader(args.corpus, reader=args.reader, fileids=args.fileids)


kwargs = {'fileids': args.fileids}

if args.simplify_tags and not args.metrics:
raise ValueError('simplify_tags can only be used with the --metrics option')
elif args.simplify_tags and args.corpus not in ['conll2000', 'switchboard']:
kwargs['simplify_tags'] = True

# TODO: support corpora with alternatives to tagged_sents that work just as well # TODO: support corpora with alternatives to tagged_sents that work just as well
if args.metrics and not hasattr(corpus, 'tagged_sents'): if args.metrics and not hasattr(corpus, 'tagged_sents'):
raise ValueError('%s does not support metrics' % args.corpus) raise ValueError('%s does not support metrics' % args.corpus)
Expand Down Expand Up @@ -79,7 +88,7 @@
tag_test = [] tag_test = []
tag_word_refs = collections.defaultdict(set) tag_word_refs = collections.defaultdict(set)
tag_word_test = collections.defaultdict(set) tag_word_test = collections.defaultdict(set)
tagged_sents = corpus.tagged_sents(fileids=args.fileids) tagged_sents = corpus.tagged_sents(**kwargs)


if args.fraction != 1.0: if args.fraction != 1.0:
cutoff = int(math.ceil(len(tagged_sents) * args.fraction)) cutoff = int(math.ceil(len(tagged_sents) * args.fraction))
Expand Down
15 changes: 13 additions & 2 deletions tests/analyze_tagger_coverage.sh
Expand Up @@ -33,12 +33,23 @@ analyzing tag coverage of corpora/treebank/tagged with ClassifierBasedPOSTagger
} }


it_does_not_support_metrics() { it_does_not_support_metrics() {
last_line=$(./analyze_chunker_coverage.py treebank --score 2>&1 | tail -n 1) last_line=$(./analyze_tagger_coverage.py movie_reviews --metrics 2>&1 | tail -n 1)
test "$last_line" "=" "ValueError: treebank does not support scoring" test "$last_line" "=" "ValueError: movie_reviews does not support metrics"
} }


it_analyzes_treebank_metrics() { it_analyzes_treebank_metrics() {
two_lines=$(./analyze_tagger_coverage.py treebank --metrics --fraction 0.5 2>&1 | head -n 5 | tail -n 2) two_lines=$(./analyze_tagger_coverage.py treebank --metrics --fraction 0.5 2>&1 | head -n 5 | tail -n 2)
echo "$two_lines" | grep -q "Accuracy:" echo "$two_lines" | grep -q "Accuracy:"
echo "$two_lines" | grep -q "Unknown words:" echo "$two_lines" | grep -q "Unknown words:"
}

it_requires_metrics_with_simplify_tags() {
last_line=$(./analyze_tagger_coverage.py treebank --simplify_tags 2>&1 | tail -n 1)
test "$last_line" "=" "ValueError: simplify_tags can only be used with the --metrics option"
}

it_analyzes_treebank_simplify_tags_metrics() {
two_lines=$(./analyze_tagger_coverage.py treebank --simplify_tags --metrics --fraction 0.5 2>&1 | head -n 5 | tail -n 2)
echo "$two_lines" | grep -q "Accuracy:"
echo "$two_lines" | grep -q "Unknown words:"
} }

0 comments on commit ca62cf9

Please sign in to comment.