diff --git a/analyze_tagged_corpus.py b/analyze_tagged_corpus.py index 530847c..7a4d51c 100755 --- a/analyze_tagged_corpus.py +++ b/analyze_tagged_corpus.py @@ -57,9 +57,7 @@ tag_counts = FreqDist() word_set = set() -if args.corpus in ['conll2000', 'switchboard']: - kwargs = {} -elif args.simplify_tags: +if args.simplify_tags and args.corpus not in ['conll2000', 'switchboard']: kwargs = {'simplify_tags': True} else: kwargs = {} diff --git a/analyze_tagger_coverage.py b/analyze_tagger_coverage.py index e679d6f..0d3782e 100755 --- a/analyze_tagger_coverage.py +++ b/analyze_tagger_coverage.py @@ -33,6 +33,8 @@ help='Specify fileids to load from corpus') corpus_group.add_argument('--fraction', default=1.0, type=float, help='''The fraction of the corpus to use for testing coverage''') +corpus_group.add_argument('--simplify_tags', action='store_true', default=False, + help='Use simplified tags. Requires the --metrics option.') args = parser.parse_args() @@ -42,6 +44,13 @@ corpus = load_corpus_reader(args.corpus, reader=args.reader, fileids=args.fileids) +kwargs = {'fileids': args.fileids} + +if args.simplify_tags and not args.metrics: + raise ValueError('simplify_tags can only be used with the --metrics option') +elif args.simplify_tags and args.corpus not in ['conll2000', 'switchboard']: + kwargs['simplify_tags'] = True + # TODO: support corpora with alternatives to tagged_sents that work just as well if args.metrics and not hasattr(corpus, 'tagged_sents'): raise ValueError('%s does not support metrics' % args.corpus) @@ -79,7 +88,7 @@ tag_test = [] tag_word_refs = collections.defaultdict(set) tag_word_test = collections.defaultdict(set) - tagged_sents = corpus.tagged_sents(fileids=args.fileids) + tagged_sents = corpus.tagged_sents(**kwargs) if args.fraction != 1.0: cutoff = int(math.ceil(len(tagged_sents) * args.fraction)) diff --git a/tests/analyze_tagger_coverage.sh b/tests/analyze_tagger_coverage.sh index f45a07f..56c4c2d 100755 --- a/tests/analyze_tagger_coverage.sh +++ b/tests/analyze_tagger_coverage.sh @@ -33,12 +33,23 @@ analyzing tag coverage of corpora/treebank/tagged with ClassifierBasedPOSTagger } it_does_not_support_metrics() { - last_line=$(./analyze_chunker_coverage.py treebank --score 2>&1 | tail -n 1) - test "$last_line" "=" "ValueError: treebank does not support scoring" + last_line=$(./analyze_tagger_coverage.py movie_reviews --metrics 2>&1 | tail -n 1) + test "$last_line" "=" "ValueError: movie_reviews does not support metrics" } it_analyzes_treebank_metrics() { two_lines=$(./analyze_tagger_coverage.py treebank --metrics --fraction 0.5 2>&1 | head -n 5 | tail -n 2) echo "$two_lines" | grep -q "Accuracy:" echo "$two_lines" | grep -q "Unknown words:" +} + +it_requires_metrics_with_simplify_tags() { + last_line=$(./analyze_tagger_coverage.py treebank --simplify_tags 2>&1 | tail -n 1) + test "$last_line" "=" "ValueError: simplify_tags can only be used with the --metrics option" +} + +it_analyzes_treebank_simplify_tags_metrics() { + two_lines=$(./analyze_tagger_coverage.py treebank --simplify_tags --metrics --fraction 0.5 2>&1 | head -n 5 | tail -n 2) + echo "$two_lines" | grep -q "Accuracy:" + echo "$two_lines" | grep -q "Unknown words:" } \ No newline at end of file