Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

dynamic formatting for tag columns

  • Loading branch information...
commit f5cc16a860d616505a37b183b5472dcc7fe8bbd8 1 parent faa2a65
@japerk authored
Showing with 27 additions and 13 deletions.
  1. +9 −4 analyze_tagged_corpus.py
  2. +18 −9 analyze_tagger_coverage.py
View
13 analyze_tagged_corpus.py
@@ -55,6 +55,7 @@
wc = 0
tag_counts = FreqDist()
+taglen = 7
word_set = set()
if args.simplify_tags and args.corpus not in ['conll2000', 'switchboard']:
@@ -63,6 +64,9 @@
kwargs = {}
for word, tag in tagged_corpus.tagged_words(fileids=args.fileids, **kwargs):
+ if len(tag) > taglen:
+ taglen = len(tag)
+
if args.corpus in ['conll2000', 'switchboard'] and args.simplify_tags:
tag = simplify_wsj_tag(tag)
@@ -85,11 +89,12 @@
else:
raise ValueError('%s is not a valid sort option' % args.sort)
+countlen = max(len(str(tag_counts[tag_counts.max()])) + 2, 9)
# simple reSt table format
-print ' Tag Count '
-print '======= ========='
+print ' '.join(['Tag'.center(taglen), 'Count'.center(countlen)])
+print ' '.join(['='*taglen, '='*(countlen)])
for tag, count in sorted(tag_counts.items(), key=sort_key, reverse=args.reverse):
- print ' '.join([tag.ljust(7), str(count).rjust(9)])
+ print ' '.join([tag.ljust(taglen), str(count).rjust(countlen)])
-print '======= ========='
+print ' '.join(['='*taglen, '='*(countlen)])
View
27 analyze_tagger_coverage.py
@@ -89,6 +89,7 @@
tag_word_refs = collections.defaultdict(set)
tag_word_test = collections.defaultdict(set)
tagged_sents = corpus.tagged_sents(**kwargs)
+ taglen = 7
if args.fraction != 1.0:
cutoff = int(math.ceil(len(tagged_sents) * args.fraction))
@@ -99,6 +100,9 @@
tags_actual.inc(tag)
tag_refs.append(tag)
tag_word_refs[tag].add(word)
+
+ if len(tag) > taglen:
+ taglen = len(tag)
for word, tag in tagger.tag(nltk.tag.untag(tagged_sent)):
tags_found.inc(tag)
@@ -111,24 +115,26 @@
print 'Accuracy: %f' % nltk.metrics.accuracy(tag_refs, tag_test)
print 'Unknown words: %d' % len(unknown_words)
- if args.trace:
+ if args.trace and unknown_words:
print ', '.join(sorted(unknown_words))
print ''
- print ' Tag Found Actual Precision Recall '
- print '======= ========= ========== ============= =========='
+ print ' '.join(['Tag'.center(taglen), 'Found'.center(9), 'Actual'.center(10),
+ 'Precision'.center(13), 'Recall'.center(10)])
+ print ' '.join(['='*taglen, '='*9, '='*10, '='*13, '='*10])
for tag in sorted(set(tags_found.keys()) | set(tags_actual.keys())):
found = tags_found[tag]
actual = tags_actual[tag]
precision = nltk.metrics.precision(tag_word_refs[tag], tag_word_test[tag])
recall = nltk.metrics.recall(tag_word_refs[tag], tag_word_test[tag])
- print ' '.join([tag.ljust(7), str(found).rjust(9), str(actual).rjust(10),
+ print ' '.join([tag.ljust(taglen), str(found).rjust(9), str(actual).rjust(10),
str(precision).ljust(13)[:13], str(recall).ljust(10)[:13]])
- print '======= ========= ========== ============= =========='
+ print ' '.join(['='*taglen, '='*9, '='*10, '='*13, '='*10])
else:
sents = corpus.sents()
+ taglen = 7
if args.fraction != 1.0:
cutoff = int(math.ceil(len(sents) * args.fraction))
@@ -137,11 +143,14 @@
for sent in sents:
for word, tag in tagger.tag(sent):
tags_found.inc(tag)
+
+ if len(tag) > taglen:
+ taglen = len(tag)
- print ' Tag Found '
- print '======= ========='
+ print ' '.join(['Tag'.center(taglen), 'Count'.center(9)])
+ print ' '.join(['='*taglen, '='*9])
for tag in sorted(tags_found.samples()):
- print ' '.join([tag.ljust(7), str(tags_found[tag]).rjust(9)])
+ print ' '.join([tag.ljust(taglen), str(tags_found[tag]).rjust(9)])
- print '======= ========='
+ print ' '.join(['='*taglen, '='*9])
Please sign in to comment.
Something went wrong with that request. Please try again.