Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Newer
Older
100755 112 lines (88 sloc) 3.318 kb
3c28516 Correctly pull in the environment python
Keoki Seu authored
1 #!/usr/bin/env python
2ca3b0d @japerk analyze chunked corpus works for v3
authored
2 import argparse, collections
ad64c1e @japerk initial chunked corpus analyzer
authored
3 import nltk.corpus
4 from nltk.tree import Tree
5 from nltk.corpus.util import LazyCorpusLoader
94a5143 @japerk simplify tags is optional
authored
6 from nltk_trainer import load_corpus_reader, simplify_wsj_tag
2ca3b0d @japerk analyze chunked corpus works for v3
authored
7 from nltk_trainer.chunking.transforms import node_label
ad64c1e @japerk initial chunked corpus analyzer
authored
8
9 ########################################
10 ## command options & argument parsing ##
11 ########################################
12
13 parser = argparse.ArgumentParser(description='Analyze a chunked corpus',
14 formatter_class=argparse.RawTextHelpFormatter)
15
16 parser.add_argument('corpus',
17 help='''The name of a chunked corpus included with NLTK, such as
18 treebank_chunk or conll2002, or the root path to a corpus directory,
19 which can be either an absolute path or relative to a nltk_data directory.''')
20 parser.add_argument('--trace', default=1, type=int,
21 help='How much trace output you want, defaults to %(default)d. 0 is no trace output.')
22
23 corpus_group = parser.add_argument_group('Corpus Reader Options')
24 corpus_group.add_argument('--reader', default=None,
25 help='''Full module path to a corpus reader class, such as
26 nltk.corpus.reader.chunked.ChunkedCorpusReader''')
27 corpus_group.add_argument('--fileids', default=None,
28 help='Specify fileids to load from corpus')
29
94a5143 @japerk simplify tags is optional
authored
30 if simplify_wsj_tag:
31 corpus_group.add_argument('--simplify_tags', action='store_true', default=False,
32 help='Use simplified tags')
33
46347b5 @japerk count IOBs by tag and print conditional counts in IOB columns
authored
34 sort_group = parser.add_argument_group('Tag Count Sorting Options')
35 sort_group.add_argument('--sort', default='tag', choices=['tag', 'count'],
ad64c1e @japerk initial chunked corpus analyzer
authored
36 help='Sort key, defaults to %(default)s')
37 sort_group.add_argument('--reverse', action='store_true', default=False,
38 help='Sort in revere order')
39
40 args = parser.parse_args()
41
42 ###################
43 ## corpus reader ##
44 ###################
45
46 chunked_corpus = load_corpus_reader(args.corpus, reader=args.reader, fileids=args.fileids)
47
48 if not chunked_corpus:
49 raise ValueError('%s is an unknown corpus')
50
51 if args.trace:
f122cfd @japerk print function
authored
52 print('loading %s' % args.corpus)
ad64c1e @japerk initial chunked corpus analyzer
authored
53
54 ##############
55 ## counting ##
56 ##############
57
58 wc = 0
2ca3b0d @japerk analyze chunked corpus works for v3
authored
59 tag_counts = collections.defaultdict(int)
60 iob_counts = collections.defaultdict(int)
61 tag_iob_counts = collections.defaultdict(lambda: collections.defaultdict(int))
ad64c1e @japerk initial chunked corpus analyzer
authored
62 word_set = set()
63
64 for obj in chunked_corpus.chunked_words():
65 if isinstance(obj, Tree):
2ca3b0d @japerk analyze chunked corpus works for v3
authored
66 label = node_label(obj)
67 iob_counts[label] += 1
46347b5 @japerk count IOBs by tag and print conditional counts in IOB columns
authored
68
69 for word, tag in obj.leaves():
70 wc += 1
71 word_set.add(word)
2ca3b0d @japerk analyze chunked corpus works for v3
authored
72 tag_counts[tag] += 1
73 tag_iob_counts[tag][label] += 1
46347b5 @japerk count IOBs by tag and print conditional counts in IOB columns
authored
74 else:
75 word, tag = obj
ad64c1e @japerk initial chunked corpus analyzer
authored
76 wc += 1
77 word_set.add(word)
2ca3b0d @japerk analyze chunked corpus works for v3
authored
78 tag_counts[tag] += 1
ad64c1e @japerk initial chunked corpus analyzer
authored
79
80 ############
81 ## output ##
82 ############
83
f122cfd @japerk print function
authored
84 print('%d total words' % wc)
85 print('%d unique words' % len(word_set))
86 print('%d tags' % len(tag_counts))
87 print('%d IOBs\n' % len(iob_counts))
ad64c1e @japerk initial chunked corpus analyzer
authored
88
46347b5 @japerk count IOBs by tag and print conditional counts in IOB columns
authored
89 if args.sort == 'tag':
aa42d2c @japerk lambda & syntax updates, comment out simplify tags
authored
90 sort_key = lambda tc: tc[0]
ad64c1e @japerk initial chunked corpus analyzer
authored
91 elif args.sort == 'count':
aa42d2c @japerk lambda & syntax updates, comment out simplify tags
authored
92 sort_key = lambda tc: tc[1]
ad64c1e @japerk initial chunked corpus analyzer
authored
93 else:
94 raise ValueError('%s is not a valid sort option' % args.sort)
95
46347b5 @japerk count IOBs by tag and print conditional counts in IOB columns
authored
96 line1 = ' Tag Count '
97 line2 = '======= ========='
ad64c1e @japerk initial chunked corpus analyzer
authored
98
46347b5 @japerk count IOBs by tag and print conditional counts in IOB columns
authored
99 iobs = sorted(iob_counts.keys())
ad64c1e @japerk initial chunked corpus analyzer
authored
100
46347b5 @japerk count IOBs by tag and print conditional counts in IOB columns
authored
101 for iob in iobs:
102 line1 += ' %s ' % iob
103 line2 += ' ==%s==' % ('=' * len(iob))
104
f122cfd @japerk print function
authored
105 print(line1)
106 print(line2)
46347b5 @japerk count IOBs by tag and print conditional counts in IOB columns
authored
107
108 for tag, count in sorted(tag_counts.items(), key=sort_key, reverse=args.reverse):
109 iob_counts = [str(tag_iob_counts[tag][iob]).rjust(4+len(iob)) for iob in iobs]
f122cfd @japerk print function
authored
110 print(' '.join([tag.ljust(7), str(count).rjust(9)] + iob_counts))
46347b5 @japerk count IOBs by tag and print conditional counts in IOB columns
authored
111
f122cfd @japerk print function
authored
112 print(line2)
Something went wrong with that request. Please try again.