/
nltk_term_index.py
executable file
·107 lines (85 loc) · 3.09 KB
/
nltk_term_index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
from __future__ import print_function
import re
import sys
import nltk
import epydoc.docbuilder
import epydoc.cli
from epydoc import log
STOPLIST = '../../tools/nltk_term_index.stoplist'
FILENAMES = ['ch%02d.xml' % n for n in range(13)]
TARGET_DIR = 'nlp/'
#FILENAMES = ['../doc/book/ll.xml']
logger = epydoc.cli.ConsoleLogger(0)
logger._verbosity = 5
log.register_logger(logger)
def find_all_names(stoplist):
ROOT = ['nltk']
logger._verbosity = 0
docindex = epydoc.docbuilder.build_doc_index(ROOT, add_submodules=True)
valdocs = sorted(docindex.reachable_valdocs(
imports=False,
#packages=False, bases=False, submodules=False,
# subclasses=False,
private=False))
logger._verbosity = 5
names = nltk.defaultdict(list)
n = 0
for valdoc in valdocs:
name = valdoc.canonical_name
if (name is not epydoc.apidoc.UNKNOWN and
name is not None and name[0] == 'nltk'):
n += 1
for i in range(len(name)):
key = str(name[i:])
if len(key) == 1:
continue
if key in stoplist:
continue
names[key].append(valdoc)
log.info('Found %s names from %s objects' % (len(names), n))
return names
SCAN_RE1 = "<programlisting>[\s\S]*?</programlisting>"
SCAN_RE2 = "<literal>[\s\S]*?</literal>"
SCAN_RE = re.compile("(%s)|(%s)" % (SCAN_RE1, SCAN_RE2))
TOKEN_RE = re.compile('[\w\.]+')
LINE_RE = re.compile('.*')
INDEXTERM = '<indexterm type="nltk"><primary>%s</primary></indexterm>'
def scan_xml(filenames, names):
fdist = nltk.FreqDist()
def linesub(match):
line = match.group()
for token in TOKEN_RE.findall(line):
if token in names:
targets = names[token]
fdist.inc(token)
if len(targets) > 1:
log.warning('%s is ambiguous: %s' % (
token, ', '.join(str(v.canonical_name)
for v in names[token])))
line += INDEXTERM % token
#line += INDEXTERM % names[token][0].canonical_name
return line
def scansub(match):
return LINE_RE.sub(linesub, match.group())
for filename in filenames:
log.info(' %s' % filename)
src = open(filename, 'rb').read()
src = SCAN_RE.sub(scansub, src)
# out = open(filename[:-4]+'.li.xml', 'wb')
out = open(TARGET_DIR + filename, 'wb')
out.write(src)
out.close()
for word in fdist:
namestr = ('\n' + 38 * ' ').join([str(v.canonical_name[:-1])
for v in names[word][:1]])
print('[%3d] %-30s %s' % (fdist[word], word, namestr))
sys.stdout.flush()
def main():
log.info('Loading stoplist...')
stoplist = open(STOPLIST).read().split()
log.info(' Stoplist contains %d words' % len(stoplist))
log.info('Running epydoc to build a name index...')
names = find_all_names(stoplist)
log.info('Scanning xml files...')
scan_xml(FILENAMES, names)
main()