/
load.py
163 lines (152 loc) · 6.93 KB
/
load.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import os
import re
import gc
import sys
import glob
import marshal
from poemy import soundparts, soundparts_left
if __name__ == '__main__':
db = {}
db['cmudict'] = {} # delisted -> ['D IY1 L IH1 S T IH0 D', ...]
db['sounds'] = {} # delisted -> ['D IY L IH S T IH D', ...]
db['meters'] = {} # delisted -> ['110', ...]
db['rhyme'] = {} # IH D -> wretched, winded, wielded, ...
db['brhyme'] = {} # IY -> regal, eagle, ...
db['frhyme'] = {} # EY N T AH D -> painted, acquainted, ...
db['meterwords'] = {} # 110 -> delisted, digested, discounted, ...
db['syl2words'] = {} # 1 -> cat, hat, log, dog, bam, doh, ...
db['front'] = {} # T -> typo, tycoon, tye, ...
db['back'] = {} # T -> zealot, what, hat, zapped, ...
db['lex2words'] = {} # 8 -> womb, tissue, thumb, scab, ...
db['word2lex'] = {} # tissue -> 8, 27, 36
db['adjectives'] = set() # all english adjectives from wordnet
db['adverbs'] = set() # all english adverbs from wordnet
db['nouns'] = set() # all english nouns from wordnet
db['verbs'] = set() # all english verbs from wordnet
db['words'] = set() # all words from specified corpora
db['chain'] = {} # markov chain built from specified corpora
if len(sys.argv) == 1:
print "please specify at least one corpus"
sys.exit(1)
for corpus in sys.argv[1:]:
if not os.path.exists('corpora/' + corpus):
print "'%s' corpus not found" % (corpus)
sys.exit(1)
gc.disable()
print 'loading cmudict.txt...'
for line in open('data/cmudict.txt').readlines():
if not line.strip() or line.startswith(';;;'):
continue
word, pron = line.split(' ')
if word.endswith(')'):
word = word[:-3]
word = word.lower().replace('_', ' ')
pron = pron.strip()
sound = re.sub(r'\d', '', pron)
meter = re.sub(r'\D', '', pron).replace('2', '1')
db['cmudict'].setdefault(word, []).append(pron)
db['sounds'].setdefault(word, []).append(sound)
db['syl2words'].setdefault(len(meter), set()).add(word)
if len(meter) > 1:
db['meters'].setdefault(word, []).append(meter)
db['meterwords'].setdefault(meter, set()).add(word)
snds = sound.split()
db['front'].setdefault(snds[0], set()).add(word)
db['back'].setdefault(snds[-1], set()).add(word)
_, syls = soundparts(sound)
db['rhyme'].setdefault(syls[-1], set()).add(word)
db['brhyme'].setdefault(syls[0], set()).add(word)
while len(syls) >= 2:
db['frhyme'].setdefault(' '.join(syls), set()).add(word)
syls = syls[1:]
# ok so here's the deal: cmudict does a great job telling us which
# syllables are stressed in words with more than one syllable, but when it
# comes to single syllable words, it always marks them as stressed. this
# is incorrect because some words are highly stressed (like: cat, dog,
# bog) and some words are almost never stressed (like: on, a, the). we
# need to be able to tell the difference in order to maintain a consistent
# meter.
#
# so what we do is we take all the multi-syllable words and build two
# tables that tell us the probability that a word starting or ending with
# a particular sound will be stressed. then we go through each single
# syllable word and compare its start/end sounds to the probability tables
# to determine the likelihood that the word is stressed
print 'calculating probability of syllable stress...'
starters = {}
enders = {}
for w in (db['syl2words'][2] | db['syl2words'][3] | db['syl2words'][4] |
db['syl2words'][5] | db['syl2words'][6] | db['syl2words'][7]):
for ss, ms in zip(db['sounds'][w], db['meters'][w]):
s, m = soundparts_left(ss)[0][0], int(ms[0])
starters.setdefault(s, [0.0, 0.0])[m] += 1.0
s, m = soundparts(ss)[1][-1], int(ms[-1])
enders.setdefault(s, [0.0, 0.0])[m] += 1.0
starters = {s: v[1] / (v[0] + v[1]) for s, v in starters.iteritems()}
enders = {s: v[1] / (v[0] + v[1]) for s, v in enders.iteritems()}
print 'guessing which single syllable words are stressed...'
db['meterwords']['0'] = set()
db['meterwords']['1'] = set()
for w in db['syl2words'][1]:
start = soundparts_left(db['sounds'][w][0])[0][0]
end = soundparts(db['sounds'][w][0])[1][-1]
p = (starters.get(start, 0.6) + enders.get(end, 0.6)) / 2.0
if p < 0.5:
db['meters'][w] = ['0']
db['meterwords']['0'].add(w)
elif p < 0.7:
db['meters'][w] = ['0', '1']
db['meterwords']['0'].add(w)
db['meterwords']['1'].add(w)
else:
db['meters'][w] = ['1']
db['meterwords']['1'].add(w)
print 'loading wordnet...'
for path in glob.glob('data/wordnet/data.*'):
for line in open(path).readlines():
if not line or line.startswith(' '):
continue
toks = line.split()
lex = int(toks[1])
wtype = toks[2]
cnt = int(toks[3], 16)
for n in range(cnt):
word = toks[4 + n * 2].replace('_', ' ')
db['lex2words'].setdefault(lex, set()).add(word)
db['word2lex'].setdefault(word, set()).add(lex)
if wtype == 'r':
db['adverbs'].add(word)
elif wtype == 'n':
db['nouns'].add(word)
elif wtype in ('a', 's'):
db['adjectives'].add(word)
elif wtype == 'v':
db['verbs'].add(word)
for corpus in sys.argv[1:]:
print "loading '%s' corpus..." % (corpus)
for path in glob.glob('corpora/%s/*.txt' % (corpus)):
data = open(path).read()
data = data.lower()
data = re.sub(r'\.', '\n', data)
data = re.sub(r"[^-'\n a-z]", r' ', data)
data = re.sub(r"([a-z])-+(\s)", r'\1\2', data)
data = re.sub(r"(\s)-+([a-z])", r'\1\2', data)
db['words'] |= set(data.split())
for line in data.splitlines():
if not line.strip():
continue
try:
iwords = iter(line.split())
w1 = iwords.next()
w2 = iwords.next()
while True:
w3 = iwords.next()
db['chain'].setdefault((w1, w2), []).append(w3)
w1, w2 = w2, w3
except StopIteration:
pass
for key in db['chain'].keys():
db['chain'][key] = list(set(db['chain'][key]))
db['chain'][key].sort(key=lambda w: len(w), reverse=True)
print 'marshaling...'
marshal.dump(db, open('db.marshal', 'w'))