-
Notifications
You must be signed in to change notification settings - Fork 0
/
markov.py
342 lines (285 loc) · 9.38 KB
/
markov.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
import sys, os, re, random, operator, datetime
##############################
directory_in = ""
directory_out = ""
clump_size = 2
num_words = 20000
line_width = 80
n = datetime.datetime.now()
run_id = "%04d" % n.year
run_id += "%02d" % n.month
run_id += "%02d" % n.day
run_id += "_" + "%02d" % n.hour
run_id += "%02d" % n.minute
run_id += "%02d" % n.second
##############################
help_text = """
This is a tool for generating output text that is
statistically similar to input text using a Markov
chain.
Syntax:
markov.py <input directory> <output directory> (-num_words <#>)
<input directory>
Path to a directory with at least one input text file. For example:
C:\\data\\text
<output directory>
Path where the output will be stored. For example:
C:\\data\\output
-clump_size <#>
Optional argument to specify how many words should be grouped
together when building the Markov chain. For example:
-clump_size 2
A value of "1" will pair single adjacent words together:
"To" -> "be"
A value of "2" (default) will pair couplets of words together:
"To be" -> "or not"
Etc.
-num_words <#>
Optional argument to specify the number of output clumps
to generate (default 20000). For example:
-num_words 10000
"""
lead_text = """
Markov Chain Text Generator
by Justin Libby 2014
"""
#####################################################
def print_help():
global help_text
print help_text
exit(0)
#####################################################
def parse_args(args):
global lead_text
global directory_in
global directory_out
global num_words
global clump_size
if len(args) < 2:
print_help()
i = 0
while i < len(args):
# first arg is the name of the script
if i > 0:
if re.match('-num_words', args[i]):
num_words = int(args[i+1])
i += 1
continue
if re.match('-clump_size', args[i]):
clump_size = int(args[i+1])
i += 1
continue
if i == 1:
directory_in = args[i]
if i == 2:
directory_out = args[i]
i += 1
if len(directory_in) == 0:
print "Error: missing input directory"
exit("Error: missing input directory")
if len(directory_out) == 0:
print "Error: missing output directory"
exit("Error: missing output directory")
if not os.path.exists(directory_in):
print "Error: input directory " + directory_in + " does not exist"
exit("Error: input directory " + directory_in + " does not exist")
if num_words < 0:
print "Error: number of generated words can not be negative"
exit("Error: number of generated words can not be negative")
print lead_text
print "Input directory = " + directory_in
print "Output directory = " + directory_out
print "Clump size = " + str(clump_size)
print "Number of words to generate = " + str(num_words)
################################
def generate_markov_table(text, num_next):
ret = {}
i = 0;
print "Creating Markov table from " + str(len(text)) + " words"
while i < len(text) - 1 - (num_next * 2):
s = ""
spp = ""
for j in range(num_next):
s += text[i + j]
spp += text[i + j + num_next]
if j < num_next - 1:
s += " "
spp += " "
# Add this instance of word (s) followed by word (spp)
if s in ret:
if spp in ret[s]:
ret[s][spp] += 1
else:
ret[s][spp] = 1
else:
ret[s] = {}
ret[s][spp] = 1
i += 1
# print some stats
keys = ret.keys()
key_freq = []
for k in keys:
subkeys = ret[k].keys()
total = 0
for sk in subkeys:
total += ret[k][sk]
key_freq.append([k, total])
# sort list by the frequency (second item)
key_freq.sort(key=operator.itemgetter(1))
print "Most numerous keys:"
top_keys = key_freq[(len(key_freq) - 11):(len(key_freq) - 1)]
top_keys.reverse()
for s in top_keys:
print " \"" + s[0] + "\" appears " + str(s[1]) + " times"
return ret
################################
def get_weighted_word(tbl, parent_key):
keys = tbl.keys()
total = 0
for k in tbl:
total += tbl[k]
# total is the total number of instances
# we have to choose from
if total == 1:
# only one entry, so pick it
return keys[0]
else:
rnd_idx = random.randint(0, total - 1)
cur_idx = 0
for k in tbl:
for i in range(tbl[k]):
if rnd_idx == cur_idx:
return k
cur_idx += 1
print "Failed to get weighted output word for parent key: " + parent_key
return ""
################################
def generate_markov_text(tbl, num_out):
global line_width
keys = tbl.keys()
num_keys = len(keys)
print "Generating text from " + str(num_keys) + " source keys"
rnd_idx = random.randint(0, num_keys - 1)
print "Starting at random index " + str(rnd_idx)
rnd_key = keys[rnd_idx]
print "First word clump: " + str(rnd_key)
ret = rnd_key
word = rnd_key
counter = 0
num_random = 0
line_count = 0
for i in range(num_out):
if word in tbl:
new_word = get_weighted_word(tbl[word], word)
else:
new_word = keys[random.randint(0, num_keys - 1)]
num_random += 1
if len(new_word) > 0:
word = new_word
ret += " " + new_word
line_count += len(new_word) + 1
else:
word = keys[random.randint(0, num_keys - 1)]
num_random += 1
# sprinkle in some new line characters
if line_count > line_width:
ret += "\n"
line_count = 0
counter += 1
print "Had to choose " + str(num_random) + " random word clumps out of " + str(num_out) + " output word clumps"
return ret
################################
def get_file_list():
global directory_in
ret = []
print "Found the following input files:"
for one_file in os.listdir(directory_in):
if one_file.endswith(".txt"):
print " " + one_file
file_name = directory_in + "\\" + one_file
ret.append(file_name)
return ret
################################
def read_input():
files_in = get_file_list()
text_in_unclean = ""
for f in files_in:
fi = open(f, 'r')
text_in_unclean += fi.read() + " "
fi.close()
print "Characters of input text : " + str(len(text_in_unclean))
# remove irrelevant ASCII characters
text_in = ""
for c in text_in_unclean:
if ord(c) > 126:
text_in += " "
continue
if ord(c) < 32:
text_in += " "
continue
text_in += c
text_in = re.sub('\n', ' ', text_in)
text_in = re.sub('\t', ' ', text_in)
# text_in = re.sub('\.', ' ', text_in)
# text_in = re.sub(',', ' ', text_in)
# text_in = re.sub('\?', ' ', text_in)
# text_in = re.sub('!', ' ', text_in)
text_in = re.sub('"', ' ', text_in)
text_in = re.sub(';', ' ', text_in)
text_in = re.sub('=', ' ', text_in)
text_in = re.sub('~', ' ', text_in)
text_in = re.sub('\\\\', ' ', text_in)
text_in = re.sub('/', ' ', text_in)
print "Characters of cleaned text : " + str(len(text_in))
# Split out words based on whitespace
#words = re.split('\W+', text_in)
words = re.split(' ', text_in)
fixed_words = []
for word in words:
w = word.strip()
if not w:
continue
fixed_words.append(w)
# Print out some samples of the input
example_idx = []
for i in range(10):
example_idx.append(random.randint(0, len(fixed_words)))
print "Example input words: "
for i in example_idx:
print " " + fixed_words[i]
return fixed_words
################################
def write_output(text_out):
global directory_out
global run_id
file_out = directory_out + "\\" + run_id + "_output.txt"
print "Writing output to file: " + file_out
fo = open(file_out, 'w')
# put periods before capital letters
# XXX - make this a post-processing step?
# XXX - todo: track punctuation placement from input words
# text_out = re.sub(r"([A-Z][a-z])", r". \1", text_out)
# fix periods floating away from words
# text_out = re.sub(r"( \.)", r".", text_out)
fo.write(text_out)
fo.close()
################################
def write_log(markov_table):
global directory_out
global run_id
log_out = directory_out + "\\" + run_id + "_log.txt"
print "Writing log to file: " + log_out
# write out statistics about the markov table
fo = open(log_out, 'w')
fo.write("FirstWord\tSecondWord\tFrequency\n")
for k in markov_table:
for s in markov_table[k]:
fo.write(k + "\t" + s + "\t" + str(markov_table[k][s]) + "\n")
fo.close()
##################################
parse_args(sys.argv)
word_array_in = read_input()
markov_table = generate_markov_table(word_array_in, clump_size)
text_out = generate_markov_text(markov_table, num_words)
write_output(text_out)
write_log(markov_table)