-
Notifications
You must be signed in to change notification settings - Fork 0
/
SpamNovel.py
155 lines (115 loc) · 4.65 KB
/
SpamNovel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
'''
SPAM NOVEL
Jeff Thompson | 2014 | www.jeffreythompson.org
'''
import Markov, random, re
comments_file = 'Comments.txt'
authors_file = 'Authors.txt'
num_words_in_novel = 50000
chapter = 1
chance_chapter = 0.02
chance_paragraph = 0.01
chance_period = 0.1
chance_question = 0.001
# other variables (set later)
capitalize = True
in_dialog = False
chapter = 1
add_dialog_quotes = True
# words for formatting sentences
pronouns = '(he|she|it)'
pronouns_upper = '(He|She|It)'
said = '(said|whispered|yelled|commanded|urged|plead|muttered)'
asked = '(asked|queried|inquired|demanded|begged)'
# not really all articles, but basically words we don't want to end a sentence with
articles = [ 'a', 'an', 'the', 'and', 'or', 'if', 'of', 'by', 'as' ]
# words to capitalize - list in lowercase! (could include names, places, etc)
words_to_capitalize = [ 'i' ]
# list of punctuation marks to look for
punctuation = [ '.', ',', '?', '!' ]
print 'SPAM NOVEL'
# load markov chain
with open(comments_file) as f:
print '- parsing file (may take a while)...'
markov = Markov.Markov(f)
# generate text
print '- generating output...'
book = ''
word_count = 0
while word_count < num_words_in_novel:
markov_output = markov.generate_markov_text(size=1000) # generate long set of words, will be trimmed
words = markov_output.split(' ')
for word in words:
if capitalize:
word = word.title()
capitalize = False
# randomly break sentence
if random.random() < chance_period and word.lower() not in articles:
word += '.'
capitalize = True
continue
elif random.random() < chance_question and word.lower() not in articles:
word += '?'
capitalize = True
continue
book += word + ' '
# random new paragraph and chapter
if random.random() < chance_paragraph and word.lower() not in articles:
book += '.\n\n' # be sure to add a period first
if random.random() < chance_chapter: # random chapter
chapter += 1
book += '\nCHAPTER ' + str(chapter)
book += '\n\n'
capitalize = True
continue
# update word count
word_count += 1
if word_count > num_words_in_novel:
break
# run general text cleanup
print '- cleaning up the text...'
# add a period at the end
if book[:-2] not in punctuation:
book += '.'
# clean up any weirdness (easier than fixing in the code above... a hack, I know)
book = re.sub(r',+\.+', '.', book) # , followed by .
book = re.sub(r'\?+\.+', '?', book) # ? followed by .
book = re.sub(r'\s+\.+', '.', book) # space before .
book = re.sub(r'\s+,+', ',', book) # ditto ,
book = re.sub(r'\s+;+', ';', book) # ditto ;
book = re.sub(r'\s+;+', ':', book) # ditto :
book = re.sub(r',{2,}', ', ', book) # more than 1 ,
book = re.sub(r'\.{2,}', '.', book) # ditto .
book = re.sub(r'[^\S\r\n]{2,}', ' ', book) # 2 or more spaces (ignore \n and \r)
# wow, super ugly: remove extra space at the start of paragraphs and capitalize as needed
book = re.sub(r'\n.*?(\b[a-zA-Z])', lambda pat: '\n' + pat.group(1).upper(), book)
# also ugly: make sure all sentences are capitalized (may be wrong after some of the regex above...)
book = re.sub(r'(\.|\?) ([a-z])', lambda pat: pat.group(1) + ' ' + pat.group(2).upper(), book)
# fix oddly capitalized letters after apostrophes (catches instances of things like he'Ll too)
book = re.sub(r'\'([A-Z].*?)\b', lambda pat: '\'' + pat.group(1).lower(), book)
# fix any missing end-of-paragraph periods
book = re.sub(r'(\b[^\.]\n+)', r'.\1', book)
# add quotes around what seems like dialog
if add_dialog_quotes:
book = re.sub(r'\.\W([^\.]*?) ' + pronouns + ' ' + said + '\.', r'.\n\n"\1," \2 \3.\n\n', book)
book = re.sub(r'\.\W' + pronouns_upper+ ' ' + said + ' ([^\.]*?)\.', r'.\n\n\1 \2, "\3."\n\n', book)
book = re.sub(r'\.\W([^\.]*?) ' + pronouns + ' ' + asked + '\.', r'.\n\n"\1?" \2 \3.\n\n', book)
book = re.sub(r'\.\W' + pronouns_upper + ' ' + asked + ' ([^\.]*?)\.', r'.\n\n\1 \2, "\3?"\n\n', book)
book = re.sub(r'"(\b[a-z])', lambda pat: '"' + pat.group(1).upper(), book)
# end of book! add contributors
book += '\n\nEND\n\n\n\nCONTRIBUTORS\nThis book was made using spam comments by the following authors:\n'
with open(authors_file) as f:
authors_set = set()
for author in f:
authors_set.add(author.strip())
authors = sorted(authors_set)
num_authors = len(authors)
book += '\n'.join(authors)
# done! write to file
print '- writing to file...'
with open('output.txt', 'w') as f:
f.write('SPAM NOVEL' + '\n' + 'Coded by Jeff Thompson and written by ' + str(num_authors) + ' spam authors' + '\n' + '2014')
f.write('\n\n\nCHAPTER 1\n\n')
f.write(book)
print '- chapters created: ' + str(chapter)
print 'ALL DONE!'