Skip to content

Commit 6059c2d

Browse files
committed
update
1 parent dafca78 commit 6059c2d

File tree

1 file changed

+293
-0
lines changed

1 file changed

+293
-0
lines changed

nlp_class/cipher_evolve.py

Lines changed: 293 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,293 @@
1+
# https://deeplearningcourses.com/c/data-science-natural-language-processing-in-python
2+
# https://www.udemy.com/data-science-natural-language-processing-in-python
3+
4+
# Author: http://lazyprogrammer.me
5+
6+
import numpy as np
7+
import matplotlib.pyplot as plt
8+
9+
import string
10+
import random
11+
import re
12+
import requests
13+
import os
14+
15+
16+
### create substitution cipher
17+
18+
# one will act as the key, other as the value
19+
letters1 = list(string.ascii_lowercase)
20+
letters2 = list(string.ascii_lowercase)
21+
22+
true_mapping = {}
23+
24+
# shuffle second set of letters
25+
random.shuffle(letters2)
26+
27+
# populate map
28+
for k, v in zip(letters1, letters2):
29+
true_mapping[k] = v
30+
31+
32+
33+
### the language model
34+
35+
# initialize Markov matrix
36+
M = np.ones((26, 26))
37+
38+
# initial state distribution
39+
pi = np.zeros(26)
40+
41+
# a function to update the Markov matrix
42+
def update_transition(ch1, ch2):
43+
# ord('a') = 97, ord('b') = 98, ...
44+
i = ord(ch1) - 97
45+
j = ord(ch2) - 97
46+
M[i,j] += 1
47+
48+
# a function to update the initial state distribution
49+
def update_pi(ch):
50+
i = ord(ch) - 97
51+
pi[i] += 1
52+
53+
# get the log-probability of a word / token
54+
def get_word_prob(word):
55+
# print("word:", word)
56+
i = ord(word[0]) - 97
57+
logp = np.log(pi[i])
58+
59+
for ch in word[1:]:
60+
j = ord(ch) - 97
61+
logp += np.log(M[i, j]) # update prob
62+
i = j # update j
63+
64+
return logp
65+
66+
# get the probability of a sequence of words
67+
def get_sequence_prob(words):
68+
# if input is a string, split into an array of tokens
69+
if type(words) == str:
70+
words = words.split()
71+
72+
logp = 0
73+
for word in words:
74+
logp += get_word_prob(word)
75+
return logp
76+
77+
78+
### create a markov model based on an English dataset
79+
# is an edit of https://www.gutenberg.org/ebooks/2701
80+
# (I removed the front and back matter)
81+
82+
# download the file
83+
if not os.path.exists('moby_dick.txt'):
84+
print("Downloading moby dick...")
85+
r = requests.get('https://lazyprogrammer.me/course_files/moby_dick.txt')
86+
with open('moby_dick.txt', 'w') as f:
87+
f.write(r.content.decode())
88+
89+
# for replacing non-alpha characters
90+
regex = re.compile('[^a-zA-Z]')
91+
92+
# load in words
93+
for line in open('moby_dick.txt'):
94+
line = line.rstrip()
95+
96+
# there are blank lines in the file
97+
if line:
98+
line = regex.sub(' ', line) # replace all non-alpha characters with space
99+
100+
# split the tokens in the line and lowercase
101+
tokens = line.lower().split()
102+
103+
for token in tokens:
104+
# update the model
105+
106+
# first letter
107+
ch0 = token[0]
108+
update_pi(ch0)
109+
110+
# other letters
111+
for ch1 in token[1:]:
112+
update_transition(ch0, ch1)
113+
ch0 = ch1
114+
115+
# normalize the probabilities
116+
pi /= pi.sum()
117+
M /= M.sum(axis=1, keepdims=True)
118+
119+
120+
### encode a message
121+
122+
# this is a random excerpt from Project Gutenberg's
123+
# The Adventures of Sherlock Holmes, by Arthur Conan Doyle
124+
# https://www.gutenberg.org/ebooks/1661
125+
126+
original_message = '''I then lounged down the street and found,
127+
as I expected, that there was a mews in a lane which runs down
128+
by one wall of the garden. I lent the ostlers a hand in rubbing
129+
down their horses, and received in exchange twopence, a glass of
130+
half-and-half, two fills of shag tobacco, and as much information
131+
as I could desire about Miss Adler, to say nothing of half a dozen
132+
other people in the neighbourhood in whom I was not in the least
133+
interested, but whose biographies I was compelled to listen to.
134+
'''
135+
136+
# Away they went, and I was just wondering whether I should not do well
137+
# to follow them when up the lane came a neat little landau, the coachman
138+
# with his coat only half-buttoned, and his tie under his ear, while all
139+
# the tags of his harness were sticking out of the buckles. It hadn't
140+
# pulled up before she shot out of the hall door and into it. I only
141+
# caught a glimpse of her at the moment, but she was a lovely woman, with
142+
# a face that a man might die for.
143+
144+
# My cabby drove fast. I don't think I ever drove faster, but the others
145+
# were there before us. The cab and the landau with their steaming horses
146+
# were in front of the door when I arrived. I paid the man and hurried
147+
# into the church. There was not a soul there save the two whom I had
148+
# followed and a surpliced clergyman, who seemed to be expostulating with
149+
# them. They were all three standing in a knot in front of the altar. I
150+
# lounged up the side aisle like any other idler who has dropped into a
151+
# church. Suddenly, to my surprise, the three at the altar faced round to
152+
# me, and Godfrey Norton came running as hard as he could towards me.
153+
154+
155+
156+
# a function to encode a message
157+
def encode_message(msg):
158+
# downcase
159+
msg = msg.lower()
160+
161+
# replace non-alpha characters
162+
msg = regex.sub(' ', msg)
163+
164+
# make the encoded message
165+
coded_msg = []
166+
for ch in msg:
167+
coded_ch = ch # could just be a space
168+
if ch in true_mapping:
169+
coded_ch = true_mapping[ch]
170+
coded_msg.append(coded_ch)
171+
172+
return ''.join(coded_msg)
173+
174+
175+
encoded_message = encode_message(original_message)
176+
177+
178+
# a function to decode a message
179+
def decode_message(msg, word_map):
180+
decoded_msg = []
181+
for ch in msg:
182+
decoded_ch = ch # could just be a space
183+
if ch in word_map:
184+
decoded_ch = word_map[ch]
185+
decoded_msg.append(decoded_ch)
186+
187+
return ''.join(decoded_msg)
188+
189+
190+
191+
### run an evolutionary algorithm to decode the message
192+
193+
# this is our initialization point
194+
dna_pool = []
195+
for _ in range(20):
196+
dna = list(string.ascii_lowercase)
197+
random.shuffle(dna)
198+
dna_pool.append(dna)
199+
200+
201+
def evolve_offspring(dna_pool, n_children):
202+
# make n_children per offspring
203+
offspring = []
204+
205+
for dna in dna_pool:
206+
for _ in range(n_children):
207+
copy = dna.copy()
208+
j = np.random.randint(len(copy))
209+
k = np.random.randint(len(copy))
210+
211+
# switch
212+
tmp = copy[j]
213+
copy[j] = copy[k]
214+
copy[k] = tmp
215+
offspring.append(copy)
216+
217+
return offspring + dna_pool
218+
219+
220+
221+
num_iters = 1000
222+
scores = np.zeros(num_iters)
223+
prev_score = float('-inf')
224+
best_dna = None
225+
best_map = None
226+
best_score = float('-inf')
227+
for i in range(num_iters):
228+
if i > 0:
229+
# get offspring from the current dna pool
230+
dna_pool = evolve_offspring(dna_pool, 3)
231+
232+
# calculate score for each dna
233+
dna2score = {}
234+
for dna in dna_pool:
235+
# populate map
236+
current_map = {}
237+
for k, v in zip(letters1, dna):
238+
current_map[k] = v
239+
240+
decoded_message = decode_message(encoded_message, current_map)
241+
score = get_sequence_prob(decoded_message)
242+
243+
# store it
244+
# needs to be a string to be a dict key
245+
dna2score[''.join(dna)] = score
246+
247+
# record the best so far
248+
if score > best_score:
249+
best_dna = dna
250+
best_map = current_map
251+
best_score = score
252+
253+
# average score for this generation
254+
scores[i] = np.mean(list(dna2score.values()))
255+
256+
# keep the best 3 dna
257+
# also turn them back into list of single chars
258+
sorted_dna = sorted(dna2score.items(), key=lambda x: x[1], reverse=True)
259+
dna_pool = [list(k) for k, v in sorted_dna[:5]]
260+
261+
if i % 200 == 0:
262+
print("iter:", i, "score:", scores[i], "best so far:", best_score)
263+
264+
265+
266+
267+
268+
# use best score
269+
decoded_message = decode_message(encoded_message, best_map)
270+
271+
print("LL of decoded message:", get_sequence_prob(decoded_message))
272+
print("LL of true message:", get_sequence_prob(regex.sub(' ', original_message.lower())))
273+
274+
275+
# which letters are wrong?
276+
for true, v in true_mapping.items():
277+
pred = best_map[v]
278+
if true != pred:
279+
print("true: %s, pred: %s" % (true, pred))
280+
281+
282+
# print the final decoded message
283+
print("Decoded message:\n", decoded_message)
284+
285+
print("\nTrue message:\n", original_message)
286+
287+
plt.plot(scores)
288+
plt.show()
289+
290+
291+
292+
293+

0 commit comments

Comments
 (0)