In [1]:
import numpy as np
import pickle
import matplotlib.pyplot as plt
import copy

import random


In [2]:
corpus = []
f = open('alice_in_wonderland.txt','r')
while(1):
    line =  f.readline()
    if len(line) == 0: break
    corpus.extend(line.split())
        
f.close()
corpus = ' '.join(corpus)

def clean_word(word):
    word = word.lower()
    for punctuation in ['"',"'",'.',',','-','?','!',';',':','—','(',')','[',']']:
        word = word.split(punctuation)[0]
    return word



corpus = [clean_word(word) for word in corpus.split()]
corpus = [word for word in corpus if len(word) > 0]
print(corpus[:25])
D = len(corpus)
print('corpus len: ',D)

['alice', 'adventures', 'in', 'wonderland', 'by', 'lewis', 'carroll', 'the', 'millennium', 'fulcrum', 'edition', '3', 'contents', 'chapter', 'i', 'down', 'the', 'rabbit', 'chapter', 'ii', 'the', 'pool', 'of', 'tears', 'chapter']
corpus len:  25320


In [3]:
tokenize = {}
wordlist = []
token = 0
for word in corpus:
    if word not in tokenize.keys():
        tokenize[word] = token
        wordlist.append(word)
        token += 1
    
V = len(wordlist)
print('word list size (number of distinct words): ', V)


word list size (number of distinct words):  2501


In [15]:
# bin how many times a word follows another word
counts_2gram = np.zeros((V,V))
for i in range(1,len(corpus)):
    token_i = tokenize[corpus[i]]
    token_im1 = tokenize[corpus[i-1]]
    counts_2gram[token_i,token_im1] += 1
print(counts_2gram)

[[0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [9. 1. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [5]:
posterior_1word = np.zeros((V, V))
prior = np.zeros(V)

row_totals = counts_2gram.sum(axis=1)
count = np.zeros(V)
for word in corpus:
    x = tokenize[word]
    count[x] += 1

prior = count/len(corpus)
  

for i in range (0,len(counts_2gram)):
    posterior_1word[i] = counts_2gram[i]/prior[i]

x = tokenize['alice']
print(posterior_1word[x])
likelihood = posterior_1word[x] * prior[x]

[0. 0. 0. ... 0. 0. 0.]


In [6]:
#past word as feature

posterior_1word = np.zeros((V, V))
prior = np.zeros(V)
row_totals = counts_2gram.sum(axis=1)
count = np.zeros(V)
for word in corpus:
    x = tokenize[word]
    count[x] += 1
prior = count/len(corpus)
for i in range (0,len(counts_2gram)):
    posterior_1word[i] = counts_2gram[i]/row_totals[i]

def get_likelihood_2gram(word):
    x = tokenize[word]
    likelihood = posterior_1word[:,x] * prior
    return(likelihood)
def pred_2gram(word):
    likelihood = get_likelihood_2gram(word)
    i = np.argmax(likelihood)
    return(wordlist[i], likelihood[i])
print(pred_2gram('alice'))
print(pred_2gram('the'))
# print(pred_2gram('cat'))
# print(pred_2gram('turtle'))
print(pred_2gram('cheshire'))
print(pred_2gram('mock'))
    

('was', 0.0007109004739336493)
('queen', 0.0027646129541864135)
('cat', 0.00019747235387045816)
('turtle', 0.0022511848341232226)


In [7]:
def get_counts_kgram(k):
    counts_kgram = np.zeros((V,V))
    for i in range(1, len(corpus)):
        token_i = tokenize[corpus[i]]
        token_imk = tokenize[corpus[i-k]]
        counts_kgram[token_i, token_imk] += 1
    return counts_kgram

In [8]:
counts_3gram = get_counts_kgram(2)
row_totals = counts_2gram.sum(axis=1)

In [9]:
#past 2 words as features

posterior_2words = np.zeros((V, V))
for i in range (0,len(counts_3gram)):
    posterior_2words[i] = counts_3gram[i]/row_totals[i]

posterior_2gram = np.vstack([posterior_1word,posterior_2words])

def get_likelihood_3gram(word2ago,word1ago):
    word1ago_i = tokenize[word1ago]
    word2ago_i = tokenize[word2ago]
    likelihood_word1 = posterior_1word[:,word1ago_i] * prior
    likelihood_word2 = posterior_2words[:,word2ago_i]
    likelihood = likelihood_word1 * likelihood_word2
    return likelihood
def pred_3gram(word2ago,word1ago):
    likelihood = get_likelihood_3gram(word2ago,word1ago)
    i = np.argmax(likelihood)
    return wordlist[i], likelihood[i]
print(pred_3gram('pack','of'))
print(pred_3gram('the','mad'))
print(pred_3gram('she','jumped'))
print(pred_3gram('four','thousand'))

('cards', 0.00011848341232227489)
('you', 5.7060000268517644e-06)
('up', 2.1392838335966295e-05)
('miles', 1.3164823591363875e-05)


In [10]:
def get_likelihood_kgram(word_lst):
    counts = np.zeros(V)
    prior = np.zeros(V)
    for word in corpus:
        x = tokenize[word]
        counts[x] += 1
    prior = counts/len(corpus)
    word_index_lst = []
    for word in word_lst:
        word_index_lst.append(tokenize[word])
        
    likelihood = prior
    count = 0
    word_index_lst = np.flip(word_index_lst)
    for index in word_index_lst:
        counts_igram = get_counts_kgram(count+1)
        posterior_iwords = np.zeros((V, V))
        for i in range (0,len(counts_igram)):
            posterior_iwords[i] = counts_igram[i]/row_totals[i]
        likelihood *= posterior_iwords[:,index]
        count = count + 1
    return likelihood
def pred_kgram(word_lst):
    likelihood = get_likelihood_kgram(word_lst)
    i = np.argmax(likelihood)
    return wordlist[i], likelihood[i]

print(pred_kgram(['before', 'she', 'found', 'herself', 'falling', 'down', 'a', 'very', 'deep']))
print(pred_kgram(['falling', 'down', 'a', 'very', 'deep']))
print(pred_kgram(['what','an', 'ignorant', 'little']))
print(pred_kgram(['four', 'thousand']))

('well', 2.799924290061186e-17)
('well', 4.702797636375411e-11)
('girl', 1.8513033175355449e-06)
('miles', 1.3164823591363875e-05)


In [11]:
# 4(c)
text = ['the','mad','hatter']
result = ''
for i in range(0,25):
    text.append(pred_kgram(text)[0])
    result += text.pop(0) + ' '
print(result)

the mad hatter with this as she could guess she was now about two feet high even then they walked off together alice heard a 


In [12]:
# 4(d)
text = ['the','mad','hatter']
result_2 = ''
for i in range(0,25):
    text.append(random.choices(wordlist,weights=get_likelihood_kgram(text),k=1)[0])
    result_2 += text.pop(0) + ' '
print(result_2)

the mad hatter all this way was a time she said waste it in asking riddles that have no answers you knew time it was 
