In [1]:
import os
import numpy as np
from IPython.display import HTML
import string

from HMM import unsupervised_HMM
from HMM_helper import (
    text_to_wordcloud,
    states_to_wordclouds,
    parse_observations,
    sample_sentence,
    sample_sentence_backwards,
    visualize_sparsities,
    animate_emission
)

In [2]:
text = open(os.path.join(os.getcwd(), 'data/shakespeare.txt')).read()

Pre-processing V2

In [4]:
# remove numbers denoting each sonnet
# remove capitalization
# remove apostrophes that are suffixes to a word, not part of the word
import nltk
import re
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize import word_tokenize 
from nltk.tokenize import TweetTokenizer

### RUN THIS FOR THE FIRST TIME YOU'RE USING THIS NOTEBOOK
# nltk.download('wordnet') 
# nltk.download('punkt')


def text_lowercase(text): 
    return text.lower() 

def remove_numbers(text): 
    result = re.sub(r'\d+', '', text) 
    return result 


def remove_punctuation(text): 
    punctuation_set = ':' + ';' + ',' + '.' + '!' + '?' + '(' + ')'
    translator = str.maketrans('', '', punctuation_set)
    return text.translate(translator).replace("' ", " ")

lemmatizer = WordNetLemmatizer() 
tknzr = TweetTokenizer()
def lemmatize_word(text): 
    word_tokens = tknzr.tokenize(text) 
    # provide context i.e. part-of-speech 
    lemmas = [lemmatizer.lemmatize(word, pos ='v') for word in word_tokens] 
    return lemmas 
print(remove_punctuation(remove_numbers(text)))

def parse_observations(text):
    # Convert text to dataset.
    lines = [line.split() for line in text.split('\n') if line.split()]

    obs_counter = 0
    obs = []
    obs_map = {}

    for line in lines:
        obs_elem = []
        
        for word in line:
#             word = re.sub(r'[^\w]', '', word).lower()
            if word not in obs_map:
                # Add unique words to the observations map.
                obs_map[word] = obs_counter
                obs_counter += 1
            
            # Add the encoded word.
            obs_elem.append(obs_map[word])
        
        # Add the encoded sequence.
        obs.append(obs_elem)

    return obs, obs_map

obs, obs_map = parse_observations(text_lowercase(remove_punctuation(remove_numbers(text))))
print(obs_map)


                   
From fairest creatures we desire increase
That thereby beauty's rose might never die
But as the riper should by time decease
His tender heir might bear his memory
But thou contracted to thine own bright eyes
Feed'st thy light's flame with self-substantial fuel
Making a famine where abundance lies
Thy self thy foe to thy sweet self too cruel
Thou that art now the world's fresh ornament
And only herald to the gaudy spring
Within thine own bud buriest thy content
And tender churl mak'st waste in niggarding
  Pity the world or else this glutton be
  To eat the world's due by the grave and thee


                   
When forty winters shall besiege thy brow
And dig deep trenches in thy beauty's field
Thy youth's proud livery so gazed on now
Will be a tattered weed of small worth held
Then being asked where all thy beauty lies
Where all the treasure of thy lusty days
To say within thine own deep sunken eyes
Were an all-eating shame and thriftless praise
How much more prai

Training

In [5]:
hmm10 = unsupervised_HMM(obs, 10, 100)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


Syllable and rhyme dictionaries

In [6]:
# syllable dictionary

# format the syllable dictionary
syll_dict = open(os.path.join(os.getcwd(), 'data/Syllable_dictionary.txt')).read()
syll_dict = syll_dict.splitlines()
syll_dict2 = []
for i in range(len(syll_dict)):
    syll_dict2.append(syll_dict[i].split())
syll_dict = syll_dict2



Generate sonnets (no rhyme, 10 syllables per line)

In [7]:
syllables = 0
i = 0
num_lines = 0
poem = ''
add_last_syll = 0
# string = sample_sentence(hmm10, obs_map, n_words=10)[0:-3].lower() + ' ' 
# each line is unlikely to have >10 words
# print(string)

while num_lines < 14: # ensure it's 14 lines in total
    # generate 10 words cuz 10 syllables/line. might not use all 10 words
    string = sample_sentence(hmm10, obs_map, n_words=10)[0:-3].lower() 
    syllables = 0
    i = 0
    line = ''
    add_last_syll = 0
#     print(string)
    # generates one line
    while syllables < 10:
        word = string.split()[i]
        index = [i[0] for i in syll_dict].index(word)
        add_syll = int(syll_dict[index][-1])
        # special case for last word. ref. to syllable_dictionary.txt
        if np.shape(syll_dict[index]) == 3:
            add_last_syll = int(syll_dict[index].split()[1][1][1])
        # if last word is a special case
        if syllables + add_last_syll == 10:
            line += word
            syllables += add_last_syll
        # normal cases
        elif syllables + add_syll <= 10:
            line += word + ' '
            syllables += add_syll
            add_last_syll = 0
        # no. of syllables of last word would cause line to exceed 10 syllables. redo the whole line.
        else: 
            string = sample_sentence(hmm10, obs_map, n_words=10)[0:-3].lower() # regenerate string
            syllables = 0
            i = -1
            line = ''
            add_last_syll = 0
            print('restart')
        i += 1
    num_lines += 1
    poem += line + "\n"
    print(num_lines)
print(poem)


1
2
3
4
5
6
7
8
9
restart
10
11
12
restart
13
14
delight knew sick age autumn worth found true 
untutored to profit thee bear untrue 
yet with pen the all-tyrant that grace due 
down-rased far as dwells with this store thee not 
shame in fire paid thou usurer longer 
not new reeks yet forbid my i virtue 
thee woe and other stain mourners proud too 
judgement hence swift-footed wilt my now on 
prize of mine devil complain moan solemn 
time to to past mine deserts to i new 
the tell what nor haply in it come fair 
used show thy side note lasting my worth 
in my to repent contented a will 
lour'st i walks with doth that deserving not 



Generate sonnets (with rhyme, 10 syllables per line)
for the 1st 12 lines, every alternate line rhymes with each other, and the last 2 lines rhyme

In [8]:
# rhyme dictionary https://stackoverflow.com/questions/25714531/find-rhyme-using-nltk-in-python
import nltk
# nltk.download('cmudict')
def rhyme(inp, level):
    entries = nltk.corpus.cmudict.entries()
    syllables2 = [(word, syl) for word, syl in entries if word == inp]
    rhymes = []
    for (word, syllable) in syllables2:
        rhymes += [word for word, pron in entries if pron[-level:] == syllable[-level:]]
    return set(rhymes)
def doTheyRhyme(word1, word2):
  # first, we don't want to report 'glue' and 'unglue' as rhyming words
  # those kind of rhymes are LAME
    if word1.find(word2) == len(word1) - len(word2):
        return False
    if word2.find(word1) == len(word2) - len(word1): 
        return False

    return word1 in rhyme(word2, 1)
print(rhyme('may',1))


{'cluj', 'piaget', 'zepa', 'waye', 'stray', 'raye', 'mcgray', 'whey', 'lyonnais', 'waga', 'cia', 'repay', 'away', 're', 'lait', 'liberte', 'mccrea', 'hevey', 'mcray', 'underpay', 'sergei', 'brey', 'cama', 'ray', 'pray', 'beaupre', 'mcshea', 'fe', 'faberge', 'nay', 'hwe', 'dray', 'jonbenet', 'nej', 'mulvey', 'abbe', 'spey', 'bouquet', 'crochet', 'sochet', "o'shea", 'jay', 'j.', 'rene', 'slay', 'bay', 'moutray', 'shea', 'dey', 'mcclay', 'portray', 'flay', 'quay', 'yay', 'fray', 'kley', 'millay', 'banpais', 'maye', 'deseret', 'mei', 'spray', 'mccay', 'pinochet', 'toray', 'nisei', 'smay', 'cafe', 'puree', 'bombay', 'vadnais', 'roget', 'dossier', 'palais', 'prepay', 'hooray', 'stay', 'perrier', 'sevey', 'stupay', 'hay', 'shay', 'hongwei', 'faraway', 'schey', 'b-j', 'ay', 'hefei', 'shinsei', "'kay", 'wray', 'usa', 'buffet', 'morais', 'rae', 'mcnay', 'ha', 'servais', 'macknay', 'kamei', 'brae', 'lurvey', 'torme', 'frey', 'sta', 'paye', 'saye', 'benet', 'lihue', 'replay', 'blay', 'sleigh', 'le

In [10]:
obs_map.get('may')
# sample_sentence_backwards(hmm10, obs_map, 10, 23)[0:-3].lower()

446

In [None]:
sample_sentence(hmm10, obs_map, n_words=10)[0:-3].lower()


In [None]:
def generate_line(num_words)
    string = sample_sentence(hmm10, obs_map, n_words=num_words)[0:-3].lower()
    syllables = 0
    i = 0
    line = ''
    add_last_syll = 0
    while syllables < num_words:
        word = string.split()[i]
        index = [i[0] for i in syll_dict].index(word)
        add_syll = int(syll_dict[index][-1])
        if np.shape(syll_dict[index]) == 3: # special case for last word
            add_last_syll = int(syll_dict[index].split()[1][1][1])
        
        if syllables + add_last_syll == 10:
            line += word
            syllables += add_last_syll
        elif syllables + add_syll <= 10:
            line += word + ' '
            syllables += add_syll
            add_last_syll = 0
        else: # regenerate string
            string = sample_sentence(hmm10, obs_map, n_words=10)[0:-3].lower()
            syllables = 0
            i = -1
            line = ''
            add_last_syll = 0
            print('restart')
        i += 1
    return line

syllables = 0
i = 0
num_lines = 0
poem = ''
add_last_syll = 0
# string = sample_sentence(hmm10, obs_map, n_words=10)[0:-3].lower() + ' ' 
# each line is unlikely to have >10 words
# print(string)

while num_lines < 14:
    if num_lines == 0:
        string = sample_sentence(hmm10, obs_map, n_words=10)[0:-3].lower()