In [1]:
import os
import numpy as np
from IPython.display import HTML
from itertools import groupby
import re
from Naive_HMM import unsupervised_HMM as naive_unsupervised_HMM
import Naive_HMM_helper

In [2]:
shakespeare = open("data/shakespeare.txt", 'r')

poems = shakespeare.readlines()
split_at = "\n"
final_poems = [list(g)[1:] for k, g in groupby(poems, lambda x: x != split_at) if k]
print("Initial number of poems: {}".format(len(final_poems)))
poem_lengths = [len(poem) for poem in final_poems] 
bad_poems = np.where(np.array(poem_lengths)!= 14)[0]
print ("Sonnets {} and {} are not 14 lines long so we remove them from our list.".format(bad_poems[0], bad_poems[1]))

final_poems = [final_poems[i] for i in np.delete(np.arange(len(final_poems)), bad_poems)]
print("Final number of poems: {}".format(len(final_poems)))
final_poems = [''.join([line.strip(' ') for line in poem]) for poem in final_poems]
final_poems

Initial number of poems: 154
Sonnets 98 and 125 are not 14 lines long so we remove them from our list.
Final number of poems: 152


["From fairest creatures we desire increase,\nThat thereby beauty's rose might never die,\nBut as the riper should by time decease,\nHis tender heir might bear his memory:\nBut thou contracted to thine own bright eyes,\nFeed'st thy light's flame with self-substantial fuel,\nMaking a famine where abundance lies,\nThy self thy foe, to thy sweet self too cruel:\nThou that art now the world's fresh ornament,\nAnd only herald to the gaudy spring,\nWithin thine own bud buriest thy content,\nAnd tender churl mak'st waste in niggarding:\nPity the world, or else this glutton be,\nTo eat the world's due, by the grave and thee.\n",
 "When forty winters shall besiege thy brow,\nAnd dig deep trenches in thy beauty's field,\nThy youth's proud livery so gazed on now,\nWill be a tattered weed of small worth held:\nThen being asked, where all thy beauty lies,\nWhere all the treasure of thy lusty days;\nTo say within thine own deep sunken eyes,\nWere an all-eating shame, and thriftless praise.\nHow much

In [3]:
# token_map maps words to numbers
# tokenized_poems replaces the words in poems with their corresponding number
tokenized_poems, token_map = Naive_HMM_helper.parse_observations(final_poems)

In [4]:
# Helpful lists
# Syllables
syllable_file = open("data/Syllable_dictionary.txt", 'r')
syllables = syllable_file.readlines()
syllables = [x.split() for x in syllables]
syllable_dict = {}
"""
for syllable in syllables:
    word = re.sub(r'[^\w]', '', syllable[0])
    syllable_dict[word] = syllable[1:]
"""
# We choose to map words to tuples of lists
# the first list corresponds to the number of syllables if the word were at the end (E)
# the second list corresponds to the number of syllables the word can take anywhere
# E.g. "test": ['E1', '2', '3'] <-> "test": [([1], [2, 3])]
for syllable in syllables:
    word = re.sub(r'[^\w]', '', syllable[0])
    end_syllable_list = []
    regular_syllable_list = []
    for item in syllable[1:]:
        if item[0] == "E":
            end_syllable_list.append(int(item[1:]))
        else:
            regular_syllable_list.append(int(item))
    syllable_dict[word] = (end_syllable_list, regular_syllable_list)
    
syllable_dict

{'gainst': ([], [1]),
 'greeing': ([1], [2]),
 'scaped': ([], [1]),
 'tis': ([], [1]),
 'twixt': ([], [1]),
 'a': ([], [1]),
 'adoting': ([2], [3]),
 'abhor': ([], [2]),
 'abide': ([], [2]),
 'able': ([], [2]),
 'about': ([], [2]),
 'above': ([], [2]),
 'absence': ([], [2]),
 'absent': ([], [2]),
 'abundance': ([], [3]),
 'abundant': ([], [3]),
 'abuse': ([], [2]),
 'abused': ([], [2]),
 'abuses': ([], [3]),
 'abysm': ([], [2]),
 'accents': ([], [2]),
 'acceptable': ([], [4]),
 'acceptance': ([], [3]),
 'accessary': ([], [4]),
 'accident': ([], [3]),
 'accidents': ([], [3]),
 'account': ([], [2]),
 'accumulate': ([], [4]),
 'accuse': ([], [2]),
 'accusing': ([], [3]),
 'achieve': ([], [2]),
 'acknowledge': ([], [3]),
 'acquaintance': ([], [3]),
 'acquainted': ([2], [3]),
 'act': ([], [1]),
 'action': ([], [2]),
 'active': ([], [2]),
 'actor': ([], [2]),
 'add': ([], [1]),
 'added': ([], [2]),
 'adders': ([], [2]),
 'addeth': ([], [2]),
 'adding': ([], [2]),
 'addition': ([], [3]),
 'ad

In [5]:
tokenized_syllable_dict = {}
for key in syllable_dict.keys():
    # If the word in syllable_dict is in our token map, add it to our tokenized_syllable_dict
    try:
        tokenized_syllable_dict[token_map[key]] = syllable_dict[key]
    except KeyError:
        pass
tokenized_syllable_dict

{476: ([], [1]),
 2594: ([1], [2]),
 2251: ([], [1]),
 925: ([], [1]),
 2016: ([], [1]),
 41: ([], [1]),
 841: ([2], [3]),
 3088: ([], [2]),
 1015: ([], [2]),
 2188: ([], [2]),
 2562: ([], [2]),
 2199: ([], [2]),
 1343: ([], [2]),
 1374: ([], [2]),
 44: ([], [3]),
 2359: ([], [3]),
 216: ([], [2]),
 2152: ([], [2]),
 2719: ([], [3]),
 2552: ([], [2]),
 1923: ([], [2]),
 234: ([], [4]),
 2905: ([], [3]),
 1272: ([], [4]),
 2754: ([], [3]),
 2605: ([], [3]),
 1120: ([], [2]),
 2648: ([], [4]),
 2638: ([], [2]),
 1721: ([], [3]),
 1888: ([], [2]),
 1293: ([], [3]),
 2066: ([], [3]),
 815: ([2], [3]),
 3100: ([], [1]),
 1842: ([], [2]),
 1303: ([], [2]),
 890: ([], [2]),
 1929: ([], [1]),
 2079: ([], [2]),
 2555: ([], [2]),
 2906: ([], [2]),
 844: ([], [2]),
 842: ([], [3]),
 1703: ([], [2]),
 2269: ([], [2]),
 2743: ([], [2]),
 2175: ([], [3]),
 1749: ([], [3]),
 2620: ([], [2]),
 2908: ([], [3]),
 1622: ([], [3]),
 358: ([], [2]),
 2713: ([], [3]),
 2087: ([], [2]),
 1824: ([], [3]),
 12

We deliberately do not make a 140 syllable line and then split them (which would allow for some extra continuity) because Shakespearean lines are typically treated as new sentences.

In [6]:
# Flattens 3-dimensional list of list of lists to 2-dimensional list of lists
# (where each inner list corresponds to a line of a poem)
flattened_tokenized_poems = [val for sublist in tokenized_poems for val in sublist]
hmm = naive_unsupervised_HMM(flattened_tokenized_poems, 16, 100)
print('Naive Sonnet:\n====================')
for i in range(14):
    print(Naive_HMM_helper.sample_sentence(hmm, token_map, tokenized_syllable_dict, num_syllables=10))

Iteration: 10
Iteration: 20
Iteration: 30
Iteration: 40
Iteration: 50
Iteration: 60
Iteration: 70
Iteration: 80
Iteration: 90
Iteration: 100
Naive Sonnet:
Churls the vermilion of fast to to my;
I time it hath doing taste my ragged;
On to when have praise of his nobler accuse;
To of hast and no no thee abundance;
Brass me this seeming free blush should to but;
Thrice that more fair huge wink of self is it;
Walks are allow still an never shaken;
Thy swift back proved me and argument torn;
And but therefore of self thou kind use than;
Folly substance did be to thou from now;
Gilding it very a all so make spacious;
Where such sum second stay seen her be thine;
Have me him gluttoning i among if;
Fire in and love is within not blow of;


In [23]:
for i in range(14):
    print(Naive_HMM_helper.sample_sentence(hmm, token_map, tokenized_syllable_dict, num_syllables=10))

Eat that is not remember thou days loves;
And stopped penury this wanting thy beauty;
Why see world will in with one thou youth thou;
Be truth no her hour ride of pen happy;
Still boy gentle form desire is than as;
Heaven first allayed sheds of truth halt barrenly;
Now away be i witness compounds war;
More thou no thy pleasant most despise most;
Untrimmed and be single him love write so;
With virtue vices glass fell nor self the;
And every beauty and bright your rare skill;
And do dumb water sake excuse heart than;
Why interest feeble that woods the play where;
False to call politic like world prisoner;
