In [1]:
import os
import numpy as np
from IPython.display import HTML
from itertools import groupby
import re

import Overall_HMM_helper
from Naive_HMM import unsupervised_HMM
import Naive_HMM_helper

In [2]:
shakespeare = open("data/shakespeare.txt", 'r')

poems = shakespeare.readlines()
split_at = "\n"
final_poems = [list(g)[1:] for k, g in groupby(poems, lambda x: x != split_at) if k]
print("Initial number of poems: {}".format(len(final_poems)))
poem_lengths = [len(poem) for poem in final_poems] 
bad_poems = np.where(np.array(poem_lengths)!= 14)[0]
print ("Sonnets {} and {} are not 14 lines long so we remove them from our list.".format(bad_poems[0], bad_poems[1]))

final_poems = [final_poems[i] for i in np.delete(np.arange(len(final_poems)), bad_poems)]
print("Final number of poems: {}".format(len(final_poems)))
final_poems = [''.join([line.strip(' ') for line in poem]) for poem in final_poems]

Initial number of poems: 154
Sonnets 98 and 125 are not 14 lines long so we remove them from our list.
Final number of poems: 152


In [4]:
# token_map maps words to numbers
# tokenized_poems replaces the words in poems with their corresponding number
tokenized_poems, token_map = Overall_HMM_helper.parse_observations(final_poems)
token_map_r = Overall_HMM_helper.obs_map_reverser(token_map)
flattened_tokenized_poems = [val for sublist in tokenized_poems for val in sublist]

In [5]:
# Helpful lists
# Syllables
syllable_file = open("data/Syllable_dictionary.txt", 'r')
syllables = syllable_file.readlines()
syllables = [x.split() for x in syllables]
syllable_dict = {}

# We choose to map words to tuples of lists
# the first list corresponds to the number of syllables if the word were at the end (E)
# the second list corresponds to the number of syllables the word can take anywhere
# E.g. "test": ['E1', '2', '3'] <-> "test": [([1], [2, 3])]
for syllable in syllables:
    word = re.sub(r'[^\w]', '', syllable[0])
    end_syllable_list = []
    regular_syllable_list = []
    for item in syllable[1:]:
        if item[0] == "E":
            end_syllable_list.append(int(item[1:]))
        else:
            regular_syllable_list.append(int(item))
    syllable_dict[word] = (end_syllable_list, regular_syllable_list)
    
# syllable_dict

In [6]:
tokenized_syllable_dict = {}
for key in syllable_dict.keys():
    # If the word in syllable_dict is in our token map, add it to our tokenized_syllable_dict
    try:
        tokenized_syllable_dict[token_map[key]] = syllable_dict[key]
    except KeyError:
        pass
# tokenized_syllable_dict

## Syllable Analysis - is in HMM_helper file (generate_emissions)

We deliberately do not make a 140 syllable line and then split them (which would allow for some extra continuity) because Shakespearean lines are typically treated as new sentences.

In [9]:
# Flattens 3-dimensional list of list of lists to 2-dimensional list of lists
# (where each inner list corresponds to a line of a poem)
hmm = unsupervised_HMM(flattened_tokenized_poems, 2, 10, tokenized_syllable_dict)
print('Naive Sonnet:\n====================')
for i in range(14):
    print(Naive_HMM_helper.sample_sentence(hmm, token_map, num_syllables=10))

Iteration: 10
Naive Sonnet:
Complexion but purging allayed my half;
Admired shadow to thy what of once;
One count proud of in keep soil one loving;
Sweet i be end still is god and the fears;
Thy well again amends bosoms sits so;
Are love his that interest show all fear knows;
But all fingers due anon gracious bids;
Sea at thy side wrecked of motion love too;
That eye and true and i am nothing eyes;
Minds death hateth secret so upon in;
Since eye picture less doth in if with be;
Where time far to title but mind next break;
Besmeared stays let the hold the mud so sun;
And my tell his i blessedfair have thou;
