In [1]:
import os
import numpy as np
from IPython.display import HTML
from itertools import groupby
import re

import Overall_HMM_helper
from Meter_Rhyme_HMM import unsupervised_HMM
import Meter_Rhyme_HMM_helper

In [2]:
shakespeare = open("data/shakespeare.txt", 'r')

poems = shakespeare.readlines()
split_at = "\n"
final_poems = [list(g)[1:] for k, g in groupby(poems, lambda x: x != split_at) if k]
print("Initial number of poems: {}".format(len(final_poems)))
poem_lengths = [len(poem) for poem in final_poems] 
bad_poems = np.where(np.array(poem_lengths)!= 14)[0]
print ("Sonnets {} and {} are not 14 lines long so we remove them from our list.".format(bad_poems[0], bad_poems[1]))

final_poems = [final_poems[i] for i in np.delete(np.arange(len(final_poems)), bad_poems)]
print("Final number of poems: {}".format(len(final_poems)))
final_poems = [''.join([line.strip(' ') for line in poem]) for poem in final_poems]

Initial number of poems: 154
Sonnets 98 and 125 are not 14 lines long so we remove them from our list.
Final number of poems: 152


In [3]:
# token_map maps words to numbers
# tokenized_poems replaces the words in poems with their corresponding number
tokenized_poems, token_map = Overall_HMM_helper.parse_observations(final_poems)
token_map_r = Overall_HMM_helper.obs_map_reverser(token_map)
flattened_tokenized_poems = [val for sublist in tokenized_poems for val in sublist]

## Syllable Analysis - is in HMM_helper file (generate_emissions)

In [4]:
# Syllables
syllable_file = open("data/Syllable_dictionary.txt", 'r')
syllables = syllable_file.readlines()
syllables = [x.split() for x in syllables]
syllable_dict = {}

# We choose to map words to tuples of lists
# the first list corresponds to the number of syllables if the word were at the end (E)
# the second list corresponds to the number of syllables the word can take anywhere
# E.g. "test": ['E1', '2', '3'] <-> "test": [([1], [2, 3])]
for syllable in syllables:
    word = re.sub(r'[^\w]', '', syllable[0])
    end_syllable_list = []
    regular_syllable_list = []
    for item in syllable[1:]:
        if item[0] == "E":
            end_syllable_list.append(int(item[1:]))
        else:
            regular_syllable_list.append(int(item))
    syllable_dict[word] = (end_syllable_list, regular_syllable_list)
    
# syllable_dict

In [5]:
tokenized_syllable_dict = {}
for key in syllable_dict.keys():
    # If the word in syllable_dict is in our token map, add it to our tokenized_syllable_dict
    try:
        tokenized_syllable_dict[token_map[key]] = syllable_dict[key]
    except KeyError:
        pass
# tokenized_syllable_dict

## Rhyme Analysis

In [6]:
def get_rhyme_pairs(poem):
    rhyme_pairs = []
    last_words = []
    poem = poem.split("\n")
    for line in poem:
        
        word = line.split(" ")[-1]
        word = re.sub(r'[^\w]', '', word).lower()
        last_words.append(word)

    if '' in last_words:
        last_words.remove('')
    
    rhyme_pairs.append((last_words[0], last_words[2]))
    rhyme_pairs.append((last_words[1], last_words[3]))
    rhyme_pairs.append((last_words[4], last_words[6]))
    rhyme_pairs.append((last_words[5], last_words[7]))
    rhyme_pairs.append((last_words[8], last_words[10]))
    rhyme_pairs.append((last_words[9], last_words[11]))
    rhyme_pairs.append((last_words[12], last_words[13]))
    
    return rhyme_pairs

# Now compile all the rhyming words in each poem
rhyming_dict = []
for poem in final_poems:
    rhyming_dict += get_rhyme_pairs(poem)

# print(rhyming_dict)

## Meter Analysis
### Note: we call words that can have different number of syllables ambiguous_syllabic_words

In [7]:
# Contains word mapped to list of possible syllables (merges end and regular list too)
flattened_syllable_dict = {}
for key in syllable_dict.keys():
    flattened_syllable_dict[key] = [item for sublist in list(syllable_dict[key]) for item in sublist]

flattened_tokenized_syllable_dict = {}
for key in tokenized_syllable_dict.keys():
    flattened_tokenized_syllable_dict[key] = [item for sublist in list(tokenized_syllable_dict[key]) for item in sublist]

print ("Maximum number of possible syllables for any word: {}" \
       .format(max([len(x) for x in flattened_syllable_dict.values()])))
# flattened_syllable_dict

Maximum number of possible syllables for any word: 2


In [8]:
# Contains words from syllable_dict that can have different numbers of syllables
ambiguous_syllabic_words = {}
for word in flattened_syllable_dict.keys():
    if len(flattened_syllable_dict[word]) > 1:
        ambiguous_syllabic_words[word] = syllable_dict[word]
# ambiguous_syllabic_words

In [9]:
ambiguous_syllabic_word_tokens = set()
for word in ambiguous_syllabic_words:
    ambiguous_syllabic_word_tokens.add(token_map[word])

unambiguous_lines = [] # list of lines that contain 1 or fewer words that can have many numbers of syllables
ambiguous_lines = [] # list of lines that contain 2 or more words that can have many numbers of syllables
for line in flattened_tokenized_poems:
    count = 0
    for token in line:
        if token in ambiguous_syllabic_word_tokens:
            count += 1
    if count <= 1:
        unambiguous_lines.append(line)
    else:
        ambiguous_lines.append(line)
print("Number of 'unambiguous lines': {}".format(len(unambiguous_lines)))
print("Number of 'ambiguous lines': {}".format(len(ambiguous_lines)))

Number of 'unambiguous lines': 2088
Number of 'ambiguous lines': 40


We now have a list of 'unambiguous' lines from the poems that have at most one ambiguous-syllabic word. It is easy to use this to figure out whether a word starts with a stressed or unstressed syllable in the 'unambiguous' lines. We manually identify all the remaining words in the 'ambiguous' lines.

In [10]:
# Note: Some Shakespeare lines do not have 10 syllables! We assume that they end in an unstressed syllable
# (the unstressed-stressed pattern continues)
weird = []
for line in unambiguous_lines:
    syllable_count_1 = 0 # must end at 10 (iambic pentameter) at the end of the line
    syllable_count_2 = 0 # must end at 10 (iambic pentameter) at the end of the line
    for word in line:
        syllable_count_1 += flattened_tokenized_syllable_dict[word][0]
        syllable_count_2 += flattened_tokenized_syllable_dict[word][-1]
    if syllable_count_1 != 10 and syllable_count_2 != 10:
        weird.append(line)
print("Lines that do not have 10 syllables:\n")
for line in weird:
    string = ""
    for word in line:
        string += token_map_r[word] + " "
    print (string)

Lines that do not have 10 syllables:

shifts but his place for still the world enjoys it 
and kept unused the user so destroys it 
in thy souls thought all naked will bestow it 
then may i dare to boast how i do love thee 
till then not show my head where thou mayst prove me 
thou dost love her because thou knowst i love her 
and for my sake even so doth she abuse me 
suffring my friend for my sake to approve her 
pity me then dear friend and i assure ye 
even that your pity is enough to cure me 
divert strong minds to the course of altring things 
be it lawful i love thee as thou lovst those 
those lips that loves own hand did make 
breathed forth the sound that said i hate 
to me that languished for her sake 
but when she saw my woeful state 
straight in her heart did mercy come 
chiding that tongue that ever sweet 
was used in giving gentle doom 
and taught it thus anew to greet 
i hate she altered with an end 
that followed it as gentle day 
doth follow night who like a fiend 
from

In [11]:
# Words that start with a stressed syllable (initialize to all the words we had to manually categorize)
starts_stressed = set()
starts_unstressed = set() # words that start with an unstressed syllable
# We exploit that each unambiguous line has at most one ambiguous-syllabic word and that each ambiguous-syllabic
# word has at most 2 numbers of syllables.
# Note: Some Shakespeare lines do not have 10 syllables! We assume that they end in an unstressed syllable
# (the unstressed-stressed pattern continues)
for line in unambiguous_lines:
    # Choose the first number of syllables.
    # If syllable count is not 10 that way, choose the last number of syllables
    temp_starts_unstressed = set()
    temp_starts_stressed = set()
    
    syllable_count = 0 # generally ends at 10 (iambic pentameter) at the end of the line
    for word in line:
        if syllable_count %2 == 0:
            temp_starts_unstressed.add(word)
        else:
            temp_starts_stressed.add(word)
        syllable_count += flattened_tokenized_syllable_dict[word][0]
    if syllable_count == 10:
        starts_stressed = starts_stressed.union(temp_starts_stressed)
        starts_unstressed = starts_unstressed.union(temp_starts_unstressed)
        continue
    
    # Else syllable count != 10:
    syllable_count = 0
    for word in line:
        if syllable_count %2 == 0:
            starts_unstressed.add(word)
        else:
            starts_stressed.add(word)
        syllable_count += flattened_tokenized_syllable_dict[word][-1]

Finding the stressed/unstressed syllables per word is pretty difficult for the ambiguous lines so we manually classify the words we have not seen yet

In [12]:
unseen_words= set(tokenized_syllable_dict.keys()) - starts_stressed.union(starts_unstressed)
for word in unseen_words:
    print(token_map_r[word], word)
print (len(unseen_words))

deformedst 2574
creature 2575
secret 658
fulness 1683
wooed 1941
dulness 1686
victor 1948
charged 1949
maintain 2722
weakness 2228
novel 2740
staineth 1207
gathered 2752
tempests 2627
inviting 2762
mended 1744
crawls 1759
maturity 1760
wherewith 1761
neercloying 2661
chary 881
sweetness 2296
rightly 2301
inherit 2302
24


In [13]:
starts_stressed = starts_stressed.union \
    (set([2575, 658, 1683, 1941, 1686, 1948, 1949, 2228, 2740, 1207, \
          2752, 2627, 1744, 1759, 1761, 2661, 881, 2296, 2301]))
starts_unstressed = starts_unstressed.union \
    (set([2574, 2722, 2762, 1760, 2302]))

In [14]:
print(set(tokenized_syllable_dict.keys()) - starts_stressed.union(starts_unstressed))
print(len(starts_stressed.intersection(starts_unstressed)))
print(len(starts_stressed.union(starts_unstressed)))
print(len(starts_stressed - starts_unstressed))
print(len(starts_unstressed - starts_stressed))

set()
460
3148
1883
805


In [15]:
# [token_map_r[x] for x in list(starts_stressed)]
# Let's see how well this analysis worked
# Words that only start stressed:
[token_map_r[x] for x in list(starts_stressed - starts_unstressed)]

['fairest',
 'creatures',
 'rose',
 'riper',
 'tender',
 'heir',
 'memory',
 'flame',
 'selfsubstantial',
 'fuel',
 'famine',
 'foe',
 'cruel',
 'ornament',
 'herald',
 'gaudy',
 'spring',
 'buriest',
 'niggarding',
 'world',
 'else',
 'glutton',
 'grave',
 'forty',
 'winters',
 'brow',
 'dig',
 'trenches',
 'field',
 'youths',
 'livery',
 'gazed',
 'tattered',
 'weed',
 'held',
 'asked',
 'treasure',
 'lusty',
 'sunken',
 'shame',
 'thriftless',
 'answer',
 'child',
 'sum',
 'blood',
 'feelst',
 'viewest',
 'mother',
 'uneared',
 'womb',
 'tillage',
 'husbandry',
 'fond',
 'tomb',
 'stop',
 'mothers',
 'back',
 'lovely',
 'april',
 'prime',
 'windows',
 'age',
 'wrinkles',
 'golden',
 'single',
 'image',
 'dies',
 'loveliness',
 'spend',
 'legacy',
 'lend',
 'frank',
 'lends',
 'free',
 'niggard',
 'bounteous',
 'largess',
 'given',
 'usurer',
 'sums',
 'traffic',
 'nature',
 'acceptable',
 'audit',
 'leave',
 'tombed',
 'used',
 'hours',
 'work',
 'frame',
 'gaze',
 'every',
 'dwell'

In [16]:
# Words that only start unstressed:
[token_map_r[x] for x in list(starts_unstressed - starts_stressed)]

['imprint',
 'desire',
 'increase',
 'eternity',
 'contain',
 'commit',
 'delivered',
 'acquaintance',
 'decease',
 'enrich',
 'invoked',
 'assistance',
 'contracted',
 'disperse',
 'aloft',
 'feedst',
 'lights',
 'compile',
 'advance',
 'decayed',
 'deserves',
 'abundance',
 'behaviour',
 'afford',
 'thereof',
 'spends',
 'within',
 'inferior',
 'broad',
 'content',
 'afloat',
 'tall',
 'forgotten',
 'immortal',
 'entombed',
 'oerread',
 'attaint',
 'besiege',
 'oerlook',
 'finding',
 'enforced',
 'anew',
 'timebettering',
 'devised',
 'plain',
 'truetelling',
 'gross',
 'abused',
 'exceed',
 'impute',
 'impair',
 'alleating',
 'confine',
 'immured',
 'example',
 'lean',
 'deserved',
 'couldst',
 'admired',
 'richly',
 'compiled',
 'unlettered',
 'proving',
 'succession',
 'amen',
 'affords',
 'warm',
 'refined',
 'hearing',
 'inhearse',
 'above',
 'compeers',
 'another',
 'giving',
 'repair',
 'astonished',
 'renewest',
 'familiar',
 'beguile',
 'unbless',
 'intelligence',
 'enfeeble

We deliberately do not make a 140 syllable line and then split them (which would allow for some extra continuity) because Shakespearean lines are typically treated as new sentences.

In [18]:
# Flattens 3-dimensional list of list of lists to 2-dimensional list of lists
# (where each inner list corresponds to a line of a poem)
hmm = unsupervised_HMM(flattened_tokenized_poems, 2, 10, \
                       tokenized_syllable_dict, starts_stressed, starts_unstressed)
print('Rhyming Sonnet:\n====================')
for i in range(7):
    print(Meter_Rhyme_HMM_helper.sample_pair(hmm, token_map, rhyming_dict, num_syllables=10))

Iteration: 10
Rhyming Sonnet:
('Praise who birth thy car in eyes to thou grows;', 'Shalt sum of in be tongue spoil right at shows;')
('Praise me it vilest assured all fly;', 'Me the winters a thee one majesty;')
('Thee i five her love is not new thou owst;', 'Then shall for put tied be my doth now growst;')
('Thou my in and tibey husband thee if got;', 'As truth i page got guess by which in blot;')
('More my i holy keep works muse acquainted;', 'By double sinful him saw whether attainted;')
('I thy in destroys the to devised there;', 'Respect a times proud me that everywhere;')
('Hell doth manys sweet thou their prophecies;', 'True such every is the view more though eyes;')


In [46]:
# Most of the meters seem correct but winters does not - it can only start stressed but it starts unstressed above
token_map["winters"] in starts_unstressed

False