In [24]:
import os
import numpy as np
from IPython.display import HTML
from itertools import groupby
import re
import pickle

import Overall_HMM_helper
from Meter_Rhyme_HMM import unsupervised_HMM
import Meter_Rhyme_HMM_helper

In [25]:
shakespeare = open("data/shakespeare.txt", 'r')

poems = shakespeare.readlines()
split_at = "\n"
final_poems = [list(g)[1:] for k, g in groupby(poems, lambda x: x != split_at) if k]
print("Initial number of poems: {}".format(len(final_poems)))
poem_lengths = [len(poem) for poem in final_poems] 
bad_poems = np.where(np.array(poem_lengths)!= 14)[0]
print ("Sonnets {} and {} are not 14 lines long so we remove them from our list.".format(bad_poems[0], bad_poems[1]))

final_poems = [final_poems[i] for i in np.delete(np.arange(len(final_poems)), bad_poems)]
print("Final number of poems: {}".format(len(final_poems)))
final_poems = [''.join([line.strip(' ') for line in poem]) for poem in final_poems]

Initial number of poems: 154
Sonnets 98 and 125 are not 14 lines long so we remove them from our list.
Final number of poems: 152


In [26]:
# token_map maps words to numbers
# tokenized_poems replaces the words in poems with their corresponding number
tokenized_poems, token_map = Overall_HMM_helper.parse_observations(final_poems)
token_map_r = Overall_HMM_helper.obs_map_reverser(token_map)
flattened_tokenized_poems = [val for sublist in tokenized_poems for val in sublist]

In [27]:
final_poems

["From fairest creatures we desire increase,\nThat thereby beauty's rose might never die,\nBut as the riper should by time decease,\nHis tender heir might bear his memory:\nBut thou contracted to thine own bright eyes,\nFeed'st thy light's flame with self-substantial fuel,\nMaking a famine where abundance lies,\nThy self thy foe, to thy sweet self too cruel:\nThou that art now the world's fresh ornament,\nAnd only herald to the gaudy spring,\nWithin thine own bud buriest thy content,\nAnd tender churl mak'st waste in niggarding:\nPity the world, or else this glutton be,\nTo eat the world's due, by the grave and thee.\n",
 "When forty winters shall besiege thy brow,\nAnd dig deep trenches in thy beauty's field,\nThy youth's proud livery so gazed on now,\nWill be a tattered weed of small worth held:\nThen being asked, where all thy beauty lies,\nWhere all the treasure of thy lusty days;\nTo say within thine own deep sunken eyes,\nWere an all-eating shame, and thriftless praise.\nHow much

In [28]:
flattened_tokenized_poems

[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
 [13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 10, 24, 21, 25],
 [13, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39],
 [40, 41, 42, 43, 44, 45, 34, 46, 34, 47, 28, 34, 48, 46, 49, 50],
 [26, 6, 51, 52, 15, 53, 54, 55, 56, 57, 58, 28, 15, 59, 60],
 [61, 29, 30, 62, 63, 34, 64, 56, 22, 65, 66, 67, 68, 69],
 [70, 15, 71, 72, 73, 74, 75, 76, 28, 77, 15, 53, 78, 18, 15, 79, 56, 80],
 [81, 82, 83, 84, 85, 34, 86, 56, 87, 88, 89, 68, 34, 8, 90],
 [34, 91, 92, 93, 94, 95, 96, 52, 97, 76, 41, 98, 99, 100, 101, 102, 103],
 [104, 105, 106, 43, 107, 34, 108, 45, 43, 107, 15, 109, 100, 34, 110, 111],
 [28, 112, 61, 29, 30, 88, 113, 32, 114, 115, 116, 117, 56, 118, 119],
 [120,
  121,
  122,
  119,
  123,
  34,
  8,
  124,
  125,
  26,
  126,
  127,
  74,
  128,
  129,
  100,
  130],
 [84, 131, 132, 133, 56, 134, 132, 135, 136, 137, 21, 108, 18, 138, 29],
 [74,
  114,
  28,
  76,
  139,
  140,
  81,
  26,
  51,
  135,
  56,
  141,
  34,
  142,
  1

## Syllable Analysis - is in HMM_helper file (generate_emissions)

In [29]:
# Syllables
syllable_file = open("data/Syllable_dictionary.txt", 'r')
syllables = syllable_file.readlines()
syllables = [x.split() for x in syllables]
syllable_dict = {}

# We choose to map words to tuples of lists
# the first list corresponds to the number of syllables if the word were at the end (E)
# the second list corresponds to the number of syllables the word can take anywhere
# E.g. "test": ['E1', '2', '3'] <-> "test": [([1], [2, 3])]
for syllable in syllables:
    word = re.sub(r'[^\w]', '', syllable[0])
    end_syllable_list = []
    regular_syllable_list = []
    for item in syllable[1:]:
        if item[0] == "E":
            end_syllable_list.append(int(item[1:]))
        else:
            regular_syllable_list.append(int(item))
    syllable_dict[word] = (end_syllable_list, regular_syllable_list)
    
# syllable_dict

In [30]:
tokenized_syllable_dict = {}
for key in syllable_dict.keys():
    # If the word in syllable_dict is in our token map, add it to our tokenized_syllable_dict
    try:
        tokenized_syllable_dict[token_map[key]] = syllable_dict[key]
    except KeyError:
        pass
# tokenized_syllable_dict

## Rhyme Analysis

In [31]:
def get_rhyme_pairs(poem):
    rhyme_pairs = []
    last_words = []
    poem = poem.split("\n")
    for line in poem:
        
        word = line.split(" ")[-1]
        word = re.sub(r'[^\w]', '', word).lower()
        last_words.append(word)

    if '' in last_words:
        last_words.remove('')
    
    rhyme_pairs.append((last_words[0], last_words[2]))
    rhyme_pairs.append((last_words[1], last_words[3]))
    rhyme_pairs.append((last_words[4], last_words[6]))
    rhyme_pairs.append((last_words[5], last_words[7]))
    rhyme_pairs.append((last_words[8], last_words[10]))
    rhyme_pairs.append((last_words[9], last_words[11]))
    rhyme_pairs.append((last_words[12], last_words[13]))
    
    return rhyme_pairs

# Now compile all the rhyming words in each poem
rhyming_dict = []
for poem in final_poems:
    rhyming_dict += get_rhyme_pairs(poem)

# print(rhyming_dict)

## Meter Analysis
### Note: we call words that can have different number of syllables ambiguous_syllabic_words

In [32]:
# Contains word mapped to list of possible syllables (merges end and regular list too)
flattened_syllable_dict = {}
for key in syllable_dict.keys():
    flattened_syllable_dict[key] = [item for sublist in list(syllable_dict[key]) for item in sublist]

flattened_tokenized_syllable_dict = {}
for key in tokenized_syllable_dict.keys():
    flattened_tokenized_syllable_dict[key] = [item for sublist in list(tokenized_syllable_dict[key]) for item in sublist]

print ("Maximum number of possible syllables for any word: {}" \
       .format(max([len(x) for x in flattened_syllable_dict.values()])))
# flattened_syllable_dict

Maximum number of possible syllables for any word: 2


In [33]:
# Contains words from syllable_dict that can have different numbers of syllables
ambiguous_syllabic_words = {}
for word in flattened_syllable_dict.keys():
    if len(flattened_syllable_dict[word]) > 1:
        ambiguous_syllabic_words[word] = syllable_dict[word]
# ambiguous_syllabic_words

In [34]:
ambiguous_syllabic_word_tokens = set()
for word in ambiguous_syllabic_words:
    ambiguous_syllabic_word_tokens.add(token_map[word])

unambiguous_lines = [] # list of lines that contain 1 or fewer words that can have many numbers of syllables
ambiguous_lines = [] # list of lines that contain 2 or more words that can have many numbers of syllables
for line in flattened_tokenized_poems:
    count = 0
    for token in line:
        if token in ambiguous_syllabic_word_tokens:
            count += 1
    if count <= 1:
        unambiguous_lines.append(line)
    else:
        ambiguous_lines.append(line)
print("Number of 'unambiguous lines': {}".format(len(unambiguous_lines)))
print("Number of 'ambiguous lines': {}".format(len(ambiguous_lines)))

Number of 'unambiguous lines': 960
Number of 'ambiguous lines': 104


In [35]:
token_map

{'from': 0,
 'fairest': 1,
 'creatures': 2,
 'we': 3,
 'desire': 4,
 'increase': 5,
 'that': 6,
 'thereby': 7,
 'beautys': 8,
 'rose': 9,
 'might': 10,
 'never': 11,
 'die': 12,
 'but': 13,
 'as': 14,
 'the': 15,
 'riper': 16,
 'should': 17,
 'by': 18,
 'time': 19,
 'decease': 20,
 'his': 21,
 'tender': 22,
 'heir': 23,
 'bear': 24,
 'memory': 25,
 'thou': 26,
 'contracted': 27,
 'to': 28,
 'thine': 29,
 'own': 30,
 'bright': 31,
 'eyes': 32,
 'feedst': 33,
 'thy': 34,
 'lights': 35,
 'flame': 36,
 'with': 37,
 'selfsubstantial': 38,
 'fuel': 39,
 'making': 40,
 'a': 41,
 'famine': 42,
 'where': 43,
 'abundance': 44,
 'lies': 45,
 'self': 46,
 'foe': 47,
 'sweet': 48,
 'too': 49,
 'cruel': 50,
 'art': 51,
 'now': 52,
 'worlds': 53,
 'fresh': 54,
 'ornament': 55,
 'and': 56,
 'only': 57,
 'herald': 58,
 'gaudy': 59,
 'spring': 60,
 'within': 61,
 'bud': 62,
 'buriest': 63,
 'content': 64,
 'churl': 65,
 'makst': 66,
 'waste': 67,
 'in': 68,
 'niggarding': 69,
 'pity': 70,
 'world': 71,


We now have a list of 'unambiguous' lines from the poems that have at most one ambiguous-syllabic word. It is easy to use this to figure out whether a word starts with a stressed or unstressed syllable in the 'unambiguous' lines. We manually identify all the remaining words in the 'ambiguous' lines.

In [36]:
# Note: Some Shakespeare lines do not have 10 syllables! We assume that they end in an unstressed syllable
# (the unstressed-stressed pattern continues)
weird = []
for line in unambiguous_lines:
    syllable_count_1 = 0 # must end at 10 (iambic pentameter) at the end of the line
    syllable_count_2 = 0 # must end at 10 (iambic pentameter) at the end of the line
    for word in line:
        syllable_count_1 += flattened_tokenized_syllable_dict[word][0]
        syllable_count_2 += flattened_tokenized_syllable_dict[word][-1]
    if syllable_count_1 != 10 and syllable_count_2 != 10:
        weird.append(line)
print("Lines that do not have 10 syllables:\n")
for line in weird:
    string = ""
    for word in line:
        string += token_map_r[word] + " "
    print (string)

Lines that do not have 10 syllables:

from fairest creatures we desire increase that thereby beautys rose might never die 
but as the riper should by time decease his tender heir might bear his memory 
but thou contracted to thine own bright eyes feedst thy lights flame with selfsubstantial fuel 
thou that art now the worlds fresh ornament and only herald to the gaudy spring 
within thine own bud buriest thy content and tender churl makst waste in niggarding 
pity the world or else this glutton be to eat the worlds due by the grave and thee 
when forty winters shall besiege thy brow and dig deep trenches in thy beautys field 
thy youths proud livery so gazed on now will be a tattered weed of small worth held 
to say within thine own deep sunken eyes were an alleating shame and thriftless praise 
how much more praise deserved thy beautys use if thou couldst answer this fair child of mine 
shall sum my count and make my old excuse proving his beauty by succession thine 
this were to be n

In [37]:
# Words that start with a stressed syllable (initialize to all the words we had to manually categorize)
starts_stressed = set()
starts_unstressed = set() # words that start with an unstressed syllable
# We exploit that each unambiguous line has at most one ambiguous-syllabic word and that each ambiguous-syllabic
# word has at most 2 numbers of syllables.
# Note: Some Shakespeare lines do not have 10 syllables! We assume that they end in an unstressed syllable
# (the unstressed-stressed pattern continues)
for line in unambiguous_lines:
    # Choose the first number of syllables.
    # If syllable count is not 10 that way, choose the last number of syllables
    temp_starts_unstressed = set()
    temp_starts_stressed = set()
    
    syllable_count = 0 # generally ends at 10 (iambic pentameter) at the end of the line
    for word in line:
        if syllable_count %2 == 0:
            temp_starts_unstressed.add(word)
        else:
            temp_starts_stressed.add(word)
        syllable_count += flattened_tokenized_syllable_dict[word][0]
    if syllable_count == 10:
        starts_stressed = starts_stressed.union(temp_starts_stressed)
        starts_unstressed = starts_unstressed.union(temp_starts_unstressed)
        continue
    
    # Else syllable count != 10:
    syllable_count = 0
    for word in line:
        if syllable_count %2 == 0:
            starts_unstressed.add(word)
        else:
            starts_stressed.add(word)
        syllable_count += flattened_tokenized_syllable_dict[word][-1]

Finding the stressed/unstressed syllables per word is pretty difficult for the ambiguous lines so we manually classify the words we have not seen yet

In [38]:
unseen_words= set(tokenized_syllable_dict.keys()) - starts_stressed.union(starts_unstressed)
for word in unseen_words:
    print(token_map_r[word], word)
print (len(unseen_words))

rudst 2572
gentlest 2573
deformedst 2574
creature 2575
monarchs 2581
swearing 3099
flatter 1055
swartcomplexioned 1056
sparkling 1057
twire 1058
gildst 1059
kingly 2591
drinks 2592
length 1064
decayed 2089
famine 42
knew 2600
afterwards 2602
foe 47
faint 2100
lark 1088
arising 1090
everfixed 2626
tempests 2627
gate 1093
east 2864
weeks 2636
worthiness 1615
eager 2653
neercloying 2661
sauces 2662
welfare 2663
abused 2152
meetness 2664
asked 106
diseased 2665
reigns 1132
needing 2666
impute 2161
committed 2685
spheres 2686
fitted 2687
distraction 2688
madding 2689
fever 2690
dearer 1160
march 1163
huge 652
ranks 1164
presenteth 654
equipage 1166
weigh 2700
suffered 2701
secret 658
glorious 1170
morning 1171
fulness 1683
tops 1174
viewest 151
dulness 1686
hits 2704
reproach 2711
renewest 158
beguile 160
unbless 161
maintain 2722
badness 2723
granting 2217
riches 2218
deserving 2219
patent 2220
swerving 2221
mistaking 2222
crave 1711
misprision 2223
oblivion 2730
waking 2226
missed 2731
wh

In [39]:
starts_stressed = starts_stressed.union \
    (set([2575, 658, 1683, 1941, 1686, 1948, 1949, 2228, 2740, 1207, \
          2752, 2627, 1744, 1759, 1761, 2661, 881, 2296, 2301]))
starts_unstressed = starts_unstressed.union \
    (set([2574, 2722, 2762, 1760, 2302]))

In [40]:
print(set(tokenized_syllable_dict.keys()) - starts_stressed.union(starts_unstressed))
print(len(starts_stressed.intersection(starts_unstressed)))
print(len(starts_stressed.union(starts_unstressed)))
print(len(starts_stressed - starts_unstressed))
print(len(starts_unstressed - starts_stressed))

{2572, 2573, 2581, 3099, 1055, 1056, 1057, 1058, 1059, 2591, 2592, 1064, 2089, 42, 2600, 2602, 47, 2100, 1088, 1090, 2626, 1093, 2864, 2636, 1615, 2653, 2662, 2663, 2152, 2664, 106, 2665, 1132, 2666, 2161, 2685, 2686, 2687, 2688, 2689, 2690, 1160, 1163, 652, 1164, 654, 1166, 2700, 2701, 1170, 1171, 2702, 2703, 1174, 151, 2704, 2711, 158, 160, 161, 2723, 2217, 2218, 2219, 2220, 2221, 2222, 1711, 2223, 2730, 2226, 2731, 1204, 1205, 694, 2229, 696, 697, 2230, 2741, 700, 1732, 1733, 2759, 2760, 204, 209, 1745, 1757, 2271, 239, 244, 2295, 1286, 2828, 2829, 2830, 813, 815, 304, 816, 306, 818, 819, 821, 2863, 823, 824, 825, 2865, 2866, 828, 1340, 830, 2875, 2876, 833, 2375, 1352, 841, 1354, 843, 844, 1356, 846, 1357, 1359, 1364, 1365, 855, 856, 2391, 2392, 861, 862, 864, 882, 884, 2426, 2427, 1923, 391, 392, 394, 2964, 1947, 422, 1459, 1460, 1462, 3004, 3005, 3006, 1471, 3007, 473, 2523, 3044, 3045, 3046}
545
2988
1662
781


In [41]:
# [token_map_r[x] for x in list(starts_stressed)]
# Let's see how well this analysis worked
# Words that only start stressed:
[token_map_r[x] for x in list(starts_stressed - starts_unstressed)]

['fairest',
 'creatures',
 'rose',
 'riper',
 'heir',
 'memory',
 'flame',
 'selfsubstantial',
 'fuel',
 'cruel',
 'herald',
 'gaudy',
 'buriest',
 'niggarding',
 'else',
 'glutton',
 'grave',
 'forty',
 'winters',
 'dig',
 'trenches',
 'field',
 'youths',
 'livery',
 'gazed',
 'tattered',
 'weed',
 'held',
 'treasure',
 'lusty',
 'sunken',
 'shame',
 'thriftless',
 'answer',
 'child',
 'sum',
 'feelst',
 'mother',
 'uneared',
 'womb',
 'tillage',
 'husbandry',
 'stop',
 'mothers',
 'back',
 'lovely',
 'april',
 'prime',
 'windows',
 'golden',
 'loveliness',
 'spend',
 'legacy',
 'natures',
 'lend',
 'lends',
 'free',
 'niggard',
 'bounteous',
 'largess',
 'given',
 'usurer',
 'sums',
 'traffic',
 'nature',
 'acceptable',
 'audit',
 'used',
 'hours',
 'work',
 'frame',
 'gaze',
 'play',
 'tyrants',
 'very',
 'fairly',
 'neverresting',
 'summer',
 'hideous',
 'frost',
 'bareness',
 'distillation',
 'liquid',
 'prisoner',
 'walls',
 'flowers',
 'leese',
 'ragged',
 'hand',
 'place',
 'us

In [42]:
# Words that only start unstressed:
[token_map_r[x] for x in list(starts_unstressed - starts_stressed)]

['imprint',
 'increase',
 'contain',
 'commit',
 'delivered',
 'decease',
 'enrich',
 'invoked',
 'assistance',
 'contracted',
 'disperse',
 'aloft',
 'feedst',
 'lights',
 'compile',
 'advance',
 'deserves',
 'abundance',
 'behaviour',
 'afford',
 'knowing',
 'thereof',
 'spends',
 'inferior',
 'broad',
 'content',
 'afloat',
 'tall',
 'forgotten',
 'immortal',
 'entombed',
 'oerread',
 'mouths',
 'attaint',
 'besiege',
 'oerlook',
 'finding',
 'enforced',
 'timebettering',
 'devised',
 'plain',
 'truetelling',
 'gross',
 'exceed',
 'extant',
 'impair',
 'alleating',
 'confine',
 'immured',
 'example',
 'lean',
 'deserved',
 'couldst',
 'admired',
 'richly',
 'compiled',
 'excuse',
 'proving',
 'succession',
 'unlettered',
 'amen',
 'polished',
 'warm',
 'hearing',
 'inhearse',
 'struck',
 'compeers',
 'another',
 'giving',
 'repair',
 'astonished',
 'familiar',
 'intelligence',
 'enfeebled',
 'farewell',
 'estimate',
 'releasing',
 'disdains',
 'selflove',
 'disposed',
 'despite',
 '

We deliberately do not make a 140 syllable line and then split them (which would allow for some extra continuity) because Shakespearean lines are typically treated as new sentences.

In [44]:
# Flattens 3-dimensional list of list of lists to 2-dimensional list of lists
# (where each inner list corresponds to a line of a poem)
hmm = unsupervised_HMM(flattened_tokenized_poems, 16, 30, \
                       tokenized_syllable_dict, starts_stressed, starts_unstressed)
print('Rhyming Sonnet:\n====================')
pairs_list = []
for i in range(7):
    pairs_list.append(Meter_Rhyme_HMM_helper.sample_pair(hmm, token_map, rhyming_dict, num_syllables=10))
    
pickle_filename = 'hmm.p'


Iteration: 10
Iteration: 20
Iteration: 30
Rhyming Sonnet:


In [None]:
def print_sonnet(pairs):
    print('Rhymed Sonnet:\n====================')
    print(pairs[0][0])
    print(pairs[1][0])
    print(pairs[0][1])
    print(pairs[1][1])
    print(pairs[2][0])
    print(pairs[3][0])
    print(pairs[2][1])
    print(pairs[3][1])
    print(pairs[4][0])
    print(pairs[5][0])
    print(pairs[4][1])
    print(pairs[5][1])
    print(pairs[6][0])
    print(pairs[6][1])

print_sonnet(pairs_list)

In [45]:
pickle.dump(hmm, open(pickle_filename, 'wb'))

In [None]:
# Most of the meters seem correct but winters does not - it can only start stressed but it starts unstressed above
token_map["winters"] in starts_unstressed