In [175]:
import os
import numpy as np
from IPython.display import HTML
from itertools import groupby
import re
from Naive_HMM import unsupervised_HMM as naive_unsupervised_HMM
import Naive_HMM_helper

In [177]:
shakespeare = open("data/shakespeare.txt", 'r')

poems = shakespeare.readlines()
split_at = "\n"
final_poems = [list(g)[1:] for k, g in groupby(poems, lambda x: x != split_at) if k]
print("Initial number of poems: {}".format(len(final_poems)))
poem_lengths = [len(poem) for poem in final_poems] 
bad_poems = np.where(np.array(poem_lengths)!= 14)[0]
print ("Sonnets {} and {} are not 14 lines long so we remove them from our list.".format(bad_poems[0], bad_poems[1]))

final_poems = [final_poems[i] for i in np.delete(np.arange(len(final_poems)), bad_poems)]
print("Final number of poems: {}".format(len(final_poems)))
final_poems = [''.join([line.strip(' ') for line in poem]) for poem in final_poems]

Initial number of poems: 154
Sonnets 98 and 125 are not 14 lines long so we remove them from our list.
Final number of poems: 152


In [178]:
# token_map maps words to numbers
# tokenized_poems replaces the words in poems with their corresponding number
tokenized_poems, token_map = Naive_HMM_helper.parse_observations(final_poems)
token_map_r = Naive_HMM_helper.obs_map_reverser(token_map)

In [182]:
# Helpful lists
# Syllables
syllable_file = open("data/Syllable_dictionary.txt", 'r')
syllables = syllable_file.readlines()
syllables = [x.split() for x in syllables]
syllable_dict = {}
"""
for syllable in syllables:
    word = re.sub(r'[^\w]', '', syllable[0])
    syllable_dict[word] = syllable[1:]
"""
# We choose to map words to tuples of lists
# the first list corresponds to the number of syllables if the word were at the end (E)
# the second list corresponds to the number of syllables the word can take anywhere
# E.g. "test": ['E1', '2', '3'] <-> "test": [([1], [2, 3])]
for syllable in syllables:
    word = re.sub(r'[^\w]', '', syllable[0])
    end_syllable_list = []
    regular_syllable_list = []
    for item in syllable[1:]:
        if item[0] == "E":
            end_syllable_list.append(int(item[1:]))
        else:
            regular_syllable_list.append(int(item))
    syllable_dict[word] = (end_syllable_list, regular_syllable_list)
    
syllable_dict

{'gainst': ([], [1]),
 'greeing': ([1], [2]),
 'scaped': ([], [1]),
 'tis': ([], [1]),
 'twixt': ([], [1]),
 'a': ([], [1]),
 'adoting': ([2], [3]),
 'abhor': ([], [2]),
 'abide': ([], [2]),
 'able': ([], [2]),
 'about': ([], [2]),
 'above': ([], [2]),
 'absence': ([], [2]),
 'absent': ([], [2]),
 'abundance': ([], [3]),
 'abundant': ([], [3]),
 'abuse': ([], [2]),
 'abused': ([], [2]),
 'abuses': ([], [3]),
 'abysm': ([], [2]),
 'accents': ([], [2]),
 'acceptable': ([], [4]),
 'acceptance': ([], [3]),
 'accessary': ([], [4]),
 'accident': ([], [3]),
 'accidents': ([], [3]),
 'account': ([], [2]),
 'accumulate': ([], [4]),
 'accuse': ([], [2]),
 'accusing': ([], [3]),
 'achieve': ([], [2]),
 'acknowledge': ([], [3]),
 'acquaintance': ([], [3]),
 'acquainted': ([2], [3]),
 'act': ([], [1]),
 'action': ([], [2]),
 'active': ([], [2]),
 'actor': ([], [2]),
 'add': ([], [1]),
 'added': ([], [2]),
 'adders': ([], [2]),
 'addeth': ([], [2]),
 'adding': ([], [2]),
 'addition': ([], [3]),
 'ad

In [183]:
tokenized_syllable_dict = {}
for key in syllable_dict.keys():
    # If the word in syllable_dict is in our token map, add it to our tokenized_syllable_dict
    try:
        tokenized_syllable_dict[token_map[key]] = syllable_dict[key]
    except KeyError:
        pass
tokenized_syllable_dict

{476: ([], [1]),
 2594: ([1], [2]),
 2251: ([], [1]),
 925: ([], [1]),
 2016: ([], [1]),
 41: ([], [1]),
 841: ([2], [3]),
 3088: ([], [2]),
 1015: ([], [2]),
 2188: ([], [2]),
 2562: ([], [2]),
 2199: ([], [2]),
 1343: ([], [2]),
 1374: ([], [2]),
 44: ([], [3]),
 2359: ([], [3]),
 216: ([], [2]),
 2152: ([], [2]),
 2719: ([], [3]),
 2552: ([], [2]),
 1923: ([], [2]),
 234: ([], [4]),
 2905: ([], [3]),
 1272: ([], [4]),
 2754: ([], [3]),
 2605: ([], [3]),
 1120: ([], [2]),
 2648: ([], [4]),
 2638: ([], [2]),
 1721: ([], [3]),
 1888: ([], [2]),
 1293: ([], [3]),
 2066: ([], [3]),
 815: ([2], [3]),
 3100: ([], [1]),
 1842: ([], [2]),
 1303: ([], [2]),
 890: ([], [2]),
 1929: ([], [1]),
 2079: ([], [2]),
 2555: ([], [2]),
 2906: ([], [2]),
 844: ([], [2]),
 842: ([], [3]),
 1703: ([], [2]),
 2269: ([], [2]),
 2743: ([], [2]),
 2175: ([], [3]),
 1749: ([], [3]),
 2620: ([], [2]),
 2908: ([], [3]),
 1622: ([], [3]),
 358: ([], [2]),
 2713: ([], [3]),
 2087: ([], [2]),
 1824: ([], [3]),
 12

## Syllable Analysis - is in HMM_helper file (generate_emissions)

We deliberately do not make a 140 syllable line and then split them (which would allow for some extra continuity) because Shakespearean lines are typically treated as new sentences.

In [7]:
# Flattens 3-dimensional list of list of lists to 2-dimensional list of lists
# (where each inner list corresponds to a line of a poem)
flattened_tokenized_poems = [val for sublist in tokenized_poems for val in sublist]
# flattened_poems = [val for sublist in tokenized_poems for val in sublist]
hmm = naive_unsupervised_HMM(flattened_tokenized_poems, 2, 10)
print('Naive Sonnet:\n====================')
for i in range(14):
    print(Naive_HMM_helper.sample_sentence(hmm, token_map, tokenized_syllable_dict, num_syllables=10))

Iteration: 10
Naive Sonnet:
But from move physic of makst put being;
Or it do for as with lovely my nor;
Tonguetied bow pen your deep worse assure in;
Black show place bear shalt they false speaking in;
Inward oft mayst stay appear which cries live;
Says of best make potions eye seen thy my;
Heart as be thou heart not with me when nor;
Needs and what lovst our is bound to give;
Sits i touches with front stoln in thee still;
Attending and love and slave win thy mow;
Silence and divert but give summer of;
Chopt minutes beside age loves if think and;
That him crowned the though takes a dwell to;
Outgoing controlling pitying me thought;


## Rhyme Analysis

In [132]:
# Karthik do stuff here

## Meter Analysis
### Note: we call words that can have different number of syllables ambiguous_syllabic_words

In [296]:
# Contains word mapped to list of possible syllables (merges end and regular list too)
flattened_syllable_dict = {}
for key in syllable_dict.keys():
    flattened_syllable_dict[key] = [item for sublist in list(syllable_dict[key]) for item in sublist]

flattened_tokenized_syllable_dict = {}
for key in tokenized_syllable_dict.keys():
    flattened_tokenized_syllable_dict[key] = [item for sublist in list(tokenized_syllable_dict[key]) for item in sublist]

print ("Maximum number of possible syllables for any word: {}" \
       .format(max([len(x) for x in flattened_syllable_dict.values()])))
flattened_syllable_dict

Maximum number of possible syllables for any word: 2


{'gainst': [1],
 'greeing': [1, 2],
 'scaped': [1],
 'tis': [1],
 'twixt': [1],
 'a': [1],
 'adoting': [2, 3],
 'abhor': [2],
 'abide': [2],
 'able': [2],
 'about': [2],
 'above': [2],
 'absence': [2],
 'absent': [2],
 'abundance': [3],
 'abundant': [3],
 'abuse': [2],
 'abused': [2],
 'abuses': [3],
 'abysm': [2],
 'accents': [2],
 'acceptable': [4],
 'acceptance': [3],
 'accessary': [4],
 'accident': [3],
 'accidents': [3],
 'account': [2],
 'accumulate': [4],
 'accuse': [2],
 'accusing': [3],
 'achieve': [2],
 'acknowledge': [3],
 'acquaintance': [3],
 'acquainted': [2, 3],
 'act': [1],
 'action': [2],
 'active': [2],
 'actor': [2],
 'add': [1],
 'added': [2],
 'adders': [2],
 'addeth': [2],
 'adding': [2],
 'addition': [3],
 'adieu': [2],
 'adjunct': [2],
 'admire': [2],
 'admired': [3],
 'admiring': [3],
 'admit': [2],
 'admitted': [3],
 'adonis': [3],
 'adore': [2],
 'adulterate': [3],
 'advance': [2],
 'advantage': [3],
 'adverse': [2],
 'advised': [2],
 'advocate': [3],
 'afar'

In [297]:
# Contains words from syllable_dict that can have different numbers of syllables
ambiguous_syllabic_words = {}
for word in flattened_syllable_dict.keys():
    if len(d[word]) > 1:
        ambiguous_syllabic_words[word] = syllable_dict[word]
ambiguous_syllabic_words

{'acquainted': ([2], [3]),
 'adoting': ([2], [3]),
 'amazeth': ([2], [3]),
 'another': ([2], [3]),
 'any': ([1], [2]),
 'arising': ([2], [3]),
 'assemble': ([2], [3]),
 'assured': ([2], [3]),
 'attainted': ([2], [3]),
 'bearing': ([1], [2]),
 'being': ([], [1, 2]),
 'beloved': ([], [2, 3]),
 'better': ([1], [2]),
 'bevel': ([1], [2]),
 'blindness': ([1], [2]),
 'broken': ([1], [2]),
 'buried': ([3], [2]),
 'chary': ([1], [2]),
 'cherish': ([1], [2]),
 'clearer': ([1], [2]),
 'comment': ([1], [2]),
 'committed': ([2], [3]),
 'controlling': ([2], [3]),
 'convertest': ([2], [3]),
 'correction': ([2], [3]),
 'cover': ([1], [2]),
 'created': ([2], [3]),
 'creature': ([1], [2]),
 'crowned': ([], [1, 2]),
 'cruel': ([1], [2]),
 'dearer': ([1], [2]),
 'dearly': ([1], [2]),
 'deceived': ([2], [3]),
 'deceivest': ([2], [3]),
 'defeated': ([2], [3]),
 'delighted': ([2], [3]),
 'departest': ([2], [3]),
 'deserving': ([2], [3]),
 'desired': ([2], [3]),
 'despised': ([2], [3]),
 'despising': ([2], [

In [298]:
ambiguous_syllabic_word_tokens = set()
for word in ambiguous_syllabic_words:
    ambiguous_syllabic_word_tokens.add(token_map[word])

unambiguous_lines = [] # list of lines that contain 1 or fewer words that can have many numbers of syllables
ambiguous_lines = [] # list of lines that contain 2 or more words that can have many numbers of syllables
for line in flattened_tokenized_poems:
    count = 0
    for token in line:
        if token in ambiguous_syllabic_word_tokens:
            count += 1
    if count <= 1:
        unambiguous_lines.append(line)
    else:
        ambiguous_lines.append(line)
print("Number of 'unambiguous lines': {}".format(len(unambiguous_lines)))
print("Number of 'ambiguous lines': {}".format(len(ambiguous_lines)))

Number of 'unambiguous lines': 2088
Number of 'ambiguous lines': 40


We now have a list of 'unambiguous' lines from the poems that have at most one ambiguous-syllabic word. It is easy to use this to figure out whether a word starts with a stressed or unstressed syllable in the 'unambiguous' lines. We manually identify all the remaining words in the 'ambiguous' lines.

In [299]:
# Note: Some Shakespeare lines do not have 10 syllables! We assume that they end in an unstressed syllable
# (the unstressed-stressed pattern continues)
weird = []
for line in unambiguous_lines:
    syllable_count_1 = 0 # must end at 10 (iambic pentameter) at the end of the line
    syllable_count_2 = 0 # must end at 10 (iambic pentameter) at the end of the line
    for word in line:
        syllable_count_1 += flattened_tokenized_syllable_dict[word][0]
        syllable_count_2 += flattened_tokenized_syllable_dict[word][-1]
    if syllable_count_1 != 10 and syllable_count_2 != 10:
        weird.append(line)
print("Lines that do not have 10 syllables:\n")
for line in weird:
    string = ""
    for word in line:
        string += token_map_r[word] + " "
    print (string)

Lines that do not have 10 syllables:

shifts but his place for still the world enjoys it 
and kept unused the user so destroys it 
in thy souls thought all naked will bestow it 
then may i dare to boast how i do love thee 
till then not show my head where thou mayst prove me 
thou dost love her because thou knowst i love her 
and for my sake even so doth she abuse me 
suffring my friend for my sake to approve her 
pity me then dear friend and i assure ye 
even that your pity is enough to cure me 
divert strong minds to the course of altring things 
be it lawful i love thee as thou lovst those 
those lips that loves own hand did make 
breathed forth the sound that said i hate 
to me that languished for her sake 
but when she saw my woeful state 
straight in her heart did mercy come 
chiding that tongue that ever sweet 
was used in giving gentle doom 
and taught it thus anew to greet 
i hate she altered with an end 
that followed it as gentle day 
doth follow night who like a fiend 
from

In [300]:
# Words that start with a stressed syllable (initialize to all the words we had to manually categorize)
starts_stressed = set()
starts_unstressed = set() # words that start with an unstressed syllable
# We exploit that each unambiguous line has at most one ambiguous-syllabic word and that each ambiguous-syllabic
# word has at most 2 numbers of syllables.
# Note: Some Shakespeare lines do not have 10 syllables! We assume that they end in an unstressed syllable
# (the unstressed-stressed pattern continues)
for line in unambiguous_lines:
    # Choose the first number of syllables.
    # If syllable count is not 10 that way, choose the last number of syllables
    temp_starts_unstressed = set()
    temp_starts_stressed = set()
    
    syllable_count = 0 # generally ends at 10 (iambic pentameter) at the end of the line
    for word in line:
        if syllable_count %2 == 0:
            temp_starts_unstressed.add(word)
        else:
            temp_starts_stressed.add(word)
        syllable_count += flattened_tokenized_syllable_dict[word][0]
    if syllable_count == 10:
        starts_stressed = starts_stressed.union(temp_starts_stressed)
        starts_unstressed = starts_unstressed.union(temp_starts_unstressed)
        continue
    
    # Else syllable count != 10:
    syllable_count = 0
    for word in line:
        if syllable_count %2 == 0:
            starts_unstressed.add(word)
        else:
            starts_stressed.add(word)
        syllable_count += flattened_tokenized_syllable_dict[word][-1]

Finding the stressed/unstressed syllables per word is pretty difficult for the ambiguous lines so we manually classify the words we have not seen yet

In [301]:
unseen_words= set(tokenized_syllable_dict.keys()) - starts_stressed.union(starts_unstressed)
for word in unseen_words:
    print(token_map_r[word], word)
print (len(unseen_words))

deformedst 2574
creature 2575
secret 658
fulness 1683
wooed 1941
dulness 1686
victor 1948
charged 1949
maintain 2722
weakness 2228
novel 2740
staineth 1207
gathered 2752
tempests 2627
inviting 2762
mended 1744
crawls 1759
maturity 1760
wherewith 1761
neercloying 2661
chary 881
sweetness 2296
rightly 2301
inherit 2302
24


In [302]:
starts_stressed = starts_stressed.union \
    (set([2575, 658, 1683, 1941, 1686, 1948, 1949, 2228, 2740, 1207, \
          2752, 2627, 1744, 1759, 1761, 2661, 881, 2296, 2301]))
starts_unstressed = starts_unstressed.union \
    (set([2574, 2722, 2762, 1760, 2302]))

In [303]:
print(set(tokenized_syllable_dict.keys()) - starts_stressed.union(starts_unstressed))
print(len(starts_stressed.intersection(starts_unstressed)))
print(len(starts_stressed.union(starts_unstressed)))
print(len(starts_stressed - starts_unstressed))
print(len(starts_unstressed - starts_stressed))

set()
460
3148
1883
805


In [309]:
# [token_map_r[x] for x in list(starts_stressed)]
# Let's see how well this analysis worked
# Words that only start stressed:
[token_map_r[x] for x in list(starts_stressed - starts_unstressed)]

['fairest',
 'creatures',
 'rose',
 'riper',
 'tender',
 'heir',
 'memory',
 'flame',
 'selfsubstantial',
 'fuel',
 'famine',
 'foe',
 'cruel',
 'ornament',
 'herald',
 'gaudy',
 'spring',
 'buriest',
 'niggarding',
 'world',
 'else',
 'glutton',
 'grave',
 'forty',
 'winters',
 'brow',
 'dig',
 'trenches',
 'field',
 'youths',
 'livery',
 'gazed',
 'tattered',
 'weed',
 'held',
 'asked',
 'treasure',
 'lusty',
 'sunken',
 'shame',
 'thriftless',
 'answer',
 'child',
 'sum',
 'blood',
 'feelst',
 'viewest',
 'mother',
 'uneared',
 'womb',
 'tillage',
 'husbandry',
 'fond',
 'tomb',
 'stop',
 'mothers',
 'back',
 'lovely',
 'april',
 'prime',
 'windows',
 'age',
 'wrinkles',
 'golden',
 'single',
 'image',
 'dies',
 'loveliness',
 'spend',
 'legacy',
 'lend',
 'frank',
 'lends',
 'free',
 'niggard',
 'bounteous',
 'largess',
 'given',
 'usurer',
 'sums',
 'traffic',
 'nature',
 'acceptable',
 'audit',
 'leave',
 'tombed',
 'used',
 'hours',
 'work',
 'frame',
 'gaze',
 'every',
 'dwell'

In [311]:
# Words that only start unstressed:
[token_map_r[x] for x in list(starts_unstressed - starts_stressed)]

['imprint',
 'desire',
 'increase',
 'eternity',
 'contain',
 'commit',
 'delivered',
 'acquaintance',
 'decease',
 'enrich',
 'invoked',
 'assistance',
 'contracted',
 'disperse',
 'aloft',
 'feedst',
 'lights',
 'compile',
 'advance',
 'decayed',
 'deserves',
 'abundance',
 'behaviour',
 'afford',
 'thereof',
 'spends',
 'within',
 'inferior',
 'broad',
 'content',
 'afloat',
 'tall',
 'forgotten',
 'immortal',
 'entombed',
 'oerread',
 'attaint',
 'besiege',
 'oerlook',
 'finding',
 'enforced',
 'anew',
 'timebettering',
 'devised',
 'plain',
 'truetelling',
 'gross',
 'abused',
 'exceed',
 'impute',
 'impair',
 'alleating',
 'confine',
 'immured',
 'example',
 'lean',
 'deserved',
 'couldst',
 'admired',
 'richly',
 'compiled',
 'unlettered',
 'proving',
 'succession',
 'amen',
 'affords',
 'warm',
 'refined',
 'hearing',
 'inhearse',
 'above',
 'compeers',
 'another',
 'giving',
 'repair',
 'astonished',
 'renewest',
 'familiar',
 'beguile',
 'unbless',
 'intelligence',
 'enfeeble