# CS155 Project 3: Shakespearean Sonnets

In [234]:
import random
import numpy as np
import os
import nltk
from nltk.corpus import cmudict
from HMM_Project3 import unsupervised_HMM
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()

## Preprocessing:

#### Initial Attempt:
- Process the words line by line
- Remove line containing numbering for poem (1, 2, 3, etc.)
- Change all the words to lowercase
- Uses TweetTokenizer to separate words (retains apostrophes and hyphens)
- Remove all punctualization

In [89]:
def preprocess_init(text):
    # Convert text to dataset.
    lines = text.split('\n')

    obs_counter = 0
    obs = []
    obs_map = {}

    for line in lines:
        # Separate into words using TweetTokenizer and lowercase
        sentence = tknzr.tokenize(line)
        # Skip if line is poem numbering
        if sentence != [] and not sentence[0].isnumeric():
            obs_elem = []
            punct = ".',':;!?()"; 
            
            for word in sentence:
                # Remove intermediate punctuation
                if not word in punct:
                    # Turn to lowercase
                    word = word.lower()
                    if word not in obs_map:
                        # Add unique words to the observations map.
                        obs_map[word] = obs_counter
                        obs_counter += 1

                    # Add the encoded word.
                    obs_elem.append(obs_map[word])

            # Add the encoded sequence.
            obs.append(obs_elem)

    return obs, obs_map

In [90]:
text = open(os.path.join(os.getcwd(), 'data/shakespeare.txt')).read()
obs, obs_map = preprocess_init(text)

## Unsupervised Learning and Poetry Generation with HMMs:

If we were to do a training/testing split for using validation to determine the number of states, we wouldn't be able to guarantee that every state would end up in the training set since some words only appear once in all of the poems. So, we will instead generate some sample poems and subjectively judge the best number of states, as suggested on Piazza.

In [92]:
def obs_map_reverser(obs_map):
    obs_map_r = {}

    for key in obs_map:
        obs_map_r[obs_map[key]] = key

    return obs_map_r

#### Initial Attempt:
- Determine number of words in each line by sampling randomly from all of the line lengths
- Dictate all of the end-line punctuation to be commas except for the final line, which ends with a period.
- Use characteristic 14-phrase structure.

In [101]:
# Generate array of all shakespeare line lengths (in terms of number of words)
line_lens = [len(i) for i in obs]

In [99]:
def generate_poem_init(hmm, obs_map, line_lens):
    # Get reverse map.
    obs_map_r = obs_map_reverser(obs_map)
    
    poem = ""
    
    for i in range(14):
        # Get desired line length:
        n_words = random.choice(line_lens)
        emission, states = hmm.generate_emission(n_words)
        sentence = [obs_map_r[i] for i in emission]
        
        formatted = ' '.join(sentence).capitalize()
        if i < 13:
            formatted += ",\n"
        else:
            formatted += "."
        
        poem += formatted

    return poem

Generate poems for 1, 2, 4, 8, and 16 hidden states to assess coherency:

In [105]:
hmm2 = unsupervised_HMM(obs, 2, 100)
print('\nSample Poem:\n====================')
print(generate_poem_init(hmm2, obs_map, line_lens))


Sample Poem:
Of climbed so same shop and they place from,
Walls of each what my her state be,
Dear me posterity within in ) love's thy,
Poet white verse thy the gone dote my,
The me to that within heaven dispense i,
From me or it beated look papers sharpened,
Like set here's all insults adder's jacks that esteeming,
For upon the all of the to fool,
Are or course the rest swear your o makes,
That bett'ring a thee beauty me for poor,
Kind are anew tyrant worth lose ocean change should i,
You more put my as which strive,
And seeking and then and that,
Truth the watchman nor his for use they.


In [106]:
hmm4 = unsupervised_HMM(obs, 4, 100)
print('\nSample Poem:\n====================')
print(generate_poem_init(hmm4, obs_map, line_lens))


Sample Poem:
Fair hide share beauty's tongue and of will,
Both with fear are his that,
Of is were but what others hate thy lest with,
In but dear thee mistress hath,
His sweetest send thy to votary and cast,
Long give and wand'ring still whereto to eyes cherubins,
To gave methinks my argument side my is enmity,
Vexed o was on i and you to,
Thy quite eyes your to paying indeed conspire which,
Memory thou laid impute niggard my ushers happy and,
Particulars power and tell aspect flatter his full none,
Heaven the sight saucy state a pattern beauties sweet hung left,
And thee mine your buds,
No frown this my black to this tell best.


In [112]:
hmm8 = unsupervised_HMM(obs, 8, 100)
print('\nSample Poem:\n====================')
print(generate_poem_init(hmm8, obs_map, line_lens))


Sample Poem:
Like found in importune the with this waste,
Title a as a interest beauty's sickness pent thee,
Made by in up in my pierced in strong,
Better burthen leaves with clouds so excellent,
Be gainst at much so perish who of,
See that nymphs let unfair of,
All jacks those grief's moment yours touches work,
Nor it think if thee for,
That or we have thy straight is thy,
Or and but boast my slave or,
Bars breast both sun dead thy in fears,
Despise the every dear-purchased decay of my,
Dateless all the expiate sweets upon that form,
Her appetite life that find me seal from self.


In [120]:
hmm16 = unsupervised_HMM(obs, 16, 100)
print('\nSample Poem:\n====================')
print(generate_poem_init(hmm16, obs_map, line_lens))


Sample Poem:
Merits had they thought acquaintance in boast bud,
If that them again from kind,
And and much to fair sounds renewed from,
Of on hast golden might their for their store,
Morning then sing when nor respect and jaws,
Or best use the music world is,
Gems tables dull broke holds me it mind when suffered,
Some why for pace than and majesty prisoner,
My no bonds from my,
Am see courses hath the sweet moon as,
And and mother it there upon ill,
Taste called live i use death's heart with fortune,
World soil of my brave treasure and thy tears,
Brave sweet grief untold eye to seeming why heaven lost.


The poems are all generally pretty nonsensical, but grammatically the poem with 8 hidden states and the poem with 16 hidden states performed significantly better. Since their performance grammatically was relatively similar, and both are still relatively thematically uncoordinated, for the sake of the time tradeoff we will use 8 hidden states for further generation/improvements.

## Additional Goals:

In the following preprocessing and generation functions, we modified them to attempt to include the following aspects from the actual Shakespearean poetry:

#### Rhyme
We implement the *abab cdcd efef gg* rhyme scheme by making a dictionary of all rhyming end pairs during pre-processing, and by seeding each paired phrase with a randomly generated pair from the dictionary and generating the poetry in reverse.

#### Syllable Count (10)
We implement the 10 syllable count by counting as we generate an emission and limiting the possibilities for words as we reach the end so that we end up at 10 syllables.

#### Punctuation
Since end-of-line punctuation has more to do with poetic structure than with the preceeding word, we will generate it making a distribution for each line number of what the punctuation usually is and then sampling from that distribution (with the exception of the final line, which is always a period). 

To do this, we first need to parse the syllable counts:

In [216]:
def parse_syll_text(syll_text):
    # Convert syllable text to dictionary
    lines = [line.split() for line in syll_text.split('\n') if line.split()]

    syll_dict = {}

    for line in lines:
        word = line[0].lower()
        syll_dict[word] = line[1:]

    return syll_dict

For the purpose of getting the words to match up, the Syllable_dictionary file was changed so that words with a leading apostrophe (i.e. 'gainst) were modified to not have the leading apostrophe, due to the way TweetTokenizer parses the words.

In [217]:
syll_text = open(os.path.join(os.getcwd(), 'data/Syllable_dictionary.txt')).read()
syll_dict_text = parse_syll_text(syll_text)

We also wrote a helper function for making the dictionary key for the rhyme_dict that returns the key and the new exception count.

In [218]:
def get_rhyme_key(rhyme, pair, excep):
    common = ''
    comm_num = 0
    # Iterate through all the possible pronunciation combos
    # to find which one has highest similarity (rhyme)
    for p1 in rhyme:
        for p2 in pair:
            comm_test = 0
            # Check commmon pronunciation from end onward
            for j in range(min(len(p1), len(p2))):
                if p1[len(p1) - j - 1] == p2[len(p2) - j - 1]:
                    comm_test += 1
                else:
                    break
            # If this pronunciation has greater commonality than 
            # any of the others, update common and comm_num
            if comm_test > comm_num:
                comm_num = comm_test
                common = ' '.join(p1[len(p1) - comm_num:])
                                        
    # If cmudict can't find the words, we save it under the key
    # 'excep#' instead
    if common == '':
        common = 'excep' + str(excep)
        excep += 1
    
    return common, excep

Finally, we can write our updated pre-processing function; note that sonnets 99 and 126 have irregular line numbers/rhyme schemes, so we exclude them from rhyme processing:

In [221]:
def preprocess(text, syll_dict_text):
    # Convert text to dataset.
    lines = text.split('\n')

    obs_counter = 0
    obs = []
    obs_map = {}
    rhyme_dict = {}
    syll_dict = {k: [] for k in range(6)} # Maximum syllables in shakespeare.txt is 5
    punct_dict = {k: [] for k in range(15)}
    
    
    line_num = 0
    poem_num = 0
    
    a_rhyme = ()
    b_rhyme = ()
    excep = 0

    for line in lines:
        # Separate into words using TweetTokenizer and lowercase
        sentence = tknzr.tokenize(line)
        # Skip if line is empty
        if sentence != []:
            # If the line is a new poem, restart the numbering
            if sentence[0].isnumeric():
                line_num = 0
                poem_num = int(sentence[0])
            else:
                obs_elem = []
                punct = ".',:;!?()";

                for i in range(len(sentence)):
                    word = sentence[i]
                    # Remove intermediate punctuation
                    if word in punct:
                        # If we are at the end of the line, add the
                        # punctuation to the relevant line in punct_dict
                        if i == len(sentence) - 1:
                            punct_dict[line_num].append(word)
                    else:
                        # Turn to lowercase
                        word = word.lower()
                        if word not in obs_map:
                            # Add unique words to the observations map.
                            obs_map[word] = obs_counter
                            obs_counter += 1
                             # Find the list of syllable numbers for this word
                            if word in syll_dict_text:
                                syll_nums = syll_dict_text[word]
                                for syll_num in syll_nums:
                                    # Check that the syllable count isn't and ending count
                                    # (since we know we will only use rhyme words for ending words)
                                    if syll_num.isnumeric():
                                        syll_dict[int(syll_num)].append(obs_map[word])
                         
                        # If we are in the last word of the line
                        if i >= len(sentence) - 2 or (i == len(sentence) - 3 and sentence[i + 1] in punct):
                            # Add the rhyming end words to the dictionary
                            if poem_num == 126: # Irregular because aabbccddeeff
                                if line_num % 2 == 0:
                                    # Get the pronunciations for the first line
                                    a_rhyme = (word, [p for (w, p) in cmudict.entries() if w == word])
                                else:
                                    # Get the pronunciations for the fourth line
                                    b_pair = [p for (w, p) in cmudict.entries() if w == word]
                                    common, excep = get_rhyme_key(b_rhyme[1], b_pair, excep)

                                    # Add the words to the dictionary
                                    if common not in rhyme_dict:
                                        # Add unique rhyme schemes to the rhyme dict
                                        rhyme_dict[common] = [obs_map[b_rhyme[0]], obs_map[word]]
                                    else:
                                        rhyme_dict[common].extend([obs_map[b_rhyme[0]], obs_map[word]])
                        
                            elif not poem_num == 99: # Excluded because ababa cdcd efef gg
                                # Since the quatrains all have the same abab structure, we can
                                # parse modulo 4
                                if line_num % 4 == 0:
                                    # Get the pronunciations for the first line
                                    a_rhyme = (word, [p for (w, p) in cmudict.entries() if w == word])
                                elif line_num % 4 == 1 and not line_num == 13:
                                    # Get the pronunciations for the second line
                                    b_rhyme = (word, [p for (w, p) in cmudict.entries() if w == word])
                                elif line_num % 4 == 2 or line_num == 13:
                                    # Get the pronunciations for the third line/last line
                                    a_pair = [p for (w, p) in cmudict.entries() if w == word]
                                    common, excep = get_rhyme_key(a_rhyme[1], a_pair, excep)

                                    # Add the words to the dictionary
                                    if common not in rhyme_dict:
                                        # Add unique rhyme schemes to the rhyme dict
                                        rhyme_dict[common] = [obs_map[a_rhyme[0]], obs_map[word]]
                                    else:
                                        rhyme_dict[common].extend([obs_map[a_rhyme[0]], obs_map[word]])
                                else:
                                    # Get the pronunciations for the fourth line
                                    b_pair = [p for (w, p) in cmudict.entries() if w == word]
                                    common, excep = get_rhyme_key(b_rhyme[1], b_pair, excep)

                                    # Add the words to the dictionary
                                    if common not in rhyme_dict:
                                        # Add unique rhyme schemes to the rhyme dict
                                        rhyme_dict[common] = [obs_map[b_rhyme[0]], obs_map[word]]
                                    else:
                                        rhyme_dict[common].extend([obs_map[b_rhyme[0]], obs_map[word]])
                        
                        # Add the encoded word.
                        obs_elem.append(obs_map[word])

                # Add the encoded sequence.
                obs.append(obs_elem)
                
                # Increment the line numbering
                line_num += 1

    return obs, obs_map, rhyme_dict, syll_dict, punct_dict

In [222]:
text = open(os.path.join(os.getcwd(), 'data/shakespeare.txt')).read()
obs, obs_map, rhyme_dict, syll_dict, punct_dict = preprocess(text, syll_dict_text)

Since rhyme_dict has duplicates, we go through and use np.unique to eliminate them for better processing:

In [229]:
for scheme in rhyme_dict:
    rhyme_dict[scheme] = np.unique(rhyme_dict[scheme])

Next, we need to modify the emission and poetry generation functions to utilize our rhyme, syllable, and punctuation data.

#### Emission:
We are now seeding each phrase with our given rhyming word, so we now select our start state as the one that has the highest probability of generating that word. 

#### Generation:

Now that we have the obs_map, we also need to generate a backwards dictionary for the syllables so we can count for emissions:

In [236]:
def obs_to_syll(obs_map, syll_dict_text):
    o2s_dict = {}
    for word in syll_dict_text:
        nums = []
        for num in syll_dict_text[word]:
            # We can exclude ending syllables because once again, we do not use them for 
            # generating emissions because we always seed with the last word
            if num.isnumeric():
                nums.append(int(num))
        if word in obs_map:
            o2s_dict[obs_map[word]] = nums
    
    return o2s_dict

In [237]:
o2s_dict = obs_to_syll(obs_map, syll_dict_text)