## Preprocessing for Spenser Sonnets

In [1]:
import random
import numpy as np
import os
import nltk
from nltk.corpus import cmudict
from HMM_Project3 import unsupervised_HMM
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()

 The main difference between the Shakespeare and Spenser sonnets are the way they are numbered in the text file. Spenser sonnets are marked with roman numerals, which are registered as letters, so to eliminate them from the data, we check if each non empty line has fewer than 10 characters. If this is the case, then it must be a roman numeral as all other lines clearly have more characters. We combined the sonnet files by reading both of them into the preprocessing function and making them into one list of lines in the function.

In [9]:
def preprocess_init(text1, text2):
    # Convert text to dataset.
    lines1 = text1.split('\n')
    lines2 = text2.split('\n')
    lines = lines1 + lines2

    obs_counter = 0
    obs = []
    obs_map = {}

    for line in lines:
        # Separate into words using TweetTokenizer and lowercase
        sentence = tknzr.tokenize(line)
        # Skip if line is poem numbering (roman numerals or regular numbers)
        if sentence != [] and len(line)>10: 
            obs_elem = []
            punct = ".',':;!?()"; 
            
            for word in sentence:
                # Remove intermediate punctuation
                if not word in punct:
                    # Turn to lowercase
                    word = word.lower()
                    if word not in obs_map:
                        # Add unique words to the observations map.
                        obs_map[word] = obs_counter
                        obs_counter += 1

                    # Add the encoded word.
                    obs_elem.append(obs_map[word])

            # Add the encoded sequence.
            obs.append(obs_elem)

    return obs, obs_map

In [10]:
text1 = open(os.path.join(os.getcwd(), 'data/spenser.txt')).read()
text2 = open(os.path.join(os.getcwd(), 'data/shakespeare.txt')).read()
obs, obs_map = preprocess_init(text1, text2)


In [11]:
def obs_map_reverser(obs_map):
    obs_map_r = {}

    for key in obs_map:
        obs_map_r[obs_map[key]] = key

    return obs_map_r

In [12]:
# Generate array of all spenser line lengths (in terms of number of words)
line_lens = [len(i) for i in obs]

In [13]:
def generate_poem_init(hmm, obs_map, line_lens):
    # Get reverse map.
    obs_map_r = obs_map_reverser(obs_map)
    
    poem = ""
    
    for i in range(14):
        # Get desired line length:
        n_words = random.choice(line_lens)
        emission, states = hmm.generate_emission(n_words)
        sentence = [obs_map_r[i] for i in emission]
        
        formatted = ' '.join(sentence).capitalize()
        if i < 13:
            formatted += ",\n"
        else:
            formatted += "."
        
        poem += formatted

    return poem

 Preprocessing done on shakespeare sonnets determined that 8 states was optimal, which is what we use here

In [14]:
hmm8 = unsupervised_HMM(obs, 8, 100)
print('\nSample Poem:\n====================')
print(generate_poem_init(hmm8, obs_map, line_lens))

Iteration: 10
Iteration: 20
Iteration: 30
Iteration: 40
Iteration: 50
Iteration: 60
Iteration: 70
Iteration: 80
Iteration: 90
Iteration: 100

Sample Poem:
Your so converted in quite not planet register,
This you comfort him away lord shaken slow write in,
Away what of kindle making trespass growing thy,
Bit lovers for with my for lest thine,
Sunset to any and and battle,
Life t'adorn that fairest is she do that dear my,
I leads them and to what to find,
Light to grow doom unto thine most of sad,
She others my your thing and seek may,
Dost more with thereof of that back were for,
The sweetest how come i that souls that to,
To warrior servant and but time's dear,
Thence where and doth immortalize no tell him,
This which hounds heart leaped the me them take.


# Additional Goals

In [15]:
def parse_syll_text(syll_text):
    # Convert syllable text to dictionary
    lines = [line.split() for line in syll_text.split('\n') if line.split()]

    syll_dict = {}

    for line in lines:
        word = line[0].lower()
        syll_dict[word] = line[1:]

    return syll_dict

In [16]:
syll_text = open(os.path.join(os.getcwd(), 'data/Syllable_dictionary.txt')).read()
syll_dict_text = parse_syll_text(syll_text)

In [17]:
def get_rhyme_key(rhyme, pair, excep):
    common = ''
    comm_num = 0
    # Iterate through all the possible pronunciation combos
    # to find which one has highest similarity (rhyme)
    for p1 in rhyme:
        for p2 in pair:
            comm_test = 0
            # Check commmon pronunciation from end onward
            for j in range(min(len(p1), len(p2))):
                if p1[len(p1) - j - 1] == p2[len(p2) - j - 1]:
                    comm_test += 1
                else:
                    break
            # If this pronunciation has greater commonality than 
            # any of the others, update common and comm_num
            if comm_test > comm_num:
                comm_num = comm_test
                common = ' '.join(p1[len(p1) - comm_num:])
                                        
    # If cmudict can't find the words, we save it under the key
    # 'excep#' instead
    if common == '':
        common = 'excep' + str(excep)
        excep += 1
    
    return common, excep

In [None]:
def preprocess(text1, text2, syll_dict_text):
    # Convert text to dataset.
    lines1 = text1.split('\n')
    lines2 = text2.split('\n')
    lines = lines1 + lines2

    obs_counter = 0
    obs = []
    obs_map = {}
    rhyme_dict = {}
    syll_dict = {k: [] for k in range(6)} # Maximum syllables in shakespeare.txt is 5
    punct_dict = {k: [] for k in range(15)}
    
    
    line_num = 0
    poem_num = ""
    
    a_rhyme = ()
    b_rhyme = ()
    excep = 0

    for line in lines:
        # Separate into words using TweetTokenizer and lowercase
        sentence = tknzr.tokenize(line)
        # Skip if line is empty
        if sentence != []:
            # If the line is a new poem, restart the numbering
            if len(line)<=10 or sentence[0].isnumeric():
                line_num = 0
                poem_num = sentence[0]
            else:
                obs_elem = []
                punct = ".',:;!?()";

                for i in range(len(sentence)):
                    word = sentence[i]
                    # Remove intermediate punctuation
                    if word in punct:
                        # If we are at the end of the line, add the
                        # punctuation to the relevant line in punct_dict
                        if i == len(sentence) - 1:
                            punct_dict[line_num].append(word)
                    else:
                        # Turn to lowercase
                        word = word.lower()
                        if word not in obs_map:
                            # Add unique words to the observations map.
                            obs_map[word] = obs_counter
                            obs_counter += 1
                             # Find the list of syllable numbers for this word
                            if word in syll_dict_text:
                                syll_nums = syll_dict_text[word]
                                for syll_num in syll_nums:
                                    # Check that the syllable count isn't and ending count
                                    # (since we know we will only use rhyme words for ending words)
                                    if syll_num.isnumeric():
                                        syll_dict[int(syll_num)].append(obs_map[word])
                         
                        # If we are in the last word of the line
                        if i >= len(sentence) - 2 or (i == len(sentence) - 3 and sentence[i + 1] in punct):
                            # Add the rhyming end words to the dictionary
                            # Since the quatrains all have the same abab structure, we can
                            # parse modulo 4
                            if poem_num != "LXXXIV" and not poem_num == 99 and not poem_num == 126:
                                if line_num % 4 == 0:
                                    # Get the pronunciations for the first line
                                    a_rhyme = (word, [p for (w, p) in cmudict.entries() if w == word])
                                elif line_num % 4 == 1 and not line_num == 13:
                                    # Get the pronunciations for the second line
                                    b_rhyme = (word, [p for (w, p) in cmudict.entries() if w == word])
                                elif line_num % 4 == 2 or line_num == 13:
                                    # Get the pronunciations for the third line/last line
                                    a_pair = [p for (w, p) in cmudict.entries() if w == word]
                                    common, excep = get_rhyme_key(a_rhyme[1], a_pair, excep)

                                    # Add the words to the dictionary
                                    if common not in rhyme_dict:
                                        # Add unique rhyme schemes to the rhyme dict
                                        rhyme_dict[common] = [obs_map[a_rhyme[0]], obs_map[word]]
                                    else:
                                        rhyme_dict[common].extend([obs_map[a_rhyme[0]], obs_map[word]])
                                else:
                                    # Get the pronunciations for the fourth line
                                    b_pair = [p for (w, p) in cmudict.entries() if w == word]
                                    common, excep = get_rhyme_key(b_rhyme[1], b_pair, excep)

                                    # Add the words to the dictionary
                                    if common not in rhyme_dict:
                                        # Add unique rhyme schemes to the rhyme dict
                                        rhyme_dict[common] = [obs_map[b_rhyme[0]], obs_map[word]]
                                    else:
                                        rhyme_dict[common].extend([obs_map[b_rhyme[0]], obs_map[word]])

                        # Add the encoded word.
                        obs_elem.append(obs_map[word])

                # Add the encoded sequence.
                obs.append(obs_elem)
                
                # Increment the line numbering
                line_num += 1

    return obs, obs_map, rhyme_dict, syll_dict, punct_dict

In [None]:
text1 = open(os.path.join(os.getcwd(), 'data/spenser.txt')).read()
text2 = open(os.path.join(os.getcwd(), 'data/shakespeare.txt')).read()
obs, obs_map, rhyme_dict, syll_dict, punct_dict = preprocess(text1, text2, syll_dict_text)

In [46]:
for scheme in rhyme_dict:
    rhyme_dict[scheme] = np.unique(rhyme_dict[scheme])

In [47]:
def obs_to_syll(obs_map, syll_dict_text):
    o2s_dict = {}
    for word in syll_dict_text:
        nums = []
        for num in syll_dict_text[word]:
            # We can exclude ending syllables because once again, we do not use them for 
            # generating emissions because we always seed with the last word
            if num.isnumeric():
                nums.append(int(num))
        if word in obs_map:
            o2s_dict[obs_map[word]] = nums
    
    return o2s_dict

In [48]:
o2s_dict = obs_to_syll(obs_map, syll_dict_text)

In [92]:
def generate_emission_improved(hmm, syll_count, seed, syll_dict, o2s_dict):
        emission = [seed]
        state = np.argmax(np.array(hmm.O)[:, seed])
        states = []

        while syll_count < 10:
            # Append state.
            states.append(state)

            # At this point, we start restricting the probabilities
            if syll_count >= 5:
                accept_obs = []

                # Get a list of all the observations with 
                # acceptable syllable counts
                for syll_num in syll_dict:
                    if syll_num <= 10 - syll_count:
                        accept_obs.extend(syll_dict[syll_num])

                accept_obs = np.unique(accept_obs)
                # Get the total probability for the given state
                # across all of these acceptable observations
                O_tot = np.sum([hmm.O[state][i] for i in accept_obs])

                # Sample next observation.
                rand_var = random.uniform(0, O_tot)

                for i in range(len(accept_obs)):
                    rand_var -= hmm.O[state][accept_obs[i]]
                    if rand_var <= 0:
                        break

                next_obs = accept_obs[i - 1]
            else:
                 # Sample next observation.
                rand_var = random.uniform(0, 1)
                next_obs = 0

                while rand_var > 0:
                    rand_var -= hmm.O[state][next_obs]
                    next_obs += 1

                next_obs -= 1

            emission.append(next_obs)

            # Increase the syllable count (using max possible, since
            # Shakespeare says brevity is the soul of wit)
            if next_obs in o2s_dict:
                pos_counts = o2s_dict[next_obs]
            else:
                pos_counts = [2]
            for i in range(len(pos_counts)):
                if pos_counts[len(pos_counts) - i - 1] <= 10 - syll_count:
                    syll_count += pos_counts[len(pos_counts) - i - 1]
                    break

            # Sample next state.
            rand_var = random.uniform(0, 1)
            next_state = 0

            while rand_var > 0:
                rand_var -= hmm.A[state][next_state]
                next_state += 1

            next_state -= 1
            state = next_state

        return emission, states

The rhyming schemes for Spenser sonnets are similar to Shakespeare, but technically follow the scheme "ababbcbccdcdee" instead of "ababcdcdefefgg." For simplicity, we chose to follow Shakespeare's rhyme scheme as both schemes have every other line rhyme (except for the couplet at the end).

In [113]:
def generate_poem(hmm, obs_map, rhyme_dict, syll_dict, punct_dict, syll_dict_text, o2s_dict):
    # Get reverse map.
    obs_map_r = obs_map_reverser(obs_map)
    
    poem = ['' for i in range(14)]
    # Tracks the beginnings of coupled lines
    line_nums = [0, 1, 4, 5, 8, 9]
    
    # Each loop generates a couplet, which we will intersperse into the desired rhyme scheme
    for i in range(7):
        # Get random rhyme scheme
        scheme = random.sample(rhyme_dict.keys(), 1)
        
        # Get two ending words
        rhymes = random.sample(list(rhyme_dict[scheme[0]]), 2)
        
        # Get the syllable count for each ending word (check the E)
        syll_count = []
        for word in rhymes:
            word_syll = 0
            if obs_map_r[word] in syll_dict_text:
                for pos_count in syll_dict_text[obs_map_r[word]]:
                    if 'E' in pos_count:
                        word_syll = int(pos_count[1])
                        break
                    else:
                        word_syll = int(pos_count)
            else:
                word_syll = 2
            syll_count.append(word_syll)
        
        # Generate the two rhyming sentences
        sentence = []
        for j in range(2):
            emission, states = generate_emission_improved(hmm, syll_count[j], rhymes[j], syll_dict, o2s_dict)
            # Reverse the order of the sentence (since our rhyming words should be at the end)
            emission = emission[::-1]
            sentence.append([obs_map_r[k] for k in emission])
        
        
        # Now that we have our two sentences, we use our index i value to put them into the poem
        # with the appropriate punctuation
        # If we have reached the couplet, i = 6:
        if i == 6:
            punct = random.sample(punct_dict[12], 1)
            poem[12] = (' '.join(sentence[0]).capitalize()) + punct[0]
            poem[13] = "  " + (' '.join(sentence[1]).capitalize()) + "."
        elif i in [0, 2, 4]:
            first = line_nums[i]
            punct_first = random.sample(punct_dict[first], 1)
            punct_sec = random.sample(punct_dict[first + 2], 1)
            poem[first] = (' '.join(sentence[0]).capitalize()) + punct_first[0]
            poem[first + 2] = "  " + (' '.join(sentence[1]).capitalize()) + punct_sec[0]
        else:
            first = line_nums[i]
            punct_first = random.sample(punct_dict[first], 1)
            punct_sec = random.sample(punct_dict[first + 2], 1)
            poem[first] = "  " + (' '.join(sentence[0]).capitalize()) + punct_first[0]
            poem[first + 2] = "  " + (' '.join(sentence[1]).capitalize()) + punct_sec[0]
        
    poem_formatted = "\n".join(poem)

    return poem_formatted

In [51]:
hmm8 = unsupervised_HMM(obs, 8, 100)

Iteration: 10
Iteration: 20
Iteration: 30
Iteration: 40
Iteration: 50
Iteration: 60
Iteration: 70
Iteration: 80
Iteration: 90
Iteration: 100


In [118]:
print('\nSample Poem:\n====================')
print(generate_poem(hmm8, obs_map, rhyme_dict, syll_dict, punct_dict, syll_dict_text, o2s_dict))


Sample Poem:
Is you delight blossomed eclipse meeds,
  Coward sometimes any angel enough?
  At eyes remain life whether you more deeds,
  That's looks on live know the in what tough.
Tame bears do some forty most re-survey)
  Will happy bids tongue which a thing bath faces?
  Sorrows grant hell dilate of wherefore day:
  Inward on worthy shames thy i oaths graces.
Complain kiss longer tomb with inward grows,
  She shall tyranny of long him awhile,
  Stand you sovereign if eyes they seas but,
  You an both feeds hold thou mercy mile:
Greater devised becomes depending proud their o'er,
  Greater expire on earth these could before.
