In [11]:
import random
import re
import os
from HMM import unsupervised_HMM
from punctuation_dict import get_punctuation_dict
from punctuation_generator import get_punc
from syllable_dict import get_syllable_dict

In [12]:
text = open(os.path.join(os.getcwd(), 'data/shakespeare.txt')).read()

In [15]:
# Store a list of words to keep capitalized
cap_words = ["i'll", 'i', 'o']
punc_dict = get_punctuation_dict()

def process_word(word):
    '''
    This function takes as its input a word and returns the processed word by 
    getting rid of unnecessary punctuations / capitalizations. 
    ''' 
    # Exception "I'll" - confusion with ill should be manually taken care of
    if word == "I'll":
        return word
    
    # Convert to lowercase and remove punctuations not part of a word
    word = punc_dict[re.sub(r'[^\w]', '', word.lower())]

    # Keep certain words capitalized
    if word in cap_words:
        word = word.capitalize()
        
    return word

In [16]:
# Create rhyme dictionary

lines = [line.split() for line in text.split('\n') if line.split()]
sonnets = []
sonnet = []
for line in lines:
    if len(line) == 1:
        # Only store sonnets with 14 lines
        if len(sonnet) == 14:
            sonnets.append(sonnet)
        sonnet = []
        continue
    sonnet.append(line)

    
# This rhyme dictionary is a list of sets, where all the elements in each set rhyme with each other
rhyme_dict = []

def add_to_rhyme_dict(w1, w2):
    '''
    This function takes in a pair of rhyming words and adds them to the rhyme dictionary.
    '''
    # Flag indicating whehter at least one of the words is in the rhyme dict
    stored = False
    
    for group in rhyme_dict:
        if w1 in group:
            group.add(w2)
            stored = True
            break
        if w2 in group:
            group.add(w1)
            stored = True
            break
    
    if not stored:
        rhyme_dict.append({w1, w2})
        
        
for sonnet in sonnets:
    # Get all the rhyming pairs in the first 3 stanzas
    for i in [0, 1, 4, 5, 8, 9]:
        word1 = process_word(sonnet[i][-1])
        word2 = process_word(sonnet[i+2][-1])
        add_to_rhyme_dict(word1, word2)
    # Last two rows of a sonnet rhyme
    add_to_rhyme_dict(process_word(sonnet[12][-1]), process_word(sonnet[13][-1]))

In [17]:
def parse_observations(text):
    # Convert text to dataset.
    lines = [line.split() for line in text.split('\n') if line.split()]
    
    obs_counter = 0
    obs = []
    obs_map = {}

    # Iterate through all the lines of poems
    for line in lines:
        # Skip line with poem id (not an actual line of poem)
        if len(line) == 1:
            continue

        # Reverse the line to train the HMM on reversed sequences
        line.reverse()
        
        obs_elem = []
        
        for word in line:
            word = process_word(word)
            
            if word not in obs_map:
                # Add unique words to the observations map.
                obs_map[word] = obs_counter
                obs_counter += 1
            
            # Add the encoded word.
            obs_elem.append(obs_map[word])
        
        # Add the encoded sequence.
        obs.append(obs_elem)

    return obs, obs_map

In [18]:
obs, obs_map = parse_observations(text)
syl_dict = get_syllable_dict()

In [22]:
hmm = unsupervised_HMM(obs, 15, 100)

1
2
3
4
5
6
7
8
9
10
Iteration: 10
11
12
13
14
15
16
17
18
19
20
Iteration: 20
21
22
23
24
25
26
27
28
29
30
Iteration: 30
31
32
33
34
35
36
37
38
39
40
Iteration: 40
41
42
43
44
45
46
47
48
49
50
Iteration: 50
51
52
53
54
55
56
57
58
59
60
Iteration: 60
61
62
63
64
65
66
67
68
69
70
Iteration: 70
71
72
73
74
75
76
77
78
79
80
Iteration: 80
81
82
83
84
85
86
87
88
89
90
Iteration: 90
91
92
93
94
95
96
97
98
99
100
Iteration: 100


In [23]:
def obs_map_reverser(obs_map):
    obs_map_r = {}

    for key in obs_map:
        obs_map_r[obs_map[key]] = key

    return obs_map_r


# Generate a sample sentence ending with a given word
def sample_sentence(hmm, obs_map, word, n_syl=10):
    # Get reverse map.
    obs_map_r = obs_map_reverser(obs_map)
    
    # Choose a state that could have generated the word
    state = hmm.find_state(obs_map[word])
    
    # Sample and convert sentence
    emission = hmm.generate_emission_rhyme(n_syl, obs_map, obs_map_r, syl_dict, word, state)[0]
    sentence = [obs_map_r[i] for i in emission][::-1]
    sentence[0] = sentence[0].capitalize()
    return ' '.join(sentence)
    
    
def generate_rhyming_sentences(hmm, obs_map):   
    # Pick two rhyming words
    [w1, w2] = random.sample(random.choice(rhyme_dict), 2)    
    return (sample_sentence(hmm, obs_map, w1), sample_sentence(hmm, obs_map, w2))

    
def generate_sonnet(hmm, obs_map):
    poem = ''
    for stanza in range(3):
        l1, l3 = generate_rhyming_sentences(hmm, obs_map)
        l2, l4 = generate_rhyming_sentences(hmm, obs_map)
        poem += (l1 + get_punc(0) + '\n' + l2 + get_punc(0) + '\n' + 
                l3 + get_punc(0) + '\n' + l4 + get_punc(1) + '\n')
        
    # Last stanza
    l1, l2 = generate_rhyming_sentences(hmm, obs_map)
    poem += '  ' + l1 + get_punc(2) + '\n  ' + l2 + get_punc(3) + '\n' 
    print(poem)
        

In [24]:
generate_sonnet(hmm, obs_map)

In him whose the feeding in birds untrimmed
That few for thy judgment's pluck this crowned plot
Beauty's thee what of thy heart with the dimmed
My loves disdains thy although love a not:
Hate one are him be perpetual beguiled
Being thee I to so I compounds loves up
My loss confounded your love as and child,
What hath in thy argument most his cup.
Love not thee nor sourly in one affords,
He keep you but I to as boast will rent
My pluck my was this even of he up words,
Another for if I for thy ornament.
  Mistress far at my loves use thy do sit,
  We have me see this true proud and stay it.

