## Find Rhyme and Meter of Words

In [35]:
import nltk
import string

In [2]:
filename = 'data/shakespeare.txt'

In [19]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import cmudict

Tokenize the words, but preserve apostrophes and hyphens in the same word, and ignore other punctuation

In [277]:
tokenizer = RegexpTokenizer('[\w|\'|-]+') # keep apostrophes and hyphens

line_tokens = []
with open(filename) as f:
    for line in f:
        line = line.strip()
        if (line.isdigit()):
            continue
        if (len(line) > 0):
            line = line.lower()
            tokens = tokenizer.tokenize(line)
            
            line_tokens.append(tokens)

In [50]:
d = cmudict.dict()

In [124]:
from utils import syl_count

In [81]:
line_tokens[2]

['but', 'as', 'the', 'riper', 'should', 'by', 'time', 'decease']

In [168]:
meter = {}
rhyme = {}

Store the meter of the word, as well as its rhyme scheme, for use later on in improving poem generation

In [169]:
def parse_line(line):
    def syl(pronunciation):
        return len([i[-1] for i in pronunciation if \
                i[-1].isdigit()])
    
    tot = 0
    for word in line:
        try:
            pronounciation = d[word][0]
            s = syl(pronounciation)
            
            sk = ','.join(pronounciation[-2:])
            
            if sk in rhyme.keys():
                rhyme[sk].add(word)
            else:
                rhyme[sk] = set()
                rhyme[sk].add(word)
            
        except (KeyError):
            s = syl_count(word)
        
        stress = []
        for i in xrange(s):
            if (tot + i) % 2 == 0:
                stress.append(0)
            else:
                stress.append(1)
        
        mk = ','.join(str(i) for i in stress)
        if mk in meter.keys():
            meter[mk].add(word)
        else:
            meter[mk] = set()
            meter[mk].add(word)
        
        tot += s

Just a function to test how well cmudict can be used to find rhyming words

In [174]:
def find_rhymes(w):
    entries = nltk.corpus.cmudict.entries()
    syllables = [(word, syl) for word, syl in entries if word == w]
    rhymes = []
    for (word, syllable) in syllables:
        rhymes += [word for word, pron in entries if pron[-2:] == syllable[-2:]]
    return set(rhymes)

## Build Word2Vec Model

In [267]:
import gensim

In [278]:
len(line_tokens)

2155

In [279]:
stops = set(line.strip() for line in open('models/stopwords_elizabethan.txt'))

for i in xrange(len(line_tokens)):
    line_tokens[i] = [w for w in line_tokens[i] if not w in stops]

In [280]:
line_tokens[-1]

["love's", 'fire', 'heats', 'water', 'water', 'cools', 'love']

In [339]:
model = gensim.models.Word2Vec(line_tokens, min_count=1)

In [340]:
model.most_similar("love")

[('different', 0.3638341426849365),
 ('new', 0.34292036294937134),
 ('souls', 0.32359182834625244),
 ('sake', 0.3227883577346802),
 ('enlarged', 0.3141912519931793),
 ('conquered', 0.30677562952041626),
 ('glazed', 0.29811012744903564),
 ('raised', 0.2969893515110016),
 ('dyed', 0.29532408714294434),
 ('moods', 0.2889629602432251)]

In [290]:
len(model.vocab)

3102

Try it on lines with a more complex neural model

In [332]:
len(line_tokens)

2155

In [333]:
line_tokens[-1]

["love's", 'fire', 'heats', 'water', 'water', 'cools', 'love']

In [337]:
model = gensim.models.Word2Vec(line_tokens, size=300, window=8, min_count=1)

In [338]:
model.most_similar("love")

[('merits', 0.18723152577877045),
 ('heart', 0.1866316795349121),
 ('rest', 0.18459047377109528),
 ('long', 0.1828131079673767),
 ('sinful', 0.18092121183872223),
 ('beauty', 0.1793537437915802),
 ('fever', 0.17897535860538483),
 ('sin', 0.178488627076149),
 ('medicine', 0.1767292320728302),
 ('one', 0.17599430680274963)]

It looks a bit more accurate with a more complex model.

Try to find the most similar word that still rhymes, and is in our Shakespearean vocabulary

In [344]:
rhymes = find_rhymes("love")

In [348]:
max_similarity = 0.
best_word = None
for rhyme in rhymes:
    if rhyme == "love":
        continue
    try:
        if model.similarity("love", rhyme) > max_similarity:
            best_word = rhyme
            max_similarity = model.similarity("love", rhyme)
    except:
        continue

In [350]:
best_word, max_similarity

(u'thereof', 0.098495416866168015)