## Find Rhyme and Meter of Words

In [1]:
import nltk
import string

In [2]:
filename = 'data/shakespeare.txt'

In [2]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import cmudict

Tokenize the words, but preserve apostrophes and hyphens in the same word, and ignore other punctuation

In [277]:
tokenizer = RegexpTokenizer('[\w|\'|-]+') # keep apostrophes and hyphens

line_tokens = []
with open(filename) as f:
    for line in f:
        line = line.strip()
        if (line.isdigit()):
            continue
        if (len(line) > 0):
            line = line.lower()
            tokens = tokenizer.tokenize(line)
            
            line_tokens.append(tokens)

In [50]:
d = cmudict.dict()

In [124]:
from utils import syl_count

In [81]:
line_tokens[2]

['but', 'as', 'the', 'riper', 'should', 'by', 'time', 'decease']

In [168]:
meter = {}
rhyme = {}

Store the meter of the word, as well as its rhyme scheme, for use later on in improving poem generation

In [169]:
def parse_line(line):
    def syl(pronunciation):
        return len([i[-1] for i in pronunciation if \
                i[-1].isdigit()])
    
    tot = 0
    for word in line:
        try:
            pronounciation = d[word][0]
            s = syl(pronounciation)
            
            sk = ','.join(pronounciation[-2:])
            
            if sk in rhyme.keys():
                rhyme[sk].add(word)
            else:
                rhyme[sk] = set()
                rhyme[sk].add(word)
            
        except (KeyError):
            s = syl_count(word)
        
        stress = []
        for i in xrange(s):
            if (tot + i) % 2 == 0:
                stress.append(0)
            else:
                stress.append(1)
        
        mk = ','.join(str(i) for i in stress)
        if mk in meter.keys():
            meter[mk].add(word)
        else:
            meter[mk] = set()
            meter[mk].add(word)
        
        tot += s

Just a function to test how well cmudict can be used to find rhyming words

In [44]:
def find_rhymes(w):
    entries = nltk.corpus.cmudict.entries()
    syllables = [(word, syl) for word, syl in entries if word == w]
    rhymes = []
    for (word, syllable) in syllables:
        rhymes += [word for word, pron in entries if pron[-2:] == syllable[-2:]]
    return set(rhymes)

## Build Word2Vec Model

In [51]:
import gensim

In [8]:
def split_lines(filename):
    """
    Tokenizes the file and returns a list of tokens for
    each line of poetry in the file.
    """
    # Keep apostrophes and hyphens
    tokenizer = RegexpTokenizer('[\w|\'|-]+') 

    line_tokens = []
    with open(filename) as f:
        for line in f:
            line = line.strip()
            if (line.isdigit()):
                continue
            if (len(line) > 0):
                line = line.lower()
                tokens = tokenizer.tokenize(line)
                
                line_tokens.append(tokens)

    return line_tokens

In [22]:
files = ['data/shakespeare.txt', 'data/shakespeare_xtra.txt']

line_tokens = []
for filename in files:
    line_tokens.extend(split_lines(filename))

In [23]:
len(line_tokens)

5540

In [24]:
stops = set(line.strip() for line in open('data/stopwords_elizabethan.txt'))

for i in xrange(len(line_tokens)):
    line_tokens[i] = [w for w in line_tokens[i] if not w in stops]

In [25]:
line_tokens[0]

['fairest', 'creatures', 'desire', 'increase']

In [26]:
model = gensim.models.Word2Vec(line_tokens, min_count=1)

In [27]:
model.most_similar("love")

[('different', 0.3981814980506897),
 ('seized', 0.39203259348869324),
 ('death', 0.3899046778678894),
 ('new', 0.3843476176261902),
 ('true', 0.3827434182167053),
 ('relieveth', 0.3562055826187134),
 ('till', 0.355848491191864),
 ("pencill'd", 0.35459262132644653),
 ('see', 0.3362261652946472),
 ('souls', 0.3330642282962799)]

In [28]:
len(model.vocab)

6609

Try it on lines with a more complex neural model

In [29]:
len(line_tokens)

5540

In [31]:
line_tokens[0]

['fairest', 'creatures', 'desire', 'increase']

In [38]:
model = gensim.models.Word2Vec(line_tokens, size=1000, window=8, min_count=1)

In [39]:
model.most_similar("love")

[('like', 0.5050860643386841),
 ('heart', 0.4762144982814789),
 ('make', 0.47565433382987976),
 ('shall', 0.46683523058891296),
 ('eyes', 0.46333134174346924),
 ('would', 0.46272504329681396),
 ('yet', 0.46228304505348206),
 ('whose', 0.4458623230457306),
 ('still', 0.43788033723831177),
 ('upon', 0.4364575147628784)]

It looks a bit more accurate with a more complex model.

In [40]:
model.save('models/word2vec.bin')

Try to find the most similar word that still rhymes, and is in our Shakespearean vocabulary

In [42]:
model = gensim.models.Word2Vec.load('models/word2vec.bin')

In [45]:
rhymes = find_rhymes("love")

In [46]:
max_similarity = 0.
best_word = None
for rhyme in rhymes:
    if rhyme == "love":
        continue
    try:
        if model.similarity("love", rhyme) > max_similarity:
            best_word = rhyme
            max_similarity = model.similarity("love", rhyme)
    except:
        continue

In [47]:
best_word, max_similarity

(u'dove', 0.053427630126736139)