## Find Rhyme and Meter of Words

In [1]:
import nltk
import string

In [2]:
filename = 'data/shakespeare.txt'

In [5]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import cmudict

Tokenize the words, but preserve apostrophes and hyphens in the same word, and ignore other punctuation

In [277]:
tokenizer = RegexpTokenizer('[\w|\'|-]+') # keep apostrophes and hyphens

line_tokens = []
with open(filename) as f:
    for line in f:
        line = line.strip()
        if (line.isdigit()):
            continue
        if (len(line) > 0):
            line = line.lower()
            tokens = tokenizer.tokenize(line)
            
            line_tokens.append(tokens)

In [50]:
d = cmudict.dict()

In [124]:
from utils import syl_count

In [81]:
line_tokens[2]

['but', 'as', 'the', 'riper', 'should', 'by', 'time', 'decease']

In [168]:
meter = {}
rhyme = {}

Store the meter of the word, as well as its rhyme scheme, for use later on in improving poem generation

In [169]:
def parse_line(line):
    def syl(pronunciation):
        return len([i[-1] for i in pronunciation if \
                i[-1].isdigit()])
    
    tot = 0
    for word in line:
        try:
            pronounciation = d[word][0]
            s = syl(pronounciation)
            
            sk = ','.join(pronounciation[-2:])
            
            if sk in rhyme.keys():
                rhyme[sk].add(word)
            else:
                rhyme[sk] = set()
                rhyme[sk].add(word)
            
        except (KeyError):
            s = syl_count(word)
        
        stress = []
        for i in xrange(s):
            if (tot + i) % 2 == 0:
                stress.append(0)
            else:
                stress.append(1)
        
        mk = ','.join(str(i) for i in stress)
        if mk in meter.keys():
            meter[mk].add(word)
        else:
            meter[mk] = set()
            meter[mk].add(word)
        
        tot += s

Just a function to test how well cmudict can be used to find rhyming words

In [30]:
def find_rhymes(w):
    entries = nltk.corpus.cmudict.entries()
    syllables = [(word, syl) for word, syl in entries if word == w]
    rhymes = []
    for (word, syllable) in syllables:
        rhymes += [word for word, pron in entries if pron[-2:] == syllable[-2:]]
    return set(rhymes)

## Build Word2Vec Model

In [2]:
import gensim

In [3]:
def split_lines(filename):
    """
    Tokenizes the file and returns a list of tokens for
    each line of poetry in the file.
    """
    # Keep apostrophes and hyphens
    tokenizer = RegexpTokenizer('\w[\w|\'|-]+\w') 

    line_tokens = []
    with open(filename) as f:
        for line in f:
            line = line.strip()
            if (line.isdigit()):
                continue
            if (len(line) > 0):
                line = line.lower()
                tokens = tokenizer.tokenize(line)
                
                line_tokens.append(tokens)

    return line_tokens

In [8]:
files = ['data/shakespeare.txt', 'data/shakespeare_xtra.txt', 'data/spenser.txt']

line_tokens = []
for filename in files:
    line_tokens.extend(split_lines(filename))

In [9]:
len(line_tokens)

6875

In [10]:
stops = set(line.strip() for line in open('data/stopwords_elizabethan.txt'))

for i in xrange(len(line_tokens)):
    line_tokens[i] = [w for w in line_tokens[i] if not w in stops]

In [11]:
line_tokens[0]

['fairest', 'creatures', 'desire', 'increase']

In [12]:
model = gensim.models.Word2Vec(line_tokens, min_count=1)

In [13]:
model.most_similar("love")

[('death', 0.5836070775985718),
 ('may', 0.5535811185836792),
 ('like', 0.5526201725006104),
 ('fair', 0.5430938005447388),
 ('heart', 0.5343868732452393),
 ('still', 0.5268172025680542),
 ('till', 0.5252523422241211),
 ('whose', 0.5251906514167786),
 ('eyes', 0.5181272625923157),
 ('one', 0.5134645700454712)]

In [14]:
len(model.vocab)

7243

Try it on lines with a more complex neural model

In [15]:
len(line_tokens)

6875

In [16]:
line_tokens[0]

['fairest', 'creatures', 'desire', 'increase']

In [25]:
model = gensim.models.Word2Vec(line_tokens, size=300, window=8, min_count=1)

In [26]:
model.most_similar("love")

[('like', 0.7128933668136597),
 ('eyes', 0.7057667374610901),
 ('yet', 0.6990523338317871),
 ('may', 0.6865546107292175),
 ('heart', 0.6787534952163696),
 ('whose', 0.6768814325332642),
 ('make', 0.6766281127929688),
 ('one', 0.6709225177764893),
 ('shall', 0.6660017967224121),
 ('upon', 0.6576135754585266)]

It looks a bit more accurate with a more complex model.

In [27]:
model.save('models/word2vec.bin')

Try to find the most similar word that still rhymes, and is in our Shakespearean vocabulary

In [28]:
model = gensim.models.Word2Vec.load('models/word2vec.bin')

In [31]:
rhymes = find_rhymes("love")

In [32]:
max_similarity = 0.
best_word = None
for rhyme in rhymes:
    if rhyme == "love":
        continue
    try:
        if model.similarity("love", rhyme) > max_similarity:
            best_word = rhyme
            max_similarity = model.similarity("love", rhyme)
    except:
        continue

In [33]:
best_word, max_similarity

(u'dove', 0.11360605169155266)