In [1]:
import nltk
import string
import random
import json
import numpy as np

In [2]:
from hmmlearn import hmm
from sklearn.externals import joblib
import gensim

In [3]:
model = joblib.load('../models/hmm_10.pkl')

In [4]:
model.n_features

6751

In [5]:
A = model.transmat_

In [6]:
O = model.emissionprob_

In [7]:
A_start = model.startprob_

In [8]:
vocab = json.load(open('../models/shakespeare_words/shakespeare_vocab.json'))

for k in vocab.keys():
    vocab[int(k)] = vocab.pop(k)

In [9]:
inverted_vocab = json.load(open('../models/shakespeare_words/shakespeare_inverted_vocab.json'))

In [3]:
meter = json.load(open('../models/shakespeare_words/shakespeare_meter.json'))
inverted_meter = json.load(open('../models/shakespeare_words/shakespeare_inverted_meter.json'))
pos = json.load(open('../models/shakespeare_words/shakespeare_pos.json'))
inverted_pos = json.load(open('../models/shakespeare_words/shakespeare_inverted_pos.json'))

IOError: [Errno 2] No such file or directory: '../models/shakespeare_words/shakespeare_meter.json'

In [4]:
word2vec = gensim.models.Word2Vec.load('../models/word2vec.bin')

In [11]:
def random_pick(l, probs):
    """ 
    Probabilistic random picking according
    to a probability distribution
    """
    x = random.uniform(0, 0.999)
    cumulative_probability = 0.0

    for item, prob in zip(l, probs):
        cumulative_probability += prob
        if x < cumulative_probability: 
            break
    return item

In [12]:
L, D = O.shape

In [24]:
"NN" in inverted_pos["ear"]

True

In [41]:
def filter_next(num_syllables, previous_word, probs):
    new_probs = np.copy(probs)
    
    # Filter based on meter, and keep syllables 11 or under
    invalid = []
    for k in meter.keys():
        m = map(int, k.split(','))
        if m[0] != num_syllables % 2:
            invalid.extend([inverted_vocab[w] for w in meter[k]])
        
        if len(m) + num_syllables > 10:
            invalid.extend([inverted_vocab[w] for w in meter[k]])
    if "NN" in inverted_pos[previous_word]:
        for k in pos.keys():
            if k not in ["VB","RB","IN"]:
                invalid.extend([inverted_vocab[w] for w in pos[k]])
    if "VB" in inverted_pos[previous_word]:
        for k in pos.keys():
            if k in ["VB"]:
                invalid.extend([inverted_vocab[w] for w in pos[k]])
            
    new_probs[invalid] = 0
    with np.errstate(divide='ignore'):
        new_probs = np.divide(new_probs, np.sum(new_probs))
        
        new_probs[new_probs == np.inf] = 0
        new_probs = np.nan_to_num(new_probs)
    
    
    return new_probs

In [42]:
def generate_line(start_word):
    emission = []
    
    num_syllables = 0    
        
    start = inverted_vocab[start_word]
    state = random_pick(range(L), \
                    np.divide(O[:, start], np.sum(O[:, start])))
            
    num_syllables += len(inverted_meter[start_word][0].split(','))
    emission.append(start_word)

    prev_word = start_word
    while num_syllables < 10:
        # Sample next observation.
        next_probs = filter_next(num_syllables, prev_word, O[state, :])    
        next_obs= random_pick(range(D), next_probs)
            
        try:
            next_word = vocab[next_obs]    
            if (next_word == "'"): # This somehow showed up as word, skip
                continue
                
            emission.append(next_word)
            stresses = inverted_meter[next_word][0].split(',')
            
            num_syllables += len(stresses)
            prev_word = next_word
            
            next_state = random_pick(range(L), A[state, :])
            state = next_state
                
        except KeyError: # shouldn't occur, but just in case
            continue
                
    return emission
        

In [43]:
def start_next(prev_start):
    w, p = zip(*word2vec.most_similar(prev_start, topn=30))

    w = list(w)
    # Make sure it starts out with unstressed
    starts = []
    for word in w:
        stresses = inverted_meter[word][0].split(',')
        if (stresses[0] == '0'):
            starts.append(word)
    
    if len(starts) == 0:
        return prev_start
    return np.random.choice(starts)

In [44]:
def generate_sonnet(start_word):
    sonnet = ''
    for i in xrange(14):
        line = generate_line(start_word)
        sonnet += ' '.join(line)
        if ((i + 1) % 4 == 0) or (i == 13):
            sonnet += '.\n'
        else:
            sonnet += ',\n'
            
        start_word = start_next(start_word)
    return sonnet

In [45]:
print generate_sonnet('years')

years takes of nothing never stain'd than waste,
torment for my infection so his fiend,
incertainties with brief for fire of me,
directed watery exclaiming my.
betrays compared time-beguiling words,
dissolve deceased imprison'd not the face,
'yet and it long solicited for red,
vehement my besieged for thy below.
confirmed the tears of a deceits with a,
importune changing orator is more,
repine than thy if presently attend,
possessed arise gather'd or themes with thee.
authority against it but with and,
oppressed of when aside deprived account.



In [11]:
rhyme = json.load(open('../models/shakespeare_words/shakespeare_rhyme.json'))

inverted_rhyme = json.load( \
            open('../models/shakespeare_words/shakespeare_inverted_rhyme.json'))

In [19]:
inverted_meter["changing"]

[u'1,0']

In [5]:
word2vec.most_similar("love")

[('like', 0.7026592493057251),
 ('eyes', 0.6991778016090393),
 ('heart', 0.6895580291748047),
 ('make', 0.6819394826889038),
 ('yet', 0.6790560483932495),
 ('one', 0.6719660758972168),
 ('sweet', 0.6665604114532471),
 ('may', 0.6648432612419128),
 ('whose', 0.6574447154998779),
 ('would', 0.6525707840919495)]