## Test HMMLearn API 

Just trying out HMMs using the hmmlearn package.

In [1]:
import nltk
import string
import json
import numpy as np

from nltk.tokenize import RegexpTokenizer

In [2]:
def split_lines(filename):
    """
    Tokenizes the file and returns a list of tokens for
    each line of poetry in the file.
    """
    # Keep apostrophes and hyphens
    tokenizer = RegexpTokenizer('[\w|\'|-]+') 

    line_tokens = []
    with open(filename) as f:
        for line in f:
            line = line.strip()
            if (line.isdigit()):
                continue
            if (len(line) > 0):
                line = line.lower()
                tokens = tokenizer.tokenize(line)
                
                line_tokens.append(tokens)

    return line_tokens

In [3]:
files = ['../data/shakespeare.txt', '../data/shakespeare_xtra.txt']

lines = []
for filename in files:
    lines.extend(split_lines(filename))

In [4]:
vocab = json.load(open('../models/shakespeare_words/shakespeare_vocab.json'))

In [5]:
# Change to integer keys
for k in vocab.keys():
    vocab[int(k)] = vocab.pop(k)

In [6]:
inverted_vocab = json.load(open('../models/shakespeare_words/shakespeare_inverted_vocab.json'))

In [7]:
from hmmlearn import hmm

In [8]:
X = np.concatenate([[inverted_vocab[x] for x in lines[i]] for i in range(len(lines))])

In [9]:
X = X.reshape(-1, 1)

In [10]:
lengths = np.array([len(line) for line in lines])

Use 10 hidden states, to keep things small. Verbose prints log-likelihood in first column, and rate of convergence in the second.

In [30]:
with np.errstate(divide='ignore'):
    model = hmm.MultinomialHMM(n_components=10, n_iter=1000, verbose=True).fit(X, lengths)

         1     -394221.0212             +nan
         2     -303234.8562      +90986.1651
         3     -303209.8331         +25.0231
         4     -303157.4206         +52.4125
         5     -303042.7946        +114.6259
         6     -302799.1822        +243.6124
         7     -302324.0421        +475.1400
         8     -301538.9850        +785.0571
         9     -300527.0973       +1011.8877
        10     -299554.1760        +972.9213
        11     -298843.1555        +711.0205
        12     -298410.2500        +432.9055
        13     -298146.0928        +264.1572
        14     -297942.3994        +203.6934
        15     -297729.1624        +213.2371
        16     -297456.0047        +273.1577
        17     -297075.1012        +380.9035
        18     -296538.0334        +537.0678
        19     -295804.3004        +733.7330
        20     -294852.0854        +952.2150
        21     -293683.0018       +1169.0836
        22     -292326.4097       +1356.5921
        23

In [31]:
model.n_features

6751

In [32]:
model.transmat_.shape

(10, 10)

In [33]:
model.emissionprob_.shape

(10, 6751)

In [34]:
model.startprob_

array([  8.82866075e-056,   9.02025036e-001,   7.87611643e-014,
         1.08297029e-200,   0.00000000e+000,   0.00000000e+000,
         9.79749643e-002,   5.64802796e-046,   2.02357964e-258,
         1.12030635e-043])

Check matrix rows sum to 1

In [35]:
np.sum(model.emissionprob_, axis=1)

array([ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.])

Test a prediction by the HMM

In [36]:
sample, hidden = model.sample(10)

In [37]:
sample.T[0]

array([6522, 3823, 6147, 1847, 5312, 4283,  373, 4203, 1987, 5404])

In [38]:
" ".join(map(lambda x: vocab[x], sample.T[0]))

u"which die his face's weak eye for fill thy ditty"

In [39]:
from sklearn.externals import joblib

In [40]:
joblib.dump(model, "../models/hmm_10.pkl")

['../models/hmm_10.pkl']