## Test HMMLearn API 

Just trying out HMMs using the hmmlearn package.

In [40]:
import nltk
import string
import json
import numpy as np

from nltk.tokenize import RegexpTokenizer

In [41]:
def split_lines(filename):
    """
    Tokenizes the file and returns a list of tokens for
    each line of poetry in the file.
    """
    # Keep apostrophes and hyphens
    tokenizer = RegexpTokenizer('[\w|\'|-]+') 

    line_tokens = []
    with open(filename) as f:
        for line in f:
            line = line.strip()
            if (line.isdigit()):
                continue
            if (len(line) > 0):
                line = line.lower()
                tokens = tokenizer.tokenize(line)
                
                line_tokens.append(tokens)

    return line_tokens

In [42]:
files = ['../data/shakespeare.txt', '../data/shakespeare_xtra.txt']

lines = []
for filename in files:
    lines.extend(split_lines(filename))

In [43]:
vocab = json.load(open('../models/shakespeare_words/shakespeare_vocab.json'))

In [44]:
# Change to integer keys
for k in vocab.keys():
    vocab[int(k)] = vocab.pop(k)

In [45]:
inverted_vocab = json.load(open('../models/shakespeare_words/shakespeare_inverted_vocab.json'))

In [46]:
from hmmlearn import hmm

In [47]:
X = np.concatenate([[inverted_vocab[x] for x in lines[i]] for i in range(len(lines))])

In [48]:
X = X.reshape(-1, 1)

In [49]:
lengths = np.array([len(line) for line in lines])

Use 10 hidden states, to keep things small. Verbose prints log-likelihood in first column, and rate of convergence in the second.

In [50]:
with np.errstate(divide='ignore'):
    model = hmm.MultinomialHMM(n_components=10, n_iter=1000, verbose=True).fit(X, lengths)

         1     -392762.9003             +nan
         2     -303230.2240      +89532.6762
         3     -303200.6337         +29.5904
         4     -303143.0346         +57.5991
         5     -303028.7858        +114.2487
         6     -302812.5371        +216.2487
         7     -302429.9822        +382.5550
         8     -301810.6174        +619.3648
         9     -300941.8464        +868.7710
        10     -299968.4335        +973.4129
        11     -299138.1726        +830.2609
        12     -298590.3882        +547.7844
        13     -298279.6769        +310.7113
        14     -298094.9866        +184.6903
        15     -297962.3817        +132.6049
        16     -297839.6877        +122.6940
        17     -297694.8087        +144.8790
        18     -297495.4915        +199.3172
        19     -297206.8926        +288.5989
        20     -296798.2839        +408.6087
        21     -296254.5001        +543.7838
        22     -295582.9095        +671.5905
        23

In [51]:
model.n_features

6751

In [52]:
model.transmat_.shape

(10, 10)

In [53]:
model.emissionprob_.shape

(10, 6751)

In [54]:
model.startprob_

array([  2.08406255e-019,   8.43051129e-122,   1.16612216e-001,
         0.00000000e+000,   3.63354679e-080,   4.47886646e-033,
         8.83387784e-001,   6.90723182e-184,   4.05294815e-135,
         2.93615792e-201])

In [55]:
np.argmax(model.emissionprob_[6])

5656

In [56]:
vocab[5656]

u'and'

Test a prediction by the HMM

In [60]:
sample, hidden = model.sample(10)

In [61]:
sample.T[0]

array([5797, 5218, 4865, 1404, 1993, 4096,  839,  694, 5331, 1146])

In [62]:
" ".join(map(lambda x: vocab[x], sample.T[0]))

u'without their untrue obeys the sudden rage where quickly thou'

In [63]:
from sklearn.externals import joblib

In [64]:
joblib.dump(model, "hmm_10.pkl")

['hmm_10.pkl']