## Test HMMLearn API 

Just trying out HMMs using the hmmlearn package.

In [24]:
import nltk
import string
import json
import numpy as np

from nltk.tokenize import RegexpTokenizer

In [137]:
def split_lines(filename):
    """
    Tokenizes the file and returns a list of tokens for
    each line of poetry in the file.
    """
    # Keep apostrophes and hyphens
    tokenizer = RegexpTokenizer('[\w|\'|-]+') 

    line_tokens = []
    with open(filename) as f:
        for line in f:
            line = line.strip()
            if (line.isdigit()):
                continue
            if (len(line) > 0):
                line = line.lower()
                tokens = tokenizer.tokenize(line)
                
                line_tokens.append(tokens)

    return line_tokens

In [138]:
files = ['data/shakespeare.txt', 'data/shakespeare_xtra.txt']

lines = []
for filename in files:
    lines.extend(tokenize(filename))

In [139]:
vocab = json.load(open('models/shakespeare_words/shakespeare_vocab.json'))

In [143]:
# Change to integer keys
for k in vocab.keys():
    vocab[int(k)] = vocab.pop(k)

In [144]:
inverted_vocab = json.load(open('models/shakespeare_words/shakespeare_inverted_vocab.json'))

In [145]:
from hmmlearn import hmm

In [146]:
X = np.concatenate([[inverted_vocab[x] for x in lines[i]] for i in range(len(lines))])

In [147]:
X = X.reshape(-1, 1)

In [148]:
lengths = np.array([len(line) for line in lines])

Use 10 hidden states, to keep things small. Verbose prints log-likelihood in first column, and rate of convergence in the second.

In [149]:
with np.errstate(divide='ignore'):
    model = hmm.MultinomialHMM(n_components=10, verbose=True, n_iter=100).fit(X, lengths)

         1     -393602.0402             +nan
         2     -303234.3343      +90367.7059
         3     -303209.4533         +24.8810
         4     -303158.7449         +50.7084
         5     -303051.7616        +106.9833
         6     -302832.3394        +219.4222
         7     -302408.6995        +423.6399
         8     -301676.3342        +732.3652
         9     -300646.7714       +1029.5628
        10     -299604.3372       +1042.4341
        11     -298867.7790        +736.5582
        12     -298440.9457        +426.8334
        13     -298167.4229        +273.5228
        14     -297934.8488        +232.5740
        15     -297681.7100        +253.1389
        16     -297365.6720        +316.0380
        17     -296955.0520        +410.6200
        18     -296434.4197        +520.6323
        19     -295804.6282        +629.7914
        20     -295080.1548        +724.4734
        21     -294272.1765        +807.9783
        22     -293366.5581        +905.6184
        23

In [150]:
model.n_features

6751

In [151]:
model.transmat_.shape

(10, 10)

In [152]:
model.emissionprob_.shape

(10, 6751)

In [153]:
model.startprob_

array([  9.99999981e-01,   1.15490802e-44,   1.85644232e-08,
         8.45699884e-18,   1.35766007e-27,   4.95112382e-10,
         2.37446847e-22,   3.30822775e-16,   1.43038560e-26,
         8.21258867e-42])

In [155]:
np.argmax(model.emissionprob_[0])

5656

In [156]:
vocab[5656]

u'and'

Test a prediction by the HMM

In [157]:
sample, hidden = model.sample(10)

In [158]:
sample.T[0]

array([6446, 1809, 1993, 2338,  939, 5656, 2625, 3962, 2605, 5656])

In [159]:
" ".join(map(lambda x: vocab[x], sample.T[0]))

u'whilst then the cannot beheld and publish thine vomit and'