## Test HMMLearn API 

Just trying out HMMs using the hmmlearn package.

In [1]:
import nltk
import string
import json
import numpy as np

from nltk.tokenize import RegexpTokenizer

In [22]:
def split_lines(filename):
    """
    Tokenizes the file and returns a list of tokens for
    each line of poetry in the file.
    """
    # Keep apostrophes and hyphens
    tokenizer = RegexpTokenizer('\w[\w|\'|-]*\w|\w') 

    line_tokens = []
    with open(filename) as f:
        for line in f:
            line = line.strip()
            if (line.isdigit()):
                continue
            if (len(line) > 0):
                line = line.lower()
                tokens = tokenizer.tokenize(line)
                
                if len(tokens) > 1:
                    line_tokens.append(tokens)

    return line_tokens

In [34]:
files = ['../data/shakespeare.txt', '../data/shakespeare_xtra.txt', \
            '../data/spenser.txt']

lines = []
for filename in files:
    lines.extend(split_lines(filename))

In [35]:
vocab = json.load(open('../models/words/vocab.json'))

In [36]:
# Change to integer keys
for k in vocab.keys():
    vocab[int(k)] = vocab.pop(k)

In [37]:
inverted_vocab = json.load(open('../models/words/inverted_vocab.json'))

In [38]:
from hmmlearn import hmm

In [39]:
X = np.concatenate([[inverted_vocab[x] for x in lines[i]] for i in range(len(lines))])

In [40]:
X = X.reshape(-1, 1)

In [41]:
lengths = np.array([len(line) for line in lines])

Use 10 hidden states, to keep things small. Verbose prints log-likelihood in first column, and rate of convergence in the second.

In [42]:
with np.errstate(divide='ignore'):
    model = hmm.MultinomialHMM(n_components=5, n_iter=1000, verbose=True).fit(X, lengths)

         1     -486565.5938             +nan
         2     -368631.7344     +117933.8594
         3     -368610.5098         +21.2246
         4     -368573.8502         +36.6596
         5     -368507.3169         +66.5333
         6     -368390.0365        +117.2804
         7     -368195.1318        +194.9046
         8     -367895.6521        +299.4797
         9     -367473.4479        +422.2043
        10     -366926.1668        +547.2811
        11     -366272.5601        +653.6067
        12     -365556.3379        +716.2222
        13     -364844.1331        +712.2048
        14     -364205.4037        +638.7294
        15     -363675.2564        +530.1473
        16     -363237.7844        +437.4720
        17     -362843.1188        +394.6656
        18     -362430.7612        +412.3576
        19     -361940.1638        +490.5975
        20     -361315.4597        +624.7041
        21     -360518.5195        +796.9402
        22     -359550.6359        +967.8836
        23

In [43]:
model.n_features

7320

In [44]:
model.transmat_.shape

(5, 5)

In [45]:
model.emissionprob_.shape

(5, 7320)

In [46]:
model.startprob_

array([  6.19259949e-004,   1.52922717e-013,   1.02180569e-008,
         9.99380730e-001,   1.08651237e-301])

Check matrix rows sum to 1

In [47]:
np.sum(model.emissionprob_, axis=1)

array([ 1.,  1.,  1.,  1.,  1.])

Test a prediction by the HMM

In [48]:
sample, hidden = model.sample(10)

In [49]:
sample.T[0]

array([ 936, 3156, 6829, 2224, 5934,  580, 4743, 1140, 2347, 2414])

In [50]:
" ".join(map(lambda x: vocab[x], sample.T[0]))

u"to she's piteous war when so hard self can not"

In [51]:
from sklearn.externals import joblib

In [52]:
joblib.dump(model, "../models/hmm_5.pkl")

['../models/hmm_5.pkl']

## Train Supervised HMM

In [3]:
files = ['../data/shakespeare.txt', '../data/shakespeare_xtra.txt',\
            '../data/spenser.txt']

lines = []
for filename in files:
    lines.extend(split_lines(filename))

In [4]:
pos = []
p_set = set()
for line in lines:
    tagged = nltk.pos_tag(line)
    w,p = zip(*tagged)
    pos.append(list(p))
    
    p_set |= set(list(p))

In [5]:
pos_indices = dict((c, i) for i, c in enumerate(p_set))
indices_pos = dict((i, c) for i, c in enumerate(p_set))

In [10]:
X = []
for line in lines:
    X.append([inverted_vocab[w] for w in line][::-1])

In [11]:
Y = []
for p in pos:
    Y.append([pos_indices[i] for i in p][::-1])

In [12]:
L = len(indices_pos.keys())
D = len(inverted_vocab.keys())

In [13]:
A = np.zeros((L, L))
O = np.zeros((L, D))
A_start = np.array([ 1. / L for _ in range(L)])

In [4]:
from sonnetHMM import BackwardsSonnetHMM

In [21]:
model = BackwardsSonnetHMM(A, O, A_start)

In [22]:
model.supervised_learning(X, Y)

In [33]:
np.savetxt("../models/pos_A.txt", model.A)

In [34]:
np.savetxt("../models/pos_O.txt", model.O)
np.savetxt("../models/pos_A_start.txt", model.A_start)