## Test HMMLearn API 

Just trying out HMMs using the hmmlearn package.

In [1]:
import nltk
import string
import json
import numpy as np

from nltk.tokenize import RegexpTokenizer

In [2]:
def split_lines(filename):
    """
    Tokenizes the file and returns a list of tokens for
    each line of poetry in the file.
    """
    # Keep apostrophes and hyphens
    tokenizer = RegexpTokenizer('\w[\w|\'|-]*\w|\w') 

    line_tokens = []
    with open(filename) as f:
        for line in f:
            line = line.strip()
            if (line.isdigit()):
                continue
            if (len(line) > 0):
                line = line.lower()
                tokens = tokenizer.tokenize(line)
                
                if len(tokens) > 1:
                    line_tokens.append(tokens)

    return line_tokens

In [3]:
files = ['../data/shakespeare.txt', '../data/shakespeare_xtra.txt',\
            '../data/spenser.txt']

lines = []
for filename in files:
    lines.extend(split_lines(filename))

In [4]:
vocab = json.load(open('../models/words/vocab.json'))

In [5]:
# Change to integer keys
for k in vocab.keys():
    vocab[int(k)] = vocab.pop(k)

In [6]:
inverted_vocab = json.load(open('../models/words/inverted_vocab.json'))

In [7]:
from hmmlearn import hmm

In [8]:
X = np.concatenate([[inverted_vocab[x] for x in lines[i]] for i in range(len(lines))])

In [9]:
X = X.reshape(-1, 1)

In [10]:
lengths = np.array([len(line) for line in lines])

Use 10 hidden states, to keep things small. Verbose prints log-likelihood in first column, and rate of convergence in the second.

In [None]:
with np.errstate(divide='ignore'):
    model = hmm.MultinomialHMM(n_components=10, n_iter=1000, verbose=True).fit(X, lengths)

         1     -482695.2551             +nan
         2     -368631.1423     +114064.1128
         3     -368605.9119         +25.2304
         4     -368557.0825         +48.8293
         5     -368458.7204         +98.3621
         6     -368266.8160        +191.9044
         7     -367917.4969        +349.3191
         8     -367346.5002        +570.9967
         9     -366536.9994        +809.5008
        10     -365568.8646        +968.1348
        11     -364617.8015        +951.0631
        12     -363860.1170        +757.6845
        13     -363344.2211        +515.8959
        14     -362992.0075        +352.2136
        15     -362694.7891        +297.2184
        16     -362359.4432        +335.3458
        17     -361901.5263        +457.9170
        18     -361234.0832        +667.4431
        19     -360273.1090        +960.9742
        20     -358962.1335       +1310.9755
        21     -357310.7482       +1651.3853
        22     -355422.0741       +1888.6741
        23

In [28]:
model.n_features

7283

In [29]:
model.transmat_.shape

(10, 10)

In [30]:
model.emissionprob_.shape

(10, 7283)

In [31]:
model.startprob_

array([  1.29665930e-181,   6.60752861e-223,   9.36255582e-176,
         6.78909443e-001,   1.91020945e-121,   3.61238423e-003,
         1.73133459e-224,   0.00000000e+000,   1.52983393e-136,
         3.17478173e-001])

Check matrix rows sum to 1

In [32]:
np.sum(model.emissionprob_, axis=1)

array([ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.])

Test a prediction by the HMM

In [33]:
sample, hidden = model.sample(10)

In [34]:
sample.T[0]

array([ 230, 1936, 2131, 2703, 4296, 2071,  226, 7143, 1213, 1955])

In [35]:
" ".join(map(lambda x: vocab[x], sample.T[0]))

u'letter thee thy heaven eyes ill shall will thou forty'

In [28]:
from sklearn.externals import joblib

In [37]:
joblib.dump(model, "../models/hmm_10.pkl")

['../models/hmm_10.pkl']

## Train Supervised HMM

In [3]:
files = ['../data/shakespeare.txt', '../data/shakespeare_xtra.txt',\
            '../data/spenser.txt']

lines = []
for filename in files:
    lines.extend(split_lines(filename))

In [4]:
pos = []
p_set = set()
for line in lines:
    tagged = nltk.pos_tag(line)
    w,p = zip(*tagged)
    pos.append(list(p))
    
    p_set |= set(list(p))

In [5]:
pos_indices = dict((c, i) for i, c in enumerate(p_set))
indices_pos = dict((i, c) for i, c in enumerate(p_set))

In [10]:
X = []
for line in lines:
    X.append([inverted_vocab[w] for w in line][::-1])

In [11]:
Y = []
for p in pos:
    Y.append([pos_indices[i] for i in p][::-1])

In [12]:
L = len(indices_pos.keys())
D = len(inverted_vocab.keys())

In [13]:
A = np.zeros((L, L))
O = np.zeros((L, D))
A_start = np.array([ 1. / L for _ in range(L)])

In [4]:
from sonnetHMM import BackwardsSonnetHMM

In [21]:
model = BackwardsSonnetHMM(A, O, A_start)

In [22]:
model.supervised_learning(X, Y)

In [33]:
np.savetxt("../models/pos_A.txt", model.A)

In [34]:
np.savetxt("../models/pos_O.txt", model.O)
np.savetxt("../models/pos_A_start.txt", model.A_start)