In [3]:
from nltk.corpus import brown

In [8]:
brown.sents(categories=['news', 'editorial', 'reviews'])

[[u'The', u'Fulton', u'County', u'Grand', u'Jury', u'said', u'Friday', u'an', u'investigation', u'of', u"Atlanta's", u'recent', u'primary', u'election', u'produced', u'``', u'no', u'evidence', u"''", u'that', u'any', u'irregularities', u'took', u'place', u'.'], [u'The', u'jury', u'further', u'said', u'in', u'term-end', u'presentments', u'that', u'the', u'City', u'Executive', u'Committee', u',', u'which', u'had', u'over-all', u'charge', u'of', u'the', u'election', u',', u'``', u'deserves', u'the', u'praise', u'and', u'thanks', u'of', u'the', u'City', u'of', u'Atlanta', u"''", u'for', u'the', u'manner', u'in', u'which', u'the', u'election', u'was', u'conducted', u'.'], ...]

In [9]:
import string
sentences = [map(string.lower, sentence) for sentence in brown.sents(categories=['news', 'editorial', 'reviews'])]

In [11]:
parts_of_speech = range(10)

In [22]:
import random
random.seed('hmm_em')
pos_assignments = [[random.choice(parts_of_speech) for token in sentence] for sentence in sentences]

In [23]:
from collections import Counter
emissions = {
    pos: Counter(
        token
        for sentence, assignments
        in zip(sentences, pos_assignments)
        for token, assignment
        in zip(sentence, assignments)
        if assignment == pos
    )
    for pos in parts_of_speech
}

In [26]:
emissions[0].most_common(10)

[(u'the', 1258),
 (u',', 990),
 (u'.', 825),
 (u'of', 644),
 (u'to', 435),
 (u'and', 422),
 (u'a', 400),
 (u'in', 397),
 (u'is', 206),
 (u'for', 187)]

In [35]:
emissions[9].most_common(10)

[(u'the', 1302),
 (u',', 1044),
 (u'.', 824),
 (u'of', 639),
 (u'to', 445),
 (u'and', 444),
 (u'a', 414),
 (u'in', 358),
 (u'is', 193),
 (u'that', 192)]

In [50]:
from collections import defaultdict
START, END = 'START', 'END'
transmissions = defaultdict(list)
for pos_assignment in pos_assignments:
    for token1, token2 in zip([START] + pos_assignment, pos_assignment + [END]):
        transmissions[token1].append(token2)
transmissions = {k: Counter(v) for k, v in transmissions.items()}

In [51]:
transmissions[START].most_common()

[(3, 976),
 (5, 949),
 (6, 947),
 (2, 943),
 (7, 943),
 (8, 938),
 (1, 935),
 (9, 932),
 (0, 914),
 (4, 894)]

In [52]:
transmissions[3].most_common()

[(3, 2073),
 (6, 1994),
 (4, 1994),
 (9, 1990),
 (0, 1965),
 (5, 1943),
 (8, 1938),
 (7, 1892),
 (1, 1876),
 (2, 1867),
 ('END', 901)]

In [54]:
emission_probabilities = {
    state: {token: count / float(sum(token_counts.values())) for token, count in token_counts.items()}
    for state, token_counts in emissions.items()
}

In [60]:
sorted(emission_probabilities[3].items(), key=lambda (__, p): p, reverse=True)[:25]

[(u'the', 0.06494396319678951),
 (u',', 0.04996818871433466),
 (u'.', 0.036950031811285665),
 (u'of', 0.02814075270395928),
 (u'and', 0.024763862379484167),
 (u'to', 0.023295649194929772),
 (u'a', 0.02060392502324671),
 (u'in', 0.018059022170019087),
 (u'is', 0.010130670973425342),
 (u'that', 0.009053981304752117),
 (u'for', 0.008319874712474917),
 (u"''", 0.008221993833504625),
 (u'``', 0.008173053394019478),
 (u'on', 0.006606959330494788),
 (u'he', 0.0064601380120393485),
 (u'with', 0.005872852738217589),
 (u'it', 0.00572603141976215),
 (u'as', 0.005628150540791856),
 (u'by', 0.005383448343366124),
 (u'be', 0.005089805706455244),
 (u'was', 0.005089805706455244),
 (u'at', 0.004991924827484951),
 (u'has', 0.004747222630059218),
 (u'his', 0.004355699114178046),
 (u'but', 0.004306758674692899)]

In [63]:
transmission_probabilities = {
    state: {next_state: count / float(sum(next_state_counts.values())) for next_state, count in next_state_counts.items()}
    for state, next_state_counts in transmissions.items()
}

In [74]:
sorted(transmission_probabilities[3].items(), key=lambda (__, p): p, reverse=True)

[(3, 0.10145353105270885),
 (6, 0.09758723633338227),
 (4, 0.09758723633338227),
 (9, 0.09739147457544169),
 (0, 0.09616796358831302),
 (5, 0.0950912739196398),
 (8, 0.09484657172221407),
 (7, 0.09259531150589732),
 (1, 0.09181226447413497),
 (2, 0.09137180051876866),
 ('END', 0.04409533597611707)]