In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_context('notebook')
sns.set_style('white')

%matplotlib inline 

In [2]:
import nltk
nltk.download('brown')
import sys
from nltk.corpus import brown

[nltk_data] Downloading package brown to /Users/johnnie/nltk_data...
[nltk_data]   Package brown is already up-to-date!


### 词性概率统计（HMM问题1）

其实就是要求联合概率：

$$p(t_1 \dots t_N, w_1 \dots w_N)$$

动手之前我们先来看下词性统计应用HMM的贝叶斯公式：

$$\begin{align*}
p(t_1 \dots t_N \mid w_1 \dots w_N) &= \frac{p(t_1 \dots t_N)p(w_1 \dots w_N \mid t_1 \dots t_N)}{p(w_1 \dots w_N)} \\
&= \frac{\prod_{i=1}^Np(t_i \mid t_{i-1})\prod_{i=1}^Np(w_i \mid t_i)}{\prod_{i=1}^Np(w_i \mid w_{i-1})} \\
&=\frac{\prod_{i=1}^Np(t_i \mid t_{i-1})p(w_i \mid t_i)}{\prod_{i=1}^Np(w_i \mid w_{i-1})}
\end{align*}$$

NLTK需要对词添加开始和结束符号，用于标记开始和结束，用以下表示：

(START, START), (END, END)

In [3]:
brown_tags_words = []
for sent in brown.tagged_sents():
    brown_tags_words.append(('START', 'START'))
    brown_tags_words.extend([(tag[:2], word) for (word, tag) in sent])
    brown_tags_words.append(('END', 'END'))

In [4]:
brown_tags_words[:27]

[('START', 'START'),
 ('AT', 'The'),
 ('NP', 'Fulton'),
 ('NN', 'County'),
 ('JJ', 'Grand'),
 ('NN', 'Jury'),
 ('VB', 'said'),
 ('NR', 'Friday'),
 ('AT', 'an'),
 ('NN', 'investigation'),
 ('IN', 'of'),
 ('NP', "Atlanta's"),
 ('JJ', 'recent'),
 ('NN', 'primary'),
 ('NN', 'election'),
 ('VB', 'produced'),
 ('``', '``'),
 ('AT', 'no'),
 ('NN', 'evidence'),
 ("''", "''"),
 ('CS', 'that'),
 ('DT', 'any'),
 ('NN', 'irregularities'),
 ('VB', 'took'),
 ('NN', 'place'),
 ('.', '.'),
 ('END', 'END')]

统计词库中相应tag下的词出现的概率，即为条件概率：

$$p(w_i \mid t_i) = \frac{p(w_i, t_i)}{p(t_i)}$$

可以使用count频率统计获得各自的概率！

In [5]:
# 统计（词性，单词）出现的概率
cfg_tagwords = nltk.ConditionalFreqDist(brown_tags_words)
# 统计条件概率（单词 | 词性）的概率
cpd_tagwords = nltk.ConditionalProbDist(cfg_tagwords, nltk.MLEProbDist)

In [6]:
print("The probability of an adjetive (JJ) being 'new' is", cpd_tagwords["JJ"].prob("new"))
print("The probability of an verb (VB) being 'took' is", cpd_tagwords["VB"].prob('took'))

The probability of an adjetive (JJ) being 'new' is 0.01472344917632025
The probability of an verb (VB) being 'took' is 0.003668790248787141


再来看下词性转换的概率：

$$p(t_i \mid t_{i-1}) = \frac{p(t_i, t_{i-1})}{p(t_{i-1})}$$

In [7]:
brow_tags = [tag for (tag, word) in brown_tags_words]

In [8]:
# 统计（词性t_i，词性t_{i-1}）的概率，bigram是实现前后词性连接成一组的方法
cfd_tags = nltk.ConditionalFreqDist(nltk.bigrams(brow_tags))
# 统计条件概率（词性t_i | 词性t_{i-1}）
cpd_tags = nltk.ConditionalProbDist(cfd_tags, nltk.MLEProbDist)

In [9]:
print("The first one is 'DT', the probability of next one 'NN' is", cpd_tags["DT"].prob("NN"))
print("The first one is 'VB', the probability of next one 'JJ' is", cpd_tags["VB"].prob("JJ"))
print("The first one is 'VB', the probability of next one 'NN' is", cpd_tags["VB"].prob("NN"))

The first one is 'DT', the probability of next one 'NN' is 0.5057722522030194
The first one is 'VB', the probability of next one 'JJ' is 0.03443483365273389
The first one is 'VB', the probability of next one 'NN' is 0.10970977711020183


如果说，有这么一句话“I love you”，对应的tag，“PP VB PP”，看看它们的联合概率是多少...

In [10]:
prob_tagsequence = cpd_tags["START"].prob("PP") * cpd_tagwords["PP"].prob("I") * \
    cpd_tags["PP"].prob("VB") * cpd_tagwords["VB"].prob("love") * \
    cpd_tags["VB"].prob("PP") * cpd_tagwords["PP"].prob("you") * \
    cpd_tags["VB"].prob("END")

print( "The probability of the tag sequence 'START PP VB PP END' for 'I love you' is:", prob_tagsequence)

The probability of the tag sequence 'START PP VB PP END' for 'I love you' is: 7.038271983819719e-12


### 找最佳词性序列 （HMM问题2 - Viterbi实现）

In [11]:
distinct_tags = set(brow_tags)

In [12]:
sentence = ['I', 'love', 'you']
sentlen = len(sentence)

In [13]:
viterbi = []

In [14]:
backpointer = []

In [15]:
first_viterbi = {}
first_backpointer = {}
for tag in distinct_tags:
    if tag == 'START': continue
    first_viterbi[tag] = cpd_tags['START'].prob(tag) * cpd_tagwords[tag].prob(sentence[0])
    first_backpointer[tag] = 'START'
        
print('输出各词性概率：\n', first_viterbi)
print('\n')
print('输出词性序列：\n', first_backpointer)

输出各词性概率：
 {'AB': 0.0, ')-': 0.0, 'PN': 0.0, 'BE': 0.0, 'QL': 0.0, 'FW': 0.0, 'RN': 0.0, 'DT': 0.0, 'WP': 0.0, 'WR': 0.0, '(': 0.0, '.-': 0.0, 'AT': 0.0, 'AP': 0.0, 'MD': 0.0, 'WD': 0.0, 'HV': 0.0, '--': 0.0, 'NP': 1.7319067623793952e-06, 'EX': 0.0, 'END': 0.0, 'IN': 0.0, 'UH': 0.0, 'RB': 0.0, 'VB': 0.0, 'TO': 0.0, ':': 0.0, 'RP': 0.0, 'CS': 0.0, 'DO': 0.0, "''": 0.0, "'": 0.0, '``': 0.0, ')': 0.0, 'NN': 1.0580313619573935e-06, ',': 0.0, '*': 0.0, 'OD': 0.0, 'JJ': 0.0, ',-': 0.0, ':-': 0.0, 'WQ': 0.0, 'NI': 3.3324520848931064e-07, 'NR': 0.0, '.': 0.0, 'CD': 0.0, '(-': 0.0, 'CC': 0.0, '*-': 0.0, 'PP': 0.014930900689060006}


输出词性序列：
 {'AB': 'START', ')-': 'START', 'PN': 'START', 'BE': 'START', 'QL': 'START', 'FW': 'START', 'RN': 'START', 'DT': 'START', 'WP': 'START', 'WR': 'START', '(': 'START', '.-': 'START', 'AT': 'START', 'AP': 'START', 'MD': 'START', 'WD': 'START', 'HV': 'START', '--': 'START', 'NP': 'START', 'EX': 'START', 'END': 'START', 'IN': 'START', 'UH': 'START', 'RB': 'START',

In [16]:
viterbi.append(first_viterbi)
backpointer.append(first_backpointer)

In [17]:
currbest = max(first_viterbi.keys(), key = lambda tag: first_viterbi[tag])
print('Word', "'" + sentence[0] +"'", 'current best two-tag sequence: ', first_backpointer[currbest], currbest)

Word 'I' current best two-tag sequence:  START PP


In [18]:
for wordindex in range(1, len(sentence)):
    this_viterbi = {}
    this_backpointer = {}
    prev_viterbi = viterbi[-1]
    
    for tag in distinct_tags:
        if tag == 'START': continue
        best_previous = max(prev_viterbi.keys(),
                           key = lambda prevtag: \
            prev_viterbi[prevtag] * cpd_tags[prevtag].prob(tag) * cpd_tagwords[tag].prob(sentence[wordindex]))
        this_viterbi[tag] = prev_viterbi[best_previous] * \
            cpd_tags[best_previous].prob(tag) * cpd_tagwords[tag].prob(sentence[wordindex])
        this_backpointer[tag] = best_previous
        
    currbest = max(this_viterbi.keys(), key = lambda tag: this_viterbi[tag])
    print('Word', "'" + sentence[wordindex] +"'", 'current best two-tag sequence: ', this_backpointer[currbest], currbest)
    
    viterbi.append(this_viterbi)
    backpointer.append(this_backpointer)

Word 'love' current best two-tag sequence:  PP NN
Word 'you' current best two-tag sequence:  VB PP


In [19]:
prev_viterbi = viterbi[-1]
best_previous = max(prev_viterbi.keys(),
                  key = lambda prevtag: prev_viterbi[prevtag] * cpd_tags[prevtag].prob('END'))
prob_tagsequence = prev_viterbi[best_previous] * cpd_tags[best_previous].prob('END')
best_tagsequence = ['END', best_previous]
backpointer.reverse()

In [20]:
current_best_tag = best_previous
for bp in backpointer:
    best_tagsequence.append(bp[current_best_tag])
    current_best_tag = bp[current_best_tag]

In [21]:
best_tagsequence.reverse()
print('The sentence is: ', end=' ')
for w in sentence: print(w, end=' ')
print('\n')
print('The best tag sequence is: ', end=' ')
for t in best_tagsequence: print(t, end=' ')
print('\n')
print('The probability of the best tag sequence is: ', prob_tagsequence)

The sentence is:  I love you 

The best tag sequence is:  START PP VB PP END 

The probability of the best tag sequence is:  6.359015280071473e-13


In [1]:
from hmmlearn import base, hmm, stats, utils
from hmmlearn.hmm import GaussianHMM