# 使用贝叶斯分类器判断句子的情绪

In [1]:
import jieba
from collections import Counter

In [2]:
with open("pos_train.txt") as file:
    content = file.read()
pos_words = [i for i in jieba.cut(content)]
        
cnt_pos = Counter(pos_words)

with open("neg_train.txt") as file:
    content = file.read()
neg_words = [i for i in jieba.cut(content)]
        
cnt_neg = Counter(neg_words)

pos_total = float(sum(cnt_pos.values()))
neg_total = float(sum(cnt_neg.values()))

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.275 seconds.
Prefix dict has been built succesfully.


得到正面和负面两个条件下，各个词对应的频率。现在开始归一化。

In [3]:
def prob(cnt):
    total = float(sum(cnt.values()))
    for key, value in cnt.items():
        cnt[key] /= total

In [4]:
prob(cnt_pos)
prob(cnt_neg)

In [5]:
# Check the prob of pos words.
pos_prob = sum(cnt_pos.values())/(sum(cnt_pos.values())+sum(cnt_neg.values()))
print pos_prob

print sum(cnt_pos.values()), sum(cnt_neg.values())


0.5
1.0 1.0


In [6]:
def guess(s):
    words = [i for i in jieba.cut(s)]
    pos = 1
    neg = 1
    for word in words:
        # if the word has no value in cnt, skip it
        if cnt_pos[word]!= 0 and cnt_neg[word]!= 0:
            pos *= cnt_pos[word]
            neg *= cnt_neg[word]
    if pos > neg:
        # print "This sentence is positive."
        return 1
    elif neg > pos:
        # print "This sentence is negative."
        return 0
    else:
        print "neg = pos! float number sucks!"

In [7]:
def test():
    pos_pos = 0
    pos_neg = 0
    neg_neg = 0
    neg_pos = 0
    with open("pos_test.txt") as file:
        for line in file:
            pos_testwords = line
            if guess(pos_testwords) == 1:
                pos_pos += 1
            elif guess(pos_testwords) == 0:
                pos_neg += 1
    
    with open("neg_test.txt") as file:
        for line in file:
            neg_testwords = line
            if guess(neg_testwords) == 1:
                neg_pos += 1
            elif guess(neg_testwords) == 0:
                neg_neg += 1
    print 'Pos Right:', pos_pos, 'Pos Wrong:', pos_neg
    print 'Neg Right:', neg_neg, 'Neg Wrong:', neg_pos
    pos_pos = float(pos_pos)
    neg_pos = float(neg_pos)
    pos_neg = float(pos_neg)
    neg_neg = float(neg_neg)
    print 'Accuracy:', (pos_pos + neg_neg) / (pos_pos + neg_neg + pos_neg + neg_pos)
    print 'Misclassification Rate:', (pos_neg + neg_pos) / (pos_pos + neg_neg + pos_neg + neg_pos)
    print 'True Positive Rate:', pos_pos / (pos_pos + pos_neg)
    print 'False Positive Rate:', neg_pos / (neg_pos + neg_neg)
    print 'Specificity:', neg_neg / (neg_neg + pos_neg)
    print 'Precision:', pos_pos / (pos_pos + neg_pos)
    print 'Prevalence:', (pos_pos + pos_neg) / (pos_pos + neg_neg + pos_neg + neg_pos)

可以看到因为使用浮点数相乘，导致大量句子发生正面情绪概率等于负面情绪概率的情况。使用浮点数直接相乘、遇到未登录词直接跳过的方法得到结果如下：
```
Pos Right: 3437 Pos Wrong: 1251
Neg Right: 4465 Neg Wrong: 836
Accuracy: 0.791070177195
Misclassification Rate: 0.208929822805
True Positive Rate: 0.733148464164
False Positive Rate: 0.15770609319
Specificity: 0.781140657803
Precision: 0.804352913644
Prevalence: 0.469316247873
```

## 概率转化为对数

In [9]:
from math import log

with open("pos_train.txt") as file:
    content = file.read()
pos_words = [i for i in jieba.cut(content)]
        
cnt_pos = Counter(pos_words)

with open("neg_train.txt") as file:
    content = file.read()
neg_words = [i for i in jieba.cut(content)]
        
cnt_neg = Counter(neg_words)

In [10]:
def log_prob(cnt):
    total = float(sum(cnt.values()))
    for key, value in cnt.items():
        cnt[key] = log(cnt[key]/total,2)

log_prob(cnt_pos)
log_prob(cnt_neg)

In [11]:
def log_guess(s):
    words = [i for i in jieba.cut(s)]
    pos = 0
    neg = 0
    for word in words:
        # if the word has no value in cnt, skip it
        if cnt_pos[word] != 0 and cnt_neg[word] != 0:
            pos += cnt_pos[word]
            neg += cnt_neg[word]
    if pos > neg:
        # print "This sentence is positive."
        return 1
    elif neg > pos:
        # print "This sentence is negative."
        return 0
    else:
        print "neg = pos!"
        print s

In [12]:
def log_test():
    pos_pos = 0
    pos_neg = 0
    neg_neg = 0
    neg_pos = 0
    with open("pos_test.txt") as file:
        for line in file:
            pos_testwords = line
            if log_guess(pos_testwords) == 1:
                pos_pos += 1
            elif log_guess(pos_testwords) == 0:
                pos_neg += 1
    
    with open("neg_test.txt") as file:
        for line in file:
            neg_testwords = line
            if log_guess(neg_testwords) == 1:
                neg_pos += 1
            elif log_guess(neg_testwords) == 0:
                neg_neg += 1
    print 'Pos Right:', pos_pos, 'Pos Wrong:', pos_neg
    print 'Neg Right:', neg_neg, 'Neg Wrong:', neg_pos
    pos_pos = float(pos_pos)
    neg_pos = float(neg_pos)
    pos_neg = float(pos_neg)
    neg_neg = float(neg_neg)
    print 'Accuracy:', (pos_pos + neg_neg) / (pos_pos + neg_neg + pos_neg + neg_pos)
    print 'Misclassification Rate:', (pos_neg + neg_pos) / (pos_pos + neg_neg + pos_neg + neg_pos)
    print 'True Positive Rate:', pos_pos / (pos_pos + pos_neg)
    print 'False Positive Rate:', neg_pos / (neg_pos + neg_neg)
    print 'Specificity:', neg_neg / (neg_neg + pos_neg)
    print 'Precision:', pos_pos / (pos_pos + neg_pos)
    print 'Prevalence:', (pos_pos + pos_neg) / (pos_pos + neg_neg + pos_neg + neg_pos)

In [13]:
log_test()

Pos Right: 3656 Pos Wrong: 1309
Neg Right: 4687 Neg Wrong: 886
Accuracy: 0.791706206111
Misclassification Rate: 0.208293793889
True Positive Rate: 0.73635448137
False Positive Rate: 0.158980800287
Specificity: 0.781687791861
Precision: 0.804931748129
Prevalence: 0.471152021256


使用浮点数直接相乘、遇到未登录词直接跳过的方法得到结果如下：
```
Pos Right: 3437 Pos Wrong: 1251
Neg Right: 4465 Neg Wrong: 836
Accuracy: 0.791070177195
Misclassification Rate: 0.208929822805
True Positive Rate: 0.733148464164
False Positive Rate: 0.15770609319
Specificity: 0.781140657803
Precision: 0.804352913644
Prevalence: 0.469316247873
```

使用浮点数取对数相加、遇到未登录词直接跳过的方法得到结果如下：
```
Pos Right: 3656 Pos Wrong: 1309
Neg Right: 4687 Neg Wrong: 886
Accuracy: 0.791706206111
Misclassification Rate: 0.208293793889
True Positive Rate: 0.73635448137
False Positive Rate: 0.158980800287
Specificity: 0.781687791861
Precision: 0.804931748129
Prevalence: 0.471152021256
```

可以看出使用浮点数取对数相加使得 pos = neg 的情况减少为0。

## 拉普拉斯平滑

当遇到未登录词时，引入拉普拉斯平滑。

In [14]:
def laplace_log_guess(s):
    words = [i for i in jieba.cut(s)]
    pos = 0
    neg = 0
    for word in words:
        if cnt_pos[word] == 0:
            pos += log(1/(pos_total+1.0),2)
        if cnt_neg[word] == 0:
            neg += log(1/(neg_total+1.0),2)
        if cnt_pos[word] != 0:
            pos += cnt_pos[word]
        if cnt_neg[word] != 0:
            neg += cnt_neg[word]
    if pos > neg:
        # print "This sentence is positive."
        return 1
    elif neg > pos:
        # print "This sentence is negative."
        return 0
    else:
        print "neg = pos!"
        print s

In [15]:
def laplace_log_test():
    pos_pos = 0
    pos_neg = 0
    neg_neg = 0
    neg_pos = 0
    with open("pos_test.txt") as file:
        for line in file:
            pos_testwords = line
            if laplace_log_guess(pos_testwords) == 1:
                pos_pos += 1
            elif laplace_log_guess(pos_testwords) == 0:
                pos_neg += 1
    
    with open("neg_test.txt") as file:
        for line in file:
            neg_testwords = line
            if laplace_log_guess(neg_testwords) == 1:
                neg_pos += 1
            elif laplace_log_guess(neg_testwords) == 0:
                neg_neg += 1
    print 'Pos Right:', pos_pos, 'Pos Wrong:', pos_neg
    print 'Neg Right:', neg_neg, 'Neg Wrong:', neg_pos
    pos_pos = float(pos_pos)
    neg_pos = float(neg_pos)
    pos_neg = float(pos_neg)
    neg_neg = float(neg_neg)
    print 'Accuracy:', (pos_pos + neg_neg) / (pos_pos + neg_neg + pos_neg + neg_pos)
    print 'Misclassification Rate:', (pos_neg + neg_pos) / (pos_pos + neg_neg + pos_neg + neg_pos)
    print 'True Positive Rate:', pos_pos / (pos_pos + pos_neg)
    print 'False Positive Rate:', neg_pos / (neg_pos + neg_neg)
    print 'Specificity:', neg_neg / (neg_neg + pos_neg)
    print 'Precision:', pos_pos / (pos_pos + neg_pos)
    print 'Prevalence:', (pos_pos + pos_neg) / (pos_pos + neg_neg + pos_neg + neg_pos)

In [16]:
laplace_log_test()

Pos Right: 3772 Pos Wrong: 1193
Neg Right: 4812 Neg Wrong: 761
Accuracy: 0.814575820839
Misclassification Rate: 0.185424179161
True Positive Rate: 0.759718026183
False Positive Rate: 0.13655122914
Specificity: 0.801332223147
Precision: 0.832120008824
Prevalence: 0.471152021256


使用浮点数直接相乘、遇到未登录词直接跳过的方法得到结果如下：
```
Pos Right: 3437 Pos Wrong: 1251
Neg Right: 4465 Neg Wrong: 836
Accuracy: 0.791070177195
Misclassification Rate: 0.208929822805
True Positive Rate: 0.733148464164
False Positive Rate: 0.15770609319
Specificity: 0.781140657803
Precision: 0.804352913644
Prevalence: 0.469316247873
```

使用浮点数取对数相加、遇到未登录词直接跳过的方法得到结果如下：
```
Pos Right: 3656 Pos Wrong: 1309
Neg Right: 4687 Neg Wrong: 886
Accuracy: 0.791706206111
Misclassification Rate: 0.208293793889
True Positive Rate: 0.73635448137
False Positive Rate: 0.158980800287
Specificity: 0.781687791861
Precision: 0.804931748129
Prevalence: 0.471152021256
```

使用浮点数取对数相加、遇到未登录词用拉普拉斯平滑的方法得到结果如下：
```
Pos Right: 3772 Pos Wrong: 1193
Neg Right: 4812 Neg Wrong: 761
Accuracy: 0.814575820839
Misclassification Rate: 0.185424179161
True Positive Rate: 0.759718026183
False Positive Rate: 0.13655122914
Specificity: 0.801332223147
Precision: 0.832120008824
Prevalence: 0.471152021256
```

可以看出拉普拉斯平滑确实有帮助！但从直觉上不是很好理解它是怎么让正确率提高的。