In [7]:
# 分类
import nltk
from nltk.corpus import names
import random

names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')])
random.shuffle(names)


def gender_features(word):
    return {'last_letter': word[-1]}

# 获取特征集/训练集/测试集
featuresets = [(gender_features(n), g) for (n,g) in names]
train_set, test_set = featuresets[500:], featuresets[:500]

# 分类器，朴素贝叶斯
classifier = nltk.NaiveBayesClassifier.train(train_set)

classifier.classify(gender_features('Neo'))
classifier.classify(gender_features('Kate'))


'female'

In [9]:
print(nltk.classify.accuracy(classifier, test_set))

0.754


In [11]:
# 确定哪些特征对于区分姓名的性别最有效
classifier.show_most_informative_features(5)

# 后面的比例被称为 似然比，可以用于比较不同特征-结果关系（有点类似于特征对结果的影响程度）

Most Informative Features
             last_letter = 'k'              male : female =     45.8 : 1.0
             last_letter = 'a'            female : male   =     38.7 : 1.0
             last_letter = 'f'              male : female =     15.3 : 1.0
             last_letter = 'v'              male : female =     11.2 : 1.0
             last_letter = 'p'              male : female =     11.2 : 1.0


In [29]:
# 更改特征集
def new_gender_features(word):
    return {
        'last_letter': word[-1],
        'name_length': len(word)
    }

featuresets = [(new_gender_features(n), g) for (n,g) in names]
train_set, test_set = featuresets[500:], featuresets[:500]
print(train_set[1])

classifier = nltk.NaiveBayesClassifier.train(train_set)

classifier.classify(gender_features('Kate'))

({'last_letter': 'o', 'name_length': 8}, 'male')


'female'

In [31]:
classifier.show_most_informative_features(4)

Most Informative Features
             last_letter = 'k'              male : female =     45.8 : 1.0
             last_letter = 'a'            female : male   =     38.7 : 1.0
             last_letter = 'f'              male : female =     15.3 : 1.0
             last_letter = 'v'              male : female =     11.2 : 1.0


In [33]:
nltk.classify.accuracy(classifier, test_set)

0.758

In [None]:
# 特征的选取十分重要
# 列出所有特征，检查哪些影响比较大
# 可能会产生欠拟合和过拟合现象

# 数据集分为训练集，测试开发集和测试集
# 利用测试开发集来判断哪些信息导致错误的出现

In [49]:
# 例如下面这个例子

train_names = names[1500:]
devtest_names = names[500:1500]
test_names = names[:500]

# print(train_names[1])

train_set = [(gender_features(n), g) for (n,g) in train_names]
devtest_set = [(gender_features(n), g) for (n,g) in devtest_names]
test_set = [(gender_features(n), g) for (n,g) in test_names]

# print(train_set[1])

classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

('Dewey', 'male')
({'last_letter': 'y'}, 'male')
0.754


In [54]:
errors = []

for (name, tag) in devtest_names:
    guess = classifier.classify(gender_features(name))
    if guess != tag:
        errors.append((tag, guess, name))
        
for i in errors:
    i[2]

('male', 'female', 'Eli')
('female', 'male', 'Coleen')
('female', 'male', 'Ardis')
('male', 'female', 'Timmie')
('female', 'male', 'Wilow')
('male', 'female', 'Noble')
('female', 'male', 'Vivian')
('female', 'male', 'Harriett')
('female', 'male', 'Madel')
('female', 'male', 'Estell')
('male', 'female', 'Jay')
('female', 'male', 'Shannen')
('female', 'male', 'Doralynn')
('female', 'male', 'Mellisent')
('female', 'male', 'Ethelyn')
('male', 'female', 'Guillaume')
('female', 'male', 'Murial')
('male', 'female', 'Jeth')
('female', 'male', 'Sheryl')
('female', 'male', 'Joann')
('male', 'female', 'Jae')
('female', 'male', 'Calypso')
('female', 'male', 'Robinett')
('female', 'male', 'Daveen')
('male', 'female', 'Yehudi')
('female', 'male', 'Fan')
('male', 'female', 'Hillary')
('male', 'female', 'Jorge')
('female', 'male', 'Meriel')
('male', 'female', 'Nikolai')
('male', 'female', 'Jerri')
('female', 'male', 'Shir')
('female', 'male', 'Margret')
('female', 'male', 'Chrystel')
('male', 'female'

In [None]:
# 分析后可以查看哪个字母结尾的名字对结果造成误判较多

In [73]:

# 将电影评论分正负评论的例子

from nltk.corpus import movie_reviews
# movie_reviews.categories()
# movie_reviews.fileids(category)

documents = [ (list(movie_reviews.words(fileid)), category) 
                for category in movie_reviews.categories()
                for fileid in movie_reviews.fileids(category)
            ]

random.shuffle(documents)
print(documents[1])

(['can', 'a', 'horror', 'movie', 'truly', 'be', 'called', 'a', 'horror', 'movie', 'if', 'it', 'has', 'no', 'scares', ',', 'suspense', ',', 'or', 'even', 'eerie', 'elements', '?', 'i', 'think', 'not', ',', 'but', 'that', "'", 's', 'what', 'children', 'of', 'the', 'corn', '666', ':', 'issac', "'", 's', 'return', 'wants', 'us', 'to', 'believe', '.', 'the', 'sixth', 'installment', 'in', 'the', 'horrible', ',', 'worn', 'out', 'series', 'is', 'by', 'far', 'the', 'worst', 'to', 'date', '.', 'unlike', 'the', 'other', 'five', 'chapters', ',', 'children', 'of', 'the', 'corn', '666', 'is', 'a', 'confusing', ',', 'brainless', 'thriller', 'that', 'takes', 'the', 'psychological', 'horror', 'route', 'rather', 'than', 'slasher', 'horror', ',', 'but', 'either', 'way', ',', 'none', 'of', 'these', 'movies', 'are', 'the', 'least', 'bit', 'scary', '.', 'the', 'film', 'follows', 'hannah', '(', 'natalie', 'ramsey', ')', 'a', 'teen', 'looking', 'for', 'her', 'mother', 'in', 'gatlin', ',', 'nebraska', ',', 'on

In [84]:
# 文本分类的特征提取器
# 语料库中前2000个最频繁的词
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
# print(type(all_words))
# list(all_words.keys())[:2000]


word_features = list(all_words.keys())[:2000]

def document_features(document):
    document_words = set(document)
    features = {}
    
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    
    return features

print(document_features(movie_reviews.words('pos/cv957_8737.txt')))

{'contains(plot)': True, 'contains(:)': True, 'contains(two)': True, 'contains(teen)': False, 'contains(couples)': False, 'contains(go)': False, 'contains(to)': True, 'contains(a)': True, 'contains(church)': False, 'contains(party)': False, 'contains(,)': True, 'contains(drink)': False, 'contains(and)': True, 'contains(then)': True, 'contains(drive)': False, 'contains(.)': True, 'contains(they)': True, 'contains(get)': True, 'contains(into)': True, 'contains(an)': True, 'contains(accident)': False, 'contains(one)': True, 'contains(of)': True, 'contains(the)': True, 'contains(guys)': False, 'contains(dies)': False, 'contains(but)': True, 'contains(his)': True, 'contains(girlfriend)': True, 'contains(continues)': False, 'contains(see)': False, 'contains(him)': True, 'contains(in)': True, 'contains(her)': False, 'contains(life)': False, 'contains(has)': True, 'contains(nightmares)': False, 'contains(what)': True, "contains(')": True, 'contains(s)': True, 'contains(deal)': False, 'contains

In [85]:
# 训练一个分类器

featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[:100], featuresets[:100]

classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

0.99


In [87]:
classifier.show_most_informative_features(3)

Most Informative Features
     contains(memorable) = True              pos : neg    =      6.9 : 1.0
         contains(voice) = True              pos : neg    =      6.9 : 1.0
            contains(mr) = True              neg : pos    =      5.8 : 1.0


In [91]:
# 决策树

from nltk.corpus import brown
suffix_fdist = nltk.FreqDist()
    
common_suffixes = list(suffix_fdist.keys())[:100]

In [93]:
def pos_features(word):
    features = {}
    for suffix in common_suffixes:
        features['endswith(%s)' % suffix] = word.lower().endswith(suffix)
    return features

tagged_words = brown.tagged_words(categories='news')
featuresets = [(pos_features(n), g) for (n,g) in tagged_words]
size = int(len(featuresets) * 0.1)

train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.DecisionTreeClassifier.train(train_set)

classifier.classify(pos_features('cats'))

'NN'

In [95]:
print(classifier.pseudocode(depth=4))

return 'NN'

