# NLTK Bag-of-Words

## 使用Bag of words 对示例文本进行特征向量化

In [1]:
sent1 = 'The cat is walking in the bedroom.'
sent2 = 'A dog was running across the kitchen.'

from sklearn.feature_extraction.text import CountVectorizer

count_vec = CountVectorizer()

sentences = [sent1, sent2]

print( count_vec.fit_transform(sentences).toarray())

print (count_vec.get_feature_names())

[[0 1 1 0 1 1 0 0 2 1 0]
 [1 0 0 1 0 0 1 1 1 0 1]]
['across', 'bedroom', 'cat', 'dog', 'in', 'is', 'kitchen', 'running', 'the', 'walking', 'was']


## 使用NLTK对文本进行语言学分析

In [11]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/lipengyuan/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [3]:
# 对句子进行词汇分割和正规化，有些情况如aren't需要分割为are和n't； 或者i‘m 要分割为I和’m
tokens_1 = nltk.word_tokenize(sent1)

In [4]:
tokens_1

['The', 'cat', 'is', 'walking', 'in', 'the', 'bedroom', '.']

In [5]:
tokens_2 = nltk.word_tokenize(sent2)

In [6]:
tokens_2

['A', 'dog', 'was', 'running', 'across', 'the', 'kitchen', '.']

In [7]:
# 整理两句的词表，并且按照ASCII的排序输出
vocab_1 = sorted(set(tokens_1))
vocab_1

['.', 'The', 'bedroom', 'cat', 'in', 'is', 'the', 'walking']

In [8]:
vocab_2 = sorted(set(tokens_2))
vocab_2

['.', 'A', 'across', 'dog', 'kitchen', 'running', 'the', 'was']

In [9]:
# 初始化stemmer 寻找各个词汇的最原始的词根
stemmer = nltk.stem.PorterStemmer()

stem_1 = [stemmer.stem(t) for t in tokens_1]
print (stem_1)


stem_2 = [stemmer.stem(t) for t in tokens_2]
print (stem_2)

['the', 'cat', 'is', 'walk', 'in', 'the', 'bedroom', '.']
['A', 'dog', 'wa', 'run', 'across', 'the', 'kitchen', '.']


In [12]:
# 初始化词性标注器，对每个词汇进行标注。
pos_tag_1 = nltk.tag.pos_tag(tokens_1)
print (pos_tag_1)

pos_tag_2 = nltk.tag.pos_tag(tokens_2)
print (pos_tag_2)

[('The', 'DT'), ('cat', 'NN'), ('is', 'VBZ'), ('walking', 'VBG'), ('in', 'IN'), ('the', 'DT'), ('bedroom', 'NN'), ('.', '.')]
[('A', 'DT'), ('dog', 'NN'), ('was', 'VBD'), ('running', 'VBG'), ('across', 'IN'), ('the', 'DT'), ('kitchen', 'NN'), ('.', '.')]
