### 1. 使用 sklearn 对文档进行向量化

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
import nltk

In [2]:
# 创建语料库
corpus = [
    'Jobs was the chairman of Apple Inc., and he was very famous',
    'I like to use apple computer',
    'And I also like to eat apple'
]
corpus

['Jobs was the chairman of Apple Inc., and he was very famous',
 'I like to use apple computer',
 'And I also like to eat apple']

#### 1.1 未经停用词过滤的文档向量化

In [3]:
vectorizer = CountVectorizer()
# 将文档转换为完整的特征向量矩阵
vectorizer.fit_transform(corpus).todense(), vectorizer.vocabulary_ # 输出文档特征向量矩阵和词频

(matrix([[0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 2],
         [0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0],
         [1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0]], dtype=int64),
 {'jobs': 9,
  'was': 16,
  'the': 12,
  'chairman': 3,
  'of': 11,
  'apple': 2,
  'inc': 8,
  'and': 1,
  'he': 7,
  'very': 15,
  'famous': 6,
  'like': 10,
  'to': 13,
  'use': 14,
  'computer': 4,
  'also': 0,
  'eat': 5})

#### 1.2 使用停用词过滤后的文档向量化
- 停用词下载如果出现问题可参考 : [停用词下载](http://knightzz.cn/archives/nltk-tong-yong-ci-xia-zai)

In [10]:
# 下载停用词
# nltk.download()
stopwords = nltk.corpus.stopwords.words('english')
stopwords[:5]

['i', 'me', 'my', 'myself', 'we']

In [11]:
# 使用停用词过滤进行文档向量化
vectorizer = CountVectorizer(stop_words=stopwords)
print("after stopwords removal:  ", vectorizer.fit_transform(corpus).todense())
print("after stopwords removal:  ", vectorizer.vocabulary_)

after stopwords removal:   [[0 1 1 0 0 1 1 1 0 0]
 [0 1 0 1 0 0 0 0 1 1]
 [1 1 0 0 1 0 0 0 1 0]]
after stopwords removal:   {'jobs': 7, 'chairman': 2, 'apple': 1, 'inc': 6, 'famous': 5, 'like': 8, 'use': 9, 'computer': 3, 'also': 0, 'eat': 4}


#### 1.3 采用ngram模式进行文档向量化
- N‐gram通常是指一段文本或语音中连续N个项目（item）的序列。项目（item）可以是单词、字母、碱基对等。
- N=1时称为uni‐gram，N=2称为bi‐gram，N=3称为tri‐gram，以此类推
- 举例:对于文本‘And l also like to eat apple'，则
    1. Uni-gram: And,l, also,like, to, eat, apple
    2. Bi-gram: And l, l also，also like，like to, to eat, eat apple.
    3. Tri-gram: And l also, l also like, also like to, like to eat, to eat apple

In [13]:
vectorizer = CountVectorizer(ngram_range=(1,2)) #表示从1-2，既包括 unigram，也包括 bigram 模式的文本
print("N-gram mode:     ",vectorizer.fit_transform(corpus).todense())  #转化为完整特征矩阵
print("N-gram mode:         ",vectorizer.vocabulary_)

N-gram mode:      [[0 0 1 0 1 1 0 1 1 1 0 0 0 1 1 1 1 1 1 1 0 0 1 1 1 1 0 0 0 0 0 1 1 2 1 1]
 [0 0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 1 1 1 0 0 0 0 0]
 [1 1 1 1 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 1 0 0 0 0 1 1 0 0 0 0 0 0 0 0]]
N-gram mode:          {'jobs': 18, 'was': 33, 'the': 24, 'chairman': 8, 'of': 22, 'apple': 5, 'inc': 16, 'and': 2, 'he': 14, 'very': 31, 'famous': 13, 'jobs was': 19, 'was the': 34, 'the chairman': 25, 'chairman of': 9, 'of apple': 23, 'apple inc': 7, 'inc and': 17, 'and he': 4, 'he was': 15, 'was very': 35, 'very famous': 32, 'like': 20, 'to': 26, 'use': 29, 'computer': 10, 'like to': 21, 'to use': 28, 'use apple': 30, 'apple computer': 6, 'also': 0, 'eat': 11, 'and also': 3, 'also like': 1, 'to eat': 27, 'eat apple': 12}
