# 文章のベクトル化

## gensimのcorpora, matutilsでベクトル化

In [16]:
from gensim import corpora,matutils

In [17]:
# 文章リスト (本来であれば、もとの文章に対して、形態素解析した上で、生成)
doclist = [['human', 'interface', 'computer'],
            ['survey', 'user', 'computer', 'system'],
            ['eps', 'user', 'interface'],
            ['system', 'human', 'system', 'eps'],
            ['user','time'],
            ['trees', 'user'],
            ['graph', 'trees'],
            ['graph', 'minors', 'minors','trees'],
            ['graph', 'minors', 'survey']]

In [18]:
# 辞書生成
dct = corpora.Dictionary(doclist)

In [19]:
print(dct.token2id)
print(dct.dfs)

{'computer': 0, 'human': 1, 'interface': 2, 'survey': 3, 'system': 4, 'user': 5, 'eps': 6, 'time': 7, 'trees': 8, 'graph': 9, 'minors': 10}
{1: 2, 2: 2, 0: 2, 3: 2, 5: 4, 4: 2, 6: 2, 7: 1, 8: 3, 9: 3, 10: 2}


In [20]:
# 文書をBoW表現(ID)と頻度（重み）のセットに変換する。
dct.doc2bow(doclist[3])

[(1, 1), (4, 2), (6, 1)]

In [21]:
matutils.corpus2dense([dct.doc2bow(doclist[0]), dct.doc2bow(doclist[1])], num_terms=len(dct)).T


array([[1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 1., 1., 1., 0., 0., 0., 0., 0.]], dtype=float32)

In [22]:
matutils.corpus2dense([dct.doc2bow(doclist[0])], num_terms=len(dct)).T

array([[1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.]], dtype=float32)

In [23]:
# 単文章のベクトル化
dense0 = list(matutils.corpus2dense([dct.doc2bow(doclist[0])], num_terms=len(dct)).T[0])
print(dense0)

[1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [24]:
# 複数文章の(2つ)のベクトル化
dense0_1 = list(matutils.corpus2dense([dct.doc2bow(doclist[0]), dct.doc2bow(doclist[1])], num_terms=len(dct)).T)
print(dense0_1)

[array([1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32), array([1., 0., 0., 1., 1., 1., 0., 0., 0., 0., 0.], dtype=float32)]


In [25]:
#全文章のベクトル化
def vec2dense(vec, num_terms):
    return list(matutils.corpus2dense([vec], num_terms=num_terms).T[0])
data_all  = [vec2dense(dct.doc2bow(doclist[i]),len(dct)) for i in range(len(doclist))]


In [26]:
data_all

[[1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 1.0, 0.0, 0.0, 2.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 2.0],
 [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0]]

## numpyによるベクトル化

In [27]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer()
"""
doclist = [['human', 'interface', 'computer'],
            ['survey', 'user', 'computer', 'system'],
            ['eps', 'user', 'interface'],
            ['system', 'human', 'system', 'eps'],
            ['user','time'],
            ['trees', 'user'],
            ['graph', 'trees'],
            ['graph', 'minors', 'minors','trees'],
            ['graph', 'minors', 'survey']]
            
"""

bag = count.fit_transform(np.array([" ".join(doc) for doc in doclist]))
print(count.vocabulary_)
print(bag.toarray())

{'human': 3, 'interface': 4, 'computer': 0, 'survey': 6, 'user': 10, 'system': 7, 'eps': 1, 'time': 8, 'trees': 9, 'graph': 2, 'minors': 5}
[[1 0 0 1 1 0 0 0 0 0 0]
 [1 0 0 0 0 0 1 1 0 0 1]
 [0 1 0 0 1 0 0 0 0 0 1]
 [0 1 0 1 0 0 0 2 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 1]
 [0 0 0 0 0 0 0 0 0 1 1]
 [0 0 1 0 0 0 0 0 0 1 0]
 [0 0 1 0 0 2 0 0 0 1 0]
 [0 0 1 0 0 1 1 0 0 0 0]]
