# gensimでLDA + pyLDAvisで可視化
* https://radimrehurek.com/gensim/tut1.html をなぞっただけ

In [1]:
import warnings
warnings.filterwarnings('ignore')

# 前処理

In [2]:
#### () => texts: データを用意する

In [3]:
documents = [
    "Human machine interface for lab abc computer applications",
    "A survey of user opinion of computer system response time",
    "The EPS user interface management system",
    "System and human system engineering testing of EPS",
    "Relation of user perceived response time to error measurement",
    "The generation of random binary unordered trees",
    "The intersection graph of paths in trees",
    "Graph minors IV Widths of trees and well quasi ordering",
    "Graph minors A survey"
]

In [4]:
stop_words = set('for a of the and to in'.split())
stop_words # この単語集合は、ストップワードとしてフィルタする

{'a', 'and', 'for', 'in', 'of', 'the', 'to'}

In [5]:
texts = [
    [word for word in document.lower().split() 
     if word not in stop_words] 
    for document in documents
]

In [6]:
texts

[['human', 'machine', 'interface', 'lab', 'abc', 'computer', 'applications'],
 ['survey', 'user', 'opinion', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'management', 'system'],
 ['system', 'human', 'system', 'engineering', 'testing', 'eps'],
 ['relation', 'user', 'perceived', 'response', 'time', 'error', 'measurement'],
 ['generation', 'random', 'binary', 'unordered', 'trees'],
 ['intersection', 'graph', 'paths', 'trees'],
 ['graph', 'minors', 'iv', 'widths', 'trees', 'well', 'quasi', 'ordering'],
 ['graph', 'minors', 'survey']]

In [7]:
# ここでさらに、不要な単語(1回しか出ないためモデリングに使えない単語)をフィルタする

In [8]:
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

In [9]:
frequency

defaultdict(int,
            {'abc': 1,
             'applications': 1,
             'binary': 1,
             'computer': 2,
             'engineering': 1,
             'eps': 2,
             'error': 1,
             'generation': 1,
             'graph': 3,
             'human': 2,
             'interface': 2,
             'intersection': 1,
             'iv': 1,
             'lab': 1,
             'machine': 1,
             'management': 1,
             'measurement': 1,
             'minors': 2,
             'opinion': 1,
             'ordering': 1,
             'paths': 1,
             'perceived': 1,
             'quasi': 1,
             'random': 1,
             'relation': 1,
             'response': 2,
             'survey': 2,
             'system': 4,
             'testing': 1,
             'time': 2,
             'trees': 3,
             'unordered': 1,
             'user': 3,
             'well': 1,
             'widths': 1})

In [10]:
texts = [
    [token for token in text if frequency[token] > 1]
    for text in texts
]

In [11]:
# 完成
texts

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]

# モデリング

In [12]:
#### texts => Gensim: LDAを回す

In [13]:
import gensim
from gensim import corpora

Using TensorFlow backend.


In [14]:
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In [15]:
dictionary

<gensim.corpora.dictionary.Dictionary at 0x129b2c6d8>

In [16]:
corpus

[[(0, 1), (1, 1), (2, 1)],
 [(2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)],
 [(1, 1), (4, 1), (5, 1), (8, 1)],
 [(0, 1), (5, 2), (8, 1)],
 [(4, 1), (6, 1), (7, 1)],
 [(9, 1)],
 [(9, 1), (10, 1)],
 [(9, 1), (10, 1), (11, 1)],
 [(3, 1), (10, 1), (11, 1)]]

In [17]:
# 回す
lda = gensim.models.ldamodel.LdaModel(
    corpus=corpus, num_topics=5, id2word=dictionary
)

In [18]:
# 結果
lda.show_topics()

[(0,
  '0.187*"user" + 0.186*"eps" + 0.186*"system" + 0.186*"interface" + 0.032*"trees" + 0.032*"graph" + 0.032*"minors" + 0.032*"human" + 0.032*"survey" + 0.032*"time"'),
 (1,
  '0.279*"trees" + 0.193*"graph" + 0.105*"computer" + 0.105*"human" + 0.105*"interface" + 0.105*"minors" + 0.018*"user" + 0.018*"system" + 0.018*"time" + 0.018*"survey"'),
 (2,
  '0.143*"system" + 0.142*"user" + 0.142*"time" + 0.142*"computer" + 0.142*"response" + 0.142*"survey" + 0.025*"trees" + 0.024*"graph" + 0.024*"minors" + 0.024*"human"'),
 (3,
  '0.234*"system" + 0.127*"graph" + 0.127*"eps" + 0.127*"human" + 0.127*"survey" + 0.127*"minors" + 0.022*"trees" + 0.022*"user" + 0.022*"interface" + 0.022*"computer"'),
 (4,
  '0.220*"response" + 0.220*"user" + 0.220*"time" + 0.039*"trees" + 0.038*"graph" + 0.038*"interface" + 0.038*"system" + 0.038*"minors" + 0.037*"human" + 0.037*"computer"')]

# 可視化

In [19]:
#### Gensim => pyLDAvis: LDA結果を可視化する

In [20]:
import pyLDAvis.gensim as gensimvis
import pyLDAvis

In [21]:
# 回したデータを食わせる
vis_data = gensimvis.prepare(lda, corpus, dictionary)
vis_data

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  topic_term_dists = topic_term_dists.ix[topic_order]


PreparedData(topic_coordinates=            Freq  cluster  topics         x         y
topic                                                
1      26.392067        1       1  0.166317  0.078684
3      23.357645        1       2  0.067641 -0.095940
2      22.302992        1       3 -0.097687  0.001234
0      15.614804        1       4 -0.028530 -0.059971
4      12.332492        1       5 -0.107740  0.075993, topic_info=     Category      Freq       Term     Total  loglift  logprob
term                                                          
4     Default  2.000000       user  2.000000  12.0000  12.0000
6     Default  2.000000   response  2.000000  11.0000  11.0000
7     Default  2.000000       time  2.000000  10.0000  10.0000
9     Default  2.000000      trees  2.000000   9.0000   9.0000
5     Default  3.000000     system  3.000000   8.0000   8.0000
8     Default  2.000000        eps  2.000000   7.0000   7.0000
1     Default  2.000000  interface  2.000000   6.0000   6.0000
10    Defaul

In [23]:
# 可視化実行
pyLDAvis.display(vis_data)