In [10]:
>>> from gensim import corpora
>>>
>>> documents = ["Human machine interface for lab abc computer applications",
>>>              "A survey of user opinion of computer system response time",
>>>              "The EPS user interface management system",
>>>              "System and human system engineering testing of EPS",
>>>              "Relation of user perceived response time to error measurement",
>>>              "The generation of random binary unordered trees",
>>>              "The intersection graph of paths in trees",
>>>              "Graph minors IV Widths of trees and well quasi ordering",
>>>              "Graph minors A survey"]

In [11]:
>>> # remove common words and tokenize
>>> stoplist = set('for a of the and to in'.split())
>>> texts = [[word for word in document.lower().split() if word not in stoplist]
>>>          for document in documents]

In [15]:
>>> from collections import defaultdict
>>> frequency = defaultdict(int)
>>> for text in texts:
>>>     for token in text:
>>>         frequency[token] += 1
>>>
>>> texts = [[token for token in text if frequency[token] > 1]
>>>          for text in texts]
>>>
>>> from pprint import pprint  # pretty-printer
>>> pprint(texts)

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]


In [16]:
>>> dictionary = corpora.Dictionary(texts)
>>> dictionary.save('/tmp/deerwester.dict')  # store the dictionary, for future reference
>>> print(dictionary)

Dictionary(12 unique tokens: ['human', 'response', 'system', 'interface', 'graph']...)


In [18]:
print(dictionary.token2id)

{'human': 1, 'response': 6, 'system': 7, 'interface': 0, 'graph': 10, 'survey': 4, 'user': 5, 'minors': 11, 'eps': 8, 'trees': 9, 'computer': 2, 'time': 3}


In [19]:
>>> new_doc = "Human computer interaction"
>>> new_vec = dictionary.doc2bow(new_doc.lower().split())
>>> print(new_vec)  # the word "interaction" does not appear in the dictionary and is ignored

[(1, 1), (2, 1)]


In [21]:
>>> corpus = [dictionary.doc2bow(text) for text in texts]
>>> corpora.MmCorpus.serialize('/tmp/deerwester.mm', corpus)  # store to disk, for later use
>>> pprint(corpus)

[[(0, 1), (1, 1), (2, 1)],
 [(2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)],
 [(0, 1), (5, 1), (7, 1), (8, 1)],
 [(1, 1), (7, 2), (8, 1)],
 [(3, 1), (5, 1), (6, 1)],
 [(9, 1)],
 [(9, 1), (10, 1)],
 [(9, 1), (10, 1), (11, 1)],
 [(4, 1), (10, 1), (11, 1)]]


In [24]:
from gensim import corpora, models, similarities
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

In [45]:
models.TfidfModel?

In [43]:
for doc in corpus_tfidf:
    print(doc)

[(0, 0.5773502691896257), (1, 0.5773502691896257), (2, 0.5773502691896257)]
[(2, 0.44424552527467476), (3, 0.44424552527467476), (4, 0.44424552527467476), (5, 0.3244870206138555), (6, 0.44424552527467476), (7, 0.3244870206138555)]
[(0, 0.5710059809418182), (5, 0.4170757362022777), (7, 0.4170757362022777), (8, 0.5710059809418182)]
[(1, 0.49182558987264147), (7, 0.7184811607083769), (8, 0.49182558987264147)]
[(3, 0.6282580468670046), (5, 0.45889394536615247), (6, 0.6282580468670046)]
[(9, 1.0)]
[(9, 0.7071067811865475), (10, 0.7071067811865475)]
[(9, 0.5080429008916749), (10, 0.5080429008916749), (11, 0.695546419520037)]
[(4, 0.6282580468670046), (10, 0.45889394536615247), (11, 0.6282580468670046)]


In [44]:
model = models.TfidfModel(corpus, normalize=True)

In [20]:
1 + 1

2

In [19]:
import numpy as np
np.savetxt('test10000.txt', data_samples,  fmt = '%f')

KeyboardInterrupt: 

In [None]:
from __future__ import print_function
from time import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups
from sklearn.datasets import load_files


n_samples = 10000
n_features = None
n_topics = 10
n_top_words = 20



data_path="ldadata"

print("loading dataset")
t0 = time()
dataset = load_files(data_path, shuffle=False,load_content=True,
                     encoding='utf-8',decode_error='ignore')
data_samples = dataset.data[:n_samples]

loading dataset


In [None]:
import gensim
from gensim import corpora, models, similarities
texts = [[word for word in document.split()] for document in data_samples]
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

# 2844555
Threshold = n_samples/10000
texts = [[token for token in text 
          if frequency[token] > Threshold
         ]
         for text in texts]
dictionary = corpora.Dictionary(texts)
print(dictionary)

In [None]:
print(dictionary.token2id['m.taobao.com'])
corpus = [dictionary.doc2bow(text) for text in texts]
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
corpus_tfidf[0]

In [15]:
lda = gensim.models.ldamulticore.LdaMulticore(corpus=corpus_tfidf, alpha= 0.01, eta = 0.01,id2word=dictionary, num_topics=10, offset = 50.0, chunksize=10000)
lda.show_topics(num_topics = 10, num_words = 10, formatted = False)

[(0,
  [('init-p01st.push.apple.com', 0.00082685831038261553),
   ('short.weixin.qq.com', 0.000531878611363985),
   ('221.131.128.129', 0.00050573981780731043),
   ('api.m.taobao.com', 0.00047956383336141424),
   ('m.baidu.com', 0.00047352830500232572),
   ('monitor.uu.qq.com', 0.00041285912745206861),
   ('mp.weixin.qq.com', 0.00040900036164177866),
   ('mmsns.qpic.cn', 0.00040867013341684184),
   ('api.weibo.cn', 0.00034883367505115587),
   ('adash.m.taobao.com', 0.00034825466691656634)]),
 (1,
  [('10.0.0.172', 0.00084414134343776843),
   ('init-p01st.push.apple.com', 0.00059159389154682233),
   ('m.baidu.com', 0.00056548019847748265),
   ('short.weixin.qq.com', 0.00048736304071144126),
   ('m.qpic.cn', 0.00045018042394656758),
   ('monitor.uu.qq.com', 0.00040922071954413637),
   ('api.weibo.cn', 0.00039332804798830037),
   ('evintl-ocsp.verisign.com', 0.0003886223151854856),
   ('conf.3g.qq.com', 0.00037515781401475235),
   ('plus.xiangji.qq.com', 0.00034067844548045869)]),
 (2,
  

In [None]:
len(data_samples)

In [None]:
len(dictionary)

In [196]:
gensim.models.ldamulticore.LdaMulticore?

In [5]:
len(corpus_tfidf)

100000

In [7]:
len(dataset.data[0])

3415