In [17]:
>>> import logging  
>>> logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)  



In [18]:
>>> from gensim import corpora, models, similarities  
>>>  
>>> corpus = [[(0, 1.0), (1, 1.0), (2, 1.0)],  
>>>           [(2, 1.0), (3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (8, 1.0)],  
>>>           [(1, 1.0), (3, 1.0), (4, 1.0), (7, 1.0)],  
>>>           [(0, 1.0), (4, 2.0), (7, 1.0)],  
>>>           [(3, 1.0), (5, 1.0), (6, 1.0)],  
>>>           [(9, 1.0)],  
>>>           [(9, 1.0), (10, 1.0)],  
>>>           [(9, 1.0), (10, 1.0), (11, 1.0)],  
>>>           [(8, 1.0), (10, 1.0), (11, 1.0)]]  

In [19]:
>>> tfidf = models.TfidfModel(corpus)  

2017-09-29 09:20:05,667 : INFO : collecting document frequencies
2017-09-29 09:20:05,669 : INFO : PROGRESS: processing document #0
2017-09-29 09:20:05,671 : INFO : calculating IDF weights for 9 documents and 11 features (28 matrix non-zeros)


In [4]:
>>> vec = [(0, 1), (4, 1)]  
>>> print(tfidf[vec])  


[(0, 0.8075244024440723), (4, 0.5898341626740045)]


In [5]:
documents = ["Human machine interface for lab abc computer applications",
>>>              "A survey of user opinion of computer system response time",
>>>              "The EPS user interface management system",
>>>              "System and human system engineering testing of EPS",
>>>              "Relation of user perceived response time to error measurement",
>>>              "The generation of random binary unordered trees",
>>>              "The intersection graph of paths in trees",
>>>              "Graph minors IV Widths of trees and well quasi ordering",
>>>              "Graph minors A survey"]

In [54]:
>>> # 去除停用词并分词
>>> # 译者注：这里只是例子，实际上还有其他停用词
>>> #         处理中文时，请借助 Py结巴分词 https://github.com/fxsjy/jieba
>>> stoplist = set('for a of the and to in'.split())
>>> texts = [[word for word in document.lower().split() if word not in stoplist]
>>>          for document in documents]
>>>

In [73]:
texts   #[][] text format

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]

In [56]:
>>> # [][]去除仅出现一次的单词
>>> from collections import defaultdict
>>> frequency = defaultdict(int)
>>> for text in texts:
>>>     for token in text:
>>>         frequency[token] += 1
>>>
>>> texts = [[token for token in text if frequency[token] > 1]
>>>          for text in texts]
>>>
>>> from pprint import pprint   # pretty-printer
>>> pprint(texts)

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]


In [59]:
>>> dictionary = corpora.Dictionary(texts) #[][]
>>> dictionary.save('/tmp/deerwester.dict') # 把字典保存起来，方便以后使用
>>> print(dictionary)


2017-09-29 09:40:58,348 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2017-09-29 09:40:58,352 : INFO : built Dictionary(12 unique tokens: ['trees', 'eps', 'survey', 'interface', 'system']...) from 9 documents (total 29 corpus positions)
2017-09-29 09:40:58,355 : INFO : saving Dictionary object under /tmp/deerwester.dict, separately None
2017-09-29 09:40:58,358 : INFO : saved /tmp/deerwester.dict


Dictionary(12 unique tokens: ['trees', 'eps', 'survey', 'interface', 'system']...)


In [72]:
fff = dictionary.token2id #[][]
fff?

In [58]:
>>> print(dictionary.token2id)

{'trees': 9, 'eps': 8, 'survey': 7, 'interface': 2, 'system': 5, 'time': 3, 'graph': 10, 'minors': 11, 'user': 6, 'human': 0, 'response': 4, 'computer': 1}


In [15]:
>>> new_doc = "Human computer interaction"
>>> new_vec = dictionary.doc2bow(new_doc.lower().split())
>>> print(new_vec) # "interaction"没有在dictionary中出现，因此忽略

[(0, 1), (1, 1)]


In [16]:
>>> corpus = [dictionary.doc2bow(text) for text in texts]
>>> corpora.MmCorpus.serialize('/tmp/deerwester.mm', corpus) # [][] corpus format存入硬盘，以备后需
>>> print(corpus)   

2017-09-28 17:34:50,690 : INFO : storing corpus in Matrix Market format to /tmp/deerwester.mm
2017-09-28 17:34:50,695 : INFO : saving sparse matrix to /tmp/deerwester.mm
2017-09-28 17:34:50,697 : INFO : PROGRESS: saving document #0
2017-09-28 17:34:50,700 : INFO : saved 9x12 matrix, density=25.926% (28/108)
2017-09-28 17:34:50,703 : INFO : saving MmCorpus index to /tmp/deerwester.mm.index


[[(0, 1), (1, 1), (2, 1)], [(1, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)], [(2, 1), (5, 1), (6, 1), (8, 1)], [(0, 1), (5, 2), (8, 1)], [(3, 1), (4, 1), (6, 1)], [(9, 1)], [(9, 1), (10, 1)], [(9, 1), (10, 1), (11, 1)], [(7, 1), (10, 1), (11, 1)]]


In [20]:
#w2v


In [74]:
# import modules & set up logging
import gensim, logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
 
sentences = [['first', 'sentence'], ['second', 'sentence']] #sentence format

# train word2vec on the two sentences
model = gensim.models.Word2Vec(sentences, min_count=1)

2017-09-29 12:31:45,041 : INFO : collecting all words and their counts
2017-09-29 12:31:45,045 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-09-29 12:31:45,047 : INFO : collected 3 word types from a corpus of 4 raw words and 2 sentences
2017-09-29 12:31:45,048 : INFO : Loading a fresh vocabulary
2017-09-29 12:31:45,050 : INFO : min_count=1 retains 3 unique words (100% of original 3, drops 0)
2017-09-29 12:31:45,051 : INFO : min_count=1 leaves 4 word corpus (100% of original 4, drops 0)
2017-09-29 12:31:45,053 : INFO : deleting the raw counts dictionary of 3 items
2017-09-29 12:31:45,054 : INFO : sample=0.001 downsamples 3 most-common words
2017-09-29 12:31:45,056 : INFO : downsampling leaves estimated 0 word corpus (5.7% of prior 4)
2017-09-29 12:31:45,057 : INFO : estimated required memory for 3 words and 100 dimensions: 3900 bytes
2017-09-29 12:31:45,058 : INFO : resetting layer weights
2017-09-29 12:31:45,060 : INFO : training model with 3 workers o

In [76]:
import os
class MySentences(object):
    def __init__(self, dirname):
        self.dirname = dirname
 
    def __iter__(self):
        for fname in os.listdir(self.dirname):
            for line in open(os.path.join(self.dirname, fname)):
                yield line.split()
 
sentences = MySentences('/some/directory') # a memory-friendly iterator
model = gensim.models.Word2Vec(sentences)




2017-09-29 12:32:46,988 : INFO : collecting all words and their counts


FileNotFoundError: [WinError 3] 系统找不到指定的路径。: '/some/directory'