forked from piskvorky/gensim
-
Notifications
You must be signed in to change notification settings - Fork 17
/
interfaces.py
269 lines (208 loc) · 10.2 KB
/
interfaces.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
"""
This module contains basic interfaces used throughout the whole gensim package.
The interfaces are realized as abstract base classes (ie., some optional functionality
is provided in the interface itself, so that the interfaces can be subclassed).
"""
from __future__ import with_statement
import logging
import itertools
from gensim import utils, matutils
from gensim._six.moves import xrange
logger = logging.getLogger('gensim.interfaces')
class CorpusABC(utils.SaveLoad):
"""
Interface (abstract base class) for corpora. A *corpus* is simply an iterable,
where each iteration step yields one document:
>>> for doc in corpus:
>>> # do something with the doc...
A document is a sequence of `(fieldId, fieldValue)` 2-tuples:
>>> for attr_id, attr_value in doc:
>>> # do something with the attribute
Note that although a default :func:`len` method is provided, it is very inefficient
(performs a linear scan through the corpus to determine its length). Wherever
the corpus size is needed and known in advance (or at least doesn't change so
that it can be cached), the :func:`len` method should be overridden.
See the :mod:`gensim.corpora.svmlightcorpus` module for an example of a corpus.
Saving the corpus with the `save` method (inherited from `utils.SaveLoad`) will
only store the *in-memory* (binary, pickled) object representation=the stream
state, and **not** the documents themselves. See the `save_corpus` static method
for serializing the actual stream content.
"""
def __iter__(self):
"""
Iterate over the corpus, yielding one document at a time.
"""
raise NotImplementedError('cannot instantiate abstract base class')
def save(self, *args, **kwargs):
import warnings
warnings.warn("corpus.save() stores only the (tiny) iteration object; "
"to serialize the actual corpus content, use e.g. MmCorpus.serialize(corpus)")
super(CorpusABC, self).save(*args, **kwargs)
def __len__(self):
"""
Return the number of documents in the corpus.
This method is just the least common denominator and should really be
overridden when possible.
"""
raise NotImplementedError("must override __len__() before calling len(corpus)")
# logger.warning("performing full corpus scan to determine its length; was this intended?")
# return sum(1 for doc in self) # sum(empty generator) == 0, so this works even for an empty corpus
@staticmethod
def save_corpus(fname, corpus, id2word=None, metadata=False):
"""
Save an existing `corpus` to disk.
Some formats also support saving the dictionary (`feature_id->word` mapping),
which can in this case be provided by the optional `id2word` parameter.
>>> MmCorpus.save_corpus('file.mm', corpus)
Some corpora also support an index of where each document begins, so
that the documents on disk can be accessed in O(1) time (see the
`corpora.IndexedCorpus` base class). In this case, `save_corpus` is automatically
called internally by `serialize`, which does `save_corpus` plus saves the index
at the same time, so you want to store the corpus with::
>>> MmCorpus.serialize('file.mm', corpus) # stores index as well, allowing random access to individual documents
Calling `serialize()` is preferred to calling `save_corpus()`.
"""
raise NotImplementedError('cannot instantiate abstract base class')
# example code:
logger.info("converting corpus to ??? format: %s" % fname)
with open(fname, 'w') as fout:
for doc in corpus: # iterate over the document stream
fmt = str(doc) # format the document appropriately...
fout.write("%s\n" % fmt) # serialize the formatted document to disk
#endclass CorpusABC
class TransformedCorpus(CorpusABC):
def __init__(self, obj, corpus, chunksize=None):
self.obj, self.corpus, self.chunksize = obj, corpus, chunksize
self.metadata = False
def __len__(self):
return len(self.corpus)
def __iter__(self):
if self.chunksize:
for chunk in utils.grouper(self.corpus, self.chunksize):
for transformed in self.obj.__getitem__(chunk, chunksize=None):
yield transformed
else:
for doc in self.corpus:
yield self.obj[doc]
#endclass TransformedCorpus
class TransformationABC(utils.SaveLoad):
"""
Interface for transformations. A 'transformation' is any object which accepts
a sparse document via the dictionary notation `[]` and returns another sparse
document in its stead::
>>> transformed_doc = transformation[doc]
or also::
>>> transformed_corpus = transformation[corpus]
See the :mod:`gensim.models.tfidfmodel` module for an example of a transformation.
"""
def __getitem__(self, vec):
"""
Transform vector from one vector space into another
**or**
Transform a whole corpus into another.
"""
raise NotImplementedError('cannot instantiate abstract base class')
def _apply(self, corpus, chunksize=None):
"""
Apply the transformation to a whole corpus (as opposed to a single document)
and return the result as another corpus.
"""
return TransformedCorpus(self, corpus, chunksize)
#endclass TransformationABC
class SimilarityABC(utils.SaveLoad):
"""
Abstract interface for similarity searches over a corpus.
In all instances, there is a corpus against which we want to perform the
similarity search.
For each similarity search, the input is a document and the output are its
similarities to individual corpus documents.
Similarity queries are realized by calling ``self[query_document]``.
There is also a convenience wrapper, where iterating over `self` yields
similarities of each document in the corpus against the whole corpus (ie.,
the query is each corpus document in turn).
"""
def __init__(self, corpus):
raise NotImplementedError("cannot instantiate Abstract Base Class")
def get_similarities(self, doc):
# (Sparse)MatrixSimilarity override this method so that they both use the
# same __getitem__ method, defined below
raise NotImplementedError("cannot instantiate Abstract Base Class")
def __getitem__(self, query):
"""Get similarities of document `query` to all documents in the corpus.
**or**
If `query` is a corpus (iterable of documents), return a matrix of similarities
of all query documents vs. all corpus document. Using this type of batch
query is more efficient than computing the similarities one document after
another.
"""
is_corpus, query = utils.is_corpus(query)
if self.normalize:
# self.normalize only works if the input is a plain gensim vector/corpus (as
# advertised in the doc). in fact, input can be a numpy or scipy.sparse matrix
# as well, but in that case assume tricks are happening and don't normalize
# anything (self.normalize has no effect).
if matutils.ismatrix(query):
import warnings
# warnings.warn("non-gensim input must already come normalized")
else:
if is_corpus:
query = [matutils.unitvec(v) for v in query]
else:
query = matutils.unitvec(query)
result = self.get_similarities(query)
if self.num_best is None:
return result
# if the input query was a corpus (=more documents), compute the top-n
# most similar for each document in turn
if matutils.ismatrix(result):
return [matutils.full2sparse_clipped(v, self.num_best) for v in result]
else:
# otherwise, return top-n of the single input document
return matutils.full2sparse_clipped(result, self.num_best)
def __iter__(self):
"""
For each index document, compute cosine similarity against all other
documents in the index and yield the result.
"""
# turn off query normalization (vectors in the index are assumed to be already normalized)
norm = self.normalize
self.normalize = False
# Try to compute similarities in bigger chunks of documents (not
# one query = a single document after another). The point is, a
# bigger query of N documents is faster than N small queries of one
# document.
#
# After computing similarities of the bigger query in `self[chunk]`,
# yield the resulting similarities one after another, so that it looks
# exactly the same as if they had been computed with many small queries.
try:
chunking = self.chunksize > 1
except AttributeError:
# chunking not supported; fall back to the (slower) mode of 1 query=1 document
chunking = False
if chunking:
# assumes `self.corpus` holds the index as a 2-d numpy array.
# this is true for MatrixSimilarity and SparseMatrixSimilarity, but
# may not be true for other (future) classes..?
for chunk_start in xrange(0, self.index.shape[0], self.chunksize):
# scipy.sparse doesn't allow slicing beyond real size of the matrix
# (unlike numpy). so, clip the end of the chunk explicitly to make
# scipy.sparse happy
chunk_end = min(self.index.shape[0], chunk_start + self.chunksize)
chunk = self.index[chunk_start : chunk_end]
if chunk.shape[0] > 1:
for sim in self[chunk]:
yield sim
else:
yield self[chunk]
else:
for doc in self.index:
yield self[doc]
# restore old normalization value
self.normalize = norm
#endclass SimilarityABC