using more iterators to reduce memory usage

inpho · Mar 30, 2018 · a0aa67f · a0aa67f
1 parent a7e324d
commit a0aa67f
Show file tree

Hide file tree

Showing 2 changed files with 27 additions and 15 deletions.
diff --git a/vsm/corpus/base.py b/vsm/corpus/base.py
@@ -2,9 +2,12 @@
 from builtins import str
 from builtins import range
 from builtins import object
+
+from itertools import tee
 import os
 
 import numpy as np
+from sortedcontainers import SortedSet
 
 from vsm.structarr import arr_add_field
 from vsm.split import split_corpus
@@ -155,27 +158,32 @@ def __init__(self,
         if to_array:
             self.corpus = np.asarray(corpus, dtype=dtype)
             self.dtype = self.corpus.dtype
+            self.words = np.unique(self.corpus)
         else:
-            self.corpus = corpus[:]
+            self.corpus, self.words = tee(corpus)
             self.dtype = dtype
+            self.words = np.asarray(SortedSet(self.words), dtype=np.object_)
 
         # Since np.unique attempts to make a whole contiguous copy of the
         # corpus array, we instead use a sorted set and cast to a np array
-        # equivalent to self.words = np.unique(self.corpus)
-        self.words = np.asarray(sorted(set(self.corpus)), dtype=np.object_)
+        # equivalent to 
+
+        if hasattr(self.corpus, '__len__'):
+            self._append_context_types(context_data, context_types)
+            self.original_length = len(self.corpus)
 
+        self.stopped_words = set()
+        if remove_empty:
+            self.remove_empty()
+
+    def _append_context_types(self, context_data, context_types):
         self.context_data = []
         for t in context_data:
             if self._validate_indices(t['idx']):
                 self.context_data.append(t)
 
         self._gen_context_types(context_types)
 
-        self.stopped_words = set()
-        self.original_length = len(self.corpus)
-
-        if remove_empty:
-            self.remove_empty()
 
     def __len__(self):
         """
@@ -497,9 +505,12 @@ def __init__(self,
         else:
             self.dtype = np.uint32
 
-        self.corpus = np.asarray([self.words_int[word]
-                                  for word in self.corpus],
-                                 dtype=self.dtype)
+        self.corpus = np.fromiter(
+            (self.words_int[word] for word in self.corpus),
+            dtype=self.dtype)
+            #count=len(self.corpus))
+
+        self._append_context_types(context_data, context_types)
 
         self.stopped_words = set()
         self.original_length = len(self.corpus)

diff --git a/vsm/extensions/corpusbuilders/corpusbuilders.py b/vsm/extensions/corpusbuilders/corpusbuilders.py
@@ -2,17 +2,18 @@
 from builtins import zip
 from builtins import str
 from builtins import range
+
+from codecs import open
+from itertools import chain
 import os
 
 import numpy as np
+from progressbar import ProgressBar, Percentage, Bar
 from unidecode import unidecode
-from codecs import open
 
 from vsm.corpus import Corpus
 from .util import *
 
-from progressbar import ProgressBar, Percentage, Bar
-
 __all__ = ['empty_corpus', 'random_corpus',
            'toy_corpus', 'corpus_fromlist',
            'file_corpus', 'dir_corpus', 'coll_corpus', 'json_corpus',
@@ -160,7 +161,7 @@ def corpus_fromlist(ls, context_type='context'):
     [array([(2, 'sentence_0'), (3, 'sentence_1'), (5, 'sentence_2')], 
           dtype=[('idx', '<i8'), ('sentence_label', '|S10')])]
     """
-    corpus = [w for ctx in ls for w in ctx]
+    corpus = chain.from_iterable(ls)    #[w for ctx in ls for w in ctx]
 
     indices = np.cumsum([len(sbls) for sbls in ls])
     metadata = ['{0}_{1}'.format(context_type, i)