Skip to content

Commit

Permalink
using more iterators to reduce memory usage
Browse files Browse the repository at this point in the history
  • Loading branch information
JaimieMurdock committed Mar 30, 2018
1 parent a7e324d commit a0aa67f
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 15 deletions.
33 changes: 22 additions & 11 deletions vsm/corpus/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,12 @@
from builtins import str
from builtins import range
from builtins import object

from itertools import tee
import os

import numpy as np
from sortedcontainers import SortedSet

from vsm.structarr import arr_add_field
from vsm.split import split_corpus
Expand Down Expand Up @@ -155,27 +158,32 @@ def __init__(self,
if to_array:
self.corpus = np.asarray(corpus, dtype=dtype)
self.dtype = self.corpus.dtype
self.words = np.unique(self.corpus)
else:
self.corpus = corpus[:]
self.corpus, self.words = tee(corpus)
self.dtype = dtype
self.words = np.asarray(SortedSet(self.words), dtype=np.object_)

# Since np.unique attempts to make a whole contiguous copy of the
# corpus array, we instead use a sorted set and cast to a np array
# equivalent to self.words = np.unique(self.corpus)
self.words = np.asarray(sorted(set(self.corpus)), dtype=np.object_)
# equivalent to

if hasattr(self.corpus, '__len__'):
self._append_context_types(context_data, context_types)
self.original_length = len(self.corpus)

self.stopped_words = set()
if remove_empty:
self.remove_empty()

def _append_context_types(self, context_data, context_types):
self.context_data = []
for t in context_data:
if self._validate_indices(t['idx']):
self.context_data.append(t)

self._gen_context_types(context_types)

self.stopped_words = set()
self.original_length = len(self.corpus)

if remove_empty:
self.remove_empty()

def __len__(self):
"""
Expand Down Expand Up @@ -497,9 +505,12 @@ def __init__(self,
else:
self.dtype = np.uint32

self.corpus = np.asarray([self.words_int[word]
for word in self.corpus],
dtype=self.dtype)
self.corpus = np.fromiter(
(self.words_int[word] for word in self.corpus),
dtype=self.dtype)
#count=len(self.corpus))

self._append_context_types(context_data, context_types)

self.stopped_words = set()
self.original_length = len(self.corpus)
Expand Down
9 changes: 5 additions & 4 deletions vsm/extensions/corpusbuilders/corpusbuilders.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,18 @@
from builtins import zip
from builtins import str
from builtins import range

from codecs import open
from itertools import chain
import os

import numpy as np
from progressbar import ProgressBar, Percentage, Bar
from unidecode import unidecode
from codecs import open

from vsm.corpus import Corpus
from .util import *

from progressbar import ProgressBar, Percentage, Bar

__all__ = ['empty_corpus', 'random_corpus',
'toy_corpus', 'corpus_fromlist',
'file_corpus', 'dir_corpus', 'coll_corpus', 'json_corpus',
Expand Down Expand Up @@ -160,7 +161,7 @@ def corpus_fromlist(ls, context_type='context'):
[array([(2, 'sentence_0'), (3, 'sentence_1'), (5, 'sentence_2')],
dtype=[('idx', '<i8'), ('sentence_label', '|S10')])]
"""
corpus = [w for ctx in ls for w in ctx]
corpus = chain.from_iterable(ls) #[w for ctx in ls for w in ctx]

indices = np.cumsum([len(sbls) for sbls in ls])
metadata = ['{0}_{1}'.format(context_type, i)
Expand Down

0 comments on commit a0aa67f

Please sign in to comment.