In [58]:
import msgpack
import spacy
import textacy
from spacy.language import Language as SpacyLang
from spacy.tokens.doc import Doc as SpacyDoc
from textacy import cache
from textacy import compat
from textacy.io.utils import open_sesame
from pympler import asizeof
import humanfriendly
import os, sys, re, gc
import psutil

def remove_whitespace_entities(doc):
    doc.ents = [ e for e in doc.ents if not e.text.isspace() ]
    return doc

def keep_hyphen_tokenizer(nlp):
    infix_re = re.compile(r'''[.\,\?\:\;\...\‘\’\`\“\”\"\'~]''')
    prefix_re = spacy.util.compile_prefix_regex(nlp.Defaults.prefixes)
    suffix_re = spacy.util.compile_suffix_regex(nlp.Defaults.suffixes)
    
def read_spacy_docs(fname, lang):
    vocab = lang.vocab
    docs = []
    with open_sesame(fname, mode='rb') as f:

        unpacker = msgpack.Unpacker(f, encoding='UTF-8')

        for msg in unpacker:

            if "user_data_keys" in msg:

                user_data_keys = msgpack.loads(msg["user_data_keys"], use_list=False, encoding='utf-8')
                for encoding in ['utf-8', 'latin1']:
                    try:
                        user_data_values = msgpack.loads(msg["user_data_values"], encoding=encoding)
                    except:
                        if encoding == 'latin1': raise

                user_data = { key: value for key, value in compat.zip_(user_data_keys, user_data_values)}
            else:
                print('error')
                user_data = None

            text = msg["text"]
            attrs = msg["array_body"]
            words = []
            spaces = []
            start = 0
            for i in compat.range_(attrs.shape[0]):
                end = start + int(attrs[i, 0])
                has_space = int(attrs[i, 1])
                words.append(text[start: end])
                spaces.append(bool(has_space))
                start = end + has_space

            spacy_doc = SpacyDoc(vocab, words=words, spaces=spaces, user_data=user_data)
            spacy_doc = spacy_doc.from_array(msg["array_head"][2:], attrs[:, 2:])
            if "sentiment" in msg:
                spacy_doc.sentiment = msg["sentiment"]
            if "tensor" in msg:
                spacy_doc.tensor = msg["tensor"]

#            print(
#                'msg: ' + humanfriendly.format_size(asizeof.asizeof(msg)),
#                'words: ' + humanfriendly.format_size(asizeof.asizeof(words)),
#                'array_head: ' + humanfriendly.format_size(asizeof.asizeof(msg["array_head"])),
#                'array_body: ' + humanfriendly.format_size(asizeof.asizeof(msg["array_body"])),
#                'spacy_doc: ', asizeof.asizeof(spacy_doc)
#            )

            docs.append(spacy_doc)
    return docs

p = psutil.Process(os.getpid())

print('MEM: ' + humanfriendly.format_size(p.memory_info().rss))

if 'corpus' in globals(): del corpus
if 'nlp' in globals(): del nlp
if 'docs' in globals(): del docs

gc.collect()
print('MEM: ' + humanfriendly.format_size(p.memory_info().rss))

fname = '../../data/benedict-xvi_curated_20190326.txt_preprocessed_en__disable(parser,ner,textcat)_.bin.bz2'

SpacyLang.factories['remove_whitespace_entities'] = lambda nlp, **cfg: remove_whitespace_entities

nlp = spacy.load('en_core_web_sm', disable=['parser','ner','textcat'])
nlp.tokenizer = keep_hyphen_tokenizer(nlp)

docs = read_spacy_docs(fname, nlp)
corpus = textacy.Corpus(docs=docs, lang=nlp)

print('MEM: ' + humanfriendly.format_size(p.memory_info().rss))

#for doc in corpus[:25]:
#    doc_size = asizeof.asizeof(doc)
#    print('MEM: ' + humanfriendly.format_size(doc_size))


MEM: 2.69 GB
MEM: 2.69 GB
MEM: 2.69 GB


In [47]:
d = doc.spacy_doc
def p(k,v):
    x = asizeof.asizeof(v)
    print(k, humanfriendly.format_size(x))
    

In [57]:

%reset -f


NameError: name 'gc' is not defined

In [24]:
for k in doc.__dict__.keys():
    try:
        x = asizeof.asizeof(doc[k])
        print(k, humanfriendly.format_size(x))
    except:
        print(k, 'failed')



corpus_index failed
_counted_ngrams failed
spacy_stringstore failed
corpus failed
_counts failed
spacy_doc failed
spacy_vocab failed


In [44]:
import inspect
members = [ (x, str(type(doc.spacy_doc.__getattribute__(x)))) for x,y in inspect.getmembers(doc.spacy_doc)]
members
#doc.spacy_doc.__sizeof__()
#[x for x in members if map(lambda x: x is in  'builtin_function_or_method' not in x[1] ]

[('_', "<class 'spacy.tokens.underscore.Underscore'>"),
 ('__bytes__', "<class 'builtin_function_or_method'>"),
 ('__class__', "<class 'type'>"),
 ('__delattr__', "<class 'method-wrapper'>"),
 ('__dir__', "<class 'builtin_function_or_method'>"),
 ('__doc__', "<class 'str'>"),
 ('__eq__', "<class 'method-wrapper'>"),
 ('__format__', "<class 'builtin_function_or_method'>"),
 ('__ge__', "<class 'method-wrapper'>"),
 ('__getattribute__', "<class 'method-wrapper'>"),
 ('__getitem__', "<class 'method-wrapper'>"),
 ('__gt__', "<class 'method-wrapper'>"),
 ('__hash__', "<class 'method-wrapper'>"),
 ('__init__', "<class 'method-wrapper'>"),
 ('__iter__', "<class 'method-wrapper'>"),
 ('__le__', "<class 'method-wrapper'>"),
 ('__len__', "<class 'method-wrapper'>"),
 ('__lt__', "<class 'method-wrapper'>"),
 ('__ne__', "<class 'method-wrapper'>"),
 ('__new__', "<class 'builtin_function_or_method'>"),
 ('__pyx_vtable__', "<class 'PyCapsule'>"),
 ('__reduce__', "<class 'builtin_function_or_method'>"