In [15]:
import os
import mmap
import pickle
import operator
from functools import lru_cache
import sys
sys.path.append('/home/rohitalyosha/Student_Job/mannheim-nel/')
from utils import *
from logging import getLogger
log = getLogger()
from sqlitedict import SqliteDict

In [16]:
rd = json_load('/home/rohitalyosha/Student_Job/mannheim-nel/data/dicts/redirects.json')
rd_sql = SqliteDict('/home/rohitalyosha/Student_Job/mannheim-nel/data/db/redirects.sqlite')

In [8]:
class FileObjectStore(object):
    def __init__(self, path):
        self.path = path
        self.store = mmdict(path)

    @classmethod
    def get_protocol(cls):
        return 'file'

    def iter_ids(self):
        return self.store.keys()

    def exists(self, oid):
        return oid in self.store

    def fetch(self, oid):
        return self.store[oid]

    def fetch_many(self, oids):
        return [self.fetch(oid) for oid in oids]

    def fetch_all(self):
        return self.store.items()

    def save_many(self, obj_iter):
        self.store.close()
        mmdict.write(self.path, ((k, v) for k, v in obj_iter))
        self.store = mmdict(self.path)

    def save(self,obj):
        self.save_many([obj])
       
    @classmethod
    def GetPath(cls, store_id, uri):
        path = store_id.replace(':', '/')
        if uri and uri.startswith('file://'):
            path = os.path.join(uri[7:], path)
        return path

    @classmethod
    def Get(cls, store_id, uri='file://', **kwargs):
        return cls(cls.GetPath(store_id, uri))

class mmdict(object):
    def __init__(self, path):
        self.path = path
        self.index = {}
        
        index_path = self.path + '.index'
        if os.path.exists(index_path):
            log.debug('Loading mmap store: %s ...' % index_path)
            with open(index_path, 'rb') as f:
                self.index = dict(self.deserialise(f))

            self.data_file = open(path + '.data', 'rb')
            self.data_mmap = mmap.mmap(self.data_file.fileno(), 0, prot=mmap.PROT_READ)
        else:
            log.warn('No existing mmap store found: %s ...' % index_path)

    @staticmethod
    def serialise(obj, f):
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

    @staticmethod
    def deserialise(f):
        return pickle.load(f)

    @staticmethod
    def static_itervalues(path):
        with open(path + '.data', 'rb') as f:
            while True:
                try:
                    yield mmdict.deserialise(f)
                except EOFError: break

    def iteritems(self):
        sorted_idx = sorted(self.index.iteritems(), key=operator.itemgetter(1))

        for i, v in enumerate(self.itervalues()):
            yield (sorted_idx[i][0], v)

    def iterkeys(self):
        return self.index.iterkeys()

    def itervalues(self):
        self.data_mmap.seek(0)
        while True:
            try:
                yield self.deserialise(self.data_mmap)
            except EOFError: break

    def __len__(self):
        return len(self.index)

    def __contains__(self, key):
        return key in self.index

    @lru_cache(maxsize=20000)
    def __getitem__(self, key):
        if key not in self:
            return None

        self.data_mmap.seek(self.index[key])
        return self.deserialise(self.data_mmap)

    def __enter__(self):
        return self

    def close(self):
        if hasattr(self, 'data_mmap') and self.data_mmap != None:
            self.data_mmap.close()
        if hasattr(self, 'data_file') and self.data_file != None:
            self.data_file.close()

    def __exit__(self, type, value, traceback):
        self.close()

    def __del__(self):
        self.close()

    @staticmethod
    def write(path, iter_kvs):
        index = []
        with open(path + '.data', 'wb') as f:
            for key, value in iter_kvs:
                index.append((key, f.tell()))
                mmdict.serialise(value, f)
        with open(path + '.index','wb') as f:
            mmdict.serialise(index, f)


In [24]:
file_stores = {}
dicts = {}
dict_names = ['ent_dict', 'word_dict', 'redirects', 'str_prior', 'str_cond', 'disamb', 'str_necounts']

In [25]:
for dict_name in dict_names:
    d = json_load(f'/home/rohitalyosha/Student_Job/mannheim-nel/data/dicts/{dict_name}.json')
    dicts[dict_name] = d
    file_store = FileObjectStore(f'/home/rohitalyosha/Student_Job/mannheim-nel/data/mmaps/{dict_name}')
    file_store.save_many(d.items())
    file_stores[dict_name] = file_store
    print(dict_name)

ent_dict
word_dict
redirects
str_prior
str_cond
disamb


No existing mmap store found: /home/rohitalyosha/Student_Job/mannheim-nel/data/mmaps/str_necounts.index ...


str_necounts


In [27]:
for dict_name, d in dicts.items():
    for k, v in d.items():
        assert v == file_stores[dict_name].fetch(k)
    print(dict_name)

ent_dict
word_dict
redirects
str_prior
str_cond
disamb
str_necounts


In [11]:
for i, (k, v) in enumerate(rd.items()):
    if  i == 10:
        break
    print(k, v)

AccessibleComputing Computer_accessibility
AfghanistanHistory History_of_Afghanistan
AfghanistanGeography Geography_of_Afghanistan
AfghanistanPeople Demographics_of_Afghanistan
AfghanistanCommunications Communications_in_Afghanistan
AfghanistanTransportations Transport_in_Afghanistan
AfghanistanMilitary Afghan_Armed_Forces
AfghanistanTransnationalIssues Foreign_relations_of_Afghanistan
AssistiveTechnology Assistive_technology
AmoeboidTaxa Amoeba


In [19]:
%%timeit
file.fetch('AccessibleComputing')

287 ns ± 0.0639 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [20]:
%%timeit
rd.get('AccessibleComputing')

93.8 ns ± 0.0563 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)


In [21]:
%%timeit
rd_sql.get('AccessibleComputing')

269 µs ± 8.32 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
