diff --git a/IPython/parallel/apps/ipcontrollerapp.py b/IPython/parallel/apps/ipcontrollerapp.py index f9fba94d46c..74e67ae0273 100755 --- a/IPython/parallel/apps/ipcontrollerapp.py +++ b/IPython/parallel/apps/ipcontrollerapp.py @@ -106,6 +106,13 @@ 'use the MongoDB backend'), 'dictdb' : ({'HubFactory' : {'db_class' : 'IPython.parallel.controller.dictdb.DictDB'}}, 'use the in-memory DictDB backend'), + 'nodb' : ({'HubFactory' : {'db_class' : 'IPython.parallel.controller.dictdb.NoDB'}}, + """use dummy DB backend, which doesn't store any information. + + This can be used to prevent growth of the memory footprint of the Hub + in cases where its record-keeping is not required. Requesting results + of tasks submitted by other clients, db_queries, and task resubmission + will not be available."""), 'reuse' : ({'IPControllerApp' : {'reuse_files' : True}}, 'reuse existing json connection files') }) diff --git a/IPython/parallel/controller/dictdb.py b/IPython/parallel/controller/dictdb.py index 21e44c599aa..10ce73e7e67 100644 --- a/IPython/parallel/controller/dictdb.py +++ b/IPython/parallel/controller/dictdb.py @@ -183,3 +183,34 @@ def get_history(self): """get all msg_ids, ordered by time submitted.""" msg_ids = self._records.keys() return sorted(msg_ids, key=lambda m: self._records[m]['submitted']) + +class NoDB(DictDB): + """A blackhole db backend that actually stores no information. + + Provides the full DB interface, but raises KeyErrors on any + method that tries to access the records. This can be used to + minimize the memory footprint of the Hub when its record-keeping + functionality is not required. + """ + + def add_record(self, msg_id, record): + pass + + def get_record(self, msg_id): + raise KeyError("NoDB does not support record access") + + def update_record(self, msg_id, record): + pass + + def drop_matching_records(self, check): + pass + + def drop_record(self, msg_id): + pass + + def find_records(self, check, keys=None): + raise KeyError("NoDB does not store information") + + def get_history(self): + raise KeyError("NoDB does not store information") + diff --git a/docs/source/parallel/parallel_db.txt b/docs/source/parallel/parallel_db.txt index f3dea618f88..648223fe39a 100644 --- a/docs/source/parallel/parallel_db.txt +++ b/docs/source/parallel/parallel_db.txt @@ -112,3 +112,26 @@ Result headers for all jobs on engine 3 or 4: In [1]: uuids = map(rc._engines.get, (3,4)) In [2]: hist34 = rc.db_query({'engine_uuid' : {'$in' : uuids }, keys='result_header') + + +Cost +==== + +The advantage of the database backends is, of course, that large amounts of +data can be stored that won't fit in memory. The default 'backend' is actually +to just store all of this information in a Python dictionary. This is very fast, +but will run out of memory quickly if you move a lot of data around, or your +cluster is to run for a long time. + +Unfortunately, the DB backends (SQLite and MongoDB) right now are rather slow, +and can still consume large amounts of resources, particularly if large tasks +or results are being created at a high frequency. + +For this reason, we have added :class:`~.NoDB`,a dummy backend that doesn't +actually store any information. When you use this database, nothing is stored, +and any request for results will result in a KeyError. This obviously prevents +later requests for results and task resubmission from functioning, but +sometimes those nice features are not as useful as keeping Hub memory under +control. + + diff --git a/docs/source/parallel/parallel_process.txt b/docs/source/parallel/parallel_process.txt index 33068b617a4..d8c34c6d254 100644 --- a/docs/source/parallel/parallel_process.txt +++ b/docs/source/parallel/parallel_process.txt @@ -762,6 +762,10 @@ To use one of these backends, you must set the :attr:`HubFactory.db_class` trait # and SQLite: c.HubFactory.db_class = 'IPython.parallel.controller.sqlitedb.SQLiteDB' + + # You can use NoDB to disable the database altogether, in case you don't need + # to reuse tasks or results, and want to keep memory consumption under control. + c.HubFactory.db_class = 'IPython.parallel.controller.dictdb.NoDB' When using the proper databases, you can actually allow for tasks to persist from one session to the next by specifying the MongoDB database or SQLite table in @@ -789,6 +793,22 @@ you can specify any arguments you may need to the PyMongo `Connection # keyword args to pymongo.Connection c.MongoDB.connection_kwargs = {} +But sometimes you are moving lots of data around quickly, and you don't need +that information to be stored for later access, even by other Clients to this +same session. For this case, we have a dummy database, which doesn't actually +store anything. This lets the Hub stay small in memory, at the obvious expense +of being able to access the information that would have been stored in the +database (used for task resubmission, requesting results of tasks you didn't +submit, etc.). To use this backend, simply pass ``--nodb`` to +:command:`ipcontroller` on the command-line, or specify the :class:`NoDB` class +in your :file:`ipcontroller_config.py` as described above. + + +.. seealso:: + + For more information on the database backends, see the :ref:`db backend reference `. + + .. _PyMongo: http://api.mongodb.org/python/1.9/ Configuring `ipengine`