Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

general parallel code cleanup

  • Loading branch information...
commit b8dd49c8e1257d51af09c545c964031c3b5517f4 1 parent 077c1af
@minrk minrk authored
View
180 IPython/zmq/parallel/controller.py
@@ -1,6 +1,7 @@
#!/usr/bin/env python
"""The IPython Controller with 0MQ
-This is the master object that handles connections from engines, clients, and
+This is the master object that handles connections from engines and clients,
+and monitors traffic through the various queues.
"""
#-----------------------------------------------------------------------------
# Copyright (C) 2010 The IPython Development Team
@@ -28,12 +29,14 @@
from streamsession import Message, wrap_exception
from entry_point import (make_base_argument_parser, select_random_ports, split_ports,
connect_logger, parse_url)
-# from messages import json # use the same import switches
#-----------------------------------------------------------------------------
# Code
#-----------------------------------------------------------------------------
+def _passer(*args, **kwargs):
+ return
+
class ReverseDict(dict):
"""simple double-keyed subset of dict methods."""
@@ -93,16 +96,18 @@ class Controller(object):
loop: zmq IOLoop instance
session: StreamSession object
<removed> context: zmq context for creating new connections (?)
+ queue: ZMQStream for monitoring the command queue (SUB)
registrar: ZMQStream for engine registration requests (XREP)
+ heartbeat: HeartMonitor object checking the pulse of the engines
clientele: ZMQStream for client connections (XREP)
not used for jobs, only query/control commands
- queue: ZMQStream for monitoring the command queue (SUB)
- heartbeat: HeartMonitor object checking the pulse of the engines
- db_stream: connection to db for out of memory logging of commands
+ notifier: ZMQStream for broadcasting engine registration changes (PUB)
+ db: connection to db for out of memory logging of commands
NotImplemented
- queue_addr: zmq connection address of the XREP socket for the queue
- hb_addr: zmq connection address of the PUB socket for heartbeats
- task_addr: zmq connection address of the XREQ socket for task queue
+ engine_addrs: dict of zmq connection information for engines to connect
+ to the queues.
+ client_addrs: dict of zmq connection information for engines to connect
+ to the queues.
"""
# internal data structures:
ids=None # engine IDs
@@ -165,14 +170,16 @@ def __init__(self, loop, session, queue, registrar, heartbeat, clientele, notifi
self.notifier = notifier
self.db = db
+ # validate connection dicts:
self.client_addrs = client_addrs
assert isinstance(client_addrs['queue'], str)
+ assert isinstance(client_addrs['control'], str)
# self.hb_addrs = hb_addrs
self.engine_addrs = engine_addrs
assert isinstance(engine_addrs['queue'], str)
+ assert isinstance(client_addrs['control'], str)
assert len(engine_addrs['heartbeat']) == 2
-
# register our callbacks
self.registrar.on_recv(self.dispatch_register_request)
self.clientele.on_recv(self.dispatch_client_msg)
@@ -182,19 +189,25 @@ def __init__(self, loop, session, queue, registrar, heartbeat, clientele, notifi
heartbeat.add_heart_failure_handler(self.handle_heart_failure)
heartbeat.add_new_heart_handler(self.handle_new_heart)
- if self.db is not None:
- self.db.on_recv(self.dispatch_db)
-
+ self.queue_handlers = { 'in' : self.save_queue_request,
+ 'out': self.save_queue_result,
+ 'intask': self.save_task_request,
+ 'outtask': self.save_task_result,
+ 'tracktask': self.save_task_destination,
+ 'incontrol': _passer,
+ 'outcontrol': _passer,
+ }
+
self.client_handlers = {'queue_request': self.queue_status,
'result_request': self.get_results,
'purge_request': self.purge_results,
+ 'load_request': self.check_load,
'resubmit_request': self.resubmit_task,
}
self.registrar_handlers = {'registration_request' : self.register_engine,
'unregistration_request' : self.unregister_engine,
'connection_request': self.connection_request,
-
}
#
# this is the stuff that will move to DB:
@@ -272,7 +285,7 @@ def dispatch_register_request(self, msg):
print (idents,msg, len(msg))
try:
msg = self.session.unpack_message(msg,content=True)
- except Exception, e:
+ except Exception as e:
logger.error("registration::got bad registration message: %s"%msg)
raise e
return
@@ -291,18 +304,9 @@ def dispatch_queue_traffic(self, msg):
logger.debug("queue traffic: %s"%msg[:2])
switch = msg[0]
idents, msg = self.session.feed_identities(msg[1:])
- if switch == 'in':
- self.save_queue_request(idents, msg)
- elif switch == 'out':
- self.save_queue_result(idents, msg)
- elif switch == 'intask':
- self.save_task_request(idents, msg)
- elif switch == 'outtask':
- self.save_task_result(idents, msg)
- elif switch == 'tracktask':
- self.save_task_destination(idents, msg)
- elif switch in ('incontrol', 'outcontrol'):
- pass
+ handler = self.queue_handlers.get(switch, None)
+ if handler is not None:
+ handler(idents, msg)
else:
logger.error("Invalid message topic: %s"%switch)
@@ -392,7 +396,7 @@ def save_queue_request(self, idents, msg):
received=None,
engine=(eid, queue_id))
self.pending[msg_id] = ( msg, info )
- self.queues[eid][0].append(msg_id)
+ self.queues[eid].append(msg_id)
def save_queue_result(self, idents, msg):
client_id, queue_id = idents[:2]
@@ -417,7 +421,7 @@ def save_queue_result(self, idents, msg):
self.results[msg_id] = msg
if msg_id in self.pending:
self.pending.pop(msg_id)
- self.queues[eid][0].remove(msg_id)
+ self.queues[eid].remove(msg_id)
self.completed[eid].append(msg_id)
else:
logger.debug("queue:: unknown msg finished %s"%msg_id)
@@ -425,6 +429,7 @@ def save_queue_result(self, idents, msg):
#--------------------- Task Queue Traffic ------------------------------
def save_task_request(self, idents, msg):
+ """Save the submission of a task."""
client_id = idents[0]
try:
@@ -437,13 +442,17 @@ def save_task_request(self, idents, msg):
header = msg['header']
msg_id = header['msg_id']
self.mia.add(msg_id)
- self.pending[msg_id] = msg
+ info = dict(submit=datetime.now(),
+ received=None,
+ engine=None)
+ self.pending[msg_id] = (msg, info)
if not self.tasks.has_key(client_id):
self.tasks[client_id] = []
self.tasks[client_id].append(msg_id)
def save_task_result(self, idents, msg):
- client_id = idents[0]
+ """save the result of a completed task."""
+ client_id, engine_uuid = idents[:2]
try:
msg = self.session.unpack_message(msg, content=False)
except:
@@ -452,16 +461,19 @@ def save_task_result(self, idents, msg):
return
parent = msg['parent_header']
+ eid = self.by_ident[engine_uuid]
if not parent:
# print msg
# logger.warn("")
return
msg_id = parent['msg_id']
self.results[msg_id] = msg
- if msg_id in self.pending:
+ if msg_id in self.pending and msg_id in self.tasks[eid]:
self.pending.pop(msg_id)
if msg_id in self.mia:
self.mia.remove(msg_id)
+ self.completed[eid].append(msg_id)
+ self.tasks[eid].remove(msg_id)
else:
logger.debug("task::unknown task %s finished"%msg_id)
@@ -475,16 +487,16 @@ def save_task_destination(self, idents, msg):
print (content)
msg_id = content['msg_id']
engine_uuid = content['engine_id']
- for eid,queue_id in self.keytable.iteritems():
- if queue_id == engine_uuid:
- break
+ eid = self.by_ident[engine_uuid]
logger.info("task::task %s arrived on %s"%(msg_id, eid))
if msg_id in self.mia:
self.mia.remove(msg_id)
else:
logger.debug("task::task %s not listed as MIA?!"%(msg_id))
- self.tasks[engine_uuid].append(msg_id)
+
+ self.tasks[eid].append(msg_id)
+ self.pending[msg_id][1].update(received=datetime.now(),engine=(eid,engine_uuid))
def mia_task_request(self, idents, msg):
client_id = idents[0]
@@ -493,10 +505,12 @@ def mia_task_request(self, idents, msg):
- #-------------------- Registration -----------------------------
+ #-------------------------------------------------------------------------
+ # Registration requests
+ #-------------------------------------------------------------------------
def connection_request(self, client_id, msg):
- """reply with connection addresses for clients"""
+ """Reply with connection addresses for clients."""
logger.info("client::client %s connected"%client_id)
content = dict(status='ok')
content.update(self.client_addrs)
@@ -507,7 +521,7 @@ def connection_request(self, client_id, msg):
self.session.send(self.registrar, 'connection_reply', content, parent=msg, ident=client_id)
def register_engine(self, reg, msg):
- """register an engine"""
+ """Register a new engine."""
content = msg['content']
try:
queue = content['queue']
@@ -556,6 +570,7 @@ def register_engine(self, reg, msg):
return eid
def unregister_engine(self, ident, msg):
+ """Unregister an engine that explicitly requested to leave."""
try:
eid = msg['content']['id']
except:
@@ -569,7 +584,7 @@ def unregister_engine(self, ident, msg):
self.hearts.pop(ec.heartbeat)
self.by_ident.pop(ec.queue)
self.completed.pop(eid)
- for msg_id in self.queues.pop(eid)[0]:
+ for msg_id in self.queues.pop(eid):
msg = self.pending.pop(msg_id)
############## TODO: HANDLE IT ################
@@ -577,6 +592,8 @@ def unregister_engine(self, ident, msg):
self.session.send(self.notifier, "unregistration_notification", content=content)
def finish_registration(self, heart):
+ """Second half of engine registration, called after our HeartMonitor
+ has received a beat from the Engine's Heart."""
try:
(eid,queue,reg,purge) = self.incoming_registrations.pop(heart)
except KeyError:
@@ -590,7 +607,8 @@ def finish_registration(self, heart):
self.keytable[eid] = queue
self.engines[eid] = EngineConnector(eid, queue, reg, control, heart)
self.by_ident[queue] = eid
- self.queues[eid] = ([],[])
+ self.queues[eid] = list()
+ self.tasks[eid] = list()
self.completed[eid] = list()
self.hearts[heart] = eid
content = dict(id=eid, queue=self.engines[eid].queue)
@@ -604,7 +622,9 @@ def _purge_stalled_registration(self, heart):
else:
pass
- #------------------- Client Requests -------------------------------
+ #-------------------------------------------------------------------------
+ # Client Requests
+ #-------------------------------------------------------------------------
def check_load(self, client_id, msg):
content = msg['content']
@@ -620,12 +640,17 @@ def check_load(self, client_id, msg):
content = dict(status='ok')
# loads = {}
for t in targets:
- content[str(t)] = len(self.queues[t])
+ content[bytes(t)] = len(self.queues[t])+len(self.tasks[t])
self.session.send(self.clientele, "load_reply", content=content, ident=client_id)
def queue_status(self, client_id, msg):
- """handle queue_status request"""
+ """Return the Queue status of one or more targets.
+ if verbose: return the msg_ids
+ else: return len of each type.
+ keys: queue (pending MUX jobs)
+ tasks (pending Task jobs)
+ completed (finished jobs from both queues)"""
content = msg['content']
targets = content['targets']
try:
@@ -635,19 +660,23 @@ def queue_status(self, client_id, msg):
self.session.send(self.clientele, "controller_error",
content=content, ident=client_id)
return
- verbose = msg.get('verbose', False)
- content = dict()
+ verbose = content.get('verbose', False)
+ content = dict(status='ok')
for t in targets:
queue = self.queues[t]
completed = self.completed[t]
+ tasks = self.tasks[t]
if not verbose:
queue = len(queue)
completed = len(completed)
- content[str(t)] = {'queue': queue, 'completed': completed }
+ tasks = len(tasks)
+ content[bytes(t)] = {'queue': queue, 'completed': completed , 'tasks': tasks}
# pending
self.session.send(self.clientele, "queue_reply", content=content, ident=client_id)
def purge_results(self, client_id, msg):
+ """Purge results from memory. This method is more valuable before we move
+ to a DB based message storage mechanism."""
content = msg['content']
msg_ids = content.get('msg_ids', [])
reply = dict(status='ok')
@@ -675,37 +704,11 @@ def purge_results(self, client_id, msg):
self.sesison.send(self.clientele, 'purge_reply', content=reply, ident=client_id)
def resubmit_task(self, client_id, msg, buffers):
- content = msg['content']
- header = msg['header']
-
-
- msg_ids = content.get('msg_ids', [])
- reply = dict(status='ok')
- if msg_ids == 'all':
- self.results = {}
- else:
- for msg_id in msg_ids:
- if msg_id in self.results:
- self.results.pop(msg_id)
- else:
- if msg_id in self.pending:
- reply = dict(status='error', reason="msg pending: %r"%msg_id)
- else:
- reply = dict(status='error', reason="No such msg: %r"%msg_id)
- break
- eids = content.get('engine_ids', [])
- for eid in eids:
- if eid not in self.engines:
- reply = dict(status='error', reason="No such engine: %i"%eid)
- break
- msg_ids = self.completed.pop(eid)
- for msg_id in msg_ids:
- self.results.pop(msg_id)
-
- self.sesison.send(self.clientele, 'purge_reply', content=reply, ident=client_id)
+ """Resubmit a task."""
+ raise NotImplementedError
def get_results(self, client_id, msg):
- """get the result of 1 or more messages"""
+ """Get the result of 1 or more messages."""
content = msg['content']
msg_ids = set(content['msg_ids'])
statusonly = content.get('status_only', False)
@@ -727,33 +730,12 @@ def get_results(self, client_id, msg):
break
self.session.send(self.clientele, "result_reply", content=content,
parent=msg, ident=client_id)
-
-############ OLD METHODS for Python Relay Controller ###################
- def _validate_engine_msg(self, msg):
- """validates and unpacks headers of a message. Returns False if invalid,
- (ident, message)"""
- ident = msg[0]
- try:
- msg = self.session.unpack_message(msg[1:], content=False)
- except:
- logger.error("engine.%s::Invalid Message %s"%(ident, msg))
- return False
-
- try:
- eid = msg.header.username
- assert self.engines.has_key(eid)
- except:
- logger.error("engine::Invalid Engine ID %s"%(ident))
- return False
-
- return eid, msg
-
-
-#--------------------
+#-------------------------------------------------------------------------
# Entry Point
-#--------------------
+#-------------------------------------------------------------------------
+
def make_argument_parser():
"""Make an argument parser"""
parser = make_base_argument_parser()
View
2  IPython/zmq/parallel/heartmonitor.py
@@ -130,7 +130,7 @@ def handle_heart_failure(self, heart):
for handler in self._failure_handlers:
try:
handler(heart)
- except Exception, e:
+ except Exception as e:
print (e)
logger.error("heartbeat::Bad Handler! %s"%handler)
pass
View
41 IPython/zmq/parallel/scheduler.py
@@ -1,3 +1,10 @@
+"""The Python scheduler for rich scheduling.
+
+The Pure ZMQ scheduler does not allow routing schemes other than LRU,
+nor does it check msg_id DAG dependencies. For those, a slightly slower
+Python Scheduler exists.
+"""
+
#----------------------------------------------------------------------
# Imports
#----------------------------------------------------------------------
@@ -40,7 +47,7 @@ def plainrandom(loads):
def lru(loads):
"""Always pick the front of the line.
- The content of loads is ignored.
+ The content of `loads` is ignored.
Assumes LRU ordering of loads, with oldest first.
"""
@@ -151,10 +158,12 @@ def __init__(self, client_stream, engine_stream, mon_stream,
self.notifier_stream.on_recv(self.dispatch_notification)
def resume_receiving(self):
- """resume accepting jobs"""
+ """Resume accepting jobs."""
self.client_stream.on_recv(self.dispatch_submission, copy=False)
def stop_receiving(self):
+ """Stop accepting jobs while there are no engines.
+ Leave them in the ZMQ queue."""
self.client_stream.on_recv(None)
#-----------------------------------------------------------------------
@@ -176,7 +185,7 @@ def dispatch_notification(self, msg):
logger.error("task::Invalid notification msg: %s"%msg)
@logged
def _register_engine(self, uid):
- """new engine became available"""
+ """New engine with ident `uid` became available."""
# head of the line:
self.targets.insert(0,uid)
self.loads.insert(0,0)
@@ -187,10 +196,12 @@ def _register_engine(self, uid):
self.resume_receiving()
def _unregister_engine(self, uid):
- """existing engine became unavailable"""
- # handle any potentially finished tasks:
+ """Existing engine with ident `uid` became unavailable."""
if len(self.targets) == 1:
+ # this was our only engine
self.stop_receiving()
+
+ # handle any potentially finished tasks:
self.engine_stream.flush()
self.completed.pop(uid)
@@ -203,7 +214,7 @@ def _unregister_engine(self, uid):
self.handle_stranded_tasks(lost)
def handle_stranded_tasks(self, lost):
- """deal with jobs resident in an engine that died."""
+ """Deal with jobs resident in an engine that died."""
# TODO: resubmit the tasks?
for msg_id in lost:
pass
@@ -214,26 +225,29 @@ def handle_stranded_tasks(self, lost):
#-----------------------------------------------------------------------
@logged
def dispatch_submission(self, raw_msg):
- """dispatch job submission"""
+ """Dispatch job submission to appropriate handlers."""
# ensure targets up to date:
self.notifier_stream.flush()
try:
idents, msg = self.session.feed_identities(raw_msg, copy=False)
- except Exception, e:
+ except Exception as e:
logger.error("task::Invaid msg: %s"%msg)
return
msg = self.session.unpack_message(msg, content=False, copy=False)
header = msg['header']
msg_id = header['msg_id']
+
+ # time dependencies
after = Dependency(header.get('after', []))
if after.mode == 'all':
after.difference_update(self.all_done)
if after.check(self.all_done):
- # recast as empty set, if we are already met,
- # to prevent
+ # recast as empty set, if `after` already met,
+ # to prevent unnecessary set comparisons
after = Dependency([])
+ # location dependencies
follow = Dependency(header.get('follow', []))
if len(after) == 0:
# time deps already met, try to run
@@ -244,6 +258,7 @@ def dispatch_submission(self, raw_msg):
self.save_unmet(msg_id, raw_msg, after, follow)
# send to monitor
self.mon_stream.send_multipart(['intask']+raw_msg, copy=False)
+
@logged
def maybe_run(self, msg_id, raw_msg, follow=None):
"""check location dependencies, and run if they are met."""
@@ -276,7 +291,7 @@ def save_unmet(self, msg_id, msg, after, follow):
@logged
def submit_task(self, msg_id, msg, follow=None, indices=None):
- """submit a task to any of a subset of our targets"""
+ """Submit a task to any of a subset of our targets."""
if indices:
loads = [self.loads[i] for i in indices]
else:
@@ -290,6 +305,8 @@ def submit_task(self, msg_id, msg, follow=None, indices=None):
self.engine_stream.send_multipart(msg, copy=False)
self.add_job(idx)
self.pending[target][msg_id] = (msg, follow)
+ content = dict(msg_id=msg_id, engine_id=target)
+ self.session.send(self.mon_stream, 'task_destination', content=content, ident='tracktask')
#-----------------------------------------------------------------------
# Result Handling
@@ -298,7 +315,7 @@ def submit_task(self, msg_id, msg, follow=None, indices=None):
def dispatch_result(self, raw_msg):
try:
idents,msg = self.session.feed_identities(raw_msg, copy=False)
- except Exception, e:
+ except Exception as e:
logger.error("task::Invaid result: %s"%msg)
return
msg = self.session.unpack_message(msg, content=False, copy=False)
View
15 IPython/zmq/parallel/streamkernel.py
@@ -125,7 +125,7 @@ def __call__(self, prompt=None):
while True:
try:
reply = self.socket.recv_json(zmq.NOBLOCK)
- except zmq.ZMQError, e:
+ except zmq.ZMQError as e:
if e.errno == zmq.EAGAIN:
pass
else:
@@ -171,7 +171,7 @@ def abort_queue(self, stream):
while True:
try:
msg = self.session.recv(stream, zmq.NOBLOCK,content=True)
- except zmq.ZMQError, e:
+ except zmq.ZMQError as e:
if e.errno == zmq.EAGAIN:
break
else:
@@ -238,17 +238,6 @@ def dispatch_control(self, msg):
else:
handler(self.control_stream, idents, msg)
- # def flush_control(self):
- # while any(zmq.select([self.control_socket],[],[],1e-4)):
- # try:
- # msg = self.control_socket.recv_multipart(zmq.NOBLOCK, copy=False)
- # except zmq.ZMQError, e:
- # if e.errno != zmq.EAGAIN:
- # raise e
- # return
- # else:
- # self.dispatch_control(msg)
-
#-------------------- queue helpers ------------------------------
View
37 IPython/zmq/parallel/streamsession.py
@@ -8,6 +8,7 @@
import traceback
import pprint
import uuid
+from datetime import datetime
import zmq
from zmq.utils import jsonapi
@@ -111,6 +112,7 @@ def __getitem__(self, k):
def msg_header(msg_id, msg_type, username, session):
+ date=datetime.now().isoformat()
return locals()
# return {
# 'msg_id' : msg_id,
@@ -140,7 +142,7 @@ def extract_header(msg_or_header):
return h
def rekey(dikt):
- """rekey a dict that has been forced to use str keys where there should be
+ """Rekey a dict that has been forced to use str keys where there should be
ints by json. This belongs in the jsonutil added by fperez."""
for k in dikt.iterkeys():
if isinstance(k, str):
@@ -162,11 +164,22 @@ def rekey(dikt):
return dikt
def serialize_object(obj, threshold=64e-6):
- """serialize an object into a list of sendable buffers.
+ """Serialize an object into a list of sendable buffers.
- Returns: (pmd, bufs)
- where pmd is the pickled metadata wrapper, and bufs
- is a list of data buffers"""
+ Parameters
+ ----------
+
+ obj : object
+ The object to be serialized
+ threshold : float
+ The threshold for not double-pickling the content.
+
+
+ Returns
+ -------
+ ('pmd', [bufs]) :
+ where pmd is the pickled metadata wrapper,
+ bufs is a list of data buffers"""
# threshold is 100 B
databuffers = []
if isinstance(obj, (list, tuple)):
@@ -318,6 +331,8 @@ def send(self, stream, msg_type, content=None, buffers=None, parent=None, subhea
Parameters
----------
+ stream : zmq.Socket or ZMQStream
+ the socket-like object used to send the data
msg_type : str or Message/dict
Normally, msg_type will be
@@ -347,10 +362,7 @@ def send(self, stream, msg_type, content=None, buffers=None, parent=None, subhea
to_send.append(DELIM)
to_send.append(self.pack(msg['header']))
to_send.append(self.pack(msg['parent_header']))
- # if parent is None:
- # to_send.append(self.none)
- # else:
- # to_send.append(self.pack(dict(parent)))
+
if content is None:
content = self.none
elif isinstance(content, dict):
@@ -374,11 +386,10 @@ def send(self, stream, msg_type, content=None, buffers=None, parent=None, subhea
pprint.pprint(omsg)
pprint.pprint(to_send)
pprint.pprint(buffers)
- # return both the msg object and the buffers
return omsg
def send_raw(self, stream, msg, flags=0, copy=True, idents=None):
- """send a raw message via idents.
+ """Send a raw message via idents.
Parameters
----------
@@ -399,7 +410,7 @@ def recv(self, socket, mode=zmq.NOBLOCK, content=True, copy=True):
socket = socket.socket
try:
msg = socket.recv_multipart(mode)
- except zmq.ZMQError, e:
+ except zmq.ZMQError as e:
if e.errno == zmq.EAGAIN:
# We can convert EAGAIN to None as we know in this case
# recv_json won't return None.
@@ -412,7 +423,7 @@ def recv(self, socket, mode=zmq.NOBLOCK, content=True, copy=True):
idents, msg = self.feed_identities(msg, copy)
try:
return idents, self.unpack_message(msg, content=content, copy=copy)
- except Exception, e:
+ except Exception as e:
print (idents, msg)
# TODO: handle it
raise e
View
76 docs/source/development/parallel_connections.txt
@@ -4,18 +4,22 @@
Connection Diagrams of The IPython ZMQ Cluster
==============================================
-This is a quick summary and illustration of the connections involved in the ZeroMQ based IPython cluster for parallel computing.
+This is a quick summary and illustration of the connections involved in the ZeroMQ based
+IPython cluster for parallel computing.
All Connections
===============
The Parallel Computing code is currently under development in Min RK's IPython fork_ on GitHub.
-.. _fork: http://github.com/minrk/ipython
+.. _fork: http://github.com/minrk/ipython/tree/newparallel
-The IPython cluster consists of a Controller and one or more clients and engines. The goal of the Controller is to manage and monitor the connections and communications between the clients and the engines.
+The IPython cluster consists of a Controller and one or more clients and engines. The goal
+of the Controller is to manage and monitor the connections and communications between the
+clients and the engines.
-It is important for security/practicality reasons that all connections be inbound to the controller process. The arrows in the figures indicate the direction of the connection.
+It is important for security/practicality reasons that all connections be inbound to the
+controller process. The arrows in the figures indicate the direction of the connection.
.. figure:: figs/allconnections.png
@@ -25,8 +29,9 @@ It is important for security/practicality reasons that all connections be inboun
All the connections involved in connecting one client to one engine.
-The Controller consists of two ZMQ Devices - both MonitoredQueues, one for Tasks (load balanced, engine agnostic), one for Multiplexing (explicit targets), a Python device for monitoring (the Heartbeat Monitor).
-
+The Controller consists of two ZMQ Devices - both MonitoredQueues, one for Tasks (load
+balanced, engine agnostic), one for Multiplexing (explicit targets), a Python device for
+monitoring (the Heartbeat Monitor).
Registration
@@ -39,7 +44,10 @@ Registration
Engines and Clients only need to know where the Registrar ``XREP`` is located to start connecting.
-Once a controller is launched, the only information needed for connecting clients and/or engines to the controller is the IP/port of the ``XREP`` socket called the Registrar. This socket handles connections from both clients and engines, and replies with the remaining information necessary to establish the remaining connections.
+Once a controller is launched, the only information needed for connecting clients and/or
+engines to the controller is the IP/port of the ``XREP`` socket called the Registrar. This
+socket handles connections from both clients and engines, and replies with the remaining
+information necessary to establish the remaining connections.
Heartbeat
---------
@@ -51,25 +59,43 @@ Heartbeat
The heartbeat sockets.
-The heartbeat process has been described elsewhere. To summarize: the controller publishes a distinct message periodically via a ``PUB`` socket. Each engine has a ``zmq.FORWARDER`` device with a ``SUB`` socket for input, and ``XREQ`` socket for output. The ``SUB`` socket is connected to the ``PUB`` socket labeled *HB(ping)*, and the ``XREQ`` is connected to the ``XREP`` labeled *HB(pong)*. This results in the same message being relayed back to the Heartbeat Monitor with the addition of the ``XREQ`` prefix. The Heartbeat Monitor receives all the replies via an ``XREP`` socket, and identifies which hearts are still beating by the ``zmq.IDENTITY`` prefix of the ``XREQ`` sockets.
+The heartbeat process has been described elsewhere. To summarize: the controller publishes
+a distinct message periodically via a ``PUB`` socket. Each engine has a ``zmq.FORWARDER``
+device with a ``SUB`` socket for input, and ``XREQ`` socket for output. The ``SUB`` socket
+is connected to the ``PUB`` socket labeled *HB(ping)*, and the ``XREQ`` is connected to
+the ``XREP`` labeled *HB(pong)*. This results in the same message being relayed back to
+the Heartbeat Monitor with the addition of the ``XREQ`` prefix. The Heartbeat Monitor
+receives all the replies via an ``XREP`` socket, and identifies which hearts are still
+beating by the ``zmq.IDENTITY`` prefix of the ``XREQ`` sockets.
Queues
------
.. figure:: figs/queuefade.png
- :width: 432px
- :alt: IPython Queue connections
- :align: center
-
- Load balanced Task queue on the left, explicitly multiplexed queue on the right.
-
-The controller has two MonitoredQueue devices. These devices are primarily for relaying messages between clients and engines, but the controller needs to see those messages for its own purposes. Since no Python code may exist between the two sockets in a queue, all messages sent through these queues (both directions) are also sent via a ``PUB`` socket to a monitor, which allows the Controller to monitor queue traffic without interfering with it.
-
-For tasks, the engine need not be specified. Messages sent to the ``XREP`` socket from the client side are assigned to an engine via ZMQ's ``XREQ`` round-robin load balancing. Engine replies are directed to specific clients via the IDENTITY of the client, which is received as a prefix at the Engine.
-
-For Multiplexing, ``XREP`` is used for both in and output sockets in the device. Clients must specify the destination by the ``zmq.IDENTITY`` of the ``PAIR`` socket connected to the downstream end of the device.
-
-At the Kernel level, both of these PAIR sockets are treated in the same way as the ``REP`` socket in the serial version (except using ZMQStreams instead of explicit sockets).
+ :width: 432px
+ :alt: IPython Queue connections
+ :align: center
+
+ Load balanced Task queue on the left, explicitly multiplexed queue on the right.
+
+The controller has two MonitoredQueue devices. These devices are primarily for relaying
+messages between clients and engines, but the controller needs to see those messages for
+its own purposes. Since no Python code may exist between the two sockets in a queue, all
+messages sent through these queues (both directions) are also sent via a ``PUB`` socket to
+a monitor, which allows the Controller to monitor queue traffic without interfering with
+it.
+
+For tasks, the engine need not be specified. Messages sent to the ``XREP`` socket from the
+client side are assigned to an engine via ZMQ's ``XREQ`` round-robin load balancing.
+Engine replies are directed to specific clients via the IDENTITY of the client, which is
+received as a prefix at the Engine.
+
+For Multiplexing, ``XREP`` is used for both in and output sockets in the device. Clients
+must specify the destination by the ``zmq.IDENTITY`` of the ``PAIR`` socket connected to
+the downstream end of the device.
+
+At the Kernel level, both of these PAIR sockets are treated in the same way as the ``REP``
+socket in the serial version (except using ZMQStreams instead of explicit sockets).
Client connections
------------------
@@ -81,7 +107,8 @@ Client connections
Clients connect to an ``XREP`` socket to query the controller
-The controller listens on an ``XREP`` socket for queries from clients as to queue status, and control instructions. Clients can connect to this via a PAIR socket or ``XREQ``.
+The controller listens on an ``XREP`` socket for queries from clients as to queue status,
+and control instructions. Clients can connect to this via a PAIR socket or ``XREQ``.
.. figure:: figs/notiffade.png
:width: 432px
@@ -90,5 +117,8 @@ The controller listens on an ``XREP`` socket for queries from clients as to queu
Engine registration events are published via a ``PUB`` socket.
-The controller publishes all registration/unregistration events via a ``PUB`` socket. This allows clients to stay up to date with what engines are available by subscribing to the feed with a ``SUB`` socket. Other processes could selectively subscribe to just registration or unregistration events.
+The controller publishes all registration/unregistration events via a ``PUB`` socket. This
+allows clients to stay up to date with what engines are available by subscribing to the
+feed with a ``SUB`` socket. Other processes could selectively subscribe to just
+registration or unregistration events.
View
120 docs/source/development/parallel_messages.txt
@@ -3,16 +3,20 @@
Messaging for Parallel Computing
================================
-This is an extension of the :ref:`messaging <messaging>` doc. Diagrams of the connections can be found in the :ref:`parallel connections <parallel_connections>` doc.
+This is an extension of the :ref:`messaging <messaging>` doc. Diagrams of the connections
+can be found in the :ref:`parallel connections <parallel_connections>` doc.
-
-ZMQ messaging is also used in the parallel computing IPython system. All messages to/from kernels remain the same as the single kernel model, and are forwarded through a ZMQ Queue device. The controller receives all messages and replies in these channels, and saves results for future use.
+ZMQ messaging is also used in the parallel computing IPython system. All messages to/from
+kernels remain the same as the single kernel model, and are forwarded through a ZMQ Queue
+device. The controller receives all messages and replies in these channels, and saves
+results for future use.
The Controller
--------------
-The controller is the central process of the IPython parallel computing model. It has 3 Devices:
+The controller is the central process of the IPython parallel computing model. It has 3
+Devices:
* Heartbeater
* Multiplexed Queue
@@ -29,9 +33,13 @@ and 3 sockets:
Registration (``XREP``)
***********************
-The first function of the Controller is to facilitate and monitor connections of clients and engines. Both client and engine registration are handled by the same socket, so only one ip/port pair is needed to connect any number of connections and clients.
+The first function of the Controller is to facilitate and monitor connections of clients
+and engines. Both client and engine registration are handled by the same socket, so only
+one ip/port pair is needed to connect any number of connections and clients.
-Engines register with the ``zmq.IDENTITY`` of their two ``XREQ`` sockets, one for the queue, which receives execute requests, and one for the heartbeat, which is used to monitor the survival of the Engine process.
+Engines register with the ``zmq.IDENTITY`` of their two ``XREQ`` sockets, one for the
+queue, which receives execute requests, and one for the heartbeat, which is used to
+monitor the survival of the Engine process.
Message type: ``registration_request``::
@@ -40,7 +48,10 @@ Message type: ``registration_request``::
'heartbeat' : '1234-abcd-...' # the heartbeat XREQ id
}
-The Controller replies to an Engine's registration request with the engine's integer ID, and all the remaining connection information for connecting the heartbeat process, and kernel socket(s). The message status will be an error if the Engine requests IDs that already in use.
+The Controller replies to an Engine's registration request with the engine's integer ID,
+and all the remaining connection information for connecting the heartbeat process, and
+kernel queue socket(s). The message status will be an error if the Engine requests IDs that
+already in use.
Message type: ``registration_reply``::
@@ -49,39 +60,49 @@ Message type: ``registration_reply``::
# if ok:
'id' : 0, # int, the engine id
'queue' : 'tcp://127.0.0.1:12345', # connection for engine side of the queue
+ 'control' : 'tcp://...', # addr for control queue
'heartbeat' : (a,b), # tuple containing two interfaces needed for heartbeat
- 'task' : 'tcp...', # addr for task queue, or None if no task queue running
+ 'task' : 'tcp://...', # addr for task queue, or None if no task queue running
# if error:
'reason' : 'queue_id already registered'
}
-Clients use the same socket to start their connections. Connection requests from clients need no information:
+Clients use the same socket as engines to start their connections. Connection requests
+from clients need no information:
Message type: ``connection_request``::
content = {}
-The reply to a Client registration request contains the connection information for the multiplexer and load balanced queues, as well as the address for direct controller queries. If any of these addresses is `None`, that functionality is not available.
+The reply to a Client registration request contains the connection information for the
+multiplexer and load balanced queues, as well as the address for direct controller
+queries. If any of these addresses is `None`, that functionality is not available.
Message type: ``connection_reply``::
content = {
'status' : 'ok', # or 'error'
# if ok:
- 'queue' : 'tcp://127.0.0.1:12345', # connection for client side of the queue
+ 'queue' : 'tcp://127.0.0.1:12345', # connection for client side of the MUX queue
'task' : 'tcp...', # addr for task queue, or None if no task queue running
- 'controller' : 'tcp...' # addr for controller methods, like queue_request, etc.
+ 'query' : 'tcp...' # addr for methods to query the controller, like queue_request, etc.
+ 'control' : 'tcp...' # addr for control methods, like abort, etc.
}
Heartbeat
*********
-The controller uses a heartbeat system to monitor engines, and track when they become unresponsive. As described in :ref:`messages <messages>`, and shown in :ref:`connections <parallel_connections>`.
+The controller uses a heartbeat system to monitor engines, and track when they become
+unresponsive. As described in :ref:`messages <messages>`, and shown in :ref:`connections
+<parallel_connections>`.
Notification (``PUB``)
**********************
-The controller published all engine registration/unregistration events on a PUB socket. This allows clients to have up-to-date engine ID sets without polling. Registration notifications contain both the integer engine ID and the queue ID, which is necessary for sending messages via the Multiplexer Queue.
+The controller published all engine registration/unregistration events on a PUB socket.
+This allows clients to have up-to-date engine ID sets without polling. Registration
+notifications contain both the integer engine ID and the queue ID, which is necessary for
+sending messages via the Multiplexer Queue.
Message type: ``registration_notification``::
@@ -100,9 +121,14 @@ Message type : ``unregistration_notification``::
Client Queries (``XREP``)
*************************
-The controller monitors and logs all queue traffic, so that clients can retrieve past results or monitor pending tasks. Currently, this information resides in memory on the Controller, but will ultimately be offloaded to a database over an additional ZMQ connection. The interface should remain the same or at least similar.
+The controller monitors and logs all queue traffic, so that clients can retrieve past
+results or monitor pending tasks. Currently, this information resides in memory on the
+Controller, but will ultimately be offloaded to a database over an additional ZMQ
+connection. The interface should remain the same or at least similar.
-:func:`queue_request` requests can specify multiple engines to query via the `targets` element. A verbose flag can be passed, to determine whether the result should be the list of `msg_ids` in the queue or simply the length of each list.
+:func:`queue_request` requests can specify multiple engines to query via the `targets`
+element. A verbose flag can be passed, to determine whether the result should be the list
+of `msg_ids` in the queue or simply the length of each list.
Message type: ``queue_request``::
@@ -111,7 +137,9 @@ Message type: ``queue_request``::
'targets' : [0,3,1] # list of ints
}
-The content of a reply to a :func:queue_request request is a dict, keyed by the engine IDs. Note that they will be the string representation of the integer keys, since JSON cannot handle number keys.
+The content of a reply to a :func:queue_request request is a dict, keyed by the engine
+IDs. Note that they will be the string representation of the integer keys, since JSON
+cannot handle number keys.
Message type: ``queue_reply``::
@@ -120,15 +148,19 @@ Message type: ``queue_reply``::
'1' : {'completed' : 10, 'queue' : 1}
}
-Clients can request individual results directly from the controller. This is primarily for use gathering results of executions not submitted by the particular client, as the client will have all its own results already. Requests are made by msg_id, and can contain one or more msg_id.
+Clients can request individual results directly from the controller. This is primarily for
+use gathering results of executions not submitted by the particular client, as the client
+will have all its own results already. Requests are made by msg_id, and can contain one or
+more msg_id.
Message type: ``result_request``::
content = {
- 'msg_ids' : [uuid,'...'] # list of strs
+ 'msg_ids' : ['uuid','...'] # list of strs
}
-The :func:`result_request` reply contains the content objects of the actual execution reply messages
+The :func:`result_request` reply contains the content objects of the actual execution
+reply messages
Message type: ``result_reply``::
@@ -139,13 +171,18 @@ Message type: ``result_reply``::
msg_id : msg, # the content dict is keyed by msg_ids,
# values are the result messages
'pending' : ['msg_id','...'], # msg_ids still pending
+ 'completed' : ['msg_id','...'], # list of completed msg_ids
# if error:
'reason' : "explanation"
}
-Clients can also instruct the controller to forget the results of messages. This can be done by message ID or engine ID. Individual messages are dropped by msg_id, and all messages completed on an engine are dropped by engine ID.
+For memory management purposes, Clients can also instruct the controller to forget the
+results of messages. This can be done by message ID or engine ID. Individual messages are
+dropped by msg_id, and all messages completed on an engine are dropped by engine ID. This will likely no longer
+be necessary once we move to a DB-based message logging backend.
-If the msg_ids element is the string ``'all'`` instead of a list, then all completed results are forgotten.
+If the msg_ids element is the string ``'all'`` instead of a list, then all completed
+results are forgotten.
Message type: ``purge_request``::
@@ -154,7 +191,9 @@ Message type: ``purge_request``::
'engine_ids' : [0,2,4] # list of engine IDs
}
-The reply to a purge request is simply the status 'ok' if the request succeeded, or an explanation of why it failed, such as requesting the purge of a nonexistent or pending message.
+The reply to a purge request is simply the status 'ok' if the request succeeded, or an
+explanation of why it failed, such as requesting the purge of a nonexistent or pending
+message.
Message type: ``purge_reply``::
@@ -168,18 +207,29 @@ Message type: ``purge_reply``::
:func:`apply` and :func:`apply_bound`
*************************************
-The `Namespace <http://gist.github.com/483294>`_ model suggests that execution be able to use the model::
+The `Namespace <http://gist.github.com/483294>`_ model suggests that execution be able to
+use the model::
client.apply(f, *args, **kwargs)
-which takes `f`, a function in the user's namespace, and executes ``f(*args, **kwargs)`` on a remote engine, returning the result (or, for non-blocking, information facilitating later retrieval of the result). This model, unlike the execute message which just uses a code string, must be able to send arbitrary (pickleable) Python objects. And ideally, copy as little data as we can. The `buffers` property of a Message was introduced for this purpose.
+which takes `f`, a function in the user's namespace, and executes ``f(*args, **kwargs)``
+on a remote engine, returning the result (or, for non-blocking, information facilitating
+later retrieval of the result). This model, unlike the execute message which just uses a
+code string, must be able to send arbitrary (pickleable) Python objects. And ideally, copy
+as little data as we can. The `buffers` property of a Message was introduced for this
+purpose.
-Utility method :func:`build_apply_message` in :mod:`IPython.zmq.streamsession` wraps a function signature and builds the correct buffer format.
+Utility method :func:`build_apply_message` in :mod:`IPython.zmq.streamsession` wraps a
+function signature and builds the correct buffer format for minimal data copying (exactly
+zero copies of numpy array data).
Message type: ``apply_request``::
content = {
- 'bound' : True # whether to execute in the engine's namespace or unbound
+ 'bound' : True, # whether to execute in the engine's namespace or unbound
+ 'after' : [msg_ids,...], # list of msg_ids or output of Dependency.as_dict()
+ 'follow' : [msg_ids,...], # list of msg_ids or output of Dependency.as_dict()
+
}
buffers = ['...'] # at least 3 in length
# as built by build_apply_message(f,args,kwargs)
@@ -200,10 +250,22 @@ Message type: ``apply_reply``::
Implementation
--------------
-There are a few differences in implementation between the `StreamSession` object used in the parallel computing fork and the `Session` object, the main one being that messages are sent in parts, rather than as a single serialized object. `StreamSession` objects also take pack/unpack functions, which are to be used when serializing/deserializing objects. These can be any functions that translate to/from formats that ZMQ sockets can send (buffers,bytes, etc.).
+There are a few differences in implementation between the `StreamSession` object used in
+the parallel computing fork and the `Session` object, the main one being that messages are
+sent in parts, rather than as a single serialized object. `StreamSession` objects also
+take pack/unpack functions, which are to be used when serializing/deserializing objects.
+These can be any functions that translate to/from formats that ZMQ sockets can send
+(buffers,bytes, etc.).
Split Sends
***********
-Previously, messages were bundled as a single json object and one call to :func:`socket.send_json`. Since the controller inspects all messages, and doesn't need to see the content of the messages, which can be large, messages are serialized and sent in pieces. All messages are sent in at least 3 parts: the header, the parent header, and the content. This allows the controller to unpack and inspect the (always small) header, without spending time unpacking the content unless the message is bound for the controller. Buffers are added on to the end of the message, and can be any objects that present the buffer interface.
+Previously, messages were bundled as a single json object and one call to
+:func:`socket.send_json`. Since the controller inspects all messages, and doesn't need to
+see the content of the messages, which can be large, messages are now serialized and sent in
+pieces. All messages are sent in at least 3 parts: the header, the parent header, and the
+content. This allows the controller to unpack and inspect the (always small) header,
+without spending time unpacking the content unless the message is bound for the
+controller. Buffers are added on to the end of the message, and can be any objects that
+present the buffer interface.
View
6 docs/source/whatsnew/development.txt
@@ -69,11 +69,11 @@ New features
:mod:`~IPython.external.argparse` to parse command line options for
:command:`ipython`.
-* New top level :func:`~IPython.core.embed.embed` function that can be called
+* New top level :func:`~IPython.frontend.terminal.embed.embed` function that can be called
to embed IPython at any place in user's code. One the first call it will
- create an :class:`~IPython.core.embed.InteractiveShellEmbed` instance and
+ create an :class:`~IPython.frontend.terminal.embed.InteractiveShellEmbed` instance and
call it. In later calls, it just calls the previously created
- :class:`~IPython.core.embed.InteractiveShellEmbed`.
+ :class:`~IPython.frontend.terminal.embed.InteractiveShellEmbed`.
* Created a component system (:mod:`IPython.core.component`) that is based on
:mod:`IPython.utils.traitlets`. Components are arranged into a runtime
Please sign in to comment.
Something went wrong with that request. Please try again.