# More efficient data movement with MPI

Just like [we did](memmap.ipynb) manually with memmap,
you can move data more efficiently with MPI by sending it to just one engine,
and using MPI to broadcast it to the rest of the engines.


In [None]:
import socket
import os, sys, re

import numpy as np

import ipyparallel as ipp

For this demo, I will connect to a cluster with engines started with MPI.

One way to do so would be:

    ipcluster start -n 32 --engines=MPI --profile mpi
    
In this directory is a docker-compose file to simulate multiple engine sets in 

In [None]:
rc = ipp.Client(profile="mpi")
rc.wait_for_engines(32)
eall = rc.broadcast_view(coalescing=True)
root = rc[0]

In [None]:
rc.ids

In [None]:
root['a'] = 5

In [None]:
%px from mpi4py.MPI import COMM_WORLD as MPI

In [None]:
mpi_ranks = eall.apply_async(lambda : MPI.Get_rank()).get_dict()
root_rank = root.apply_sync(lambda : MPI.Get_rank())
mpi_ranks

In [None]:
sz = 512
data = np.random.random((sz, sz))
data.nbytes // (1024 * 1024)

In [None]:
%%time 
ar = eall.push({'data': data}, block=False)
ar.wait_interactive()


In [None]:
@ipp.interactive
def _bcast(key, root_rank):
    """function to run on engines as part of broadcast"""
    g = globals()
    obj = g.get(key, None)
    obj = MPI.bcast(obj, root_rank)
    g[key] = obj

def broadcast(key, obj, dv, root, root_rank):
    """More efficient broadcast by doing push to root,
    and MPI broadcast to other engines.
    
    Still O(N) messages, but all but one message is always small.
    """
    root.push({key : obj}, block=False)
    return dv.apply_async(_bcast, key, root_rank)

In [None]:
%%time
ar = broadcast('data', data, eall, root, root_rank)
ar.wait_interactive()

In [None]:
%%px
import numpy as np
np.linalg.norm(data, 2)
