## Broadcast

Complete the program. 

1. Add the `MPI_Bcast` routine to broadcast an array with 10.000.000 numbers from process with rank 0. 

2. Write your own broadcast function `my_Bcast` using `MPI_Send` and `MPI_Recv` routines. 

3. Measure the time of both routines using MPI function `MPI_Wtime` when running with 2, 4, 8 processors. What do you make of the differences at different scales?

In [None]:
?MPI::MPI_Bcast

In [None]:
?MPI::MPI_Wtime

***
#### C skeleton

In [None]:
%%file bcast.c
#include <stdio.h>
#include <mpi.h>

void my_Bcast(void* data, int count, MPI_Datatype datatype, int root, MPI_Comm communicator) 
{
    int rank, size, i;
    MPI_Comm_rank(communicator, &rank);
    MPI_Comm_size(communicator, &size);
    
    // TODO:
    // If we are the root process, send our data to everyone
    // If we are a receiver process, receive the data from the root
    
}

int main()
{
    int rank, i;
    int num_elements = 10000000; // size of array
    int num_trials = 10; // number of timing experiments

    MPI_Init(NULL, NULL);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);

    double total_my_bcast_time = 0.0;
    double total_mpi_bcast_time = 0.0;
    int* data = (int*)malloc(sizeof(int) * num_elements); // create array

    for (i = 0; i < num_trials; i++) {
        // TODO:
        // broadcast with MPI_Bcast
        // time MPI_Bcast
        // synchronize before starting timing and before obtaining final time

        // TODO:
        // broadcast with my_Bcast
        // time my_Bcast

    }

    // Print resulting times
    if (rank == 0) {
        printf("Avg my_Bcast time = %lf\n", total_my_bcast_time / num_trials);
        printf("Avg MPI_Bcast time = %lf\n", total_mpi_bcast_time / num_trials);
    }

    free(data);
    MPI_Finalize();
}

Now compile it and run it with 4 processes. 

In [None]:
!mpicc bcast.c -o bcast && mpirun -np 2 --allow-run-as-root bcast

***
#### Python skeleton

In [None]:
%%file bcast.py
from mpi4py import MPI

def my_Bcast(root):
    global data
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    size = comm.Get_size()
    # TODO:
    # If we are the root process, send our data to everyone
    # If we are a receiver process, receive the data from the root
        
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
num_elements = 10000000
num_trials = 10
        
total_my_bcast_time = 0.0
total_mpi_bcast_time = 0.0
data = [None] * num_elements

for i in range(0, num_trials):
    pass
    # TODO:
    # broadcast with MPI_Bcast
    # time MPI_Bcast
    # synchronize before starting timing and obtaining final time
    
    # TODO:
    # broadcast with my_Bcast
    # time my_Bcast

# print resulting times
if rank == 0:
    print("Avg my_Bcast time = %lf" % (total_my_bcast_time / num_trials))
    print("Avg MPI_Bcast time = %lf" % (total_mpi_bcast_time / num_trials))

Now compile it and run it with 4 processes. 

In [None]:
!mpirun -np 2 --allow-run-as-root python bcast.py

***
#### Fortran skeleton

In [None]:
%%file bcast.f90
subroutine my_Bcast(data, count, datatype, root, communicator)
    double precision, dimension (num_elements) :: data
    integer :: count, root
    integer :: datatype, communicator
    
    integer ( kind = 4 ) error
    integer ( kind = 4 ) rank, size
    integer :: i
    call MPI_Comm_rank(communicator, rank, error)
    call MPI_Comm_size(communicator, size, error)
    
    ! TODO:
    ! If we are the root process, send our data to everyone
    ! If we are a receiver process, receive the data from the root
    
end

program bcast
use mpi

integer ( kind = 4 ) error
integer ( kind = 4 ) rank
integer :: i, num_elements, num_trials
double precision :: total_my_bcast_time, total_mpi_bcast_time
double precision, allocatable, dimension (:) :: data
num_elements = 10000000
num_trials = 10
    
call MPI_Init(error)
call MPI_Comm_rank(MPI_COMM_WORLD, rank, error)

total_my_bcast_time = 0
total_mpi_bcast_time = 0
allocate(data(0:num_elements-1)) ! create array

do i = 0, num_trials-1
    ! TODO:
    ! broadcast with MPI_Bcast
    ! time MPI_Bcast
    ! synchronize before starting timing
    
    ! TODO:
    ! broadcast with my_Bcast
    ! time my_Bcast
end do

! print resulting times
if (rank .eq. 0) then
    print *, "Avg my_Bcast time = ", (total_my_bcast_time / num_trials)
    print *, "Avg MPI_Bcast time = ", (total_mpi_bcast_time / num_trials)
end if

call MPI_Finalize(error)
end

Now compile it and run it with 4 processes. 

In [None]:
!mpif90 bcast.f90 && mpirun -np 2 --allow-run-as-root a.out

***

### You can compare with our solution:

***
#### C solution

In [None]:
%%file bcast.c
#include <stdio.h>
#include <mpi.h>

void my_Bcast(void* data, int count, MPI_Datatype datatype, int root, MPI_Comm communicator) 
{
    int rank, size, i;
    MPI_Comm_rank(communicator, &rank);
    MPI_Comm_size(communicator, &size);

    // If we are the root process, send our data to everyone
    if (rank == root) {
        for (i = 0; i < size; i++) {
            if (i != rank) {
                MPI_Send(data, count, datatype, i, 0, communicator);
            }
        }
    } else {
    // If we are a receiver process, receive the data from the root
        MPI_Recv(data, count, datatype, root, 0, communicator, MPI_STATUS_IGNORE);
    }
}

int main()
{
    int rank, i;
    int num_elements = 10000000; // size of array
    int num_trials = 10;

    MPI_Init(NULL, NULL);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);

    double total_my_bcast_time = 0.0;
    double total_mpi_bcast_time = 0.0;
    int* data = (int*)malloc(sizeof(int) * num_elements); // create array

    for (i = 0; i < num_trials; i++) {
        // Time MPI_Bcast
        // Synchronize before starting timing
        MPI_Barrier(MPI_COMM_WORLD);
        total_mpi_bcast_time -= MPI_Wtime();
        MPI_Bcast(data, num_elements, MPI_INT, 0, MPI_COMM_WORLD);
        // Synchronize again before obtaining final time
        MPI_Barrier(MPI_COMM_WORLD);
        total_mpi_bcast_time += MPI_Wtime();

        // Time my_Bcast
        MPI_Barrier(MPI_COMM_WORLD);
        total_my_bcast_time -= MPI_Wtime();
        my_Bcast(data, num_elements, MPI_INT, 0, MPI_COMM_WORLD);
        MPI_Barrier(MPI_COMM_WORLD);
        total_my_bcast_time += MPI_Wtime();
    }

    // Print resulting times
    if (rank == 0) {
        printf("Avg my_Bcast time = %lf\n", total_my_bcast_time / num_trials);
        printf("Avg MPI_Bcast time = %lf\n", total_mpi_bcast_time / num_trials);
    }

    free(data);
    MPI_Finalize();
}

In [None]:
!mpicc bcast.c -o bcast && mpirun -np 2 --allow-run-as-root bcast

***
#### Python solution

In [None]:
%%file bcast.py
from mpi4py import MPI

def my_Bcast(root):
    global data
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    size = comm.Get_size()
    
    # If we are the root process, send our data to everyone
    if rank == root:
        for i in range(0, size):
            if i != rank:
                comm.send(data, dest=i)
    else:
    # If we are a receiver process, receive the data from the root
        data = comm.recv(source=0)
        
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
num_elements = 10000000
num_trials = 10
        
total_my_bcast_time = 0.0
total_mpi_bcast_time = 0.0
data = [None] * num_elements

for i in range(0, num_trials):
    # Time MPI_Bcast
    # Synchronize before starting timing
    comm.Barrier()
    total_mpi_bcast_time -= MPI.Wtime()
    data = comm.bcast(data, root=0)
    # Synchronize again before obtaining final time
    comm.Barrier()
    total_mpi_bcast_time += MPI.Wtime()
        
    # Time my_Bcast
    comm.Barrier()
    total_my_bcast_time -= MPI.Wtime()
    my_Bcast(0)
    comm.Barrier()
    total_my_bcast_time += MPI.Wtime()

# Print resulting times
if rank == 0:
    print("Avg my_Bcast time = %lf" % (total_my_bcast_time / num_trials))
    print("Avg MPI_Bcast time = %lf" % (total_mpi_bcast_time / num_trials))

In [None]:
!mpirun -np 2 --allow-run-as-root python bcast.py

***
#### Fortran solution

In [None]:
%%file bcast.f90
subroutine my_Bcast(data, count, datatype, root, communicator)
    double precision, dimension (num_elements) :: data
    integer :: count, root
    integer :: datatype, communicator
    
    integer ( kind = 4 ) error
    integer ( kind = 4 ) rank, size
    integer :: i
    call MPI_Comm_rank(communicator, rank, error)
    call MPI_Comm_size(communicator, size, error)
    
    ! If we are the root process, send our data to everyone
    if (rank .eq. root) then
        do i = 0, size-1
            if (i /= rank) then
                call MPI_Send(data, count, datatype, i, 0, communicator, error)
            end if
        end do
    else
        ! If we are a receiver process, receive the data from the root
        call MPI_Recv(data, count, datatype, root, 0, communicator, MPI_STATUS_IGNORE, error)
    end if   
end

program bcast
use mpi

integer ( kind = 4 ) error
integer ( kind = 4 ) rank
integer :: i, num_elements, num_trials
double precision :: total_my_bcast_time, total_mpi_bcast_time
double precision, allocatable, dimension (:) :: data
num_elements = 10000000
num_trials = 10
    
call MPI_Init(error)
call MPI_Comm_rank(MPI_COMM_WORLD, rank, error)

total_my_bcast_time = 0
total_mpi_bcast_time = 0
allocate(data(0:num_elements-1)) ! create array

do i = 0, num_trials-1
    ! Time MPI_Bcast
    ! Synchronize before starting timing
    call MPI_Barrier(MPI_COMM_WORLD, error)
    total_mpi_bcast_time = total_mpi_bcast_time - MPI_Wtime()
    call MPI_Bcast(data, num_elements, MPI_INT, 0, MPI_COMM_WORLD, error)
    ! Synchronize again before obtaining final time
    call MPI_Barrier(MPI_COMM_WORLD, error)
    total_mpi_bcast_time = total_mpi_bcast_time + MPI_Wtime()   
    
    ! Time my_Bcast
    call MPI_Barrier(MPI_COMM_WORLD, error)
    total_my_bcast_time = total_my_bcast_time - MPI_Wtime()
    call my_Bcast(data, num_elements, MPI_INT, 0, MPI_COMM_WORLD)
    call MPI_Barrier(MPI_COMM_WORLD, error)
    total_my_bcast_time = total_my_bcast_time + MPI_Wtime()
end do

! print resulting times
if (rank .eq. 0) then
    print *, "Avg my_Bcast time = ", (total_my_bcast_time / num_trials)
    print *, "Avg MPI_Bcast time = ", (total_mpi_bcast_time / num_trials)
end if

call MPI_Finalize(error)
end

In [None]:
!mpif90 bcast.f90 && mpirun -np 2 --allow-run-as-root a.out