## Scatter and gather

In this exercise you write an MPI program that computes the average of an array of elements using MPI_Scatter and MPI_Gather. 

The program takes the following steps:

1. The root process (process 0) creates an array of generated random numbers. 

2. Scatter the random numbers from the root process to all other processes, giving each process an equal amount of numbers.

3. Each process computes the average of their subset of numbers.

4. The root process gathers each individual average and it computes the total average on now a much smaller array of numbers. 

First, think about how would you solve this exercise without MPI_Scatter and MPI_Gather routines. 

* TODO: Look at the provided skeleton. Use `MPI_Scatter` and `MPI_Gather` routines and compute missing final average result to solve the exercise. 

In [None]:
?MPI::MPI_Scatter

In [None]:
?MPI::MPI_Gather

***
#### C skeleton

In [None]:
%%file gather.c
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>
#include <mpi.h>

int main()
{
    int i, rank, size;
    int num_elements_per_proc = 3;
    int num_elements;
    // seed the random number generator
    srand(time(NULL));

    MPI_Init(NULL, NULL);

    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);

    // create a random array of elements on the root process
    // total size will be the number of elements per process times the number of processes
    float *rand_nums = NULL;
    if (rank == 0) {
        num_elements = num_elements_per_proc * size;
        rand_nums = (float *)malloc(sizeof(float) * num_elements);
        for (i = 0; i < num_elements; i++) {
            rand_nums[i] = (rand() / (float)RAND_MAX);
        }
    }

    // for each process, create a buffer that will hold a subset of the array
    float *sub_rand_nums = (float *)malloc(sizeof(float) * num_elements_per_proc);

    // scatter the random numbers from the root process to all other processes
    // TODO: scatter

    // compute the average of your subset
    float sub_avg = 0.f;
    for (i = 0; i < num_elements_per_proc; i++) {
        sub_avg += sub_rand_nums[i];
    }
    sub_avg = sub_avg / num_elements_per_proc;
    printf("I am process %i out of %i, average result = %f \n", rank, size, sub_avg);

    // gather all partial averages down to the root process
    float *sub_avgs = NULL;
    if (rank == 0) {
        // memory allocation needed only on root process
        sub_avgs = (float *)malloc(sizeof(float) * size);
    }
    // TODO: gather

    // compute the total average of all numbers from the partial averages
    if (rank == 0) {
        float avg = 0.f;
        // TODO: compute final average of values in sub_avgs
        printf("Avg of all elements is %f\n", avg);

        // compute average across the original data for comparison
        float original_data_avg = 0.f;
        for (i = 0; i < num_elements; i++) {
            original_data_avg += rand_nums[i];
        }
        original_data_avg = original_data_avg / num_elements;
        printf("Avg computed across original data is %f\n", original_data_avg);

    }

    // Clean up
    if (rank == 0) {
        free(rand_nums);
        free(sub_avgs);
    }
    free(sub_rand_nums);

    MPI_Finalize();
}

When you are done, compile it and run with 4 processes. 

In [None]:
!mpicc gather.c -o gather && mpirun -np 4 --allow-run-as-root gather

***
#### Python skeleton

In [None]:
%%file gather.py
from mpi4py import MPI
import random

comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()
num_elements_per_proc = 3
num_elements = num_elements_per_proc * size
    
# create a random array of elements on the root process
# total size will be the number of elements per process times the number of processes
# in python you can only scatter as many elements as you have processors
# so arrange the array like this: [[1, 2], [3, 4], ...]
rand_nums = []
if rank == 0:
    for i in range(0, size):
        temp = []
        for j in range(0, num_elements_per_proc):
            temp.append(random.uniform(0, 1))
        rand_nums.append(temp)

# scatter the random numbers from the root process to all other processes
# TODO: scatter and save to sub_rand_nums
sub_rand_nums = [0] * size
                
# compute the average of your subset
sub_avg = sum(sub_rand_nums) / len(sub_rand_nums)
print("I am process %i out of %i, average result = %f" % (rank, size, sub_avg));

# gather all partial averages down to the root process
sub_avgs = []
if rank == 0:
    sub_avgs = [None] * size
# TODO: gather

# compute the total average of all numbers from the partial averages
if rank == 0:
    avg = 0
    # TODO: compute final average of values in sub_avgs
    print("Avg of all elements is %f" % avg);

    # compute average across the original data for comparison
    original_data_avg = 0
    for i in rand_nums:
        for j in i:
            original_data_avg += j
    original_data_avg /= num_elements
    print("Avg computed across original data is %f" % original_data_avg);

When you are done, compile it and run with 4 processes. 

In [None]:
!mpirun -np 4 --allow-run-as-root python gather.py

***
#### Fortran skeleton

In [None]:
%%file gather.f90
program gather
use mpi

integer ( kind = 4 ) error
!--integer :: n
double precision :: sub_avg, avg
double precision, allocatable, dimension (:) :: rand_nums, sub_rand_nums, sub_avgs
integer :: i, rank, size, num_elements_per_proc, num_elements
num_elements_per_proc = 3

call MPI_Init(error)
call MPI_Comm_rank(MPI_COMM_WORLD, rank, error)
call MPI_Comm_size(MPI_COMM_WORLD, size, error)
    
! create a random array of elements on the root process
! total size will be the number of elements per process times the number of processes
if (rank == 0) then
    num_elements = num_elements_per_proc * size
    allocate(rand_nums(0:num_elements-1))
    do i = 0, num_elements-1
        call random_number(rand_nums(i))
    end do
endif

! for each process, create a buffer that will hold a subset of the array
allocate(sub_rand_nums(0:num_elements_per_proc-1))

! scatter the random numbers from the root process to all other processes
! TODO: scatter

! compute the average of your subset
sub_avg = 0.0
do i = 0, num_elements_per_proc-1
    sub_avg = sub_avg + sub_rand_nums(i)
end do
sub_avg = sub_avg / num_elements_per_proc
print *, "I am process", rank, "out of", size, ", average result = ", sub_avg
    
! gather all partial averages down to the root process
if (rank == 0) then
    ! memory allocation needed only on root process
    allocate(sub_avgs(0:size-1))
endif
! TODO: gather

! compute the total average of all numbers from the partial averages 
if (rank == 0) then
    avg = 0.0
    ! TODO: compute final average of values in sub_avgs
    print *, "Avg of all elements is", avg
    
    ! compute average across the original data for comparison
    original_data_avg = 0.0
    do i = 0, num_elements-1
        original_data_avg = original_data_avg + rand_nums(i)
    end do
    original_data_avg = original_data_avg / num_elements
    print *, "Avg computed across original data is", original_data_avg
endif

call MPI_Finalize(error)
end

When you are done, compile it and run with 4 processes. 

In [None]:
!mpif90 gather.f90 && mpirun -np 4 --allow-run-as-root a.out

***

### You can compare with our solution:

***
#### C solution

In [None]:
%%file gather.c
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>
#include <mpi.h>

int main()
{
    int i, rank, size;
    int num_elements_per_proc = 3;
    int num_elements;
    // seed the random number generator
    srand(time(NULL));

    MPI_Init(NULL, NULL);

    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);

    // create a random array of elements on the root process
    // total size will be the number of elements per process times the number of processes
    float *rand_nums = NULL;
    if (rank == 0) {
        num_elements = num_elements_per_proc * size;
        rand_nums = (float *)malloc(sizeof(float) * num_elements);
        for (i = 0; i < num_elements; i++) {
            rand_nums[i] = (rand() / (float)RAND_MAX);
        }
    }

    // for each process, create a buffer that will hold a subset of the array
    float *sub_rand_nums = (float *)malloc(sizeof(float) * num_elements_per_proc);

    // scatter the random numbers from the root process to all other processes
    MPI_Scatter(rand_nums, num_elements_per_proc, MPI_FLOAT, sub_rand_nums,
              num_elements_per_proc, MPI_FLOAT, 0, MPI_COMM_WORLD);

    // compute the average of your subset
    float sub_avg = 0.f;
    for (i = 0; i < num_elements_per_proc; i++) {
        sub_avg += sub_rand_nums[i];
    }
    sub_avg = sub_avg / num_elements_per_proc;
    printf("I am process %i out of %i, average result = %f \n", rank, size, sub_avg);

    // gather all partial averages down to the root process
    float *sub_avgs = NULL;
    if (rank == 0) {
        // memory allocation needed only on root process
        sub_avgs = (float *)malloc(sizeof(float) * size);
    }
    MPI_Gather(&sub_avg, 1, MPI_FLOAT, sub_avgs, 1, MPI_FLOAT, 0, MPI_COMM_WORLD);

    // compute the total average of all numbers from the partial averages
    if (rank == 0) {
        float avg = 0.f;
        for (i = 0; i < size; i++) {
            avg += sub_avgs[i];
        }
        avg = avg / size;
        printf("Avg of all elements is %f\n", avg);

        // compute average across the original data for comparison
        float original_data_avg = 0.f;
        for (i = 0; i < num_elements; i++) {
            original_data_avg += rand_nums[i];
        }
        original_data_avg = original_data_avg / num_elements;
        printf("Avg computed across original data is %f\n", original_data_avg);

    }

    // Clean up
    if (rank == 0) {
        free(rand_nums);
        free(sub_avgs);
    }
    free(sub_rand_nums);

    MPI_Finalize();
}

In [None]:
!mpicc gather.c -o gather && mpirun -np 4 --allow-run-as-root gather

***
#### Python solution

In [None]:
%%file gather.py
from mpi4py import MPI
import random

comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()
num_elements_per_proc = 3
num_elements = num_elements_per_proc * size
    
# create a random array of elements on the root process
# total size will be the number of elements per process times the number of processes
# in python you can only scatter as many elements as you have processors
# so arrange the array like this: [[1, 2], [3, 4], ...]
rand_nums = []
if rank == 0:
    for i in range(0, size):
        temp = []
        for j in range(0, num_elements_per_proc):
            temp.append(random.uniform(0, 1))
        rand_nums.append(temp)

# scatter the random numbers from the root process to all other processes
sub_rand_nums = comm.scatter(rand_nums, root=0)
                
# compute the average of your subset
sub_avg = sum(sub_rand_nums) / len(sub_rand_nums)
print("I am process %i out of %i, average result = %f" % (rank, size, sub_avg));

# gather all partial averages down to the root process
sub_avgs = []
if rank == 0:
    sub_avgs = [None] * size
sub_avgs = comm.gather(sub_avg, root=0)

# compute the total average of all numbers from the partial averages
if rank == 0:
    avg = sum(sub_avgs) / len(sub_avgs)
    print("Avg of all elements is %f" % avg);

    # compute average across the original data for comparison
    original_data_avg = 0
    for i in rand_nums:
        for j in i:
            original_data_avg += j
    original_data_avg /= num_elements
    print("Avg computed across original data is %f" % original_data_avg);

In [None]:
!mpirun -np 4 --allow-run-as-root python gather.py

***
#### Fortran solution

In [None]:
%%file gather.f90
program gather
use mpi

integer ( kind = 4 ) error
!--integer :: n
double precision :: sub_avg, avg
double precision, allocatable, dimension (:) :: rand_nums, sub_rand_nums, sub_avgs
integer :: i, rank, size, num_elements_per_proc, num_elements
num_elements_per_proc = 3

call MPI_Init(error)
call MPI_Comm_rank(MPI_COMM_WORLD, rank, error)
call MPI_Comm_size(MPI_COMM_WORLD, size, error)
    
! create a random array of elements on the root process
! total size will be the number of elements per process times the number of processes
if (rank == 0) then
    num_elements = num_elements_per_proc * size
    allocate(rand_nums(0:num_elements-1))
    do i = 0, num_elements-1
        call random_number(rand_nums(i))
    end do
endif

! for each process, create a buffer that will hold a subset of the array
allocate(sub_rand_nums(0:num_elements_per_proc-1))

! scatter the random numbers from the root process to all other processes
call MPI_Scatter(rand_nums, num_elements_per_proc, MPI_DOUBLE_PRECISION, sub_rand_nums, &
&                num_elements_per_proc, MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, error)

! compute the average of your subset
sub_avg = 0.0
do i = 0, num_elements_per_proc-1
    sub_avg = sub_avg + sub_rand_nums(i)
end do
sub_avg = sub_avg / num_elements_per_proc
print *, "I am process", rank, "out of", size, ", average result = ", sub_avg
    
! gather all partial averages down to the root process
if (rank == 0) then
    ! memory allocation needed only on root process
    allocate(sub_avgs(0:size-1))
endif
call MPI_Gather(sub_avg, 1, MPI_DOUBLE_PRECISION, sub_avgs, 1, MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, error)

! compute the total average of all numbers from the partial averages 
if (rank == 0) then
    avg = 0.0
    do i = 0, size-1
        avg = avg + sub_avgs(i)
    end do
    avg = avg / size
    print *, "Avg of all elements is", avg
    
    ! compute average across the original data for comparison
    original_data_avg = 0.0
    do i = 0, num_elements-1
        original_data_avg = original_data_avg + rand_nums(i)
    end do
    original_data_avg = original_data_avg / num_elements
    print *, "Avg computed across original data is", original_data_avg
endif

call MPI_Finalize(error)
end

In [None]:
!mpif90 gather.f90 && mpirun -np 4 --allow-run-as-root a.out