## Reduce

In this exercise you write an MPI program that computes the average of an array of elements using MPI_Reduce. 

The program takes the following steps:

1. Each process creates an array of generated random numbers. 

2. Each process computes the `local_sum` calculation. 

3. Reduce the `local_sum` to the root process (process 0) using `MPI_SUM`. 

4. The root process calculates final average. 

* TODO: Go to the exercise and rewrite progam using `MPI_Reduce` to compute a global average. 

In [None]:
?MPI::MPI_Reduce

***
#### C skeleton

In [None]:
#include <mpi.h>

In [None]:
%%executable  a.x -- -lmpi

int i, rank, size;
int num_elements_per_proc = 3;
int num_elements;
// seed the random number generator to get different results for each processor
srand(time(NULL)*rank);

MPI_Init(NULL, NULL);

MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);

// create a random array of elements on all processes
float *rand_nums = NULL;
rand_nums = (float *)malloc(sizeof(float) * num_elements_per_proc);
for (i = 0; i < num_elements_per_proc; i++) {
    rand_nums[i] = (rand() / (float)RAND_MAX);
}

// sum the numbers locally
float local_sum = 0.f;
// TODO
printf("Local sum for process %d : %f, avg = %f\n",
         rank, local_sum, local_sum / num_elements_per_proc);


// reduce all of the local sums into the global sum on root process
float global_sum;
// TODO: reduce

// print the result
if (rank == 0) {
    // TODO
}

// clean up
free(rand_nums);

MPI_Finalize();

Now compile it and run it with 4 processes. 

In [None]:
!mpirun -np 4 a.x

***
#### Python skeleton

In [None]:
%%file reduce.py
from mpi4py import MPI
import random

comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()
num_elements_per_proc = 3
    
# create a random array of elements on all processes
rand_nums = []
for i in range(0, num_elements_per_proc):
    rand_nums.append(random.uniform(0, 1))
        
# sum the numbers locally
local_sum = 0
# TODO
print("Local sum for process %d : %f, avg = %f" %
         (rank, local_sum, local_sum / num_elements_per_proc))

# reduce all of the local sums into the global sum on root process
global_sum = 0
# TODO: reduce

# print the result
if rank == 0 :
    # TODO:

Now compile it and run it with 4 processes. 

In [None]:
!mpirun -np 4 python reduce.py

***
#### Fortran skeleton

In [None]:
%%file reduce.f90
program reduce
use mpi

integer ( kind = 4 ) error
double precision :: local_sum, global_sum
double precision, allocatable, dimension (:) :: rand_nums
integer :: i, rank, size, num_elements_per_proc, num_elements
num_elements_per_proc = 3

call MPI_Init(error)
call MPI_Comm_rank(MPI_COMM_WORLD, rank, error)
call MPI_Comm_size(MPI_COMM_WORLD, size, error)
num_elements = num_elements_per_proc * size
    
! create a random array of elements on all processes
allocate(rand_nums(0:num_elements_per_proc-1))
do i = 0, num_elements_per_proc-1
    call random_number(rand_nums(i))
end do

! sum the numbers locally
local_sum = 0.0
! TODO
print *, "Local sum for process", rank, ":", local_sum, ", avg = ", (local_sum / num_elements_per_proc)

! reduce all of the local sums into the global sum on root process
global_sum = 0.0
! TODO: reduce

! print the result
if (rank == 0) then
    ! TODO
endif

call MPI_Finalize(error)
end

Now compile it and run it with 4 processes. 

In [None]:
!mpif90 reduce.f90 && mpirun -np 4 a.out

***

### You can compare with our solution:

***
#### C solution

In [None]:
%%executable  a.x -- -lmpi

int i, rank, size;
int num_elements_per_proc = 3;
int num_elements;
// seed the random number generator to get different results for each processor
srand(time(NULL)*rank);

MPI_Init(NULL, NULL);

MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);

// create a random array of elements on all processes
float *rand_nums = NULL;
rand_nums = (float *)malloc(sizeof(float) * num_elements_per_proc);
for (i = 0; i < num_elements_per_proc; i++) {
    rand_nums[i] = (rand() / (float)RAND_MAX);
}

// sum the numbers locally
float local_sum = 0.f;
for (i = 0; i < num_elements_per_proc; i++) {
    local_sum += rand_nums[i];
}
printf("Local sum for process %d : %f, avg = %f\n",
         rank, local_sum, local_sum / num_elements_per_proc);

// reduce all of the local sums into the global sum on root process
float global_sum;
MPI_Reduce(&local_sum, &global_sum, 1, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD);

// print the result
if (rank == 0) {
    printf("Total sum = %f, avg = %f\n", global_sum,
        global_sum / (size * num_elements_per_proc));
}

// clean up
free(rand_nums);

MPI_Finalize();

In [None]:
!mpirun -np 4 a.x

***
#### Python solution

In [None]:
%%file reduce.py
from mpi4py import MPI
import random

comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()
num_elements_per_proc = 3
    
# create a random array of elements on all processes
rand_nums = []
for i in range(0, num_elements_per_proc):
    rand_nums.append(random.uniform(0, 1))
        
# sum the numbers locally
local_sum = sum(rand_nums)
print("Local sum for process %d : %f, avg = %f" %
         (rank, local_sum, local_sum / num_elements_per_proc))

# reduce all of the local sums into the global sum on root process
global_sum = comm.reduce(local_sum, op=MPI.SUM, root=0)

# print the result
if rank == 0 :
    print("Total sum = %f, avg = %f" % (global_sum,
        global_sum / (size * num_elements_per_proc)))

In [None]:
!mpirun -np 4 python reduce.py

***
#### Fortran solution

In [None]:
%%file reduce.f90
program reduce
use mpi

integer ( kind = 4 ) error
double precision :: local_sum, global_sum
double precision, allocatable, dimension (:) :: rand_nums
integer :: i, rank, size, num_elements_per_proc, num_elements
num_elements_per_proc = 3

call MPI_Init(error)
call MPI_Comm_rank(MPI_COMM_WORLD, rank, error)
call MPI_Comm_size(MPI_COMM_WORLD, size, error)
num_elements = num_elements_per_proc * size
    
! create a random array of elements on all processes
allocate(rand_nums(0:num_elements_per_proc-1))
do i = 0, num_elements_per_proc-1
    call random_number(rand_nums(i))
end do

! sum the numbers locally
local_sum = 0.0
do i = 0, num_elements_per_proc-1
    local_sum = local_sum + rand_nums(i)
end do
print *, "Local sum for process", rank, ":", local_sum, ", avg = ", (local_sum / num_elements_per_proc)

! reduce all of the local sums into the global sum on root process
global_sum = 0.0
call MPI_Reduce(local_sum, global_sum, 1, MPI_DOUBLE_PRECISION, MPI_SUM, 0, MPI_COMM_WORLD, error)

! print the result
if (rank == 0) then
    print *, "Total sum = ", global_sum, "avg = ", (global_sum / num_elements)
endif

call MPI_Finalize(error)
end

In [None]:
!mpif90 reduce.f90 && mpirun -np 4 a.out