## Standard deviation

In this exercise you will write a MPI program that computes the standard deviation of an array of numbers in parallel using MPI_Reduce and MPI_Allreduce. 

The program takes the following steps:

1. Each process creates an array of generated random numbers.

2. Each process computes the `local_sum` and sums them using `MPI_Allreduce`. 

3. After the `global_sum` is available on all processes, each process computes the `mean` so that `local_sq_diff` can be computed. 

4. Reduce the `local_sq_diff` to the root process (process 0). 

5. The root process calculates the standard deviation by taking the square root of the mean of the global squared differences. 

* TODO: Rewrite progam using MPI_Reduce and MPI_Allreduce to compute the standard deviation. 

In [None]:
?MPI::MPI_Allreduce

***
#### C skeleton

In [None]:
%%file deviation.c
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>
#include <math.h>
#include <mpi.h>

int main()
{
    int i, rank, size;
    int num_elements_per_proc = 3;

    MPI_Init(NULL, NULL);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);

    // seed the random number generator uniquely for each processor
    srand(time(NULL)*rank);
    // create a random array of elements on all processes
    float *rand_nums = NULL;
    rand_nums = (float *)malloc(sizeof(float) * num_elements_per_proc);
    for (i = 0; i < num_elements_per_proc; i++) {
        rand_nums[i] = (rand() / (float)RAND_MAX);
    }

    // sum the numbers locally
    float local_sum = 0;
    for (i = 0; i < num_elements_per_proc; i++) {
        local_sum += rand_nums[i];
    }

    // reduce all of the local sums into the global sum
    // in order to calculate the mean
    float global_sum;
    // TODO: reduce
    float mean = global_sum / (num_elements_per_proc * size);

    // compute the local sum of the squared differences from the mean
    float local_sq_diff = 0;
    // TODO: calculate the sum of squared differences

    // reduce the global sum of the squared differences to the root process
    float global_sq_diff;
    // TODO: reduce

    // standard deviation is the square root of the mean of the squared differences
    // print the result
    if (rank == 0) {
        float stddev = sqrt(global_sq_diff / (num_elements_per_proc * size));
        printf("Mean - %f, Standard deviation = %f\n", mean, stddev);
    }

    // clean up
    free(rand_nums);

    MPI_Finalize();
}

Now compile it and run it with 4 processes. 

In [None]:
!mpicc deviation.c -lm && mpirun -np 4 --allow-run-as-root a.out

***
#### Python skeleton

In [None]:
%%file deviation.py
from mpi4py import MPI
import random
import math

comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()
num_elements_per_proc = 3
    
# create a random array of elements on all processes
rand_nums = []
for i in range(0, num_elements_per_proc):
    rand_nums.append(random.uniform(0, 1))
        
# sum the numbers locally
local_sum = sum(rand_nums)

# reduce all of the local sums into the global sum
# in order to calculate the mean
# TODO: reduce to global_sum
mean = global_sum / (num_elements_per_proc * size)

# compute the local sum of the squared differences from the mean
local_sq_diff = 0
# TODO: calculate the sum of squared differences

# reduce the global sum of the squared differences to the root process
# TODO: reduce to global_sq_diff

# standard deviation is the square root of the mean of the squared differences
# print the result
if rank == 0 :
    stddev = math.sqrt(global_sq_diff / (num_elements_per_proc * size))
    print("Mean - %f, Standard deviation = %f" % (mean, stddev))

Now compile it and run it with 4 processes. 

In [None]:
!mpirun -np 4 --allow-run-as-root python deviation.py

***
#### Fortran skeleton

In [None]:
%%file deviation.f90
program deviation
use mpi

integer ( kind = 4 ) error
double precision :: local_sum, global_sum, mean, local_sq_diff, global_sq_diff, stddev
double precision, allocatable, dimension (:) :: rand_nums
integer :: i, rank, size, num_elements_per_proc
num_elements_per_proc = 3

call MPI_Init(error)
call MPI_Comm_rank(MPI_COMM_WORLD, rank, error)
call MPI_Comm_size(MPI_COMM_WORLD, size, error)

! create a random array of elements on all processes
allocate(rand_nums(0:num_elements_per_proc-1))
do i = 0, num_elements_per_proc-1
    call random_number(rand_nums(i))
end do

! sum the numbers locally
local_sum = 0.0
do i = 0, num_elements_per_proc-1
    local_sum = local_sum + rand_nums(i)
end do

! reduce all of the local sums into the global sum
! in order to calculate the mean
! TODO: reduce to global_sum
mean = global_sum / (num_elements_per_proc * size)

! compute the local sum of the squared differences from the mean
local_sq_diff = 0.0
! TODO: calculate the sum of squared differences
    
! reduce the global sum of the squared differences to the root process
! TODO: reduce to global_sq_diff

! standard deviation is the square root of the mean of the squared differences
! print the result
if (rank == 0) then
    stddev = sqrt(global_sq_diff / (num_elements_per_proc * size))
    print *, "Mean = ", mean, "Standard deviation = ", stddev
endif

call MPI_Finalize(error)
end

Now compile it and run it with 4 processes. 

In [None]:
!mpif90 deviation.f90 && mpirun -np 4 --allow-run-as-root a.out

***

### You can compare with our solution:

***
#### C solution

In [None]:
%%file deviation.c
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>
#include <math.h>
#include <mpi.h>

int main()
{
    int i, rank, size;
    int num_elements_per_proc = 3;

    MPI_Init(NULL, NULL);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);

    // seed the random number generator uniquely for each processor
    srand(time(NULL)*rank);
    // create a random array of elements on all processes
    float *rand_nums = NULL;
    rand_nums = (float *)malloc(sizeof(float) * num_elements_per_proc);
    for (i = 0; i < num_elements_per_proc; i++) {
        rand_nums[i] = (rand() / (float)RAND_MAX);
    }

    // sum the numbers locally
    float local_sum = 0;
    for (i = 0; i < num_elements_per_proc; i++) {
        local_sum += rand_nums[i];
    }

    // reduce all of the local sums into the global sum
    // in order to calculate the mean
    float global_sum;
    MPI_Allreduce(&local_sum, &global_sum, 1, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD);
    float mean = global_sum / (num_elements_per_proc * size);

    // compute the local sum of the squared differences from the mean
    float local_sq_diff = 0;
    for (i = 0; i < num_elements_per_proc; i++) {
        local_sq_diff += (rand_nums[i] - mean) * (rand_nums[i] - mean);
    }

    // reduce the global sum of the squared differences to the root process
    float global_sq_diff=0;
    MPI_Reduce(&local_sq_diff, &global_sq_diff, 1, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD);

    // standard deviation is the square root of the mean of the squared differences
    // print the result
    if (rank == 0) {
        float stddev = sqrt(global_sq_diff / (num_elements_per_proc * size));
        printf("Mean - %f, Standard deviation = %f\n", mean, stddev);
    }

    // clean up
    free(rand_nums);

    MPI_Finalize();
}

In [None]:
!mpicc deviation.c -lm && mpirun -np 4 --allow-run-as-root a.out

***
#### Python solution

In [None]:
%%file deviation.py
from mpi4py import MPI
import random
import math

comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()
num_elements_per_proc = 3
    
# create a random array of elements on all processes
rand_nums = []
for i in range(0, num_elements_per_proc):
    rand_nums.append(random.uniform(0, 1))
        
# sum the numbers locally
local_sum = sum(rand_nums)

# reduce all of the local sums into the global sum
# in order to calculate the mean
global_sum = comm.allreduce(local_sum, op=MPI.SUM)
mean = global_sum / (num_elements_per_proc * size)

# compute the local sum of the squared differences from the mean
local_sq_diff = 0
for i in range(0, num_elements_per_proc-1):
    local_sq_diff += (rand_nums[i] - mean) * (rand_nums[i] - mean)

# reduce the global sum of the squared differences to the root process
global_sq_diff = comm.reduce(local_sq_diff, op=MPI.SUM, root=0)

# standard deviation is the square root of the mean of the squared differences
# print the result
if rank == 0 :
    stddev = math.sqrt(global_sq_diff / (num_elements_per_proc * size))
    print("Mean - %f, Standard deviation = %f" % (mean, stddev))

In [None]:
!mpirun -np 4 --allow-run-as-root python deviation.py

***
#### Fortran solution

In [None]:
%%file deviation.f90
program deviation
use mpi

integer ( kind = 4 ) error
double precision :: local_sum, global_sum, mean, local_sq_diff, global_sq_diff, stddev
double precision, allocatable, dimension (:) :: rand_nums
integer :: i, rank, size, num_elements_per_proc
num_elements_per_proc = 3

call MPI_Init(error)
call MPI_Comm_rank(MPI_COMM_WORLD, rank, error)
call MPI_Comm_size(MPI_COMM_WORLD, size, error)

! create a random array of elements on all processes
allocate(rand_nums(0:num_elements_per_proc-1))
do i = 0, num_elements_per_proc-1
    call random_number(rand_nums(i))
end do

! sum the numbers locally
local_sum = 0.0
do i = 0, num_elements_per_proc-1
    local_sum = local_sum + rand_nums(i)
end do

! reduce all of the local sums into the global sum
! in order to calculate the mean
call MPI_AllReduce(local_sum, global_sum, 1, MPI_DOUBLE_PRECISION, MPI_SUM, MPI_COMM_WORLD, error)
mean = global_sum / (num_elements_per_proc * size)

! compute the local sum of the squared differences from the mean
local_sq_diff = 0.0
do i = 0, num_elements_per_proc-1
    local_sq_diff = local_sq_diff + (rand_nums(i) - mean)*(rand_nums(i) - mean)
end do
    
! reduce the global sum of the squared differences to the root process
call MPI_Reduce(local_sq_diff, global_sq_diff, 1, MPI_DOUBLE_PRECISION, MPI_SUM, 0, MPI_COMM_WORLD, error)

! standard deviation is the square root of the mean of the squared differences
! print the result
if (rank == 0) then
    stddev = sqrt(global_sq_diff / (num_elements_per_proc * size))
    print *, "Mean = ", mean, "Standard deviation = ", stddev
endif

call MPI_Finalize(error)
end

In [None]:
!mpif90 deviation.f90 && mpirun -np 4 --allow-run-as-root a.out