# Lecture 24 : Parallel Message Passing and Introduction to Collective Communication

# Part 1 : Parallel Message Passing

## Let's revisit the MPI sum code from lecture 22.

In [2]:
%%writefile mpi_sum_v1.c
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <mpi.h>

int main(int argc, char** argv) {

    MPI_Init (&argc, &argv);

    // MPI_COMM_WORLD is the default communicator that contains all ranks
    int rank, size;
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);

    // get N from command line
    if (argc < 2) {
        printf ("Command usage : %s %s\n",argv[0],"N");
        return 1;
    }
    long long N = atoll(argv[1]);

    // start the timer
    double start_time, end_time;
    start_time = MPI_Wtime();

    // calculate the sum
    long long sum = 0;
    for (long long i = 1+rank; i <= N;i+=size) {
        sum += i;
    }

    // all nonzero ranks send their partial sums to rank 0
    if (rank == 0) {
        long long rank_sum;
        MPI_Status status;
        for (int source=1;source<size;source++) {
            MPI_Recv(&rank_sum,1,MPI_UNSIGNED_LONG_LONG,source,0,MPI_COMM_WORLD,&status);
            sum += rank_sum;
        }
    } else {
        int dest = 0;
        MPI_Send(&sum,1,MPI_LONG_LONG,dest,0,MPI_COMM_WORLD);
    }

    // all nonzero ranks receive the final sum from rank 0
    if (rank == 0) {
        for (int dest = 1;dest < size;dest++) {
            MPI_Send(&sum,1,MPI_LONG_LONG,dest,0,MPI_COMM_WORLD);
        }
    } else {
        int src = 0;
        MPI_Status status;
        MPI_Recv(&sum,1,MPI_LONG_LONG,src,0,MPI_COMM_WORLD,&status);
    }

    // stop the timer
    end_time = MPI_Wtime();

    // print results
    printf ("rank %d (of %d) sum = %lld, N*(N+1)/2 = %lld, elapsed time = %.4f seconds\n",
            rank,size,sum,(N/2)*(N+1),end_time-start_time);

    MPI_Finalize();
}

Overwriting mpi_sum_v1.c


## Here are the results of running version 1 on matrix with N equal to 1 million.

    $ mpicc -o mpi_sum_v1 mpi_sum_v1.c
    $ mpiexec -n 4 ./mpi_sum_v1 1000000
    rank 0 (of 4) sum = 500000500000, N*(N+1)/2 = 500000500000, elapsed time = 0.0010 seconds
    rank 1 (of 4) sum = 500000500000, N*(N+1)/2 = 500000500000, elapsed time = 0.0010 seconds
    rank 2 (of 4) sum = 500000500000, N*(N+1)/2 = 500000500000, elapsed time = 0.0010 seconds
    rank 3 (of 4) sum = 500000500000, N*(N+1)/2 = 500000500000, elapsed time = 0.0010 seconds
    $

## In version 2, we use parallel communication to reduce the partial sums with result on rank 0.  
## For simplicity, we assume that the number of ranks is $2^k$ where $k \geq 0$.

In [None]:
%%writefile mpi_sum_v2.c
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <mpi.h>

int main(int argc, char** argv) {

    MPI_Init (&argc, &argv);

    // MPI_COMM_WORLD is the default communicator that contains all ranks
    int rank, size;
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);

    // get N from command line
    if (argc < 2) {
        printf ("Command usage : %s %s\n",argv[0],"N");
        return 1;
    }
    long long N = atoll(argv[1]);

    // start the timer
    double start_time, end_time;
    start_time = MPI_Wtime();

    // calculate the sum
    long long sum = 0;
    for (long long i = 1+rank; i <= N;i+=size) {
        sum += i;
    }

    // use parallel message passing to reduce the partial sums with result on rank 0
    // we assume that size = 2^k for some integer k >= 0
    int alive = size;
    while (alive > 1) {
        if (rank < alive/2) {
            // rank is a receiver
            long long rank_sum;
            MPI_Status status;
            int src = rank + alive/2;
            MPI_Recv (&rank_sum, 1, MPI_LONG_LONG, src, 0, MPI_COMM_WORLD, &status);
            sum += rank_sum;
        } else if (rank < alive) {
            // rank is a sender
            int dest = rank - alive/2;
            MPI_Send (&sum, 1, MPI_LONG_LONG, dest, 0, MPI_COMM_WORLD);
        }
        alive = alive/2;
    }

    // all nonzero ranks receive the final sum from rank 0
    if (rank == 0) {
        for (int dest = 1;dest < size;dest++) {
            MPI_Send(&sum,1,MPI_LONG_LONG,dest,0,MPI_COMM_WORLD);
        }
    } else {
        int src = 0;
        MPI_Status status;
        MPI_Recv(&sum,1,MPI_LONG_LONG,src,0,MPI_COMM_WORLD,&status);
    }

    // stop the timer
    end_time = MPI_Wtime();

    // print results
    printf ("rank %d (of %d) sum = %lld, N*(N+1)/2 = %lld, elapsed time = %.4f seconds\n",
            rank,size,sum,(N/2)*(N+1),end_time-start_time);

    MPI_Finalize();
}

## Here are the results of running version 2 on matrix with N equal to 1 million.

    $ mpicc -o mpi_sum_v2 mpi_sum_v2.c
    $ mpiexec -n 4 ./mpi_sum_v2 1000000
    rank 0 (of 4) sum = 500000500000, N*(N+1)/2 = 500000500000, elapsed time = 0.0010 seconds
    rank 1 (of 4) sum = 500000500000, N*(N+1)/2 = 500000500000, elapsed time = 0.0010 seconds
    rank 2 (of 4) sum = 500000500000, N*(N+1)/2 = 500000500000, elapsed time = 0.0010 seconds
    rank 3 (of 4) sum = 500000500000, N*(N+1)/2 = 500000500000, elapsed time = 0.0010 seconds
    $

## In version 3, we use parallel communication to broadcast the sum on rank 0 to all other ranks.
## For simplicity, we assume that the number of ranks is $2^k$ where $k \geq 0$.

In [3]:
%%writefile mpi_sum_v3.c
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <mpi.h>

int main(int argc, char** argv) {

    MPI_Init (&argc, &argv);

    // MPI_COMM_WORLD is the default communicator that contains all ranks
    int rank, size;
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);

    // get N from command line
    if (argc < 2) {
        printf ("Command usage : %s %s\n",argv[0],"N");
        return 1;
    }
    long long N = atoll(argv[1]);

    // start the timer
    double start_time, end_time;
    start_time = MPI_Wtime();

    // calculate the sum
    long long sum = 0;
    for (long long i = 1+rank; i <= N;i+=size) {
        sum += i;
    }

    // use parallel message passing to reduce the partial sums with result on rank 0
    // we assume that size = 2^k for some integer k >= 0
    int alive = size;
    while (alive > 1) {
        if (rank < alive/2) {
            // rank is a receiver
            long long rank_sum;
            MPI_Status status;
            int src = rank + alive/2;
            MPI_Recv (&rank_sum, 1, MPI_LONG_LONG, src, 0, MPI_COMM_WORLD, &status);
            sum += rank_sum;
        } else if (rank < alive) {
            // rank is a sender
            int dest = rank - alive/2;
            MPI_Send (&sum, 1, MPI_LONG_LONG, dest, 0, MPI_COMM_WORLD);
        }
        alive = alive/2;
    }

    // use parallel message passing to broadcast the sum on rank 0 to all other ranks
    // we assume that size = 2^k for some integer k >= 0
    alive = 1;
    while (alive < size) {
        alive = alive*2;
        if (rank < alive/2) {
            // rank is a sender
            int dest = rank + alive/2;
            MPI_Send (&sum, 1, MPI_LONG_LONG, dest, 0, MPI_COMM_WORLD);
        } else if (rank < alive) {
            // rank is a receiver */
            MPI_Status status;
            int src = rank - alive/2;
            MPI_Recv (&sum, 1, MPI_LONG_LONG, src, 0, MPI_COMM_WORLD, &status);
        }
    }

    // stop the timer
    end_time = MPI_Wtime();

    // print results
    printf ("rank %d (of %d) sum = %lld, N*(N+1)/2 = %lld, elapsed time = %.4f seconds\n",
            rank,size,sum,(N/2)*(N+1),end_time-start_time);

    MPI_Finalize();
}


Writing mpi_sum_v3.c


## Here are the results of running version 3 on matrix with N equal to 1 million.

    $ mpicc -o mpi_sum_v3 mpi_sum_v3.c
    $ mpiexec -n 4 ./mpi_sum_v3 1000000
    rank 0 (of 4) sum = 500000500000, N*(N+1)/2 = 500000500000, elapsed time = 0.0009 seconds
    rank 1 (of 4) sum = 500000500000, N*(N+1)/2 = 500000500000, elapsed time = 0.0009 seconds
    rank 2 (of 4) sum = 500000500000, N*(N+1)/2 = 500000500000, elapsed time = 0.0009 seconds
    rank 3 (of 4) sum = 500000500000, N*(N+1)/2 = 500000500000, elapsed time = 0.0009 seconds
    $

# Part 2 : Comparing the Performance of Parallel and Sequential Message Passing

## To compare the performance of parallel message passing versus sequential message passing we need an MPI code that does a lot of communication!

## In the sequential version of our MPI test code we will have a configurable number of communication rounds.  During each communication round, each rank will generate a random number and send that number to rank 0 so that it can be added to the total sum.  

## Here is the code that implements the MPI communications using sequential message passing.  Note that only rank 0 will have the correct total sum.

    int total_sum = 0;
    for (int round = 0;round < rounds;round++) {
	    int round_sum = random() % 5;
	    if (rank == 0) {
	        int number;
            MPI_Status status;
	        for (int src = 1;src < size;src++) {
		        MPI_Recv(&number,1,MPI_INT,src,0,MPI_COMM_WORLD,&status);
		        round_sum += number;
	        }
	    } else {
	        int dest = 0;
	        MPI_Send(&round_sum,1,MPI_INT,dest,0,MPI_COMM_WORLD);
	    }
	    total_sum += round_sum;
    }


## To thoroughly test sequential message passing performance we run our MPI code on 64 ranks where each rank is on a different compute node!  

## This ensures that point-to-point communications between ranks will need to use the network.

## Tests involving such heavy use of ARC resources need to be run during non-peak hours.  

## Here is the part of the *sbatch* file that asks for the computing resources used during our test.

    #!/bin/bash
    #SBATCH -A cmda3634_rjh
    #SBATCH -p normal_q
    #SBATCH -t 5
    #SBATCH --nodes=64
    #SBATCH --ntasks-per-node=1
    #SBATCH -o mpi_seq.out

## Here is the sequential performance (500000 rounds, 64 ranks on 64 different compute nodes).

    [jasonwil@tinkercliffs2 L24]$ sbatch mpi_seq.sh 500000 1234
    Submitted batch job 2300140
    [jasonwil@tinkercliffs2 L24]$ squeue -u jasonwil
        JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
        2300140  normal_q mpi_seq. jasonwil  R       0:04     64 tc[009-010,
        015-016,024-027,035-037,049-053,066-067,070-071,079-087,089-090,099-101,
        170-171,179-182,197-203,229-234,236-237,242-246,259-260,266-267]
    [jasonwil@tinkercliffs2 L24]$ cat mpi_seq.out
    elapsed time = 8.1717 seconds
    rounds = 500000, seed = 1234, sum = 64020814


## To compare sequential message passing to parallel message passing we modify our MPI communications to perform the sum reduction in parallel:

    int total_sum = 0;
    for (int round = 0;round < rounds;round++) {
	    int round_sum = random() % 5;
	    // use parallel message passing to reduce the partial sums with result on rank 0
	    // we assume that size = 2^k for some integer k >= 0
	    int alive = size;
	    while (alive > 1) {
	        if (rank < alive/2) {
		        // rank is a receiver
		        int rank_sum;
		        MPI_Status status;
		        int src = rank + alive/2;
		        MPI_Recv (&rank_sum, 1, MPI_INT, src, 0, MPI_COMM_WORLD, &status);
		        round_sum += rank_sum;
	        } else if (rank < alive) {
		        // rank is a sender
		        int dest = rank - alive/2;
		        MPI_Send (&round_sum, 1, MPI_INT, dest, 0, MPI_COMM_WORLD);
	        }
	        alive = alive/2;
	    }
	    total_sum += round_sum;
    }




## Here is the parallel performance (500000 rounds, 64 ranks on 64 different compute nodes).

    [jasonwil@tinkercliffs2 L24]$ sbatch mpi_par.sh 500000 1234
    Submitted batch job 2300076
    [jasonwil@tinkercliffs2 L21]$ squeue -u jasonwil
        JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
        2300076  normal_q mpi_par. jasonwil  R       0:02     64 tc[009-010,
        015-016,035-037,049-053,066-067,070-071,074-075,079-087,089-090,099-101,
        113-114,170-171,179-182,197-201,229-234,236-237,242-246,259-260,266-267,
        272-273]
    [jasonwil@tinkercliffs2 L21]$ cat mpi_par.out
    elapsed time = 0.8618 seconds
    rounds = 500000, seed = 1234, sum = 64020814


## Note that the speedup when using parallel message passing versus sequential is:

$$\text{speedup} = 8.1717/0.8618 = 9.5$$

## We could have (roughly) predicted this speedup as follows.  

## Since $2^6 = 64$ we recall from lab 23 that the sequential code will perform 63 sequential communication steps per round while the parallel code will perform 6 parallel communication steps per round.

## Note that the ideal speedup of $63/6 = 10.5$ is only slightly larger than the actual speedup of $9.5$.

## Note that the performance advantage when using parallel message passing grows very rapidly as the number of ranks increases.  

## For example if the number of ranks is 1024, then since $2^{10} = 1024$ the ideal speedup is $1023/10 = 102$.  

# Part 3 : Introduction to Collective Communication

## As shown in part 2, parallel message passing is much more efficient than sequential message passing.

## However, note that the code complexity of version 3 is quite high and we are assuming the special case where the number of ranks is 2^k for simplicity.  

## Fortunately, the type of communications we are doing in version 3 (i.e. reducing partial sums and broadcasting) are so common in parallel computing that MPI has built-in **collective communication functions** that drastically simplify the communications code part of our MPI programs.  

## In addition, these collective communication functions use parallel message passing under the hood so they are very efficient as well.  


## The problem of reducing partial values on all ranks to a final value on a given rank using some type of operation (i.e. sum, min, max, etc.) is called a **reduction**.  

## More specifically, the problem of adding partial sums on all ranks to compute the total sum on a given rank is called a **sum reduction**.

## The MPI_Reduce function in MPI performs a reduction in one line of code.

## For example, to add the partial sums stored in each rank in the variable *rank_sum* and put the result in the variable *sum* on rank 0 we use the code:

    // use collective communication to reduce the partial sums with result on rank 0
    long long rank_sum = sum;
    MPI_Reduce(&rank_sum,&sum,1,MPI_LONG_LONG,MPI_SUM,0,MPI_COMM_WORLD);

## For reference, here is the interface to MPI_Reduce

    int MPI_Reduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int root, MPI_Comm comm);

## The problem of copying the value of a variable on some rank to all other ranks is called a **broadcast**.

## The MPI_Bcast function in MPI performs a broadcast in one line of code.

## For example, to copy the total sum stored in rank 0 in the variable *sum* to the variable *sum* in all other ranks we use the code:

    // use collective communication to broadcast sum on rank 0 to all ranks
    MPI_Bcast(&sum,1,MPI_LONG_LONG,0,MPI_COMM_WORLD);

## For reference, here is the interface to MPI_Bcast

    int MPI_Bcast( void *buffer, int count, MPI_Datatype datatype, int root, MPI_Comm comm );




## Here is version 4 of our sum code which replaces the parallel message passing code with calls to MPI_Reduce and MPI_Bcast.  Note how much easier the code is to read and understand!

In [1]:
%%writefile mpi_sum_v4.c
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <mpi.h>

int main(int argc, char** argv) {

    MPI_Init (&argc, &argv);

    // MPI_COMM_WORLD is the default communicator that contains all ranks
    int rank, size;
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);

    // get N from command line
    if (argc < 2) {
        printf ("Command usage : %s %s\n",argv[0],"N");
        return 1;
    }
    long long N = atoll(argv[1]);

    // start the timer
    double start_time, end_time;
    start_time = MPI_Wtime();

    // calculate the sum
    long long sum = 0;
    for (long long i = 1+rank; i <= N;i+=size) {
        sum += i;
    }

    // use collective communication to reduce the partial sums with result on rank 0
    long long rank_sum = sum;
    MPI_Reduce(&rank_sum,&sum,1,MPI_LONG_LONG,MPI_SUM,0,MPI_COMM_WORLD);

    // use collective communication to broadcast sum on rank 0 to all ranks
    MPI_Bcast(&sum,1,MPI_LONG_LONG,0,MPI_COMM_WORLD);

    // stop the timer
    end_time = MPI_Wtime();

    // print results
    printf ("rank %d (of %d) sum = %lld, N*(N+1)/2 = %lld, elapsed time = %.4f seconds\n",
            rank,size,sum,(N/2)*(N+1),end_time-start_time);

    MPI_Finalize();
}

Writing mpi_sum_v4.c


## Here are the results of running version 4 on matrix with N equal to 1 million.

    $ mpicc -o mpi_sum_v4 mpi_sum_v4.c
    $ mpiexec -n 4 ./mpi_sum_v4 1000000
    rank 0 (of 4) sum = 500000500000, N*(N+1)/2 = 500000500000, elapsed time = 0.0010 seconds
    rank 1 (of 4) sum = 500000500000, N*(N+1)/2 = 500000500000, elapsed time = 0.0010 seconds
    rank 2 (of 4) sum = 500000500000, N*(N+1)/2 = 500000500000, elapsed time = 0.0010 seconds
    rank 3 (of 4) sum = 500000500000, N*(N+1)/2 = 500000500000, elapsed time = 0.0010 seconds

# Part 4 : Performance of MPI_Reduce

## To test the performance of MPI_Reduce compared to the parallel and sequential message passing codes from part 2 we use the collective communication code:

    int total_sum = 0;
    for (int round = 0;round < rounds;round++) {
	    int round_sum = random() % 5;
	    int number = round_sum;
	    MPI_Reduce(&number,&round_sum,1,MPI_INT,MPI_SUM,0,MPI_COMM_WORLD);
	    total_sum += round_sum;
    }


## Performance using MPI_Reduce (500000 rounds, 64 ranks on 64 different compute nodes).

    [jasonwil@tinkercliffs2 L24]$ sbatch mpi_reduce.sh 500000 1234
    [jasonwil@tinkercliffs2 L24]$ squeue -u jasonwil
        JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
        2300544  normal_q mpi_redu jasonwil  R       0:02     64 tc[003-004,006,
        009-010,012-013,015-016,018,020,022,024-027,032,035,037,046,049-053,056,
        059,061,066-067,070-071,075,079-087,089-090,095,099-101,104,133-134,
        179-181,196-201,239-240,253-254]
    [jasonwil@tinkercliffs2 L21]$ cat mpi_reduce.out
    elapsed time = 1.1595 seconds
    rounds = 500000, seed = 1234, sum = 64020814


## The runtime when using MPI_Reduce is comparable to that of our hand-coded parallel message passing version.  **This indicates that MPI_Reduce is using parallel message passing!**

## Also note that MPI_Reduce easily handles an arbitrary number of ranks (recall that we assumed that the number of ranks was $2^k$ to simplify our parallel message passing code).