# Lecture 14 : OpenMP Extreme

In [1]:
# clone a public repo on Github to download some data files
!git clone https://github.com/jasonrwilson/cmda3634_materials.git

Cloning into 'cmda3634_materials'...
remote: Enumerating objects: 199, done.[K
remote: Counting objects: 100% (96/96), done.[K
remote: Compressing objects: 100% (64/64), done.[K
remote: Total 199 (delta 42), reused 65 (delta 32), pack-reused 103[K
Receiving objects: 100% (199/199), 36.17 MiB | 8.75 MiB/s, done.
Resolving deltas: 100% (96/96), done.


In [2]:
# copy the lecture 14 files to our working directory
!cp cmda3634_materials/L14/* .

# Part 1 : Sequential Code for Finding the Extreme Pair

In [3]:
%%writefile extreme.c
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "vec.h"

int main () {

    // read the number of points and the dimension of each point
    int num_points, dim;
    if (scanf("%*c %d %d",&num_points, &dim) != 2) {
        printf ("error reading the number of points and the dimension\n");
        return 1;
    }

    // Read vectors from stdin and store them in a 2d array
    double* data = (double*)malloc(num_points*dim*sizeof(double));
    if (data == NULL) {
        printf ("malloc return NULL pointer!\n");
        return 1;
    }
    for (int i=0;i<num_points;i++) {
        if (vec_read_stdin(data+i*dim,dim) != dim) {
            printf ("error reading the next point from stdin\n");
            return 1;
        }
    }

    // find the extreme pair
    double max_dist_sq = 0;
    int extreme[2];
    int pairs_checked = 0;
    for (int i=0;i<num_points-1;i++) {
        for (int j=i+1;j<num_points;j++) {
            double dist_sq = vec_dist_sq(data+i*dim,data+j*dim,dim);
	        pairs_checked += 1;
            if (dist_sq > max_dist_sq) {
                max_dist_sq = dist_sq;
                extreme[0] = i;
                extreme[1] = j;
            }
        }
    }

    // output the results
    printf ("pairs checked = %d\n",pairs_checked);
    printf ("Extreme Distance = %.2f\n",sqrt(max_dist_sq));
    printf ("Extreme Pair = %d %d\n",extreme[0],extreme[1]);

    // free memory allocated for dataset
    free(data);
}

Writing extreme.c


In [5]:
!gcc -o extreme extreme.c vec.c -lm

In [6]:
!time cat points10k.txt | ./extreme

pairs checked = 49995000
Extreme Distance = 25.52
Extreme Pair = 929 9395

real	0m0.786s
user	0m0.758s
sys	0m0.007s


In [7]:
!time cat mnist1000.txt | ./extreme

pairs checked = 499500
Extreme Distance = 3797.52
Extreme Pair = 121 426

real	0m2.235s
user	0m2.181s
sys	0m0.017s


In [8]:
!time cat mnist2000.txt | ./extreme

pairs checked = 1999000
Extreme Distance = 3928.75
Extreme Pair = 1618 1895

real	0m7.420s
user	0m7.280s
sys	0m0.023s


In [9]:
!time cat mnist10000.txt | ./extreme

pairs checked = 49995000
Extreme Distance = 4097.95
Extreme Pair = 5977 6412

real	2m50.579s
user	2m49.319s
sys	0m0.125s


## Note that finding the extreme pair is a $O(n^2)$ algorithm.  
## For a large the sequential code takes almost 3 minutes.  
## We can substantially reduce the runtime by checking for the extreme pair in parallel.

# Part 2 : OpenMP Version of Extreme

## For version 1 of the OpenMP code we will add a command line argument to read in the number of threads and add OpenMP timing code to the double for loop.  

In [11]:
%%writefile omp_extreme_v1.c
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <omp.h>
#include "vec.h"

int main (int argc, char* argv[]) {

    // get num_threads from the command line
    if (argc < 2) {
        printf ("Command usage : %s %s\n",argv[0],"num_threads");
        return 1;
    }
    int num_threads = atoi(argv[1]);
    omp_set_num_threads(num_threads);

    // read the number of points and the dimension of each point
    int num_points, dim;
    if (scanf("%*c %d %d",&num_points, &dim) != 2) {
        printf ("error reading the number of points and the dimension\n");
        return 1;
    }

    // read vectors from stdin and store them in a 2d array
    double* data = (double*)malloc(num_points*dim*sizeof(double));
    if (data == NULL) {
        printf ("malloc return NULL pointer!\n");
        return 1;
    }
    for (int i=0;i<num_points;i++) {
        if (vec_read_stdin(data+i*dim,dim) != dim) {
            printf ("error reading the next point from stdin\n");
            return 1;
        }
    }

    // start the timer
    double start_time, end_time;
    start_time = omp_get_wtime();

    // find the extreme pair
    double max_dist_sq = 0;
    int extreme[2];
    int pairs_checked = 0;
    for (int i=0;i<num_points-1;i++) {
        for (int j=i+1;j<num_points;j++) {
            double dist_sq = vec_dist_sq(data+i*dim,data+j*dim,dim);
	    pairs_checked += 1;
            if (dist_sq > max_dist_sq) {
                max_dist_sq = dist_sq;
                extreme[0] = i;
                extreme[1] = j;
            }
        }
    }

    // stop the timer
    end_time = omp_get_wtime();

    // output the results
    printf ("num_threads = %d, ",num_threads);
    printf ("elapsed time = %.6f seconds\n",end_time-start_time);
    printf ("pairs checked = %d\n",pairs_checked);
    printf ("Extreme Distance = %.2f\n",sqrt(max_dist_sq));
    printf ("Extreme Pair = %d %d\n",extreme[0],extreme[1]);

    // free memory allocated for dataset
    free(data);
}

Writing omp_extreme_v1.c


## In version 2, we add the create a parallel region that includes the double for loop.

In [29]:
%%writefile omp_extreme_v2.c
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <omp.h>
#include "vec.h"

int main (int argc, char* argv[]) {

    // get num_threads from the command line
    if (argc < 2) {
        printf ("Command usage : %s %s\n",argv[0],"num_threads");
        return 1;
    }
    int num_threads = atoi(argv[1]);
    omp_set_num_threads(num_threads);

    // read the number of points and the dimension of each point
    int num_points, dim;
    if (scanf("%*c %d %d",&num_points, &dim) != 2) {
        printf ("error reading the number of points and the dimension\n");
        return 1;
    }

    // read vectors from stdin and store them in a 2d array
    double* data = (double*)malloc(num_points*dim*sizeof(double));
    if (data == NULL) {
        printf ("malloc return NULL pointer!\n");
        return 1;
    }
    for (int i=0;i<num_points;i++) {
        if (vec_read_stdin(data+i*dim,dim) != dim) {
            printf ("error reading the next point from stdin\n");
            return 1;
        }
    }

    // start the timer
    double start_time, end_time;
    start_time = omp_get_wtime();

    // find the extreme pair
    double max_dist_sq = 0;
    int extreme[2];
    int pairs_checked = 0;

#pragma omp parallel
    {
	    for (int i=0;i<num_points-1;i++) {
	        for (int j=i+1;j<num_points;j++) {
		        double dist_sq = vec_dist_sq(data+i*dim,data+j*dim,dim);
		        pairs_checked += 1;
		        if (dist_sq > max_dist_sq) {
		            max_dist_sq = dist_sq;
		            extreme[0] = i;
		            extreme[1] = j;
		        }
	        }
	    }
    }

    // stop the timer
    end_time = omp_get_wtime();

    // output the results
    printf ("num_threads = %d, ",num_threads);
    printf ("elapsed time = %.6f seconds\n",end_time-start_time);
    printf ("pairs checked = %d\n",pairs_checked);
    printf ("Extreme Distance = %.2f\n",sqrt(max_dist_sq));
    printf ("Extreme Pair = %d %d\n",extreme[0],extreme[1]);

    // free memory allocated for dataset
    free(data);
}

Overwriting omp_extreme_v2.c


## In version 3, we will assign the loop iterations of the outer loop to threads so that each thread gets approximately the same number of loop iterations.

In [33]:
%%writefile omp_extreme_v3.c
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <omp.h>
#include "vec.h"

int main (int argc, char* argv[]) {

    // get num_threads from the command line
    if (argc < 2) {
        printf ("Command usage : %s %s\n",argv[0],"num_threads");
        return 1;
    }
    int num_threads = atoi(argv[1]);
    omp_set_num_threads(num_threads);

    // read the number of points and the dimension of each point
    int num_points, dim;
    if (scanf("%*c %d %d",&num_points, &dim) != 2) {
        printf ("error reading the number of points and the dimension\n");
        return 1;
    }

    // read vectors from stdin and store them in a 2d array
    double* data = (double*)malloc(num_points*dim*sizeof(double));
    if (data == NULL) {
        printf ("malloc return NULL pointer!\n");
        return 1;
    }
    for (int i=0;i<num_points;i++) {
        if (vec_read_stdin(data+i*dim,dim) != dim) {
            printf ("error reading the next point from stdin\n");
            return 1;
        }
    }

    // start the timer
    double start_time, end_time;
    start_time = omp_get_wtime();

    // find the extreme pair
    double max_dist_sq = 0;
    int extreme[2];
    int pairs_checked = 0;

#pragma omp parallel
    {
        int thread_num = omp_get_thread_num();
	    for (int i=0+thread_num;i<num_points-1;i+=num_threads) {
	        for (int j=i+1;j<num_points;j++) {
		        double dist_sq = vec_dist_sq(data+i*dim,data+j*dim,dim);
		        pairs_checked += 1;
		        if (dist_sq > max_dist_sq) {
		            max_dist_sq = dist_sq;
		            extreme[0] = i;
		            extreme[1] = j;
		        }
	        }
	    }
    }

    // stop the timer
    end_time = omp_get_wtime();

    // output the results
    printf ("num_threads = %d, ",num_threads);
    printf ("elapsed time = %.6f seconds\n",end_time-start_time);
    printf ("pairs checked = %d\n",pairs_checked);
    printf ("Extreme Distance = %.2f\n",sqrt(max_dist_sq));
    printf ("Extreme Pair = %d %d\n",extreme[0],extreme[1]);

    // free memory allocated for dataset
    free(data);
}

Writing omp_extreme_v3.c


## In version 4, we specify the shared variables

In [46]:
%%writefile omp_extreme_v4.c
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <omp.h>
#include "vec.h"

int main (int argc, char* argv[]) {

    // get num_threads from the command line
    if (argc < 2) {
        printf ("Command usage : %s %s\n",argv[0],"num_threads");
        return 1;
    }
    int num_threads = atoi(argv[1]);
    omp_set_num_threads(num_threads);

    // read the number of points and the dimension of each point
    int num_points, dim;
    if (scanf("%*c %d %d",&num_points, &dim) != 2) {
        printf ("error reading the number of points and the dimension\n");
        return 1;
    }

    // read vectors from stdin and store them in a 2d array
    double* data = (double*)malloc(num_points*dim*sizeof(double));
    if (data == NULL) {
        printf ("malloc return NULL pointer!\n");
        return 1;
    }
    for (int i=0;i<num_points;i++) {
        if (vec_read_stdin(data+i*dim,dim) != dim) {
            printf ("error reading the next point from stdin\n");
            return 1;
        }
    }

    // start the timer
    double start_time, end_time;
    start_time = omp_get_wtime();

    // find the extreme pair
    double max_dist_sq = 0;
    int extreme[2];
    int pairs_checked = 0;

#pragma omp parallel default(none) shared(data,num_points,dim,max_dist_sq,extreme,num_threads,pairs_checked)
    {
        int thread_num = omp_get_thread_num();
	    for (int i=0+thread_num;i<num_points-1;i+=num_threads) {
	        for (int j=i+1;j<num_points;j++) {
		        double dist_sq = vec_dist_sq(data+i*dim,data+j*dim,dim);
		        pairs_checked += 1;
		        if (dist_sq > max_dist_sq) {
		            max_dist_sq = dist_sq;
		            extreme[0] = i;
		            extreme[1] = j;
		        }
	        }
	    }
    }

    // stop the timer
    end_time = omp_get_wtime();

    // output the results
    printf ("num_threads = %d, ",num_threads);
    printf ("elapsed time = %.6f seconds\n",end_time-start_time);
    printf ("pairs checked = %d\n",pairs_checked);
    printf ("Extreme Distance = %.2f\n",sqrt(max_dist_sq));
    printf ("Extreme Pair = %d %d\n",extreme[0],extreme[1]);

    // free memory allocated for dataset
    free(data);
}

Overwriting omp_extreme_v4.c


In [50]:
%%writefile omp_extreme_v5.c
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <omp.h>
#include "vec.h"

int main (int argc, char* argv[]) {

    // get num_threads from the command line
    if (argc < 2) {
        printf ("Command usage : %s %s\n",argv[0],"num_threads");
        return 1;
    }
    int num_threads = atoi(argv[1]);
    omp_set_num_threads(num_threads);

    // read the number of points and the dimension of each point
    int num_points, dim;
    if (scanf("%*c %d %d",&num_points, &dim) != 2) {
        printf ("error reading the number of points and the dimension\n");
        return 1;
    }

    // read vectors from stdin and store them in a 2d array
    double* data = (double*)malloc(num_points*dim*sizeof(double));
    if (data == NULL) {
        printf ("malloc return NULL pointer!\n");
        return 1;
    }
    for (int i=0;i<num_points;i++) {
        if (vec_read_stdin(data+i*dim,dim) != dim) {
            printf ("error reading the next point from stdin\n");
            return 1;
        }
    }

    // start the timer
    double start_time, end_time;
    start_time = omp_get_wtime();

    // find the extreme pair
    double max_dist_sq = 0;
    int extreme[2];
    int pairs_checked = 0;

#pragma omp parallel default(none) shared(data,num_points,dim,max_dist_sq,extreme,num_threads,pairs_checked)
    {
        int thread_num = omp_get_thread_num();
	    for (int i=0+thread_num;i<num_points-1;i+=num_threads) {
	        for (int j=i+1;j<num_points;j++) {
		        double dist_sq = vec_dist_sq(data+i*dim,data+j*dim,dim);
#pragma omp critical
                {
		            pairs_checked += 1;
		            if (dist_sq > max_dist_sq) {
		                max_dist_sq = dist_sq;
		                extreme[0] = i;
		                extreme[1] = j;
		            }
                }
	        }
	    }
    }

    // stop the timer
    end_time = omp_get_wtime();

    // output the results
    printf ("num_threads = %d, ",num_threads);
    printf ("elapsed time = %.6f seconds\n",end_time-start_time);
    printf ("pairs checked = %d\n",pairs_checked);
    printf ("Extreme Distance = %.2f\n",sqrt(max_dist_sq));
    printf ("Extreme Pair = %d %d\n",extreme[0],extreme[1]);

    // free memory allocated for dataset
    free(data);
}

Writing omp_extreme_v5.c


## In version 6, we use local thread versions of the read/write shared variables to that each thread only enters the critical region one time.

In [61]:
%%writefile omp_extreme_v6.c
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <omp.h>
#include "vec.h"

int main (int argc, char* argv[]) {

    // get num_threads from the command line
    if (argc < 2) {
        printf ("Command usage : %s %s\n",argv[0],"num_threads");
        return 1;
    }
    int num_threads = atoi(argv[1]);
    omp_set_num_threads(num_threads);

    // read the number of points and the dimension of each point
    int num_points, dim;
    if (scanf("%*c %d %d",&num_points, &dim) != 2) {
        printf ("error reading the number of points and the dimension\n");
        return 1;
    }

    // read vectors from stdin and store them in a 2d array
    double* data = (double*)malloc(num_points*dim*sizeof(double));
    if (data == NULL) {
        printf ("malloc return NULL pointer!\n");
        return 1;
    }
    for (int i=0;i<num_points;i++) {
        if (vec_read_stdin(data+i*dim,dim) != dim) {
            printf ("error reading the next point from stdin\n");
            return 1;
        }
    }

    // start the timer
    double start_time, end_time;
    start_time = omp_get_wtime();

    // find the extreme pair
    double max_dist_sq = 0;
    int extreme[2];
    int pairs_checked = 0;

#pragma omp parallel default(none) shared(data,num_points,dim,max_dist_sq,extreme,num_threads,pairs_checked)
    {
	    double thread_max_dist_sq = 0;
	    int thread_extreme[2];
	    int thread_pairs_checked = 0;
	    int thread_num = omp_get_thread_num();
	    for (int i=0+thread_num;i<num_points-1;i+=num_threads) {
	        for (int j=i+1;j<num_points;j++) {
		        double dist_sq = vec_dist_sq(data+i*dim,data+j*dim,dim);
		        thread_pairs_checked += 1;
		        if (dist_sq > thread_max_dist_sq) {
		            thread_max_dist_sq = dist_sq;
		            thread_extreme[0] = i;
		            thread_extreme[1] = j;
		        }
	        }
	    }
#pragma omp critical
	    {
	        pairs_checked += thread_pairs_checked;
	        if (thread_max_dist_sq > max_dist_sq) {
		        max_dist_sq = thread_max_dist_sq;
		        extreme[0] = thread_extreme[0];
		    extreme[1] = thread_extreme[1];
	        }
	    }
    }

    // stop the timer
    end_time = omp_get_wtime();

    // output the results
    printf ("num_threads = %d, ",num_threads);
    printf ("elapsed time = %.6f seconds\n",end_time-start_time);
    printf ("pairs checked = %d\n",pairs_checked);
    printf ("Extreme Distance = %.2f\n",sqrt(max_dist_sq));
    printf ("Extreme Pair = %d %d\n",extreme[0],extreme[1]);

    // free memory allocated for dataset
    free(data);
}

Overwriting omp_extreme_v6.c
