## Exercise 2: Calculate pi

Our goal in this exercise is to practice:
* worksharing constructs: for
* synchronization constructs: critical directive

This is a code example of computing pi. In the exercise below you should add a parallel region and *for* directive to the part that computes pi. 

In [10]:
#pragma cling load("libomp.so")
#include <stdio.h>
#include <time.h>
#include <sys/time.h>
#include <omp.h>

#define f(A) (4.0/(1.0+A*A))

int num_threads = 4;
omp_set_num_threads(num_threads);

//declarations
const int n = 10000000;
int i;
double w, x, sum, pi;
clock_t t1, t2;
struct timeval tv1, tv2;
struct timezone tz;
double wt1, wt2;

#   pragma omp parallel
{ 
#     pragma omp single 
  printf("OpenMP-parallel with %1d threads\n", omp_get_num_threads());
} /* end omp parallel */
# pragma omp barrier

gettimeofday(&tv1, &tz);
wt1 = omp_get_wtime();
t1 = clock();

/* calculate pi = integral [0..1] 4/(1+x**2) dx */
w = 1.0/n;
sum = 0.0;
for (i = 1; i <= n; i++)
{
    x = w*((double)i-0.5);
    sum = sum+f(x);
}
pi = w*sum;
 
t2 = clock();
wt2 = omp_get_wtime();
gettimeofday(&tv2, &tz);
printf( "computed pi = %24.16g\n", pi );
printf( "CPU time (clock)                = %12.4g sec\n", (t2-t1)/1000000.0 );
printf( "wall clock time (omp_get_wtime) = %12.4g sec\n", wt2-wt1 );
printf( "wall clock time (gettimeofday)  = %12.4g sec\n", (tv2.tv_sec-tv1.tv_sec) + (tv2.tv_usec-tv1.tv_usec)*1e-6 );

OpenMP-parallel with 4 threads
computed pi =        3.141592653589731
CPU time (clock)                =        0.159 sec
wall clock time (omp_get_wtime) =       0.1995 sec
wall clock time (gettimeofday)  =       0.1995 sec


Is the calculation correct? Test it out more than once and try to find the race-condition. 

Compare your incomplete solution with this incomplete solution:

In [6]:
#pragma cling load("libomp.so")
#include <stdio.h>
#include <time.h>
#include <sys/time.h>
#include <omp.h>

#define f(A) (4.0/(1.0+A*A))

int num_threads = 4;
omp_set_num_threads(num_threads);

//declarations
const int n = 10000000;
int i;
double w, x, sum, pi;
clock_t t1, t2;
struct timeval tv1, tv2;
struct timezone tz;
double wt1, wt2;

#   pragma omp parallel
{ 
#     pragma omp single 
  printf("OpenMP-parallel with %1d threads\n", omp_get_num_threads());
} /* end omp parallel */
# pragma omp barrier

gettimeofday(&tv1, &tz);
wt1 = omp_get_wtime();
t1 = clock();

/* calculate pi = integral [0..1] 4/(1+x**2) dx */
w = 1.0/n;
sum = 0.0;
#pragma omp parallel
{
    #pragma omp for
    for (i = 1; i <= n; i++)
    {
        x = w*((double)i-0.5);
        sum = sum+f(x);
    }
} /*end omp parallel*/ 
pi = w*sum;

t2 = clock();
wt2 = omp_get_wtime();
gettimeofday(&tv2, &tz);
printf( "computed pi = %24.16g\n", pi );
printf( "CPU time (clock)                = %12.4g sec\n", (t2-t1)/1000000.0 );
printf( "wall clock time (omp_get_wtime) = %12.4g sec\n", wt2-wt1 );
printf( "wall clock time (gettimeofday)  = %12.4g sec\n", (tv2.tv_sec-tv1.tv_sec) + (tv2.tv_usec-tv1.tv_usec)*1e-6 );

OpenMP-parallel with 4 threads
computed pi =        1.169188131363253
CPU time (clock)                =      0.03598 sec
wall clock time (omp_get_wtime) =      0.05974 sec
wall clock time (gettimeofday)  =      0.05974 sec


Add private(x) clause. Is it still incorrect?

Compare with our incomplete solution below:

In [8]:
#pragma cling load("libomp.so")
#include <stdio.h>
#include <time.h>
#include <sys/time.h>
#include <omp.h>

#define f(A) (4.0/(1.0+A*A))

int num_threads = 4;
omp_set_num_threads(num_threads);

//declarations
const int n = 10000000;
int i;
double w, x, sum, pi;
clock_t t1, t2;
struct timeval tv1, tv2;
struct timezone tz;
double wt1, wt2;

#   pragma omp parallel
{ 
#     pragma omp single 
  printf("OpenMP-parallel with %1d threads\n", omp_get_num_threads());
} /* end omp parallel */
# pragma omp barrier

gettimeofday(&tv1, &tz);
wt1 = omp_get_wtime();
t1 = clock();

/* calculate pi = integral [0..1] 4/(1+x**2) dx */
w = 1.0/n;
sum = 0.0;
#pragma omp parallel private(x), shared(w,sum)
{
    #pragma omp for
    for (i = 1; i <= n; i++)
    {
        x = w*((double)i-0.5);
        sum = sum+f(x);
    }
} /*end omp parallel*/ 
pi = w*sum;

t2 = clock();
wt2 = omp_get_wtime();
gettimeofday(&tv2, &tz);
printf( "computed pi = %24.16g\n", pi );
printf( "CPU time (clock)                = %12.4g sec\n", (t2-t1)/1000000.0 );
printf( "wall clock time (omp_get_wtime) = %12.4g sec\n", wt2-wt1 );
printf( "wall clock time (gettimeofday)  = %12.4g sec\n", (tv2.tv_sec-tv1.tv_sec) + (tv2.tv_usec-tv1.tv_usec)*1e-6 );

OpenMP-parallel with 4 threads
computed pi =       0.9367670240268653
CPU time (clock)                =      0.03355 sec
wall clock time (omp_get_wtime) =      0.05609 sec
wall clock time (gettimeofday)  =      0.05609 sec


Now add a *critical* directive around the sum statement and compile.

After successful execution, you may compare your result with the provided solution:

In [None]:
#pragma cling load("libomp.so")
#include <stdio.h>
#include <time.h>
#include <sys/time.h>
#include <omp.h>

#define f(A) (4.0/(1.0+A*A))

int num_threads = 4;
omp_set_num_threads(num_threads);

//declarations
const int n = 10000000;
int i;
double w, x, sum, pi;
clock_t t1, t2;
struct timeval tv1, tv2;
struct timezone tz;
double wt1, wt2;

#   pragma omp parallel
{ 
#     pragma omp single 
  printf("OpenMP-parallel with %1d threads\n", omp_get_num_threads());
} /* end omp parallel */
# pragma omp barrier

gettimeofday(&tv1, &tz);
wt1 = omp_get_wtime();
t1 = clock();

/* calculate pi = integral [0..1] 4/(1+x**2) dx */
w = 1.0/n;
sum = 0.0;
#pragma omp parallel private(x), shared(w,sum)
{
    #pragma omp for
    for (i = 1; i <= n; i++)
    {
        x = w*((double)i-0.5);
        /* the critical region inside of a loop with many
       iterations may cause a huge overhead */  
        #pragma omp critical
        {
            sum = sum+f(x);
        }
    }
} /*end omp parallel*/ 
pi = w*sum;

t2 = clock();
wt2 = omp_get_wtime();
gettimeofday(&tv2, &tz);
printf( "computed pi = %24.16g\n", pi );
printf( "CPU time (clock)                = %12.4g sec\n", (t2-t1)/1000000.0 );
printf( "wall clock time (omp_get_wtime) = %12.4g sec\n", wt2-wt1 );
printf( "wall clock time (gettimeofday)  = %12.4g sec\n", (tv2.tv_sec-tv1.tv_sec) + (tv2.tv_usec-tv1.tv_usec)*1e-6 );

The value of pi is correct but the execution time is too long. How can you optimize your code?

Try moving *critical* directive outside of a loop and compile. 

Compare the CPU time for the template program and CPU time for our solution. Have we significantly optimized our code?

### After successful execution, you may compare your result with the provided solution:

In [14]:
#pragma cling load("libomp.so")
#include <stdio.h>
#include <time.h>
#include <sys/time.h>
#include <omp.h>

#define f(A) (4.0/(1.0+A*A))

int num_threads = 4;
omp_set_num_threads(num_threads);

//declarations
const int n = 10000000;
int i;
double w, x, sum, pi;
clock_t t1, t2;
struct timeval tv1, tv2;
struct timezone tz;
double wt1, wt2;

double sum0; //partial sum

#   pragma omp parallel
{ 
#     pragma omp single 
  printf("OpenMP-parallel with %1d threads\n", omp_get_num_threads());
} /* end omp parallel */
# pragma omp barrier

gettimeofday(&tv1, &tz);
wt1 = omp_get_wtime();
t1 = clock();

/* calculate pi = integral [0..1] 4/(1+x**2) dx */
w = 1.0/n;
sum = 0.0;
#pragma omp parallel private(x,sum0), shared(w,sum)
{
    sum0 = 0.0;
    #pragma omp for nowait
    for (i = 1; i <= n; i++)
    {
        x = w*((double)i-0.5);
        sum0 = sum0+f(x);
    }
    # pragma omp critical
    {
        sum = sum+sum0;
    }
} /*end omp parallel*/ 
pi = w*sum;

t2 = clock();
wt2 = omp_get_wtime();
gettimeofday(&tv2, &tz);
printf( "computed pi = %24.16g\n", pi );
printf( "CPU time (clock)                = %12.4g sec\n", (t2-t1)/1000000.0 );
printf( "wall clock time (omp_get_wtime) = %12.4g sec\n", wt2-wt1 );
printf( "wall clock time (gettimeofday)  = %12.4g sec\n", (tv2.tv_sec-tv1.tv_sec) + (tv2.tv_usec-tv1.tv_usec)*1e-6 );

OpenMP-parallel with 4 threads
computed pi =         3.14159265358967
CPU time (clock)                =      0.03397 sec
wall clock time (omp_get_wtime) =      0.06564 sec
wall clock time (gettimeofday)  =      0.06565 sec
