# Lecture 14 : OpenMP Wrapup

## Clone the materials repo to access datafiles.

In [1]:
!git clone https://code.vt.edu/jasonwil/cmda3634_materials.git

Cloning into 'cmda3634_materials'...
remote: Enumerating objects: 245, done.[K
remote: Counting objects: 100% (208/208), done.[K
remote: Compressing objects: 100% (201/201), done.[K
remote: Total 245 (delta 69), reused 9 (delta 2), pack-reused 37 (from 1)[K
Receiving objects: 100% (245/245), 37.72 MiB | 9.05 MiB/s, done.
Resolving deltas: 100% (74/74), done.


In [2]:
!cp cmda3634_materials/L14/* .

# Part 1 : OpenMP Standard Deviation

## The standard deviation of the numbers $1 ... N$ is given by
$$\sigma = \sqrt{\frac{N^2-1}{12}}$$

## A sequential code to compute the standard deviation of the numbers $1 ... N$ is given below.  

In [3]:
%%writefile std_dev.c
#include <stdio.h>
#include <stdlib.h>
#include <math.h>

int main (int argc, char** argv) {

    // get N from the command line
    if (argc < 2) {
        printf ("Command usage : %s N\n",argv[0]);
        return 1;
    }
    long long N = atoll(argv[1]);

    // initialize sums
    double sum_diff_sq = 0;
    long long sum = 0;

    // compute the mean
    for (long long i=1;i<=N;i++) {
        sum += i;
    }
    double mean = 1.0*sum/N;

    // compute the sum of differences squared
    for (long long i=1;i<=N;i++) {
        sum_diff_sq += (i-mean)*(i-mean);
    }

    // compute the standard deviation
    double std_dev = sqrt(sum_diff_sq/N);

    // print the results
    printf ("computed std dev is %.1lf",std_dev);
    printf (", sqrt((N^2-1)/12) is %.1lf\n",sqrt((N*N-1)/12.0));

}

Writing std_dev.c


In [4]:
!gcc -o std_dev std_dev.c -lm

In [5]:
!time ./std_dev 1000000000

computed std dev is 288675134.6, sqrt((N^2-1)/12) is 288675134.6

real	0m6.693s
user	0m6.639s
sys	0m0.004s


## In our first OpenMP version we add reading num_threads and timing code.

In [6]:
%%writefile omp_std_dev_v1.c
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <omp.h>

int main (int argc, char** argv) {

    // get N and num_threads from the command line
    if (argc < 3) {
        printf ("Command usage : %s N num_threads\n",argv[0]);
        return 1;
    }
    long long N = atoll(argv[1]);
    int num_threads = atoi(argv[2]);
    omp_set_num_threads(num_threads);

    // start the timer
    double start_time, end_time;
    start_time = omp_get_wtime();

    // initialize sums
    double sum_diff_sq = 0;
    long long sum = 0;

    // compute the mean
    for (long long i=1;i<=N;i++) {
        sum += i;
    }
    double mean = 1.0*sum/N;

    // compute the sum of differences squared
    for (long long i=1;i<=N;i++) {
        sum_diff_sq += (i-mean)*(i-mean);
    }

    // compute the standard deviation
    double std_dev = sqrt(sum_diff_sq/N);

    // stop the timer
    end_time = omp_get_wtime();

    // print the results
    printf ("num_threads = %d, ",num_threads);
    printf ("elapsed time = %.4f seconds\n",end_time-start_time);
    printf ("computed std dev is %.1lf",std_dev);
    printf (", sqrt((N^2-1)/12) is %.1lf\n",sqrt((N*N-1)/12.0));

}

Writing omp_std_dev_v1.c


In [7]:
!gcc -o omp_std_dev_v1 omp_std_dev_v1.c -lm -fopenmp

In [8]:
!./omp_std_dev_v1 1000000 1

num_threads = 1, elapsed time = 0.0071 seconds
computed std dev is 288675.1, sqrt((N^2-1)/12) is 288675.1


## For our second version, we add a parallel region to run the two for loops in parallel.  

## Note that we use thread versions of the sums for efficiency and use the atomic operations outside the for loops for thread safety.

In [9]:
%%writefile omp_std_dev_v2.c
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <omp.h>

int main (int argc, char** argv) {

    // get N and num_threads from the command line
    if (argc < 3) {
        printf ("Command usage : %s N num_threads\n",argv[0]);
        return 1;
    }
    long long N = atoll(argv[1]);
    int num_threads = atoi(argv[2]);
    omp_set_num_threads(num_threads);

    // start the timer
    double start_time, end_time;
    start_time = omp_get_wtime();

    // initialize sums
    double sum_diff_sq = 0;
    long long sum = 0;

#pragma omp parallel
    {
	// compute the sum
	long long thread_sum = 0;
#pragma omp for
	for (long long i=1;i<=N;i++) {
	    thread_sum += i;
	}
#pragma omp atomic
	sum += thread_sum;

	// compute the mean
	double mean = 1.0*sum/N;

	// compute the sum of differences squared
	double thread_sum_diff_sq = 0;
#pragma omp for
	for (long long i=1;i<=N;i++) {
	    thread_sum_diff_sq += (i-mean)*(i-mean);
	}
#pragma omp atomic
	sum_diff_sq += thread_sum_diff_sq;
    }

    // compute the standard deviation
    double std_dev = sqrt(sum_diff_sq/N);

    // stop the timer
    end_time = omp_get_wtime();

    // print the results
    printf ("num_threads = %d, ",num_threads);
    printf ("elapsed time = %.4f seconds\n",end_time-start_time);
    printf ("computed std dev is %.1lf",std_dev);
    printf (", sqrt((N^2-1)/12) is %.1lf\n",sqrt((N*N-1)/12.0));

}

Writing omp_std_dev_v2.c


In [10]:
!gcc -o omp_std_dev_v2 omp_std_dev_v2.c -lm -fopenmp

In [11]:
!./omp_std_dev_v2 10000000 2

num_threads = 2, elapsed time = 0.0421 seconds
computed std dev is 3498511.7, sqrt((N^2-1)/12) is 2886751.3


## Note that version 2 computes the wrong answer!
## What do you think the problem is with the code?
## Answer:
## Here is version 3 which fixes the issue.

In [12]:
%%writefile omp_std_dev_v3.c
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <omp.h>

int main (int argc, char** argv) {

    // get N and num_threads from the command line
    if (argc < 3) {
        printf ("Command usage : %s N num_threads\n",argv[0]);
        return 1;
    }
    long long N = atoll(argv[1]);
    int num_threads = atoi(argv[2]);
    omp_set_num_threads(num_threads);

    // start the timer
    double start_time, end_time;
    start_time = omp_get_wtime();

    // initialize sums
    double sum_diff_sq = 0;
    long long sum = 0;

#pragma omp parallel
    {
	// compute the sum
	long long thread_sum = 0;
#pragma omp for
	for (long long i=1;i<=N;i++) {
	    thread_sum += i;
	}
#pragma omp atomic
	sum += thread_sum;

#pragma omp barrier
	// compute the mean
	double mean = 1.0*sum/N;

	// compute the sum of differences squared
	double thread_sum_diff_sq = 0;
#pragma omp for
	for (long long i=1;i<=N;i++) {
	    thread_sum_diff_sq += (i-mean)*(i-mean);
	}
#pragma omp atomic
	sum_diff_sq += thread_sum_diff_sq;
    }

    // compute the standard deviation
    double std_dev = sqrt(sum_diff_sq/N);

    // stop the timer
    end_time = omp_get_wtime();

    // print the results
    printf ("num_threads = %d, ",num_threads);
    printf ("elapsed time = %.4f seconds\n",end_time-start_time);
    printf ("computed std dev is %.1lf",std_dev);
    printf (", sqrt((N^2-1)/12) is %.1lf\n",sqrt((N*N-1)/12.0));

}

Writing omp_std_dev_v3.c


In [13]:
!gcc -o omp_std_dev_v3 omp_std_dev_v3.c -lm -fopenmp

In [14]:
!./omp_std_dev_v3 10000000 2

num_threads = 2, elapsed time = 0.0374 seconds
computed std dev is 2886751.3, sqrt((N^2-1)/12) is 2886751.3


## The issue with version 2 is that a thread could compute the mean using
    // compute the mean
	double mean = 1.0*sum/N;
## **before** all threads have finished updating sum with their partial sums.  
## The solution is to a **barrier** before the mean calculation.
    #pragma omp barrier    


## No thread will execute code after a **barrier** until all threads reach the barrier.

## In our final version 4 we use two separate parallel regions.  
## Note that there is an implied barrier at the end of a parallel region.
## Using multiple parallel regions incurs the additional overhead of the fork-join phases.  
## If the number of fork-join phases is small the performance penalty of using multiple parallel regions is typically not signficant.
## However, be careful when putting parallel regions inside of a for loop that has a lot of iterations as the fork-join overhead can quickly add up (I have seen this happen in my research).

In [15]:
%%writefile omp_std_dev_v4.c
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <omp.h>

int main (int argc, char** argv) {

    // get N and num_threads from the command line
    if (argc < 3) {
        printf ("Command usage : %s N num_threads\n",argv[0]);
        return 1;
    }
    long long N = atoll(argv[1]);
    int num_threads = atoi(argv[2]);
    omp_set_num_threads(num_threads);

    // start the timer
    double start_time, end_time;
    start_time = omp_get_wtime();

    // initialize sums
    double sum_diff_sq = 0;
    long long sum = 0;

#pragma omp parallel
    {
	    // compute the sum
	    long long thread_sum = 0;
#pragma omp for
	    for (long long i=1;i<=N;i++) {
	        thread_sum += i;
	    }
#pragma omp atomic
	    sum += thread_sum;
    }

    // there is an implied barrier at the end of a parallel region
    // compute the mean
    double mean = 1.0*sum/N;

#pragma omp parallel
    {
	    // compute the sum of differences squared
	    double thread_sum_diff_sq = 0;
#pragma omp for
	    for (long long i=1;i<=N;i++) {
	        thread_sum_diff_sq += (i-mean)*(i-mean);
	    }
#pragma omp atomic
	    sum_diff_sq += thread_sum_diff_sq;
    }

    // compute the standard deviation
    double std_dev = sqrt(sum_diff_sq/N);

    // stop the timer
    end_time = omp_get_wtime();

    // print the results
    printf ("num_threads = %d, ",num_threads);
    printf ("elapsed time = %.4f seconds\n",end_time-start_time);
    printf ("computed std dev is %.1lf",std_dev);
    printf (", sqrt((N^2-1)/12) is %.1lf\n",sqrt((N*N-1)/12.0));

}

Writing omp_std_dev_v4.c


In [16]:
!gcc -o omp_std_dev_v4 omp_std_dev_v4.c -lm -fopenmp

In [17]:
!./omp_std_dev_v4 10000000 2

num_threads = 2, elapsed time = 0.0381 seconds
computed std dev is 2886751.3, sqrt((N^2-1)/12) is 2886751.3


# Part 2 : OpenMP Nearest Neighbor Classification

## Here is a pure Python script for nearest neighbor classification.

In [18]:
%%writefile nearest.py
import sys
import numpy as np
import gzip
import time # to time part of the code

# make sure a command line argument for the number of test images is provided
if (len(sys.argv) < 2):
    print ('command usage :',sys.argv[0],'num_test')
    exit(1)
num_test = int(sys.argv[1])
print ('number of digits to classify =',num_test)

# Opens MNIST training image set and stores it as a 60000 x 784 matrix
# There are 60000 images, each of which is 28 x 28 pixels
# Each image is stored as a 28x28 = 784 dimensional row vector in the matrix
f = gzip.open('train-images-idx3-ubyte.gz','r')
f.read(16) # skip file header
buf = f.read(60000*28*28)
data = np.frombuffer(buf,dtype=np.uint8)
train = data.reshape(60000,28*28)

# Opening and saving the 60000 training labels
f = gzip.open('train-labels-idx1-ubyte.gz','r')
f.read(8) #skip header
buf = f.read(60000)
train_labels = np.frombuffer(buf,dtype=np.uint8)

# Opens MNIST test image set and stores it as a 10000 x 784 matrix
# There are 10000 images, each of which is 28 x 28 pixels
# Each image is stored as a 28x28 = 784 dimensional row vector in the matrix
f = gzip.open('t10k-images-idx3-ubyte.gz','r')
f.read(16) # skip header
buf = f.read(10000*28*28)
data = np.frombuffer(buf, dtype=np.uint8)
test = data.reshape(10000,28*28)

# Opening and saving the 10000 test labels
f = gzip.open('t10k-labels-idx1-ubyte.gz','r')
f.read(8) #skip header
buf = f.read(10000)
test_labels = np.frombuffer(buf,dtype=np.uint8)

# Allocate space to store the nearest neighbor indices
nearest = np.empty(num_test,dtype='int32')

# time just the nearest neighbor code
start = time.process_time()

# find the index of the training image closest to the test image with the given index
# note that we interpret the image data as 32 bit integers to avoid overflow
for test_index in range(num_test):
    min_dist_sq = np.inf
    for train_index in range(len(train)):
        diff = train[train_index].astype(np.int32)-test[test_index].astype(np.int32)
        dist_sq = np.dot(diff,diff)
        if (dist_sq < min_dist_sq):
            min_dist_sq = dist_sq
            nearest[test_index] = train_index

# record and print elapsed time
elapsed = time.process_time()-start
print ('Time to find nearest neighbors in Python =',np.round(elapsed,4),'seconds')

# count nearest neighbor classification errors
labels_diff = test_labels[:num_test] - train_labels[nearest]
classify_errors = np.count_nonzero(labels_diff)
print ('number of classification errors =',classify_errors)
print ('classificiation rate =',(num_test-classify_errors)/num_test)

Writing nearest.py


In [19]:
!time python3 nearest.py 10

number of digits to classify = 10
Time to find nearest neighbors in Python = 5.4012 seconds
number of classification errors = 0
classificiation rate = 1.0

real	0m6.285s
user	0m5.931s
sys	0m0.101s


## Here is a Python script that calls a C function to find the nearest neighbors.

In [20]:
%%writefile nearest_c.py
import sys
import numpy as np
import gzip
import ctypes as ct # for calling C from Python
lib = ct.cdll.LoadLibrary("./nearest.so") # load C nearest function
import time # to time part of the code

# make sure a command line argument for the number of test images is provided
if (len(sys.argv) < 2):
    print ('command usage :',sys.argv[0],'num_test')
    exit(1)
num_test = int(sys.argv[1])
print ('number of digits classified =',num_test)

# Opens MNIST training image set and stores it as a 60000 x 784 matrix
# There are 60000 images, each of which is 28 x 28 pixels
# Each image is stored as a 28x28 = 784 dimensional row vector in the matrix
f = gzip.open('train-images-idx3-ubyte.gz','r')
f.read(16) # skip file header
buf = f.read(60000*28*28)
data = np.frombuffer(buf,dtype=np.uint8)
train = data.reshape(60000,28*28)

# Opening and saving the 60000 training labels
f = gzip.open('train-labels-idx1-ubyte.gz','r')
f.read(8) #skip header
buf = f.read(60000)
train_labels = np.frombuffer(buf,dtype=np.uint8)

# Opens MNIST test image set and stores it as a 10000 x 784 matrix
# There are 10000 images, each of which is 28 x 28 pixels
# Each image is stored as a 28x28 = 784 dimensional row vector in the matrix
f = gzip.open('t10k-images-idx3-ubyte.gz','r')
f.read(16) # skip header
buf = f.read(10000*28*28)
data = np.frombuffer(buf, dtype=np.uint8)
test = data.reshape(10000,28*28)

# Opening and saving the 10000 test labels
f = gzip.open('t10k-labels-idx1-ubyte.gz','r')
f.read(8) #skip header
buf = f.read(10000)
test_labels = np.frombuffer(buf,dtype=np.uint8)

# Allocate space to store the nearest neighbor indices
nearest = np.empty(num_test,dtype='int32')

# time just the nearest neighbor code
start = time.process_time()

# find the nearest neighbors using C
train_cptr = train.ctypes.data_as(ct.POINTER(ct.c_uint8))
test_cptr = test.ctypes.data_as(ct.POINTER(ct.c_uint8))
nearest_cptr = nearest.ctypes.data_as(ct.POINTER(ct.c_int32))
lib.nearest(train_cptr,ct.c_int(len(train)),test_cptr,ct.c_int(num_test),
        nearest_cptr,ct.c_int(len(train[0])))

# record and print elapsed time
elapsed = time.process_time()-start
print ('Time to find nearest neighbors in C =',np.round(elapsed,4),'seconds')

# count nearest neighbor classification errors
labels_diff = test_labels[:num_test] - train_labels[nearest]
classify_errors = np.count_nonzero(labels_diff)
print ('number of classification errors =',classify_errors)
print ('classificiation rate =',(num_test-classify_errors)/num_test)

Writing nearest_c.py


## Here is the C code for finding the nearest neighbors.

In [21]:
%%writefile nearest.c
#include <stdio.h>
#include <stdlib.h>
#include <limits.h>

typedef unsigned char byte;

// calculates ||u-v||^2
// we accumulate the result using a C int to avoid overflow
int vec_dist_sq (byte* u, byte* v, int dim) {
    int dist_sq = 0;
    for (int i=0;i<dim;i++) {
        dist_sq += (u[i]-v[i])*(u[i]-v[i]);
    }
    return dist_sq;
}

// for each test vector find the nearest training vector
void nearest(byte* train, int num_train, byte* test, int num_test, int* nearest, int dim) {
    for (int i=0;i<num_test;i++) {
        int min_dist_sq = INT_MAX;
        for (int j=0;j<num_train;j++) {
            int dist_sq = vec_dist_sq(test+i*dim,train+j*dim,dim);
            if (dist_sq < min_dist_sq) {
                min_dist_sq = dist_sq;
                nearest[i] = j;
            }
        }
    }
}

Writing nearest.c


## We compile the C code into a shared object library.

In [22]:
!gcc -O3 -march=native -fPIC -shared -o nearest.so nearest.c

## Let's test out the C/Python version

In [23]:
!time python3 nearest_c.py 10000

number of digits classified = 10000
Time to find nearest neighbors in C = 66.1299 seconds
number of classification errors = 309
classificiation rate = 0.9691

real	1m7.205s
user	1m6.614s
sys	0m0.129s


## While the Python/C code is certainly faster it still takes around one minute to classify all 10000 images.

## Let's modify the Python script to call a C function that will use OpenMP to accelerate finding the nearest neighbors.

In [24]:
%%writefile omp_nearest_c.py
import sys
import numpy as np
import gzip
import ctypes as ct # for calling C from Python
lib = ct.cdll.LoadLibrary("./omp_nearest.so") # load OpenMP C nearest function
import time # to time part of the code

# make sure a command line argument for the number of test images is provided
if (len(sys.argv) < 3):
    print ('command usage :',sys.argv[0],'num_test','num_threads')
    exit(1)
num_test = int(sys.argv[1])
num_threads = int(sys.argv[2])
print ('number of threads =',num_threads)
print ('number of digits classified =',num_test)

# Opens MNIST training image set and stores it as a 60000 x 784 matrix
# There are 60000 images, each of which is 28 x 28 pixels
# Each image is stored as a 28x28 = 784 dimensional row vector in the matrix
f = gzip.open('train-images-idx3-ubyte.gz','r')
f.read(16) # skip file header
buf = f.read(60000*28*28)
data = np.frombuffer(buf,dtype=np.uint8)
train = data.reshape(60000,28*28)

# Opening and saving the 60000 training labels
f = gzip.open('train-labels-idx1-ubyte.gz','r')
f.read(8) #skip header
buf = f.read(60000)
train_labels = np.frombuffer(buf,dtype=np.uint8)

# Opens MNIST test image set and stores it as a 10000 x 784 matrix
# There are 10000 images, each of which is 28 x 28 pixels
# Each image is stored as a 28x28 = 784 dimensional row vector in the matrix
f = gzip.open('t10k-images-idx3-ubyte.gz','r')
f.read(16) # skip header
buf = f.read(10000*28*28)
data = np.frombuffer(buf, dtype=np.uint8)
test = data.reshape(10000,28*28)

# Opening and saving the 10000 test labels
f = gzip.open('t10k-labels-idx1-ubyte.gz','r')
f.read(8) #skip header
buf = f.read(10000)
test_labels = np.frombuffer(buf,dtype=np.uint8)

# Allocate space to store the nearest neighbor indices
nearest = np.empty(num_test,dtype='int32')

# find the nearest neighbors using C and OpenMP
train_cptr = train.ctypes.data_as(ct.POINTER(ct.c_uint8))
test_cptr = test.ctypes.data_as(ct.POINTER(ct.c_uint8))
nearest_cptr = nearest.ctypes.data_as(ct.POINTER(ct.c_int32))
lib.omp_nearest(train_cptr,ct.c_int(len(train)),test_cptr,ct.c_int(num_test),
        nearest_cptr,ct.c_int(len(train[0])),ct.c_int(num_threads))

# count nearest neighbor classification errors
labels_diff = test_labels[:num_test] - train_labels[nearest]
classify_errors = np.count_nonzero(labels_diff)
print ('number of classification errors =',classify_errors)
print ('classificiation rate =',(num_test-classify_errors)/num_test)

Writing omp_nearest_c.py


## Here is version 1 of our OpenMP C code.  Note that we just set the number of threads and add timing code.

In [25]:
%%writefile omp_nearest_v1.c
#include <stdio.h>
#include <stdlib.h>
#include <limits.h>
#include <omp.h>

typedef unsigned char byte;

// calculates ||u-v||^2
// we accumulate the result using a C int to avoid overflow
int vec_dist_sq (byte* u, byte* v, int dim) {
    int dist_sq = 0;
    for (int i=0;i<dim;i++) {
        dist_sq += (u[i]-v[i])*(u[i]-v[i]);
    }
    return dist_sq;
}

// for each test vector find the nearest training vector
void omp_nearest(byte* train, int num_train, byte* test, int num_test, int* nearest, int dim, int num_threads) {
    omp_set_num_threads(num_threads);

    // start the timer
    double start_time, end_time;
    start_time = omp_get_wtime();

    for (int i=0;i<num_test;i++) {
        int min_dist_sq = INT_MAX;
        for (int j=0;j<num_train;j++) {
            int dist_sq = vec_dist_sq(test+i*dim,train+j*dim,dim);
            if (dist_sq < min_dist_sq) {
                min_dist_sq = dist_sq;
                nearest[i] = j;
            }
        }
    }

    // stop the timer
    end_time = omp_get_wtime();

    printf ("Time to find nearest neighbors in C with OpenMP = %.2f seconds\n",
        end_time-start_time);
}

Writing omp_nearest_v1.c


## For version 2 we parallize the loop on line 27.
## In other words, we have each thread classify a subset of the test images.
## Although nearest is a read/write shared array, each thread writes to different entries in the array.  
## Thus, there are no read/write race conditions in our parallel region.
## Because of the simplicity of this example, we can use **#pragma omp parallel for** which combines **#pragma omp parallel** and **#pragma omp for**.
## Here is version 2 which is the completed OpenMP/C code.

In [26]:
%%writefile omp_nearest_v2.c
#include <stdio.h>
#include <stdlib.h>
#include <limits.h>
#include <omp.h>

typedef unsigned char byte;

// calculates ||u-v||^2
// we accumulate the result using a C int to avoid overflow
int vec_dist_sq (byte* u, byte* v, int dim) {
    int dist_sq = 0;
    for (int i=0;i<dim;i++) {
        dist_sq += (u[i]-v[i])*(u[i]-v[i]);
    }
    return dist_sq;
}

// for each test vector find the nearest training vector
void omp_nearest(byte* train, int num_train, byte* test, int num_test, int* nearest, int dim, int num_threads) {
    omp_set_num_threads(num_threads);

    // start the timer
    double start_time, end_time;
    start_time = omp_get_wtime();

#pragma omp parallel for
    for (int i=0;i<num_test;i++) {
        int min_dist_sq = INT_MAX;
        for (int j=0;j<num_train;j++) {
            int dist_sq = vec_dist_sq(test+i*dim,train+j*dim,dim);
            if (dist_sq < min_dist_sq) {
                min_dist_sq = dist_sq;
                nearest[i] = j;
            }
        }
    }

    // stop the timer
    end_time = omp_get_wtime();

    printf ("Time to find nearest neighbors in C with OpenMP = %.2f seconds\n",
        end_time-start_time);
}

Writing omp_nearest_v2.c


## Here are the results of running the Python/C/OpenMP version on matrix with 128 threads.
## Note that it takes around one second to classify all 10000 images!
    $ conda activate cmda3634_master
    $ gcc -O3 -march=native -fPIC -shared -o omp_nearest.so omp_nearest.c -fopenmp
    $ python3 omp_nearest_c.py 10000 128
    number of threads = 128
    number of digits classified = 10000
    Time to find nearest neighbors in C with OpenMP = 0.86 seconds
    number of classification errors = 309
    classificiation rate = 0.9691
