# Lecture 19 : GPU Extreme

## Clone the materials repo to access datafiles.

In [4]:
!git clone https://code.vt.edu/jasonwil/cmda3634_materials.git

Cloning into 'cmda3634_materials'...
remote: Enumerating objects: 230, done.[K
remote: Counting objects: 100% (191/191), done.[K
remote: Compressing objects: 100% (168/168), done.[K
remote: Total 230 (delta 83), reused 37 (delta 16), pack-reused 39 (from 1)[K
Receiving objects: 100% (230/230), 47.75 MiB | 8.85 MiB/s, done.
Resolving deltas: 100% (83/83), done.


In [5]:
# copy the lecture 19 files to our working directory
!cp cmda3634_materials/L19/* .
# uncompress the .gz files
!gzip -d *.gz

# Part 1 : Sequential Extreme

In [6]:
%%writefile extreme.c
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>

typedef unsigned char byte;

// read data from a binary file
void read_bin (byte* data, int num_bytes, char* filename, int header_size) {
    byte header[header_size];
    FILE* fptr;
    int num_read;
    // open the binary file for reading
    fptr = fopen(filename,"rb");
    // need to check for null
    if (fptr == 0) {
        printf ("Error opening binary data file %s.\n",filename);
        exit(1);
    }
    // read header
    num_read = fread(header, sizeof(byte), header_size, fptr);
    // read data
    num_read = fread(data, sizeof(byte), num_bytes, fptr);
    if (num_read != num_bytes) {
        printf ("Warning : binary data file read error for %s.\n",filename);
    }
    // close the binary file
    fclose(fptr);
}

typedef struct {
    int max_dist_sq;
    int i,j;
} extreme_info;

int vec_dist_sq(byte* u, byte* v, int dim) {
    int dist_sq = 0;
    for (int i=0;i<dim;i++) {
	    dist_sq += (u[i]-v[i])*(u[i]-v[i]);
    }
    return dist_sq;
}

int main (int argc, char** argv) {

    // read in a MNIST image set
    int len = 60000;
    int dim = 784;
    byte* data = (byte*)malloc(len*dim*sizeof(byte));
    char images_file[] = "train-images-idx3-ubyte";
    read_bin(data,len*dim,images_file,16);

    // start the timer
    clock_t start = clock();

    // find the extreme pair
    extreme_info info = { 0, -1, -1 };
    for (int i=0;i<len-1;i++) {
	    for (int j=i+1;j<len;j++) {
	        int dist_sq = vec_dist_sq(data+i*dim,data+j*dim,dim);
	        if (dist_sq > info.max_dist_sq) {
		        info.max_dist_sq = dist_sq;
		        info.i = i;
		        info.j = j;
	        }
	    }
    }

    // stop the timer
    clock_t stop = clock();
    double elapsed = (double)(stop-start)/CLOCKS_PER_SEC;

    // print results
    printf ("number of images = %d\n",len);
    printf ("elapsed time = %.4f seconds\n",elapsed);
    printf ("extreme distance = %.2f\n",sqrt(info.max_dist_sq));
    printf ("extreme pair = (%d,%d)\n",info.i,info.j);

    // free dynamically allocated memory
    free(data);
}


Overwriting extreme.c


In [7]:
!gcc -O3 -o extreme extreme.c -lm

In [8]:
!./extreme

number of images = 60000
elapsed time = 201.5857 seconds
extreme distance = 4303.32
extreme pair = (26785,59452)


## Note that we are reading in the binary version of the MNIST image file rather than the text version.

## Binary files take up less space than text files and they load faster as well.

# Part 2 : GPU Extreme (Max Distance Only)

## We start by writing a version that just computes the maximum distance squared over all pairs.  

## Note the use of *atomicMax*.

In [6]:
%%writefile gpu_extreme_v1.cu
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <cuda.h>

typedef unsigned char byte;

// read data from a binary file
void read_bin (byte* data, int num_bytes, char* filename, int header_size) {
    byte header[header_size];
    FILE* fptr;
    int num_read;
    // open the binary file for reading
    fptr = fopen(filename,"rb");
    // need to check for null
    if (fptr == 0) {
        printf ("Error opening binary data file %s.\n",filename);
        exit(1);
    }
    // read header
    num_read = fread(header, sizeof(byte), header_size, fptr);
    // read data
    num_read = fread(data, sizeof(byte), num_bytes, fptr);
    if (num_read != num_bytes) {
        printf ("Warning : binary data file read error for %s.\n",filename);
    }
    // close the binary file
    fclose(fptr);
}

__global__ void extremeKernel(byte* data, int len, int dim, int* max_dist_sq) {
    int thread_num = blockIdx.x*blockDim.x + threadIdx.x;
    if (thread_num < len*len) {
	    int i = thread_num/len;
	    int j = thread_num%len;
	    if (i < j) {
	        int dist_sq = 0;
	        for (int k=0;k<dim;k++) {
		        int diff = data[i*dim+k]-data[j*dim+k];
		        dist_sq += diff*diff;
	        }
	        atomicMax(max_dist_sq,dist_sq);
	    }
    }
}

int main (int argc, char** argv) {

    // read in a MNIST image set
    int len = 10000;
    int dim = 784;
    byte* data = (byte*)malloc(len*dim*sizeof(byte));
    char images_file[] = "t10k-images-idx3-ubyte";
    read_bin(data,len*dim,images_file,16);

    // allocate device memory
    byte* d_data;
    int* d_max_dist_sq;
    cudaMalloc(&d_data,len*dim*sizeof(byte));
    cudaMalloc(&d_max_dist_sq,sizeof(int));

    // start the timer
    clock_t start = clock();

    // copy data to device
    cudaMemcpy(d_data,data,len*dim*sizeof(byte),cudaMemcpyHostToDevice);

    // initialize the device max_dist_sq to 0
    cudaMemset(d_max_dist_sq,0,sizeof(int));

    // launch kernel to compute extreme distance
    int B = 256;
    int G = (len*len+B-1)/B;
    extremeKernel <<< G, B >>> (d_data,len,dim,d_max_dist_sq);

    // copy max_dist_sq from device to host
    int max_dist_sq;
    cudaMemcpy(&max_dist_sq,d_max_dist_sq,sizeof(int),cudaMemcpyDeviceToHost);

    // stop the timer
    clock_t stop = clock();
    double elapsed = (double)(stop-start)/CLOCKS_PER_SEC;

    // print results
    printf ("number of images = %d\n",len);
    printf ("elapsed time = %.4f seconds\n",elapsed);
    printf ("extreme distance = %.2f\n",sqrt(max_dist_sq));

    // free dynamically allocated memory
    free(data);
    cudaFree(d_data);
    cudaFree(d_max_dist_sq);
}


Overwriting gpu_extreme_v1.cu


In [7]:
!nvcc -arch=sm_75 -o gpu_extreme_v1 gpu_extreme_v1.cu

In [8]:
!./gpu_extreme_v1

number of images = 10000
elapsed time = 0.6890 seconds
extreme distance = 4097.95


# Part 3 : GPU Extreme (Max Distance and Extreme Pair)

## In CUDA there is no equivalent to an OpenMP critical region.

## Thus, we frequently have to get creative to get the most out of the GPU atomics.

## In order to do an *atomic update* of the triple

$$(max\_dist\_sq, i, j)$$

## we pack the three values $max\_dist\_sq$, $i$, and $j$ into an *unsigned long long* which is 64 bits.  

## We use 32 of the 64 bits to store $\max\_dist\_sq$ and 16 bits each to store $i$ and $j$.

## We put $max\_dist\_sq$ in the high 32 bits so that the *atomicMax* will still work as expected.  

## The extreme pair $(i,j)$ is tucked into the low 32 bits and will not effect the *atomicMax* calculation (unless there is a tie).

## Note that there is a function to *compress* a triple of extreme info and a function to *expand* an unsigned long long containing into a triple of extreme info.  

## Since the *compress* function runs on the device we have to declare the function as:

    __device__ uint64 extreme_info_compress(int dist_sq, int i, int j) {

In [4]:
%%writefile gpu_extreme_v2.cu
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <cuda.h>

typedef unsigned char byte;
typedef unsigned long long uint64;

// read data from a binary file
void read_bin (byte* data, int num_bytes, char* filename, int header_size) {
    byte header[header_size];
    FILE* fptr;
    int num_read;
    // open the binary file for reading
    fptr = fopen(filename,"rb");
    // need to check for null
    if (fptr == 0) {
        printf ("Error opening binary data file %s.\n",filename);
        exit(1);
    }
    // read header
    num_read = fread(header, sizeof(byte), header_size, fptr);
    // read data
    num_read = fread(data, sizeof(byte), num_bytes, fptr);
    if (num_read != num_bytes) {
        printf ("Warning : binary data file read error for %s.\n",filename);
    }
    // close the binary file
    fclose(fptr);
}

#define TWO_POW_16 65536
void extreme_info_expand(uint64 info, int* dist_sq, int* i, int* j) {
    *j = info % TWO_POW_16;
    info = info >> 16;
    *i = info % TWO_POW_16;
    info = info >> 16;
    *dist_sq = info;
}

__device__ uint64 extreme_info_compress(int dist_sq, int i, int j) {
    uint64 info = dist_sq;
    info = info << 16;
    info += i;
    info = info << 16;
    info += j;
    return info;
}

__global__ void extremeKernel(byte* data, int len, int dim, uint64* max_info) {
    int thread_num = blockIdx.x*blockDim.x + threadIdx.x;
    if (thread_num < len*len) {
	    int i = thread_num/len;
	    int j = thread_num%len;
	    if (i < j) {
	        int dist_sq = 0;
	        for (int k=0;k<dim;k++) {
		        int diff = data[i*dim+k]-data[j*dim+k];
		        dist_sq += diff*diff;
	        }
	        uint64 info = extreme_info_compress(dist_sq,i,j);
	        atomicMax(max_info,info);
	    }
    }
}

int main (int argc, char** argv) {

    // read in a MNIST image set
    int len = 10000;
    int dim = 784;
    byte* data = (byte*)malloc(len*dim*sizeof(byte));
    char images_file[] = "t10k-images-idx3-ubyte";
    read_bin(data,len*dim,images_file,16);

    // allocate device memory
    byte* d_data;
    uint64* d_max_info;
    cudaMalloc(&d_data,len*dim*sizeof(byte));
    cudaMalloc(&d_max_info,sizeof(uint64));

    // start the timer
    clock_t start = clock();

    // copy data to device
    cudaMemcpy(d_data,data,len*dim*sizeof(byte),cudaMemcpyHostToDevice);

    // initialize the device max_info to 0
    cudaMemset(d_max_info,0,sizeof(uint64));

    // launch kernel to compute extreme distance
    int B = 256;
    int G = (len*len+B-1)/B;
    printf ("G = %d\n",G);
    extremeKernel <<< G, B >>> (d_data,len,dim,d_max_info);

    // copy max_info from device to host
    uint64 max_info;
    cudaMemcpy(&max_info,d_max_info,sizeof(uint64),cudaMemcpyDeviceToHost);

    // expand max_info
    int max_dist_sq, i, j;
    extreme_info_expand(max_info,&max_dist_sq,&i,&j);

    // stop the timer
    clock_t stop = clock();
    double elapsed = (double)(stop-start)/CLOCKS_PER_SEC;

    // print results
    printf ("number of images = %d\n",len);
    printf ("elapsed time = %.4f seconds\n",elapsed);
    printf ("extreme distance = %.2f\n",sqrt(max_dist_sq));
    printf ("extreme pair = (%d,%d)\n",i,j);

    // free dynamically allocated memory
    free(data);
    cudaFree(d_data);
    cudaFree(d_max_info);
}


Writing gpu_extreme_v2.cu


In [5]:
!nvcc -arch=sm_75 -o gpu_extreme_v2 gpu_extreme_v2.cu

In [6]:
!./gpu_extreme_v2

G = 390625
number of images = 10000
elapsed time = 0.7950 seconds
extreme distance = 4097.95
extreme pair = (5977,6412)


# Part 4 : GPU Extreme (Column Major Order)

## It is frequently better to use matrices stored in column major order in CUDA.

## Ideally, consecutive threads in a warp will read consecutive memory locations when accessing memory.  

## Suppose each thread in a warp is reading consecutive rows of a matrix.  

## If the matrix is stored in *row major order* then consecutive threads in a warp are reading values that are stored far apart in memory.  

## However if the matrix is stored in *column major order* then consecutive threads in a warp are reading values that are stored next to each other in memory.

In [3]:
%%writefile gpu_extreme_v3.cu
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <cuda.h>

typedef unsigned char byte;
typedef unsigned long long uint64;

// read data from a binary file
void read_bin (byte* data, int num_bytes, char* filename, int header_size) {
    byte header[header_size];
    FILE* fptr;
    int num_read;
    // open the binary file for reading
    fptr = fopen(filename,"rb");
    // need to check for null
    if (fptr == 0) {
        printf ("Error opening binary data file %s.\n",filename);
        exit(1);
    }
    // read header
    num_read = fread(header, sizeof(byte), header_size, fptr);
    // read data
    num_read = fread(data, sizeof(byte), num_bytes, fptr);
    if (num_read != num_bytes) {
        printf ("Warning : binary data file read error for %s.\n",filename);
    }
    // close the binary file
    fclose(fptr);
}

#define TWO_POW_16 65536
void extreme_info_expand(uint64 info, int* dist_sq, int* i, int* j) {
    *j = info % TWO_POW_16;
    info = info >> 16;
    *i = info % TWO_POW_16;
    info = info >> 16;
    *dist_sq = info;
}

__device__ uint64 extreme_info_compress(int dist_sq, int i, int j) {
    uint64 info = dist_sq;
    info = info << 16;
    info += i;
    info = info << 16;
    info += j;
    return info;
}

__global__ void extremeKernel(byte* data, int len, int dim, uint64* max_info) {
    int thread_num = blockIdx.x*blockDim.x + threadIdx.x;
    if (thread_num < len*len) {
	    int i = thread_num/len;
	    int j = thread_num%len;
	    if (i < j) {
	        int dist_sq = 0;
	        for (int k=0;k<dim;k++) {
                int diff = data[k*len+i]-data[k*len+j];
		        dist_sq += diff*diff;
	        }
	        uint64 info = extreme_info_compress(dist_sq,i,j);
	        atomicMax(max_info,info);
	    }
    }
}

int main (int argc, char** argv) {

    // read in a MNIST image set
    int len = 10000;
    int dim = 784;
    byte* data = (byte*)malloc(len*dim*sizeof(byte));
    char images_file[] = "t10k-images-idx3-ubyte-c";
    read_bin(data,len*dim,images_file,16);

    // allocate device memory
    byte* d_data;
    uint64* d_max_info;
    cudaMalloc(&d_data,len*dim*sizeof(byte));
    cudaMalloc(&d_max_info,sizeof(uint64));

    // start the timer
    clock_t start = clock();

    // copy data to device
    cudaMemcpy(d_data,data,len*dim*sizeof(byte),cudaMemcpyHostToDevice);

    // initialize the device max_info to 0
    cudaMemset(d_max_info,0,sizeof(uint64));

    // launch kernel to compute extreme distance
    int B = 256;
    int G = (len*len+B-1)/B;
    printf ("G = %d\n",G);
    extremeKernel <<< G, B >>> (d_data,len,dim,d_max_info);

    // copy max_info from device to host
    uint64 max_info;
    cudaMemcpy(&max_info,d_max_info,sizeof(uint64),cudaMemcpyDeviceToHost);

    // expand max_info
    int max_dist_sq, i, j;
    extreme_info_expand(max_info,&max_dist_sq,&i,&j);

    // stop the timer
    clock_t stop = clock();
    double elapsed = (double)(stop-start)/CLOCKS_PER_SEC;

    // print results
    printf ("number of images = %d\n",len);
    printf ("elapsed time = %.4f seconds\n",elapsed);
    printf ("extreme distance = %.2f\n",sqrt(max_dist_sq));
    printf ("extreme pair = (%d,%d)\n",i,j);

    // free dynamically allocated memory
    free(data);
    cudaFree(d_data);
    cudaFree(d_max_info);
}


Writing gpu_extreme_v3.cu


In [4]:
!nvcc -arch=sm_75 -o gpu_extreme_v3 gpu_extreme_v3.cu

In [5]:
!./gpu_extreme_v3

G = 390625
number of images = 10000
elapsed time = 0.2506 seconds
extreme distance = 4097.95
extreme pair = (5977,6412)


# Part 5 : GPU Extreme (60000 images)

## When running on a file with 60000 images, there are 3.6 billion threads which is a number that is too large to store in a C int.  

## Thus we have to change some parts of the code to avoid overflow.  

## Here are the necessary changes:

    long long thread_num = (long long)blockIdx.x*blockDim.x + threadIdx.x;

    if (thread_num < (long long)len*len) {

    int G = ((long long)len*len+B-1)/B;

In [3]:
%%writefile gpu_extreme_v4.cu
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <cuda.h>

typedef unsigned char byte;
typedef unsigned long long uint64;

// read data from a binary file
void read_bin (byte* data, int num_bytes, char* filename, int header_size) {
    byte header[header_size];
    FILE* fptr;
    int num_read;
    // open the binary file for reading
    fptr = fopen(filename,"rb");
    // need to check for null
    if (fptr == 0) {
        printf ("Error opening binary data file %s.\n",filename);
        exit(1);
    }
    // read header
    num_read = fread(header, sizeof(byte), header_size, fptr);
    // read data
    num_read = fread(data, sizeof(byte), num_bytes, fptr);
    if (num_read != num_bytes) {
        printf ("Warning : binary data file read error for %s.\n",filename);
    }
    // close the binary file
    fclose(fptr);
}

#define TWO_POW_16 65536
void extreme_info_expand(uint64 info, int* dist_sq, int* i, int* j) {
    *j = info % TWO_POW_16;
    info = info >> 16;
    *i = info % TWO_POW_16;
    info = info >> 16;
    *dist_sq = info;
}

__device__ uint64 extreme_info_compress(int dist_sq, int i, int j) {
    uint64 info = dist_sq;
    info = info << 16;
    info += i;
    info = info << 16;
    info += j;
    return info;
}

__global__ void extremeKernel(byte* data, int len, int dim, uint64* max_info) {
    long long thread_num = (long long)blockIdx.x*blockDim.x + threadIdx.x;
    if (thread_num < (long long)len*len) {
	    int i = thread_num/len;
	    int j = thread_num%len;
	    if (i < j) {
	        int dist_sq = 0;
	        for (int k=0;k<dim;k++) {
                int diff = data[k*len+i]-data[k*len+j];
		        dist_sq += diff*diff;
	        }
	        uint64 info = extreme_info_compress(dist_sq,i,j);
	        atomicMax(max_info,info);
	    }
    }
}

int main (int argc, char** argv) {

    // read in a MNIST image set
    int len = 60000;
    int dim = 784;
    byte* data = (byte*)malloc(len*dim*sizeof(byte));
    char images_file[] = "train-images-idx3-ubyte-c";
    read_bin(data,len*dim,images_file,16);

    // allocate device memory
    byte* d_data;
    uint64* d_max_info;
    cudaMalloc(&d_data,len*dim*sizeof(byte));
    cudaMalloc(&d_max_info,sizeof(uint64));

    // start the timer
    clock_t start = clock();

    // copy data to device
    cudaMemcpy(d_data,data,len*dim*sizeof(byte),cudaMemcpyHostToDevice);

    // initialize the device max_info to 0
    cudaMemset(d_max_info,0,sizeof(uint64));

    // launch kernel to compute extreme distance
    int B = 256;
    int G = ((long long)len*len+B-1)/B;
    printf ("G = %d\n",G);
    extremeKernel <<< G, B >>> (d_data,len,dim,d_max_info);

    // copy max_info from device to host
    uint64 max_info;
    cudaMemcpy(&max_info,d_max_info,sizeof(uint64),cudaMemcpyDeviceToHost);

    // expand max_info
    int max_dist_sq, i, j;
    extreme_info_expand(max_info,&max_dist_sq,&i,&j);

    // stop the timer
    clock_t stop = clock();
    double elapsed = (double)(stop-start)/CLOCKS_PER_SEC;

    // print results
    printf ("number of images = %d\n",len);
    printf ("elapsed time = %.4f seconds\n",elapsed);
    printf ("extreme distance = %.2f\n",sqrt(max_dist_sq));
    printf ("extreme pair = (%d,%d)\n",i,j);

    // free dynamically allocated memory
    free(data);
    cudaFree(d_data);
    cudaFree(d_max_info);
}


Writing gpu_extreme_v4.cu


In [4]:
!nvcc -arch=sm_75 -o gpu_extreme_v4 gpu_extreme_v4.cu

In [5]:
!./gpu_extreme_v4

G = 14062500
number of images = 60000
elapsed time = 12.0245 seconds
extreme distance = 4303.32
extreme pair = (26785,59452)


## Although the GPU version 4 is considerably faster than the sequential code, we are not even close to realizing the maximum performance of the GPU for this problem.  

## Optimizing kernels to achieve high performance on a GPU is one of the main topics of CMDA 4634.  