CUDA code won't run #37

tanmayb123 · 2017-06-18T05:46:30Z

I tried compiling the following CUDA code with cocl cudatest.cu on a macOS Sierra machine with a GeForce GTX 1070:

#include <stdio.h>

#define SIZE 1000

__global__ void kernel_matrix_add(float *input1, float *input2, float *output) {
        const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
        output[idx] = input1[idx] + input2[idx];
}

__global__ void kernel_matrix_multiply(float *input1, float *input2, float *output) {
        const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
        output[idx] = input1[idx] * input2[idx];
}

__global__ void kernel_matrix_divide(float *input1, float *input2, float *output) {
        const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
        output[idx] = input1[idx] / input2[idx];
}

__global__ void kernel_matrix_subtract(float *input1, float *input2, float *output) {
        const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
        output[idx] = input1[idx] - input2[idx];
}


int main() {

        float * in1;
        float * in2;
        float * out;

        cudaHostAlloc(&in1, SIZE*sizeof(float), cudaHostAllocDefault);
        cudaHostAlloc(&in2, SIZE*sizeof(float), cudaHostAllocDefault);
        cudaHostAlloc(&out, SIZE*sizeof(float), cudaHostAllocDefault);

        for (int i = 0; i < SIZE; ++i) {
                in1[i] = i;
                in2[i] = i;
                out[i] = 0;
        }

        float * d_in1;
        float * d_in2;
        float * d_out;

        cudaMalloc(&d_in1, SIZE*sizeof(float));
        cudaMalloc(&d_in2, SIZE*sizeof(float));
        cudaMalloc(&d_out, SIZE*sizeof(float));

        cudaMemcpy(d_in1, in1, SIZE*sizeof(float), cudaMemcpyHostToDevice);
        cudaMemcpy(d_in2, in2, SIZE*sizeof(float), cudaMemcpyHostToDevice);
        cudaMemcpy(d_out, out, SIZE*sizeof(float), cudaMemcpyHostToDevice);

        kernel_matrix_multiply<<<SIZE / 1024 + 1, 1024>>>(d_in1, d_in2, d_out);

        cudaMemcpy(out, d_out, SIZE*sizeof(float), cudaMemcpyHostToDevice);

        printf("First 10 Results:\n");

        for (int i = 0; i < 10; ++i) {
                printf("%f\n", out[i]);
        }

}

But I got the following error:

cocl args: cudatest.cu
LLVM_COMPILE_FLAGS -I/usr/local/opt/clang+llvm-4.0.0-x86_64-apple-darwin/include -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/usr/local/opt/clang+llvm-4.0.0-x86_64-apple-darwin/include -fPIC -fvisibility-inlines-hidden -Wall -W -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wmissing-field-initializers -pedantic -Wno-long-long -Wcovered-switch-default -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wstring-conversion -Werror=date-time -std=c++11 -fexceptions -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS
+ /usr/local/opt/llvm-4.0/bin/clang++ -DUSE_CLEW -std=c++11 -x cuda -D__CORIANDERCC__ -D__CUDACC__ --cuda-gpu-arch=sm_30 -nocudalib -nocudainc --cuda-device-only -emit-llvm -O2 -S -stdlib=libc++ -Wno-gnu-anonymous-struct -Wno-nested-anon-types -I/usr/local/opt/clang+llvm-4.0.0-x86_64-apple-darwin/include -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/usr/local/opt/clang+llvm-4.0.0-x86_64-apple-darwin/include -fPIC -fvisibility-inlines-hidden -Wall -W -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wmissing-field-initializers -pedantic -Wno-long-long -Wcovered-switch-default -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wstring-conversion -Werror=date-time -std=c++11 -fexceptions -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/usr/local/include/EasyCL -I/usr/local/include/cocl -include /usr/local/include/cocl/cocl.h -include /usr/local/include/cocl/fake_funcs.h -include /usr/local/include/cocl/cocl_deviceside.h -I/usr/local/include ./cudatest.cu -o ./cudatest-device-noopt.ll
./cudatest.cu:41:42: error: use of undeclared identifier 'cudaHostAllocDefault'
        cudaHostAlloc(&in1, SIZE*sizeof(float), cudaHostAllocDefault);
                                                ^
./cudatest.cu:42:42: error: use of undeclared identifier 'cudaHostAllocDefault'
        cudaHostAlloc(&in2, SIZE*sizeof(float), cudaHostAllocDefault);
                                                ^
./cudatest.cu:43:42: error: use of undeclared identifier 'cudaHostAllocDefault'
        cudaHostAlloc(&out, SIZE*sizeof(float), cudaHostAllocDefault);
                                                ^
3 errors generated.

Is there a limitation to coriander I'm not aware of? How can I make this compatible?

Thanks!

The text was updated successfully, but these errors were encountered:

hughperkins · 2017-06-18T06:41:14Z

Not everything is implemented. You can look in `include/cocl/cocl_memory.h` to see which memory related operations and constants are declared.

…

On 18 June 2017 06:46:31 BST, Tanmay Bakshi ***@***.***> wrote: I tried compiling the following CUDA code with `cocl cudatest.cu` on a macOS Sierra machine with a GeForce GTX 1070: ``` #include <stdio.h> #define SIZE 1000 __global__ void kernel_matrix_add(float *input1, float *input2, float *output) { const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; output[idx] = input1[idx] + input2[idx]; } __global__ void kernel_matrix_multiply(float *input1, float *input2, float *output) { const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; output[idx] = input1[idx] * input2[idx]; } __global__ void kernel_matrix_divide(float *input1, float *input2, float *output) { const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; output[idx] = input1[idx] / input2[idx]; } __global__ void kernel_matrix_subtract(float *input1, float *input2, float *output) { const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; output[idx] = input1[idx] - input2[idx]; } int main() { float * in1; float * in2; float * out; cudaHostAlloc(&in1, SIZE*sizeof(float), cudaHostAllocDefault); cudaHostAlloc(&in2, SIZE*sizeof(float), cudaHostAllocDefault); cudaHostAlloc(&out, SIZE*sizeof(float), cudaHostAllocDefault); for (int i = 0; i < SIZE; ++i) { in1[i] = i; in2[i] = i; out[i] = 0; } float * d_in1; float * d_in2; float * d_out; cudaMalloc(&d_in1, SIZE*sizeof(float)); cudaMalloc(&d_in2, SIZE*sizeof(float)); cudaMalloc(&d_out, SIZE*sizeof(float)); cudaMemcpy(d_in1, in1, SIZE*sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(d_in2, in2, SIZE*sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(d_out, out, SIZE*sizeof(float), cudaMemcpyHostToDevice); kernel_matrix_multiply<<<SIZE / 1024 + 1, 1024>>>(d_in1, d_in2, d_out); cudaMemcpy(out, d_out, SIZE*sizeof(float), cudaMemcpyHostToDevice); printf("First 10 Results:\n"); for (int i = 0; i < 10; ++i) { printf("%f\n", out[i]); } } ``` But I got the following error: ``` cocl args: cudatest.cu LLVM_COMPILE_FLAGS -I/usr/local/opt/clang+llvm-4.0.0-x86_64-apple-darwin/include -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/usr/local/opt/clang+llvm-4.0.0-x86_64-apple-darwin/include -fPIC -fvisibility-inlines-hidden -Wall -W -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wmissing-field-initializers -pedantic -Wno-long-long -Wcovered-switch-default -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wstring-conversion -Werror=date-time -std=c++11 -fexceptions -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS + /usr/local/opt/llvm-4.0/bin/clang++ -DUSE_CLEW -std=c++11 -x cuda -D__CORIANDERCC__ -D__CUDACC__ --cuda-gpu-arch=sm_30 -nocudalib -nocudainc --cuda-device-only -emit-llvm -O2 -S -stdlib=libc++ -Wno-gnu-anonymous-struct -Wno-nested-anon-types -I/usr/local/opt/clang+llvm-4.0.0-x86_64-apple-darwin/include -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/usr/local/opt/clang+llvm-4.0.0-x86_64-apple-darwin/include -fPIC -fvisibility-inlines-hidden -Wall -W -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wmissing-field-initializers -pedantic -Wno-long-long -Wcovered-switch-default -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wstring-conversion -Werror=date-time -std=c++11 -fexceptions -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/usr/local/include/EasyCL -I/usr/local/include/cocl -include /usr/local/include/cocl/cocl.h -include /usr/local/include/cocl/fake_funcs.h -include /usr/local/include/cocl/cocl_deviceside.h -I/usr/local/include ./cudatest.cu -o ./cudatest-device-noopt.ll ./cudatest.cu:41:42: error: use of undeclared identifier 'cudaHostAllocDefault' cudaHostAlloc(&in1, SIZE*sizeof(float), cudaHostAllocDefault); ^ ./cudatest.cu:42:42: error: use of undeclared identifier 'cudaHostAllocDefault' cudaHostAlloc(&in2, SIZE*sizeof(float), cudaHostAllocDefault); ^ ./cudatest.cu:43:42: error: use of undeclared identifier 'cudaHostAllocDefault' cudaHostAlloc(&out, SIZE*sizeof(float), cudaHostAllocDefault); ^ 3 errors generated. ``` Is there a limitation to coriander I'm not aware of? How can I make this compatible? Thanks! -- You are receiving this because you are subscribed to this thread. Reply to this email directly or view it on GitHub: #37

-- Sent from my Android device with K-9 Mail. Please excuse my brevity.

tanmayb123 · 2017-06-18T06:49:36Z

Got it. Thanks!

tanmayb123 · 2017-06-18T06:54:17Z

I tried this code:

#include <stdio.h>

#define SIZE 1000

__global__ void kernel_matrix_add(float *input1, float *input2, float *output) {
        const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
        output[idx] = input1[idx] + input2[idx];
}

__global__ void kernel_matrix_multiply(float *input1, float *input2, float *output) {
        const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
        output[idx] = input1[idx] * input2[idx];
}

__global__ void kernel_matrix_divide(float *input1, float *input2, float *output) {
        const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
        output[idx] = input1[idx] / input2[idx];
}

__global__ void kernel_matrix_subtract(float *input1, float *input2, float *output) {
        const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
        output[idx] = input1[idx] - input2[idx];
}


int main() {

        float * in1;
        float * in2;
        float * out;

        cuMemHostAlloc((void**)&in1, SIZE*sizeof(float));
        cuMemHostAlloc((void**)&in2, SIZE*sizeof(float));
        cuMemHostAlloc((void**)&out, SIZE*sizeof(float));

        for (int i = 0; i < SIZE; ++i) {
                in1[i] = i;
                in2[i] = i;
                out[i] = 0;
        }

        float * d_in1;
        float * d_in2;
        float * d_out;

        cudaMalloc(&d_in1, SIZE*sizeof(float));
        cudaMalloc(&d_in2, SIZE*sizeof(float));
        cudaMalloc(&d_out, SIZE*sizeof(float));

        cudaMemcpy(d_in1, in1, SIZE*sizeof(float), cudaMemcpyHostToDevice);
        cudaMemcpy(d_in2, in2, SIZE*sizeof(float), cudaMemcpyHostToDevice);
        cudaMemcpy(d_out, out, SIZE*sizeof(float), cudaMemcpyHostToDevice);

        kernel_matrix_multiply<<<SIZE / 1024 + 1, 1024>>>(d_in1, d_in2, d_out);

        cudaMemcpy(out, d_out, SIZE*sizeof(float), cudaMemcpyHostToDevice);

        printf("First 10 Results:\n");

        for (int i = 0; i < 10; ++i) {
                printf("%f\n", out[i]);
        }

}

But now I get this:

OpenCL platform: Apple
OpenCL device: GeForce GTX 1070
Segmentation fault: 11

When I try to run with nvcc:

cudatest.cu(32): error: identifier "cuMemHostAlloc" is undefined

1 error detected in the compilation of "/var/folders/5k/t9j2_ms918jgylxs810fmh8c0000gn/T//tmpxft_0000396d_00000000-9_cudatest.cpp1.ii".

Is there a way I can debug this?
Sorry! I'm just really new to this type of development!

tanmayb123 · 2017-06-18T07:02:12Z

I found it! This was the culprit: cudaMemcpy(out, d_out, SIZE*sizeof(float), cudaMemcpyHostToDevice);!
For some reason, this line worked on CUDA, but not cocl! It's supposed to be DeviceToHost not HostToDevice.

hughperkins · 2017-06-18T07:11:18Z

Luckily, its possible to use gdb on the compiled code, as long as you compile with -g. You can also catch the exception and print it. Whether I should be throwing an exception, rather than returning an error code, is an open question.

…

On 18 June 2017 08:02:12 BST, Tanmay Bakshi ***@***.***> wrote: I found it! This was the culprit: `cudaMemcpy(out, d_out, SIZE*sizeof(float), cudaMemcpyHostToDevice);`! For some reason, this line worked on CUDA, but not `cocl`! It's supposed to be `DeviceToHost` not `HostToDevice`. -- You are receiving this because you commented. Reply to this email directly or view it on GitHub: #37 (comment)

-- Sent from my Android device with K-9 Mail. Please excuse my brevity.

hughperkins closed this as completed Jun 21, 2017

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

CUDA code won't run #37

CUDA code won't run #37

tanmayb123 commented Jun 18, 2017

hughperkins commented Jun 18, 2017 via email

tanmayb123 commented Jun 18, 2017

tanmayb123 commented Jun 18, 2017

tanmayb123 commented Jun 18, 2017

hughperkins commented Jun 18, 2017 via email

CUDA code won't run #37

CUDA code won't run #37

Comments

tanmayb123 commented Jun 18, 2017

hughperkins commented Jun 18, 2017 via email

tanmayb123 commented Jun 18, 2017

tanmayb123 commented Jun 18, 2017

tanmayb123 commented Jun 18, 2017

hughperkins commented Jun 18, 2017 via email