In [2]:
%%writefile atomicAddLL.cu

Overwriting atomicAddLL.cu


In [3]:
%%writefile atomicAddLL.cu
#include <stdio.h>

__device__ long long atomicAddLL(long long *addr, long long val) {
    unsigned long long *uaddr = (unsigned long long *)addr;
    unsigned long long old = *uaddr, assumed;
    do {
        assumed = old;
        old = atomicCAS(uaddr, assumed, assumed + val);
    } while (assumed != old);
    return (long long)old;
}

__global__ void atomicAddKernel(long long *data) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    printf("Thread %d adding value %d\n", tid, tid); // Added printf statement
    atomicAddLL(data, tid);
}

int main() {
    long long *d_data;
    long long h_data = 0;

    cudaMalloc(&d_data, sizeof(long long));
    cudaMemcpy(d_data, &h_data, sizeof(long long), cudaMemcpyHostToDevice);

    int threadsPerBlock = 256;
    int blocksPerGrid = 4;

    // Correct kernel launch
    atomicAddKernel<<<blocksPerGrid, threadsPerBlock>>>(d_data);
    cudaDeviceSynchronize();

    cudaMemcpy(&h_data, d_data, sizeof(long long), cudaMemcpyDeviceToHost);
    printf("Final value: %lld\n", h_data); // Expected: Sum of thread indices

    cudaFree(d_data);
    return 0;
}

Overwriting atomicAddLL.cu


In [5]:
!nvcc atomicAddLL.cu -o atomicAddLL -gencode arch=compute_75,code=sm_75 -lcublas

!./atomicAddLL

Thread 576 adding value 576
Thread 577 adding value 577
Thread 578 adding value 578
Thread 579 adding value 579
Thread 580 adding value 580
Thread 581 adding value 581
Thread 582 adding value 582
Thread 583 adding value 583
Thread 584 adding value 584
Thread 585 adding value 585
Thread 586 adding value 586
Thread 587 adding value 587
Thread 588 adding value 588
Thread 589 adding value 589
Thread 590 adding value 590
Thread 591 adding value 591
Thread 592 adding value 592
Thread 593 adding value 593
Thread 594 adding value 594
Thread 595 adding value 595
Thread 596 adding value 596
Thread 597 adding value 597
Thread 598 adding value 598
Thread 599 adding value 599
Thread 600 adding value 600
Thread 601 adding value 601
Thread 602 adding value 602
Thread 603 adding value 603
Thread 604 adding value 604
Thread 605 adding value 605
Thread 606 adding value 606
Thread 607 adding value 607
Thread 672 adding value 672
Thread 673 adding value 673
Thread 674 adding value 674
Thread 675 adding va