<a href="https://colab.research.google.com/github/jmtcabili/CEPARCO-Integrating-Project-ARIMA/blob/main/Differencing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Mon Mar 31 07:09:01 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   54C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
%%writefile cuda_arima.cu

#include <stdio.h>
#include <stdlib.h>

//CUDA convolution kernel
__global__
void differencing(size_t n, float *out, float *in){
    int k;
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;

    for (k = index; k < n-1; k += stride)
       out[k]= in[k+1]-in[k];
}

__global__
void autoregressive(size_t n, float *lagged, float *in, int p)
{
    // Get the thread indices
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    int rowStride = blockDim.y * gridDim.y;
    int colStride = blockDim.x * gridDim.x;

    for (int k = row; k < n; k += rowStride){
      for
    }
    // Check if the thread is within bounds
    if (row < n && col < p) {

        // Calculate position in the output array
        int output_idx = row * p + col;

        // For each column, we lag the input array by 'col' elements
        // If the lagged index is negative, we set the value to 0
        int input_idx = row - col;

        if (input_idx >= 0) {
            lagged[output_idx] = in[0];
        } else {
            lagged[output_idx] = 0.0f;
        }
    }
}

/// ******* ma_related
__global__
void matrixmult(int nRow, int nCol, float *arrResult, float *arr1, float *arr2)
{
  int col = blockIdx.x * blockDim.x + threadIdx.x ;
  int row = blockIdx.y * blockDim.y + threadIdx.y ;
  int index = col+row*nCol;

  arrResult[index] = arr1[index] * arr2[index];

}

__global__
void calcuate_rate(size_t n, float *out, float *in){
    int k;
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;

    for (k = index; k < n; k += stride){
      if(k<1)
          out[k] = 0.0f;
      else
       out[k]= in[k]/in[k-1]-1;
    }
}


/*
  output is sum , averaging is done externally
*/
__global__
void  getTotalSum(size_t n, float *avesum ,float *in){
  __shared__ float sum;
    int k;
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;

    sum = 0.0;

    for (k = index; k < n; k +=  stride){
      atomicAdd(&sum, in[k]);
      __syncthreads();

      if(index % blockDim.x ==0){
        atomicAdd(avesum,sum);
      }
    }

}

__global__
void  calculate_residuals(size_t n, float *out ,float *in, float average ){
    int k;
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;
    for (k = index; k < n; k += stride){
      if(k<1)
          out[k] = 0.0f;
      else
          out[index] = in[index]- average;
    }
}


int main(){

  //dataset
  const size_t ARRAY_SIZE = 1<<28;
  const size_t ARRAY_BYTES = ARRAY_SIZE * sizeof(float);
  const size_t numOfLoops = 30;


  //arima parameters and variables;
  const int p = 4;
  const int q = 1;

  const size_t AR_SIZE = p + 1;
  const size_t AR_BYTES = AR_SIZE * sizeof(float);

  // declare arrays
    float *in, *out, *lagged, *AR_Coef;
    cudaMallocManaged(&in, ARRAY_BYTES);
    cudaMallocManaged(&out, ARRAY_BYTES);
    cudaMallocManaged(&lagged, ARRAY_BYTES*p); //same amount of rows and p cols
    cudaMallocManaged(&AR_Coef, AR_BYTES);


  //get GPU id
    int device = -1;
    cudaGetDevice(&device);

  // memory advise
   cudaMemAdvise(in, ARRAY_BYTES, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
   cudaMemAdvise(in, ARRAY_BYTES, cudaMemAdviseSetReadMostly, cudaCpuDeviceId);

  //"prefetch data" to create CPU page memory
    cudaMemPrefetchAsync(in,ARRAY_BYTES,cudaCpuDeviceId,NULL);
  //"prefetch data" to create GPU page memory
    cudaMemPrefetchAsync(out,ARRAY_BYTES,device,NULL);

  // initialize array
  for (size_t i=0; i<ARRAY_SIZE; i++)
     in[i] = i % 3 + 1.0;

  // prefetch from CPU to GPU
  cudaMemPrefetchAsync(in,ARRAY_BYTES,device,NULL);

  // CUDA kernel
    size_t numThreads = 1024;
    size_t numBlocks = (ARRAY_SIZE + numThreads-1) / numThreads;

  printf("\n***** Differencing in CUDA with MemAdvise\n");
  printf("numElements = %lu\n", ARRAY_SIZE);
  printf("numBlocks = %lu, numThreads = %lu \n",numBlocks, numThreads);
    for (size_t i=0; i<numOfLoops;i++)
      differencing<<<numBlocks, numThreads>>> (ARRAY_SIZE,out,in);

  // synchronize GPU with CPU
    cudaDeviceSynchronize();

  // prefetch from GPU to CPU
  cudaMemPrefetchAsync(out,ARRAY_BYTES,cudaCpuDeviceId,NULL);

  // error checking routine
    size_t err_count = 0;
    for (size_t i=0; i<ARRAY_SIZE-2; i++){
      if(in[i+1]-in[i] != out[i])
        err_count++;
    }


  //Displays First 10 Elements
  printf("First 10 elements: \n");
  for (int i = 0; i < 10; i++){
    printf("%.2f\n", out[i]);
  }
  printf("...\n...\n...\n");


  //Displays Last 10 Elements
  printf("Last 10 elements: \n");
  for (int i = ARRAY_SIZE-10; i < ARRAY_SIZE; i++){
    printf("%.2f\n", out[i]);
  }


  printf("Error count (Prefetch & MemAdvise): %lu\n", err_count);

  cudaMemAdvise(out, ARRAY_BYTES, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
  cudaMemAdvise(out, ARRAY_BYTES, cudaMemAdviseSetReadMostly, cudaCpuDeviceId);
  //"prefetch data" to create CPU page memory
    cudaMemPrefetchAsync(out,ARRAY_BYTES,cudaCpuDeviceId,NULL);
  //"prefetch data" to create GPU page memory
    cudaMemPrefetchAsync(lagged,ARRAY_BYTES*p,device,NULL);
  // prefetch from CPU to GPU
    cudaMemPrefetchAsync(out,ARRAY_BYTES,device,NULL);

  dim3 blockSize(16, 16);  // 16x16 threads per block
  dim3 gridSize(( + blockSize.x - 1) / blockSize.x,
                (ARRAY_SIZE + blockSize.y - 1) / blockSize.y);

  //Call autoregressive function
    autoregressive<<<gridSize, blockSize>>> (ARRAY_SIZE,lagged, out, p);

  // synchronize GPU with CPU
    cudaDeviceSynchronize();

  // prefetch from GPU to CPU
  cudaMemPrefetchAsync(lagged,ARRAY_BYTES*p,cudaCpuDeviceId,NULL);

  for (int i = 0; i < 5; i++){
    for (int j = 0; j < p; j++){
      printf("%.2f ", lagged[i*p+j]);
    }
    printf("\n");
  }

  //free memory
    cudaFree(in);
    cudaFree(out);
    cudaFree(lagged);
    cudaFree(AR_Coef);

  return 0;
}

Overwriting cuda_arima.cu


In [None]:
%%shell
nvcc -arch=sm_75 cuda_arima.cu -o cuda_arima

    const int q = 1;
              ^






In [None]:
%%shell
nvprof ./cuda_arima

==3118== NVPROF is profiling process 3118, command: ./cuda_arima

***** Differencing in CUDA with MemAdvise
numElements = 268435456
numBlocks = 262144, numThreads = 1024 
First 10 elements: 
1.00
1.00
-2.00
1.00
1.00
-2.00
1.00
1.00
-2.00
1.00
...
...
...
Last 10 elements: 
1.00
1.00
-2.00
1.00
1.00
-2.00
1.00
1.00
-2.00
0.00
Error count (Prefetch & MemAdvise): 0
0.00 0.00 0.00 0.00 
0.00 0.00 0.00 0.00 
0.00 0.00 0.00 0.00 
0.00 0.00 0.00 0.00 
0.00 0.00 0.00 0.00 
==3118== Profiling application: ./cuda_arima
==3118== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  312.71ms        30  10.424ms  8.7090ms  13.694ms  differencing(unsigned long, float*, float*)
      API calls:   52.34%  855.52ms         8  106.94ms  700.10us  380.14ms  cudaMemPrefetchAsync
                   23.46%  383.52ms         2  191.76ms  70.894ms  312.63ms  cudaDeviceSynchronize
                   12.22%  199.69ms         4  49.923ms  

