In [4]:
import pandas as pd
import time

###

# Read the file
#df = pd.read_csv('dataset.txt')  # or 'your_file.txt' if it's CSV-formatted

# Extract the 'values' column
#values = df['value']  # returns a Series

#values.head()

###

In [5]:
diff = []
timing_data = []


loop = 20
total_time = 0.0
ARRAY_SIZE = 1<<26

for i in range(0,ARRAY_SIZE):
  timing_data.append(i % 5 + 1.0)


for n in range(0,loop):


  start = time.perf_counter()
  for x in range(0,len(timing_data)-1):

    diff.append(timing_data[x+1] - timing_data[x])
  end = time.perf_counter()
  total_time += end-start

print("Total Time for",loop,"loops (Seconds): ",total_time,"\n")
print("Average Time (Seconds): ",total_time/loop,"\n")

for x in range(0,10):
  #print(values[x+1],"\n")

  print(diff[x],"\n")





KeyboardInterrupt: 

In [1]:
%%writefile cuda_arima.cu

#include <stdio.h>
#include <stdlib.h>
#define MAXCHAR 1000


void readCsv(float *dest, size_t n){
    FILE *fp;
    char row[MAXCHAR];
    char *token;
    int ARRAY_SIZE = n;
    int i = 0, col = 1;


    fp = fopen("dataset.txt","r");

    while (feof(fp) != true)
    {
        fgets(row, MAXCHAR, fp);
        token = strtok(row, ",");
        col = 1;
        while(token != NULL)
        {
            if (col == 2)
            {
                dest[i] = atof(token);
                i++;
            }
            token = strtok(NULL, ",");
            col++;
        }
    }
}

void writeCsv(float *dest, size_t n){
    FILE *fp;
    int ARRAY_SIZE = n;
    int i;

    fp = fopen("diff_dataset.txt","w");

    for(i = 0; i < ARRAY_SIZE; i++){
        fprintf(fp, "%f\n", dest[i]);
    }

    fclose(fp);
}


//CUDA convolution kernel
__global__
void differencing(size_t n, float *out, float *in){
    int k;
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;

    for (k = index; k < n-1; k += stride)
       out[k]= in[k+1]-in[k];
}



int main(){

  //dataset
  const size_t ARRAY_SIZE = 1<<26;
  const size_t ARRAY_BYTES = ARRAY_SIZE * sizeof(float);
  const size_t numOfLoops = 20;






  // declare arrays
    float *in, *out;
    cudaMallocManaged(&in, ARRAY_BYTES);
    cudaMallocManaged(&out, ARRAY_BYTES);


    //readCsv(in, ARRAY_SIZE);





  //get GPU id
    int device = -1;
    cudaGetDevice(&device);

  // memory advise
   cudaMemAdvise(in, ARRAY_BYTES, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
   cudaMemAdvise(in, ARRAY_BYTES, cudaMemAdviseSetReadMostly, cudaCpuDeviceId);

  //"prefetch data" to create CPU page memory
    cudaMemPrefetchAsync(in,ARRAY_BYTES,cudaCpuDeviceId,NULL);
  //"prefetch data" to create GPU page memory
    cudaMemPrefetchAsync(out,ARRAY_BYTES,device,NULL);

  // initialize array
  for (size_t i=0; i<ARRAY_SIZE; i++)
     in[i] = i % 5 + 1.0;

  // prefetch from CPU to GPU
  cudaMemPrefetchAsync(in,ARRAY_BYTES,device,NULL);

  // CUDA kernel
    size_t numThreads = 1024;
    size_t numBlocks = (ARRAY_SIZE + numThreads-1) / numThreads;

  printf("\n***** Differencing in CUDA with MemAdvise\n");
  printf("numElements = %lu\n", ARRAY_SIZE);
  printf("numBlocks = %lu, numThreads = %lu \n",numBlocks, numThreads);
    for (size_t i=0; i<numOfLoops;i++)
      differencing<<<numBlocks, numThreads>>> (ARRAY_SIZE,out,in);

  // synchronize GPU with CPU
    cudaDeviceSynchronize();

  // prefetch from GPU to CPU
  cudaMemPrefetchAsync(out,ARRAY_BYTES,cudaCpuDeviceId,NULL);

  // error checking routine
    size_t err_count = 0;
    for (size_t i=0; i<ARRAY_SIZE-2; i++){
      if(in[i+1]-in[i] != out[i])
        err_count++;
    }



  //Displays First 10 Elements
  printf("First 10 elements: \n");
  for (int i = 0; i < 10; i++){
    printf("%.4f\n", out[i]);
  }
  printf("...\n...\n...\n");

  //Displays Last 10 Elements
  printf("Last 10 elements: \n");
  for (int i = ARRAY_SIZE-10; i < ARRAY_SIZE; i++){
    printf("%.4f\n", out[i]);
  }


  printf("Error count (Prefetch & MemAdvise): %lu\n", err_count);

  cudaMemAdvise(out, ARRAY_BYTES, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
  cudaMemAdvise(out, ARRAY_BYTES, cudaMemAdviseSetReadMostly, cudaCpuDeviceId);
  //"prefetch data" to create CPU page memory
    cudaMemPrefetchAsync(out,ARRAY_BYTES,cudaCpuDeviceId,NULL);
  // prefetch from CPU to GPU
    cudaMemPrefetchAsync(out,ARRAY_BYTES,device,NULL);




  // synchronize GPU with CPU
    cudaDeviceSynchronize();


  // write to CSV
  writeCsv(out, ARRAY_SIZE);

  //free memory
    cudaFree(in);
    cudaFree(out);

  return 0;
}

Writing cuda_arima.cu


In [2]:
%%shell
nvcc -arch=sm_75 cuda_arima.cu -o cuda_arima

      int ARRAY_SIZE = n;
          ^






In [3]:
%%shell
nvprof ./cuda_arima

==767== NVPROF is profiling process 767, command: ./cuda_arima

***** Differencing in CUDA with MemAdvise
numElements = 67108864
numBlocks = 65536, numThreads = 1024 
First 10 elements: 
1.0000
1.0000
1.0000
1.0000
-4.0000
1.0000
1.0000
1.0000
1.0000
-4.0000
...
...
...
Last 10 elements: 
-4.0000
1.0000
1.0000
1.0000
1.0000
-4.0000
1.0000
1.0000
1.0000
0.0000
Error count (Prefetch & MemAdvise): 0
==767== Profiling application: ./cuda_arima
==767== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  68.188ms        20  3.4094ms  3.4063ms  3.4140ms  differencing(unsigned long, float*, float*)
      API calls:   50.29%  250.88ms         2  125.44ms  92.668us  250.79ms  cudaMallocManaged
                   22.83%  113.91ms         6  18.986ms  1.1802ms  60.349ms  cudaMemPrefetchAsync
                   17.09%  85.252ms         2  42.626ms  17.118ms  68.134ms  cudaDeviceSynchronize
                    4.77%  23.789ms



In [4]:
%%writefile lagged.cu
#include <stdio.h>
#include <stdlib.h>

void writeCsv(float *dest, size_t n){
    FILE *fp;
    int ARRAY_SIZE = n;
    int i;

    fp = fopen("AR_COEFF.txt","w");

    for(i = 0; i < ARRAY_SIZE; i++){
        fprintf(fp, "%f\n", dest[i]);
    }

    fclose(fp);
}

__global__
void autoregressive(size_t n, float *lagged, float *in, int lagged_cols)
{
    int row = blockIdx.y * blockDim.y + threadIdx.y; // Row index
    int col = blockIdx.x * blockDim.x + threadIdx.x; // Column index

    int rowStride = blockDim.y * gridDim.y;
    int colStride = blockDim.x * gridDim.x;

    for (int i = row; i < n; i+= rowStride){
      for (int j = col; j < lagged_cols; j+= colStride){
        if (j == 0){
          lagged[i * lagged_cols + j] = 1;
        }else if (i < n && j < lagged_cols) {
            int index = i - j;
            if (index < 0) {
                lagged[i * lagged_cols + j] = 0; // Assign zero for out-of-bounds indices
            } else {
                lagged[i * lagged_cols + j] = in[index]; // Assign lagged value
            }
        }
      }
    }

}

__global__
void transpose(float *out, float *in, int p, size_t ARRAY_SIZE){

  int col = blockIdx.x * blockDim.x + threadIdx.x;
  int row = blockIdx.y * blockDim.y + threadIdx.y;

  if (row < p && col < ARRAY_SIZE) {
      out[row * ARRAY_SIZE + col] = in[col * p + row];
  }

}

__global__
void matMulNaive(float *dest, float *in1, float *in2,
                 size_t in1_height, size_t in2_height,
                 size_t in1_width, size_t in2_width)
{

    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    // Each thread computes one element of the result matrix
    float cValue = 0;

    if (row < in1_height && col < in2_width) {
        // Matrix multiplication: in1 (lagged_cols x n) * in2 (n x lagged_cols)
        for (int k = 0; k < in1_width; ++k) {
            cValue += in1[row * in1_width + k] * in2[k * in2_width + col];
        }
        dest[row * in2_width + col] = cValue;
    }

    //p+1 n x n p+1 -> first mul = p+1 mat
    //p+1 p+1 x p+1 n -> second mul = p+1 x n
}


//Matrix inverse functions
__global__ void nodiag_normalize(float *A, float *I, int n, int i){
	int x = blockIdx.x * blockDim.x + threadIdx.x;
	int y = blockIdx.y * blockDim.y + threadIdx.y;

	if (x < n && y < n)
	if (x == i && x!=y){
		I[x*n + y] /= A[i*n + i];
		A[x*n + y] /= A[i*n + i];
	}

}

__global__ void diag_normalize(float *A, float *I, int n, int i){
	int x = blockIdx.x * blockDim.x + threadIdx.x;
	int y = blockIdx.y * blockDim.y + threadIdx.y;

	if (x < n && y < n)
	if (x == y && x == i){
		I[x*n + y] /= A[i*n + i];
		A[x*n + y] /= A[i*n + i];
	}

}

__global__ void gaussjordan(float *A, float *I, int n, int i)
{
	int x = blockIdx.x * blockDim.x + threadIdx.x;
	int y = blockIdx.y * blockDim.y + threadIdx.y;

	if (x < n && y < n){
		if (x != i){
			I[x*n + y] -= I[i*n + y] * A[x*n + i];
			if (y != i){
				A[x*n + y] -= A[i*n + y] * A[x*n + i];
			}
		}
	}

}

__global__ void set_zero(float *A, float *I, int n, int i){
	int x = blockIdx.x * blockDim.x + threadIdx.x;
	int y = blockIdx.y * blockDim.y + threadIdx.y;

	if (x < n && y < n){
		if (x != i){
			if (y == i){
				A[x*n + y] = 0;
			}
		}
	}
}


int main (){

  //dataset
  const size_t ARRAY_SIZE = 10;
  const size_t ARRAY_BYTES = ARRAY_SIZE * sizeof(float);
  //const size_t numOfLoops = 30;


  //arima parameters and variables;

  const int p = 4;
  const int lagged_cols = p + 1;
  const int q = 1;

  const size_t AR_SIZE = p + 1;
  const size_t AR_BYTES = AR_SIZE * sizeof(float);
  const size_t X_BYTES = ARRAY_SIZE * lagged_cols * sizeof(float);
  const size_t PXP_BYTES = AR_SIZE * AR_SIZE * sizeof(float);

  // declare arrays
  float *in, *out, *lagged, *transposed, *transposed2,
        *prod1, *inverse, *identity, *prod2, *AR_COEFF;
  cudaMallocManaged(&in, ARRAY_BYTES);
  cudaMallocManaged(&out, ARRAY_BYTES);
  cudaMallocManaged(&lagged, X_BYTES); //same amount of rows and p cols
  cudaMallocManaged(&transposed,X_BYTES);
  cudaMallocManaged(&transposed2,X_BYTES);
  cudaMallocManaged(&prod2,X_BYTES);
  cudaMallocManaged(&prod1,PXP_BYTES);
  cudaMallocManaged(&inverse,PXP_BYTES);
  cudaMallocManaged(&identity,PXP_BYTES);
  cudaMallocManaged(&AR_COEFF, AR_BYTES);


  int device = -1;
  cudaGetDevice(&device);  //get GPU id

  // memory advise
  cudaMemAdvise(in, ARRAY_BYTES, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
  cudaMemAdvise(in, ARRAY_BYTES, cudaMemAdviseSetReadMostly, cudaCpuDeviceId);


  cudaMemPrefetchAsync(in,ARRAY_BYTES,cudaCpuDeviceId,NULL);                       //"prefetch data" to create CPU page memory
  cudaMemPrefetchAsync(lagged,ARRAY_SIZE * lagged_cols * sizeof(float),device,NULL);         //"prefetch data" to create GPU page memory

  for (size_t i=0; i<ARRAY_SIZE; i++)
      in[i] = i % 5 + 1.0;

  cudaMemPrefetchAsync(in,ARRAY_BYTES,device,NULL);                                //prefetch from CPU to GPU

  dim3 threadsPerBlock(16, 16);
  dim3 numBlocks((ARRAY_SIZE + threadsPerBlock.x-1)/threadsPerBlock.x,
                 (lagged_cols + threadsPerBlock.y-1)/threadsPerBlock.y);

  autoregressive<<<numBlocks, threadsPerBlock>>> (ARRAY_SIZE,lagged, in, lagged_cols);
  cudaGetLastError();

  // synchronize GPU with CPU
  cudaDeviceSynchronize();

  // prefetch from GPU to CPU
  cudaMemPrefetchAsync(lagged,ARRAY_SIZE * lagged_cols * sizeof(float),cudaCpuDeviceId,NULL);


  for (int i = ARRAY_SIZE-10; i < ARRAY_SIZE; i++){
    for (int j = 0; j < lagged_cols; j++){
      printf("%.2f ", lagged[i*lagged_cols+j]);
    }
    printf("\n");
  }



  //---------- Transposing lagged matrix ----------------//
  //try removing later along with prev prefetch async and print


  cudaMemAdvise(lagged, X_BYTES, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
  cudaMemAdvise(lagged, X_BYTES, cudaMemAdviseSetReadMostly, cudaCpuDeviceId);


  cudaMemPrefetchAsync(lagged,X_BYTES,cudaCpuDeviceId,NULL);                       //"prefetch data" to create CPU page memory

  cudaMemPrefetchAsync(transposed,X_BYTES, device, NULL);                                   //"prefetch data" to create GPU page memory

  cudaMemPrefetchAsync(lagged, X_BYTES, device, NULL);


  //dim3 dimGrid(ARRAY_SIZE/TILE_DIM, p/TILE_DIM, 1);
  //dim3 dimBlock(TILE_DIM, BLOCK_ROWS, 1);

  transpose<<<numBlocks, threadsPerBlock>>>(transposed, lagged, lagged_cols, ARRAY_SIZE);
  cudaGetLastError();

  // synchronize GPU with CPU
  cudaDeviceSynchronize();

  // prefetch from GPU to CPU
  cudaMemPrefetchAsync(transposed,X_BYTES,cudaCpuDeviceId,NULL);


  //---printing tranposed---//
  printf("\n");

  for (int i = 0; i < lagged_cols; i++){
    for (int j = 0; j < 10; j++){
      printf("%.2f ", transposed[i*ARRAY_SIZE+j]);
    }
    printf("\n");
  }

  cudaMemPrefetchAsync(prod1, PXP_BYTES, device, NULL);
  cudaMemPrefetchAsync(lagged, X_BYTES, device, NULL);
  cudaMemPrefetchAsync(transposed, X_BYTES, device, NULL);

  matMulNaive<<<numBlocks, threadsPerBlock>>>(prod1, transposed, lagged,
                                              lagged_cols, ARRAY_SIZE, ARRAY_SIZE, lagged_cols);

  cudaDeviceSynchronize();
  cudaError_t err = cudaGetLastError();
  if (err != cudaSuccess) {
      printf("CUDA Error: %s\n", cudaGetErrorString(err));
  }

  // Prefetch result back to CPU
  cudaMemPrefetchAsync(prod1, PXP_BYTES, cudaCpuDeviceId, NULL);

  // Print results if needed
  printf("\nMatrix multiplication result:\n");
  for (int i = 0; i < lagged_cols; i++) {
      for (int j = 0; j < lagged_cols; j++) {
          printf("%.2f ", prod1[i * lagged_cols + j]);
      }
      printf("\n");
  }

  dim3 threadsPerBlockInv(lagged_cols, lagged_cols);
  dim3 numBlocksInv((lagged_cols + lagged_cols -1) / lagged_cols,
                 (lagged_cols+lagged_cols-1)/lagged_cols);

  cudaMemPrefetchAsync(inverse, PXP_BYTES, cudaCpuDeviceId, NULL);
  cudaMemPrefetchAsync(prod1, PXP_BYTES, device, NULL);

  //set identity matrix
  for (int i = 0; i < lagged_cols; i++){
    for (int j = 0; j < lagged_cols; j++){
      if (i == j)
        inverse[i * lagged_cols + j] = 1.0;
      else
        inverse[i * lagged_cols + j] = 0.0;
    }
  }

  printf("\nIdentity Matrix result:\n");
  for (int i = 0; i < lagged_cols; i++) {
      for (int j = 0; j < lagged_cols; j++) {
          printf("%.2f ", inverse[i * lagged_cols + j]);
      }
      printf("\n");
  }


  for (int i = 0; i < lagged_cols; i++){
    nodiag_normalize <<<numBlocksInv, threadsPerBlockInv >>>(prod1, inverse, lagged_cols, i);
    cudaDeviceSynchronize();
		diag_normalize <<<numBlocksInv, threadsPerBlockInv>>>(prod1, inverse, lagged_cols, i);
    cudaDeviceSynchronize();
		gaussjordan <<<numBlocksInv, threadsPerBlockInv>>>(prod1, inverse,lagged_cols, i);
    cudaDeviceSynchronize();
		set_zero <<<numBlocksInv, threadsPerBlockInv>>>(prod1, inverse, lagged_cols, i);
    cudaDeviceSynchronize();
  }

  // Prefetch result back to CPU
  cudaMemPrefetchAsync(inverse, PXP_BYTES, cudaCpuDeviceId, NULL);

  printf("\nInverse result:\n");
  for (int i = 0; i < lagged_cols; i++) {
      for (int j = 0; j < lagged_cols; j++) {
          printf("%.9f ", inverse[i * lagged_cols + j]);
      }
      printf("\n");
  }

  cudaMemPrefetchAsync(transposed, X_BYTES, device, NULL);
  cudaMemPrefetchAsync(inverse, PXP_BYTES, device, NULL);
  cudaMemPrefetchAsync(prod2, X_BYTES, device, NULL);

  matMulNaive<<<numBlocks, threadsPerBlock>>>(prod2, inverse, transposed, lagged_cols, lagged_cols, lagged_cols, ARRAY_SIZE);

  cudaDeviceSynchronize();
  err = cudaGetLastError();
  if (err != cudaSuccess) {
      printf("CUDA Error: %s\n", cudaGetErrorString(err));
  }

  // Prefetch result back to CPU
  cudaMemPrefetchAsync(prod2, X_BYTES, cudaCpuDeviceId, NULL);

  printf("\n");
  // Print results if needed
  for (int i = 0; i < lagged_cols; i++){
    for (int j = 0; j < 10; j++){
      printf("%.9f ", prod2[i*ARRAY_SIZE+j]);
    }
    printf("\n");
  }

  cudaMemPrefetchAsync(prod2, X_BYTES, device, NULL);
  cudaMemPrefetchAsync(AR_COEFF, AR_BYTES, device, NULL);

  matMulNaive<<<numBlocks, threadsPerBlock>>>(AR_COEFF, prod2, in, lagged_cols, ARRAY_SIZE, ARRAY_SIZE, 1);

  cudaDeviceSynchronize();
  err = cudaGetLastError();
  if (err != cudaSuccess) {
      printf("CUDA Error: %s\n", cudaGetErrorString(err));
  }

  // Prefetch result back to CPU
  cudaMemPrefetchAsync(AR_COEFF, X_BYTES, cudaCpuDeviceId, NULL);

  for (int i = 0; i < lagged_cols; i++){
    printf("%.5f\n", AR_COEFF[i]);
  }






  //cudaDeviceSynchronize();

  // write to CSV
  writeCsv(AR_COEFF, lagged_cols);


  //free memory
  cudaFree(in);
  cudaFree(out);
  cudaFree(lagged);
  cudaFree(transposed);
  cudaFree(prod1);
  cudaFree(prod2);
  cudaFree(inverse);


  return 0;
}


Writing lagged.cu


In [5]:
%%shell
nvcc -arch=sm_75 lagged.cu -o lagged

    const int q = 1;
              ^






In [6]:
%%shell
nvprof ./lagged

==1131== NVPROF is profiling process 1131, command: ./lagged
1.00 0.00 0.00 0.00 0.00 
1.00 1.00 0.00 0.00 0.00 
1.00 2.00 1.00 0.00 0.00 
1.00 3.00 2.00 1.00 0.00 
1.00 4.00 3.00 2.00 1.00 
1.00 5.00 4.00 3.00 2.00 
1.00 1.00 5.00 4.00 3.00 
1.00 2.00 1.00 5.00 4.00 
1.00 3.00 2.00 1.00 5.00 
1.00 4.00 3.00 2.00 1.00 

1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 
0.00 1.00 2.00 3.00 4.00 5.00 1.00 2.00 3.00 4.00 
0.00 0.00 1.00 2.00 3.00 4.00 5.00 1.00 2.00 3.00 
0.00 0.00 0.00 1.00 2.00 3.00 4.00 5.00 1.00 2.00 
0.00 0.00 0.00 0.00 1.00 2.00 3.00 4.00 5.00 1.00 

Matrix multiplication result:
10.00 25.00 21.00 18.00 16.00 
25.00 85.00 65.00 51.00 44.00 
21.00 65.00 69.00 53.00 43.00 
18.00 51.00 53.00 60.00 47.00 
16.00 44.00 43.00 47.00 56.00 

Identity Matrix result:
1.00 0.00 0.00 0.00 0.00 
0.00 1.00 0.00 0.00 0.00 
0.00 0.00 1.00 0.00 0.00 
0.00 0.00 0.00 1.00 0.00 
0.00 0.00 0.00 0.00 1.00 

Inverse result:
0.439719111 -0.092227302 -0.016625991 -0.020980148 -0.022794995 




In [25]:
%%writefile ARIMA.cu

#include <stdio.h>
#include <stdlib.h>
#define MAXCHAR 1000

void readCsv(float *dest, size_t n){
    FILE *fp;
    char row[MAXCHAR];
    char *token;
    int ARRAY_SIZE = n;
    int i = 0, col = 1;


    fp = fopen("diff_dataset.txt","r");

    while (feof(fp) != true)
    {
        fgets(row, MAXCHAR, fp);
        token = strtok(row, ",");
        col = 1;
        while(token != NULL)
        {
            if (col == 2)
            {
                dest[i] = atof(token);
                i++;
            }
            token = strtok(NULL, ",");
            col++;
        }
    }
}

void readCsvAR(float *dest, size_t n){
    FILE *fp;
    char row[MAXCHAR];
    char *token;
    int ARRAY_SIZE = n;
    int i = 0, col = 1;


    fp = fopen("AR_COEFF.txt","r");

    while (feof(fp) != true)
    {
        fgets(row, MAXCHAR, fp);
        token = strtok(row, ",");
        col = 1;
        while(token != NULL)
        {
            if (col == 2)
            {
                dest[i] = atof(token);
                i++;
            }
            token = strtok(NULL, ",");
            col++;
        }
    }
}

// FORECASTING
void ARIMA(float *out, float *dataset, float *residuals, float *AR_coeff, float *MA_coeff, size_t p, size_t q, size_t ARRAY_SIZE, size_t Prediction_Size){
    for(int i = 0; i < Prediction_Size; i++){
        float ar = 0;
        float ma = 0;

        for(int j = 0; j < p; j++){
            ar += AR_coeff[j] * dataset[i+ARRAY_SIZE-j];
        }

        for(int j = 0; j < q; j++){
            ma += MA_coeff[j] * residuals[i+ARRAY_SIZE-j];
        }

        // Get new Prediction
        out[i] = ar + ma;

        // Get new residuals
        residuals[i+ARRAY_SIZE] = dataset[i+ARRAY_SIZE] - out[i];
    }
}

int main(){
    //dataset
    const size_t ARRAY_SIZE = 1<<10;
    const size_t ARRAY_BYTES = ARRAY_SIZE * sizeof(float);
    const size_t AR_BYTES = 5 * sizeof(float);
    const size_t MA_BYTES = 5 * sizeof(float);
    const size_t Prediction_Size = 10;
    const size_t Prediction_Bytes = Prediction_Size * sizeof(float);
    //const size_t numOfLoops = 30;

    // declare arrays
    float *dataset, *residuals, *AR_coeff, *MA_coeff, *out;
    cudaMallocManaged(&dataset, ARRAY_BYTES);
    cudaMallocManaged(&residuals, ARRAY_BYTES + Prediction_Bytes);
    cudaMallocManaged(&AR_coeff, AR_BYTES);
    cudaMallocManaged(&MA_coeff, MA_BYTES);

    cudaMallocManaged(&out, Prediction_Bytes);

    int device = -1;
    cudaGetDevice(&device);  //get GPU id

    // Initialize Dataset
    for (size_t i=0; i<ARRAY_SIZE; i++)
      dataset[i] = i % 5 + 1.0;

    // Initialize residuals
    for (size_t i=0; i<ARRAY_SIZE; i++)
      residuals[i] = 1 / 5;

    // Initialize AR_COEFF

    AR_coeff[0] = 1.982260;
    AR_coeff[1] = 0.515579;
    AR_coeff[2] = -0.101413;
    AR_coeff[3] = -0.124845;
    AR_coeff[4] = 0.104051;

    // Initialize MA_coeff

    MA_coeff[0] = 0.17950;
    MA_coeff[1] = 0.32938;
    MA_coeff[2] = -0.00733;
    MA_coeff[3] = -0.04431;
    MA_coeff[4] = -0.10458;

    ARIMA(out,dataset,residuals,AR_coeff,MA_coeff,5,5,ARRAY_SIZE,Prediction_Size);

    printf("Predicted Values: \n");
    for(int i = 0; i < Prediction_Size; i++){
      printf("%.2f ", out[i]);
    }
    printf("\n");
    return 0;
}

Overwriting ARIMA.cu


In [26]:
%%shell
nvcc -arch=sm_75 ARIMA.cu -o ARIMA

      int ARRAY_SIZE = n;
          ^


      int ARRAY_SIZE = n;
          ^





In [27]:
%%shell
nvprof ./ARIMA

==7455== NVPROF is profiling process 7455, command: ./ARIMA
Predicted Values: 
1.61 -1.10 0.19 0.42 -0.02 -0.10 0.07 0.02 -0.01 -0.00 
==7455== Profiling application: ./ARIMA
==7455== Profiling result:
No kernels were profiled.
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
      API calls:   99.82%  116.07ms         5  23.213ms  3.4580us  116.03ms  cudaMallocManaged
                    0.15%  177.38us       114  1.5550us     144ns  70.132us  cuDeviceGetAttribute
                    0.01%  14.293us         1  14.293us  14.293us  14.293us  cuDeviceGetName
                    0.01%  5.8830us         1  5.8830us  5.8830us  5.8830us  cuDeviceGetPCIBusId
                    0.00%  1.6780us         1  1.6780us  1.6780us  1.6780us  cudaGetDevice
                    0.00%  1.6740us         3     558ns     174ns  1.1660us  cuDeviceGetCount
                    0.00%  1.2690us         2     634ns     175ns  1.0940us  cuDeviceGet
                    0.00%  1.1630

