In [2]:
import pandas as pd
import time

# Read the file
df = pd.read_csv('dataset.txt')  # or 'your_file.txt' if it's CSV-formatted

# Extract the 'values' column
values = df['value']  # returns a Series

values.head()

Unnamed: 0,value
0,3.526591
1,3.180891
2,3.252221
3,3.611003
4,3.565869


In [6]:
diff = []


loop = 300
total_time = 0.0

for n in range(0,loop):


  start = time.perf_counter()
  for x in range(0,len(values)-1):

    diff.append(values[x+1] - values[x])
  end = time.perf_counter()
  total_time += end-start

print("Total Time for",loop,"loops (Seconds): ",total_time,"\n")
print("Average Time (Seconds): ",total_time/loop,"\n")

for x in range(0,10):
  #print(values[x+1],"\n")

  print(diff[x],"\n")





Total Time for 300 loops (Seconds):  0.20519100800015622 

Average Time (Seconds):  0.0006839700266671873 

-0.3456999999999999 

0.07133000000000012 

0.35878200000000016 

-0.04513400000000001 

0.7405020000000002 

0.7819639999999994 

-2.273815 

0.17129100000000008 

0.21896899999999997 

-0.07720199999999977 



In [10]:
%%writefile cuda_arima.cu

#include <stdio.h>
#include <stdlib.h>
#define MAXCHAR 1000


void readCsv(float *dest, size_t n){
    FILE *fp;
    char row[MAXCHAR];
    char *token;
    int ARRAY_SIZE = n;
    int i = 0, col = 1;


    fp = fopen("dataset.txt","r");

    while (feof(fp) != true)
    {
        fgets(row, MAXCHAR, fp);
        token = strtok(row, ",");
        col = 1;
        while(token != NULL)
        {
            if (col == 2)
            {
                dest[i] = atof(token);
                i++;
            }
            token = strtok(NULL, ",");
            col++;
        }
    }
}


//CUDA convolution kernel
__global__
void differencing(size_t n, float *out, float *in){
    int k;
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;

    for (k = index; k < n-1; k += stride)
       out[k]= in[k+1]-in[k];
}



int main(){

  //dataset
  const size_t ARRAY_SIZE = 205;
  const size_t ARRAY_BYTES = ARRAY_SIZE * sizeof(float);
  const size_t numOfLoops = 300;






  // declare arrays
    float *in, *out;
    cudaMallocManaged(&in, ARRAY_BYTES);
    cudaMallocManaged(&out, ARRAY_BYTES);


    readCsv(in, ARRAY_SIZE);





  //get GPU id
    int device = -1;
    cudaGetDevice(&device);

  // memory advise
   cudaMemAdvise(in, ARRAY_BYTES, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
   cudaMemAdvise(in, ARRAY_BYTES, cudaMemAdviseSetReadMostly, cudaCpuDeviceId);

  //"prefetch data" to create CPU page memory
    cudaMemPrefetchAsync(in,ARRAY_BYTES,cudaCpuDeviceId,NULL);
  //"prefetch data" to create GPU page memory
    cudaMemPrefetchAsync(out,ARRAY_BYTES,device,NULL);

  // initialize array
 // for (size_t i=0; i<ARRAY_SIZE; i++)
   //  in[i] = i % 3 + 1.0;

  // prefetch from CPU to GPU
  cudaMemPrefetchAsync(in,ARRAY_BYTES,device,NULL);

  // CUDA kernel
    size_t numThreads = 1024;
    size_t numBlocks = (ARRAY_SIZE + numThreads-1) / numThreads;

  printf("\n***** Differencing in CUDA with MemAdvise\n");
  printf("numElements = %lu\n", ARRAY_SIZE);
  printf("numBlocks = %lu, numThreads = %lu \n",numBlocks, numThreads);
    for (size_t i=0; i<numOfLoops;i++)
      differencing<<<numBlocks, numThreads>>> (ARRAY_SIZE,out,in);

  // synchronize GPU with CPU
    cudaDeviceSynchronize();

  // prefetch from GPU to CPU
  cudaMemPrefetchAsync(out,ARRAY_BYTES,cudaCpuDeviceId,NULL);

  // error checking routine
    size_t err_count = 0;
    for (size_t i=0; i<ARRAY_SIZE-2; i++){
      if(in[i+1]-in[i] != out[i])
        err_count++;
    }


  //Displays First 10 Elements
  printf("First 10 elements: \n");
  for (int i = 0; i < 10; i++){
    printf("%.4f\n", out[i]);
  }
  printf("...\n...\n...\n");


  //Displays Last 10 Elements
  printf("Last 10 elements: \n");
  for (int i = ARRAY_SIZE-10; i < ARRAY_SIZE; i++){
    printf("%.4f\n", out[i]);
  }


  printf("Error count (Prefetch & MemAdvise): %lu\n", err_count);

  cudaMemAdvise(out, ARRAY_BYTES, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
  cudaMemAdvise(out, ARRAY_BYTES, cudaMemAdviseSetReadMostly, cudaCpuDeviceId);
  //"prefetch data" to create CPU page memory
    cudaMemPrefetchAsync(out,ARRAY_BYTES,cudaCpuDeviceId,NULL);
  // prefetch from CPU to GPU
    cudaMemPrefetchAsync(out,ARRAY_BYTES,device,NULL);




  // synchronize GPU with CPU
    cudaDeviceSynchronize();




  //free memory
    cudaFree(in);
    cudaFree(out);

  return 0;
}

Overwriting cuda_arima.cu


In [11]:
%%shell
nvcc -arch=sm_75 cuda_arima.cu -o cuda_arima

      int ARRAY_SIZE = n;
          ^






In [12]:
%%shell
nvprof ./cuda_arima

==2225== NVPROF is profiling process 2225, command: ./cuda_arima

***** Differencing in CUDA with MemAdvise
numElements = 205
numBlocks = 1, numThreads = 1024 
First 10 elements: 
-0.3457
0.0713
0.3588
-0.0451
0.7405
0.7820
-2.2738
0.1713
0.2190
-0.0772
...
...
...
Last 10 elements: 
1.9867
0.5561
3.8593
-8.0111
-3.3893
4.8427
-0.1952
-3.4808
-19.4317
0.0000
Error count (Prefetch & MemAdvise): 0
==2225== Profiling application: ./cuda_arima
==2225== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  2.4233ms       300  8.0770us  2.8470us  1.5579ms  differencing(unsigned long, float*, float*)
      API calls:   98.54%  245.97ms         2  122.98ms  23.332us  245.95ms  cudaMallocManaged
                    0.74%  1.8403ms       300  6.1340us  4.5930us  197.85us  cudaLaunchKernel
                    0.39%  964.77us         2  482.38us  2.4980us  962.27us  cudaDeviceSynchronize
                    0.15%  379.92us   

