<a href="https://colab.research.google.com/github/jyoti246/alexnet_cuda_cudnn/blob/main/alexnet_jyoti_testing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+git://github.com/andreinechaev/nvcc4jupyter.git
  Cloning git://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-unxkd91v
  Running command git clone -q git://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-unxkd91v
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp37-none-any.whl size=4307 sha256=95ce12fc3b2951e1bd129f7ffa2d031d11b49a6632b6391550eefa27dfb8dd3d
  Stored in directory: /tmp/pip-ephem-wheel-cache-bgbtckck/wheels/10/c2/05/ca241da37bff77d60d31a9174f988109c61ba989e4d4650516
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2


In [None]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


In [None]:
# Keep the contents from below link to your drive
# https://drive.google.com/drive/folders/19c_twHLSY37kucu4ece392ELkxryJLw4?usp=sharing

In [None]:
%%cuda --name alexnet.cu

#include <cudnn.h>
#include <cublas_v2.h>
#include <cuda_runtime.h>
#include <cstdlib>
#include <cassert>
#include <cstdlib>
#include <iostream>
#include <string>
#include <random>
#include <cmath>
#include <stdio.h>
#include <bits/stdc++.h>
#include <fstream>
#include <opencv2/opencv.hpp>
#include <assert.h>
#include <math.h>
#include <iomanip>


using namespace std;
using namespace cv;

#define BATCH_SIZE 1
#define OVERLAP_POOLING 1
#define BIAS_INIT_VAL 0.001
#define MAX_THREADS_PER_BLOCK 1024 // according to GTX 1050 Ti

int roundUp(int num, int den){

  return((num + den - 1 )/(den));

}

struct convDim_t{

  int Height;
  int Width;
  int Channels;
  int Batch;
};

struct poolDim_t{

  int Height;
  int Width;
  int padHeight;
  int padWidth;
  int strideHeight;
  int strideWidth;
};

struct kernelDim_t{

  int kernelSize;
  int kernelHeight;
  int kernelWidth;
  int strideHeight;
  int strideWidth;
  int padHeight;
  int padWidth;
  int dilationHeight;
  int dilationWidth;
};


convDim_t setConvSpecs(int ht, int wd, int ch, int bt){

  convDim_t temp;
  temp.Height = ht;
  temp.Width = wd;
  temp.Channels = ch;
  temp.Batch = bt;

  return temp;
}

kernelDim_t setKernelSpecs(int size, int fheight, int fwidth, int sheight, int swidth, int pheight, int pwidth, int dheight, int dwidth){

  kernelDim_t layerKernel;
  layerKernel.kernelSize = size;
  layerKernel.kernelHeight = fheight;
  layerKernel.kernelWidth = fwidth;
  layerKernel.strideHeight = sheight;
  layerKernel.strideWidth = swidth;
  layerKernel.padHeight = pheight;
  layerKernel.padWidth = pwidth;
  layerKernel.dilationHeight = dheight;
  layerKernel.dilationWidth = dwidth;

  return layerKernel;
}

poolDim_t setPoolSpecs(bool flagOverlap){

  poolDim_t poolDims;

  if(flagOverlap){

    poolDims.Height = 3;
    poolDims.Width = 3;
    poolDims.padHeight = 0;
    poolDims.padWidth = 0;
    poolDims.strideHeight = 2;
    poolDims.strideWidth = 2;  
  }
  else{
    poolDims.Height = 2;
    poolDims.Width = 2;
    poolDims.padHeight = 1;
    poolDims.padWidth = 1;
    poolDims.strideHeight = 2;
    poolDims.strideWidth = 2;
  }

  return poolDims;
  
}

#define checkCUDNN(expression)                             \
{                                                          \
  cudnnStatus_t status = (expression);                     \
  if (status != CUDNN_STATUS_SUCCESS) {                    \
    std::cerr << "Error on line " << __LINE__ << ": "      \
              << cudnnGetErrorString(status) << std::endl; \
    std::exit(EXIT_FAILURE);                               \
  }                                                        \
}

float alpha = 1.0;
float beta = 0.0;

class ConvLayers{
    
public:

  float *  inputTensor;                   
  float *kernelTensor;
  float *biasTensor;		
  int layerIndex;
  float alph, bet;
  cudnnHandle_t CUDNN;
  cudnnTensorFormat_t TensorFormat;
  cudnnDataType_t DataType;
  cudnnConvolutionMode_t ConvMode;
  cudnnActivationMode_t ActivationMode;
  cudnnPoolingMode_t PoolingMode;

  convDim_t outDims;
  convDim_t inDims;
  kernelDim_t kernelDims;
  poolDim_t poolDims;

  random_device rd{};
  mt19937 gen{rd()};  
  normal_distribution<> d{0,1}; 

  float* conv_output{nullptr}; // output of convolution operation
  float* poolTensor{nullptr};  // output of pooling layer, if exists
  float* outputTensor{nullptr};
  void* d_workspace{nullptr};
  size_t workspaceBytes{0};

  int convOutDimHeight{0}, convOutDimWidth{0}, convOutDimChannels{0}, convOutDimBatchSize{0};
  int poolOutBatchSize{0}, poolOutChannels{0}, poolOutHeight{0}, poolOutWidth{0};

  bool POOL;  // True if pooling is to be done in this layer, otherwise False

  cudnnTensorDescriptor_t input_descriptor;
  cudnnFilterDescriptor_t kernel_descriptor;
  cudnnConvolutionDescriptor_t convolution_descriptor;
  cudnnTensorDescriptor_t bias_descriptor;
  cudnnTensorDescriptor_t convOutput_descriptor;
  cudnnConvolutionFwdAlgo_t convolution_algorithm;
  cudnnActivationDescriptor_t activation_descriptor;
  cudnnPoolingDescriptor_t pooling_descriptor;
  cudnnTensorDescriptor_t poolTensor_descriptor;

  ConvLayers(){}

	ConvLayers( int index,  float*  inT, convDim_t inDim, kernelDim_t kdims, float a, float b, 
		cudnnTensorFormat_t t_format, cudnnDataType_t d_type, cudnnConvolutionMode_t c_mode, cudnnActivationMode_t ActMode, cudnnHandle_t cud){

    this->POOL = false;
    this->inputTensor = inT;
    this->inDims = inDim;
    this->kernelDims = kdims;

    this->layerIndex = index;
    this->alph = a; this->bet = b;
    this->TensorFormat = t_format;
    this->DataType = d_type;			
    this->ConvMode = c_mode;
    this->ActivationMode = ActMode;
    this->CUDNN = cud;	
	}

  ConvLayers( int index,  float*  inT, convDim_t inDim, kernelDim_t kdims, poolDim_t pdims, float a, float b, 
    cudnnTensorFormat_t t_format, cudnnDataType_t d_type, cudnnConvolutionMode_t c_mode, cudnnActivationMode_t ActMode,cudnnPoolingMode_t poolMode, cudnnHandle_t cud){

    this->POOL = true;
    this->inputTensor = inT;
    this->inDims = inDim;
    this->kernelDims = kdims;
    this->poolDims = pdims;
    this->layerIndex = index;
    this->alph = a; this->bet = b;
    this->TensorFormat = t_format;
    this->DataType = d_type;      
    this->ConvMode = c_mode;
    this->ActivationMode = ActMode;
    this->PoolingMode = poolMode;
    this->CUDNN = cud;  
  }

  void getConvLayerSpecs();

	void buildConvLayer(float* inbias, vector< vector<vector< vector <float> > > >& inkernal);

  void fwdProp();
    
  void bwdProp();

};


void ConvLayers::getConvLayerSpecs(){



  checkCUDNN(cudnnCreateTensorDescriptor(&input_descriptor));
  checkCUDNN(cudnnSetTensor4dDescriptor(input_descriptor,
                                          TensorFormat,
                                          DataType,
                                          inDims.Batch,
                                          inDims.Channels, 
                                          inDims.Height, 
                                          inDims.Width));

   
  // --- Build the Kernel which is going to convolve over the input ---//
  
  
  checkCUDNN(cudnnCreateFilterDescriptor(&kernel_descriptor));
  checkCUDNN(cudnnSetFilter4dDescriptor(kernel_descriptor,
                                        DataType,
                                        TensorFormat,
                                        kernelDims.kernelSize,
                                        inDims.Channels,
                                        kernelDims.kernelHeight,
                                        kernelDims.kernelWidth));

  // --- Build the Convolution descriptor --- //

  
  checkCUDNN(cudnnCreateConvolutionDescriptor(&convolution_descriptor));
  checkCUDNN(cudnnSetConvolution2dDescriptor(convolution_descriptor,
                                            kernelDims.padHeight,
                                            kernelDims.padWidth,
                                            kernelDims.strideHeight,
                                            kernelDims.strideWidth,
                                            kernelDims.dilationHeight,
                                            kernelDims.dilationWidth,
                                            ConvMode,
                                            DataType));

  // --- This function returns the dimensions of the resulting 4D tensor of a 2D convolution,     //
  // ---given the convolution descriptor, the input tensor descriptor and the filter descriptor --- //

  checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convolution_descriptor,
                                                 input_descriptor,
                                                 kernel_descriptor,
                                                 &convOutDimBatchSize,
                                                 &convOutDimChannels,
                                                 &convOutDimHeight,
                                                 &convOutDimWidth));
  
  outDims.Height = convOutDimHeight;
  outDims.Width = convOutDimWidth;
  outDims.Channels = convOutDimChannels;
  outDims.Batch = convOutDimBatchSize;
  
  
  checkCUDNN(cudnnCreateTensorDescriptor(&bias_descriptor));
  checkCUDNN(cudnnSetTensor4dDescriptor(bias_descriptor,
                                            TensorFormat,
                                            DataType,
                                            convOutDimBatchSize,
                                           convOutDimChannels,
                                           convOutDimHeight,
                                           convOutDimWidth));

  // ---Build the output Descriptor ---//

  
  checkCUDNN(cudnnCreateTensorDescriptor(&convOutput_descriptor));
  checkCUDNN(cudnnSetTensor4dDescriptor(convOutput_descriptor,
                                        TensorFormat,
                                        DataType, 
                                        convOutDimBatchSize,
                                        convOutDimChannels,
                                        convOutDimHeight,
                                        convOutDimWidth));

  // -- Size references for next conv layer --- //

  

  // --- Determine the Convolution algorithm to be used in CNN layer ---//

  
  checkCUDNN(cudnnGetConvolutionForwardAlgorithm(CUDNN,
                                        input_descriptor,
                                        kernel_descriptor,
                                        convolution_descriptor,
                                        convOutput_descriptor,
                                        CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
                                        /*memoryLimitInBytes=*/0,
                                        &convolution_algorithm));

  
  checkCUDNN(cudnnCreateActivationDescriptor(&activation_descriptor));
  checkCUDNN(cudnnSetActivationDescriptor(activation_descriptor,
                                        ActivationMode,
                                        CUDNN_PROPAGATE_NAN,
                                        /*relu_coef=*/0));


  /*
  Do some adjustment if the output dimension of pooling layer is not an integer (which will give an error) 
  Each dimension h and w of the output images is computed as followed:
  outputDim = 1 + (inputDim + 2*padding - windowDim)/poolingStride;
  */

  // check if the user has asked to create a pooling layer for this conv layer
  if(POOL){

    /*if((outDims.Height - poolDims.Height)%2 != 0){
      poolDims.Height = (poolDims.Height == 2) ? 3 : 2;
    }

    if((outDims.Width - poolDims.Width)%2 != 0){
      poolDims.Width = (poolDims.Width == 2) ? 3 : 2;
    }*/

    
    checkCUDNN(cudnnCreatePoolingDescriptor(&pooling_descriptor));
    checkCUDNN(cudnnSetPooling2dDescriptor(pooling_descriptor,
                                            PoolingMode,
                                            CUDNN_NOT_PROPAGATE_NAN,
                                            poolDims.Height,
                                            poolDims.Width,
                                            poolDims.padHeight,
                                            poolDims.padWidth,
                                            poolDims.strideHeight,
                                            poolDims.strideWidth));

    checkCUDNN(cudnnGetPooling2dForwardOutputDim(pooling_descriptor,
                                              convOutput_descriptor,
                                                  &poolOutBatchSize,
                                                  &poolOutChannels,
                                                  &poolOutHeight,
                                                  &poolOutWidth));

    
    checkCUDNN(cudnnCreateTensorDescriptor(&poolTensor_descriptor));  
    checkCUDNN(cudnnSetTensor4dDescriptor(poolTensor_descriptor,
                                          TensorFormat,
                                          DataType,
                                          poolOutBatchSize,
                                          poolOutChannels,
                                          poolOutHeight,
                                          poolOutWidth));

    outDims.Batch = poolOutBatchSize;
    outDims.Channels = poolOutChannels;
    outDims.Height = poolOutHeight;
    outDims.Width = poolOutWidth;

  }
  cout<<"Input image size "<<inDims.Batch<<" X "<<inDims.Height<<" X "<<inDims.Width<<" X "<<inDims.Channels<<endl;
  cout<<"Output image size "<<outDims.Batch<<" X "<<outDims.Height<<" X "<<outDims.Width<<" X "<<outDims.Channels<<endl;
}

  void ConvLayers::buildConvLayer(float* inbias, vector< vector<vector< vector <float> > > >& inkernal){

  	// --- Set up the memory required for the convolution --- //
  
	  
    checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(CUDNN,
                                                     input_descriptor,
                                                     kernel_descriptor,
                                                     convolution_descriptor,
                                                     convOutput_descriptor,
                                                     convolution_algorithm,
                                                     &workspaceBytes));

    // Initialize bias and kernel tensors here //

    // Bias
    cudaMallocManaged(&biasTensor, outDims.Channels * outDims.Batch * sizeof(float));
    cudaMemcpy(biasTensor,inbias,outDims.Channels * outDims.Batch * sizeof(float),cudaMemcpyHostToDevice);
    
    float hkernel[kernelDims.kernelSize][inDims.Channels][kernelDims.kernelHeight][kernelDims.kernelWidth];

    for(int i = 0; i < kernelDims.kernelSize; i++){
      for(int j = 0; j < inDims.Channels; j++){
        for(int k = 0; k < kernelDims.kernelHeight; k++){
          for(int l = 0; l < kernelDims.kernelWidth; l++){
            hkernel[i][j][k][l] = inkernal[i][j][k][l]; 
          }
        }
      }
    }

    //float* hkernel = (float*)malloc(sizeof(float)*kernelDims.kernelSize*inDims.Channels*kernelDims.kernelHeight*kernelDims.kernelWidth);
    //float hkernel[kernelDims.kernelSize*inDims.Channels*kernelDims.kernelHeight*kernelDims.kernelWidth];

    //for(int i = 0; i < kernelDims.kernelSize; i++){
    //   for(int j = 0; j < inDims.Channels; j++){
    //     for(int k = 0; k < kernelDims.kernelHeight; k++){
    //       for(int l = 0; l < kernelDims.kernelWidth; l++){
    //        hkernel[i*inDims.Channels * kernelDims.kernelHeight * kernelDims.kernelWidth+j* kernelDims.kernelHeight * kernelDims.kernelWidth+k*kernelDims.kernelWidth+l] = inkernal[i][j][k][l]; 
    //      }
    //    }
    //  }
    //}


    cudaMallocManaged(&kernelTensor, kernelDims.kernelSize *inDims.Channels* kernelDims.kernelHeight * kernelDims.kernelWidth * sizeof(float));
    cudaMemcpy(kernelTensor,hkernel,kernelDims.kernelSize*inDims.Channels * kernelDims.kernelHeight * kernelDims.kernelWidth * sizeof(float),cudaMemcpyHostToDevice);
    
    // --- Allocate Memory in the GPU for layer operation --- //    
    cudaMallocManaged(&d_workspace, workspaceBytes);
    int convout_bytes = convOutDimBatchSize * convOutDimChannels * convOutDimHeight * convOutDimWidth * sizeof(float);    
 
    // memory required for storing output of the conv operation (after adding bias)
    cudaMallocManaged(&conv_output, convout_bytes);
    cudaMemset(conv_output, 0, convout_bytes);

    // set up memory for pool tensor if pool is true
    if(POOL){
      int poolSize =  outDims.Batch * outDims.Channels * outDims.Height * outDims.Width * sizeof(float);
      cudaMallocManaged(&poolTensor, poolSize); 
      cudaMemset(poolTensor, 0, poolSize);
    }

    /*
    cerr << "Workspace size: " << (workspaceBytes / 1048576.0) << "MB" << endl;
    */
}


void ConvLayers::fwdProp(){
  //float testprint[kernelDims.kernelHeight * kernelDims.kernelWidth * inDims.Channels * kernelDims.kernelSize];
  //  cudaMemcpy(testprint,kernelTensor,kernelDims.kernelHeight * kernelDims.kernelWidth * inDims.Channels * kernelDims.kernelSize* sizeof(float),cudaMemcpyDeviceToHost);
  //for(int i=0;i<kernelDims.kernelHeight * kernelDims.kernelWidth * inDims.Channels * kernelDims.kernelSize;i++){
    //  cout<<testprint[i]<<" ";
  //}cout<<endl;

  /*
  float testprint[inDims.Height * inDims.Width * inDims.Channels * inDims.Batch];
  cudaMemcpy(testprint,inputTensor,inDims.Height * inDims.Width * inDims.Channels * inDims.Batch* sizeof(float),cudaMemcpyDeviceToHost);
  //cout<<"Start   "<<inDims.Height << inDims.Width << inDims.Channels << inDims.Batch<<endl;
  for(int i=0;i<inDims.Height * inDims.Width * inDims.Channels * inDims.Batch;i++){
     cout<<testprint[i]<<" ";
  }cout<<endl;
  */

  // cout<<workspaceBytes<<endl;
  checkCUDNN(cudnnConvolutionForward(CUDNN,
                                      &alph,
                                      input_descriptor,
                                      inputTensor,
                                      kernel_descriptor,
                                      kernelTensor,
                                      convolution_descriptor,
                                      convolution_algorithm,
                                      d_workspace,
                                      workspaceBytes,
                                      &bet,
                                      convOutput_descriptor,
                                      conv_output));
    
        
  checkCUDNN(cudnnAddTensor(CUDNN, &alph, bias_descriptor,
                                  biasTensor ,&alph, convOutput_descriptor, conv_output));
    
  // float testprint1[convOutDimHeight * convOutDimWidth * convOutDimChannels * convOutDimBatchSize];
  // cudaMemcpy(testprint1,conv_output,convOutDimHeight * convOutDimWidth * convOutDimChannels * convOutDimBatchSize* sizeof(float),cudaMemcpyDeviceToHost);
  // for(int i=0;i<convOutDimHeight * convOutDimWidth * convOutDimChannels * convOutDimBatchSize;i++){
  //     cout<<testprint1[i]<<" ";
  // }cout<<endl;

  checkCUDNN(cudnnActivationForward(CUDNN,
                                      activation_descriptor,
                                      &alph,
                                      convOutput_descriptor,
                                      conv_output,
                                      &bet,
                                      convOutput_descriptor,
                                      conv_output));

  // cudaMemcpy(testprint1,conv_output,convOutDimHeight * convOutDimWidth * convOutDimChannels * convOutDimBatchSize* sizeof(float),cudaMemcpyDeviceToHost);
  // for(int i=0;i<convOutDimHeight * convOutDimWidth * convOutDimChannels * convOutDimBatchSize;i++){
  //     cout<<testprint1[i]<<" ";
  // }cout<<endl;

  if(POOL){
      //cout<<"AM in POOL"<<endl;
    checkCUDNN(cudnnPoolingForward(CUDNN,
                                  pooling_descriptor,
                                  &alph,
                                  convOutput_descriptor,
                                  conv_output,
                                  &bet,
                                  poolTensor_descriptor,
                                  poolTensor));
    cudaMallocManaged(&outputTensor,outDims.Height*outDims.Width*outDims.Channels*sizeof(float));
    cudaMemcpy(outputTensor,poolTensor,outDims.Height*outDims.Width*outDims.Channels*sizeof(float),cudaMemcpyDeviceToDevice);

  }
  else{
    cudaMallocManaged(&outputTensor,outDims.Height*outDims.Width*outDims.Channels*sizeof(float));
    cudaMemcpy(outputTensor,conv_output,outDims.Height*outDims.Width*outDims.Channels*sizeof(float),cudaMemcpyDeviceToDevice);  	
  }


}


void ConvLayers::bwdProp(){


}





__global__
void addBiasFC(int dim1,int dim2, float* bias, float* res){

	int idx = blockIdx.x * blockDim.x + threadIdx.x;

	if(idx < dim1*dim2){
		res[idx] += bias[idx];
	}
}


class FCLayers : public ConvLayers{

  public:

    cublasHandle_t CUBLAS;
    int inDims;
    int outDims;
    int batch;
    float *weights{nullptr};
    float *dcost{nullptr};
    float *labels{nullptr};
    // float *p_act{nullptr};
    float *nabla_w{nullptr};
    float *nabla_b{nullptr};
    float* ones{nullptr};
    float* d_intermediate{nullptr};
    float* dReLU_tensor{nullptr};
		


    bool last;
    // float* outputTensor{nullptr};
    cudnnTensorDescriptor_t outputTensor_descriptor;

    FCLayers( float* inputTensor_, int inDims_, int batch,int outDims_, float alpha, float beta, cudnnActivationMode_t ActivationMode_, 
    	cudnnTensorFormat_t t_format, cudnnDataType_t d_type,cublasHandle_t CUBLAS_,cudnnHandle_t CUDNN){

      this->last = false;
      this->inputTensor = inputTensor_;
      this->inDims = inDims_;
      this->batch = batch;
      this->outDims = outDims_;
      this->CUBLAS = CUBLAS_;
      this->ActivationMode = ActivationMode_;      
      this->alph = alpha;
      this->bet = beta;
      this->CUDNN = CUDNN;
      this->DataType = d_type;
      this->TensorFormat = t_format;
    }

    FCLayers( float* inputTensor_, int inDims_, int batch,int outDims_, float alpha, float beta, cudnnActivationMode_t ActivationMode_, 
    	cudnnTensorFormat_t t_format, cudnnDataType_t d_type,cublasHandle_t CUBLAS_,cudnnHandle_t CUDNN, float* labels){

      this->last = true;
      this->inputTensor = inputTensor_;
      this->inDims = inDims_;
      this->batch = batch;
      this->outDims = outDims_;
      this->CUBLAS = CUBLAS_;
      this->ActivationMode = ActivationMode_;      
      this->alph = alpha;
      this->bet = beta;
      this->CUDNN = CUDNN;
      this->DataType = d_type;
      this->TensorFormat = t_format;
    }

    void getFCLayerSpecs();
    void buildFCLayer(float* inbias, vector< vector <float> >& inkernal);
    void fwdProp(bool isAct);
    void bwdProp();

	private:
		int numBlocks;
		int numThreads;


};


void FCLayers::getFCLayerSpecs(){

	checkCUDNN(cudnnCreateActivationDescriptor(&activation_descriptor));
	checkCUDNN(cudnnSetActivationDescriptor(activation_descriptor,
		                                ActivationMode,
		                                CUDNN_PROPAGATE_NAN,
	    	                            /*relu_coef=*/0));

	checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor_descriptor));
  checkCUDNN(cudnnSetTensor4dDescriptor(outputTensor_descriptor,
                                      TensorFormat,
                                      DataType,
                                      batch, outDims, 1, 1));


}

void FCLayers::buildFCLayer(float* inbias, vector< vector <float> >& inkernal){

	// Initialization of weight matrix

	
	float *hweights;
	hweights = (float*)malloc(sizeof(float)*inDims*outDims);
	for(int i = 0; i < outDims; i++){
          for(int j=0;j<inDims;j++)
	 hweights[i*outDims + j] = inkernal[i][j];
	}

	cudaMallocManaged(&weights, inDims*outDims*sizeof(float));
	cudaMemcpy(weights, hweights, inDims * outDims * sizeof(float),cudaMemcpyHostToDevice);

	free(hweights);
	// initialization of bias vector
	cudaMallocManaged(&biasTensor,outDims*batch*sizeof(float));
    cudaMemcpy(biasTensor,inbias,outDims*batch*sizeof(float),cudaMemcpyHostToDevice);
    

	cudaMallocManaged(&outputTensor,outDims*batch*sizeof(float));
	// cudaMemset(outputTensor,0,outDims*batch*sizeof(float)); ----- If not using cudaMemset(),
	// ensure that while performing any operation on it, its multiplicatio coeff is 0, like in cublasSgemm() below, bet is 0 for the same reason

	// Decide the number of threads and blocks based on the size of the output of the FC layer (before adding the bias) for addBiasFC kernel
	if(batch*outDims <= MAX_THREADS_PER_BLOCK){
		numThreads = batch*outDims;
		numBlocks = 1;
	}
	else{
		numBlocks = roundUp(batch*outDims,MAX_THREADS_PER_BLOCK);
		numThreads = MAX_THREADS_PER_BLOCK;
	}

	// for back prop 
	
}


void FCLayers::fwdProp(bool isAct){

	

	cublasSgemm(CUBLAS,
				CUBLAS_OP_T,CUBLAS_OP_N,
				outDims, batch, inDims,
				&alph,
				weights,inDims,
				inputTensor,inDims,
				&bet,
				outputTensor,outDims);
	
	addBiasFC<<<numBlocks,numThreads>>>(outDims,batch,biasTensor,outputTensor);
	if(isAct)
  checkCUDNN(cudnnActivationForward(CUDNN,
									activation_descriptor,
									&alph,
									outputTensor_descriptor,
									outputTensor,
									&bet,
									outputTensor_descriptor,
									outputTensor));
	

}








void fill_vec(vector<vector<vector<vector<float> > > > &ker, float bia[], string file_loc, int x, int y, int z, int w){
  
  std::ifstream File(file_loc);
  std::string Str;
  int b_c=0,i=0,j=0,k=0,l=0,count=0,length=y*z*w+1;
  while (std::getline(File, Str))
  {
      istringstream ss(Str);
      string word;
      while (ss >> word) 
      {
          if(count%length==0){
              bia[b_c++]=stof(word);
          }
          else{
              ker[i][j][k][l++]=stof(word);
              if(l==w)k++,l=0;
              if(k==z)j++,k=0;
              if(j==y)i++,j=0;
          }
          count++;
          //cout << word << "\n";
      }
  }
  //cout<<count<<'\n';
  
  File.close();
  
}

void fill_vec(vector<vector<float> > &ker, float bia[], string file_loc, int x, int y){
  
  std::ifstream File(file_loc);
  std::string Str;
  int b_c=0,i=0,j=0,count=0,length=y+1;
  while (std::getline(File, Str))
  {
      istringstream ss(Str);
      string word;
      while (ss >> word) 
      {
          if(count%length==0){
              bia[b_c++]=stof(word);
          }
          else{
              ker[i][j++]=stof(word);
              if(j==y)i++,j=0;
          }
          count++;
          //cout << word << "\n";
      }
  }
  //cout<<count<<'\n';
  
  File.close();
}

void fill_vec_special(vector<vector<float> > &ker, float bia[], string file_loc, int x, int y, int old_y){
  
  std::ifstream File(file_loc);
  std::string Str;
  int b_c=0,i=0,j=0,j_fill=0,count=0,length=old_y+1;
  while (std::getline(File, Str))
  {
      istringstream ss(Str);
      string word;
      while (ss >> word) 
      {
          if(count%length==0){
              bia[b_c++]=stof(word);
          }
          else{
              ker[i][j_fill++]=stof(word);
              j++;
              if(j%36==0){
                  j_fill+=13;
              }
              if(j_fill==y)i++,j_fill=0,j=0;
          }
          count++;
          //cout << word << "\n";
      }
  }
  //cout<<count<<'\n';
  
  File.close();
}

void print(float bia[],int l){
    for(int i=0;i<l;i++)
      cout<<bia[i]<<' ';
    cout<<'\n';
    cout<<'\n';
}
void print(vector<vector<float> > &ker, int x, int y){
    for(int i=0;i<x;i++){
        for(int j=0;j<y;j++)
        cout<<ker[i][j]<<' ';
      cout<<'\n';
    }
    cout<<'\n';
}
void print(vector<vector<vector<vector<float> > > > &ker, int x, int y, int z, int w){
    for(int i=0;i<x;i++){
        for(int j=0;j<y;j++){
            for(int k=0;k<z;k++){
                for(int l=0;l<w;l++)
                  cout<<ker[i][j][k][l]<<' ';
                cout<<"}}}";
            }
            cout<<"___";
        }
      cout<<"...";
    }
    cout<<'\n';
}


float* processImgAlexnet(float *img){

    // cout<<"Inside Alexnet\n\nInput Image"<<endl;
    // for(int i=0;i<224*224*3;i++){
    //     cout<<img[i]<<" ";
    //     //img[i]=1.0;
    // }cout<<"\n\n";


    vector< vector<vector< vector <float> > > > 
        conv1(64,vector<vector< vector <float> > >(3,vector< vector <float> >(11,vector <float>(11,0.0)))),
        conv2(192,vector<vector< vector <float> > >(64,vector< vector <float> >(5,vector <float>(5,0.0)))),
        conv3(384,vector<vector< vector <float> > >(192,vector< vector <float> >(3,vector <float>(3,0.0)))),
        conv4(256,vector<vector< vector <float> > >(384,vector< vector <float> >(3,vector <float>(3,0.0)))),
        conv5(256,vector<vector< vector <float> > >(256,vector< vector <float> >(3,vector <float>(3,0.0))))
    ;

    vector< vector <float> > 
            lin6(4096,  vector <float> (9216, 0.0)),
            lin7(4096,  vector <float> (4096, 0.0)),
            lin8(1000,  vector <float> (4096, 0.0));
        //9216 appended with zeroes from 36 to 49
    float bias1[64]={0.0}, bias2[192]={0.0}, bias3[384]={0.0}, bias4[256]={0.0}, bias5[256]={0.0},bias6[4096]={0.0}, bias7[4096]={0.0}, bias8[1000]={0.0};
  
    //Weights Version_4df8aa71
    /*
    fill_vec(conv1, bias1, "/content/drive/MyDrive/HP3/AlexNet/Weights/Version_4df8aa71/feature1.txt", 64, 3, 11, 11);
    fill_vec(conv2, bias2, "/content/drive/MyDrive/HP3/AlexNet/Weights/Version_4df8aa71/feature2.txt", 192, 64, 5, 5);
    fill_vec(conv3, bias3, "/content/drive/MyDrive/HP3/AlexNet/Weights/Version_4df8aa71/feature3.txt", 384, 192, 3, 3);
    fill_vec(conv4, bias4, "/content/drive/MyDrive/HP3/AlexNet/Weights/Version_4df8aa71/feature4.txt", 256, 384, 3, 3);
    fill_vec(conv5, bias5, "/content/drive/MyDrive/HP3/AlexNet/Weights/Version_4df8aa71/feature5.txt", 256, 256, 3, 3);
    fill_vec(lin6, bias6, "/content/drive/MyDrive/HP3/AlexNet/Weights/Version_4df8aa71/feature6.txt", 4096, 9216);
    fill_vec(lin7, bias7, "/content/drive/MyDrive/HP3/AlexNet/Weights/Version_4df8aa71/feature7.txt", 4096, 4096);
    fill_vec(lin8, bias8, "/content/drive/MyDrive/HP3/AlexNet/Weights/Version_4df8aa71/feature8.txt", 1000, 4096);
    */
    
    //Weights Version_7be5be79
    
    fill_vec(conv1, bias1, "/content/drive/MyDrive/HP3/AlexNet/Weights/Version_7be5be79/feature1.txt", 64, 3, 11, 11);
    fill_vec(conv2, bias2, "/content/drive/MyDrive/HP3/AlexNet/Weights/Version_7be5be79/feature2.txt", 192, 64, 5, 5);
    fill_vec(conv3, bias3, "/content/drive/MyDrive/HP3/AlexNet/Weights/Version_7be5be79/feature3.txt", 384, 192, 3, 3);
    fill_vec(conv4, bias4, "/content/drive/MyDrive/HP3/AlexNet/Weights/Version_7be5be79/feature4.txt", 256, 384, 3, 3);
    fill_vec(conv5, bias5, "/content/drive/MyDrive/HP3/AlexNet/Weights/Version_7be5be79/feature5.txt", 256, 256, 3, 3);
    fill_vec(lin6, bias6, "/content/drive/MyDrive/HP3/AlexNet/Weights/Version_7be5be79/feature6.txt", 4096, 9216);
    fill_vec(lin7, bias7, "/content/drive/MyDrive/HP3/AlexNet/Weights/Version_7be5be79/feature7.txt", 4096, 4096);
    fill_vec(lin8, bias8, "/content/drive/MyDrive/HP3/AlexNet/Weights/Version_7be5be79/feature8.txt", 1000, 4096);
    
    /*for(int i=0;i<64;i++){
        for(int j=0;j<3;j++){
            for(int k=0;k<11;k++){
                for(int l=0;l<11;l++)conv1[i][j][k][l]=1.0;
            }
        }
        bias1[i]=3.0;
    }*/

    cudnnHandle_t cudnn;
    checkCUDNN(cudnnCreate(&cudnn));
  	cublasHandle_t cublas;
  	cublasCreate(&cublas);

    convDim_t firstLayerInputDims = setConvSpecs(224, 224, 3, BATCH_SIZE);

    float *input_layer1;
    cudaMallocManaged(&input_layer1,firstLayerInputDims.Height * firstLayerInputDims.Width * firstLayerInputDims.Channels * firstLayerInputDims.Batch * sizeof(float));
    cudaMemcpy(input_layer1,img,224*224*3*BATCH_SIZE* sizeof(float),cudaMemcpyHostToDevice);
    
    // Set up clock for timing comparisons
	  srand(time(NULL));
	  std::clock_t start;
	  double duration;
    start = std::clock();

    poolDim_t poolDim1 = setPoolSpecs((bool)OVERLAP_POOLING); //setting a overlapping pool layer
    kernelDim_t layerKernel1 = setKernelSpecs(64,11,11,4,4,2,2,1,1);
    ConvLayers convlayer1(1, input_layer1, firstLayerInputDims, layerKernel1, poolDim1, alpha, beta, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, CUDNN_CROSS_CORRELATION,
                        CUDNN_ACTIVATION_RELU, CUDNN_POOLING_MAX, cudnn);
    convlayer1.getConvLayerSpecs();
    convlayer1.buildConvLayer(bias1, conv1);
    convlayer1.fwdProp();

    // cout<<"\nAfter Conv layer 1\n";
    // cout<<"Output image size "<<convlayer1.outDims.Batch<<" X "<<convlayer1.outDims.Height<<" X "<<convlayer1.outDims.Width<<" X "<<convlayer1.outDims.Channels<<endl;
        

    poolDim_t poolDim2 = setPoolSpecs((bool)OVERLAP_POOLING);
    kernelDim_t layerKernel2 = setKernelSpecs(192,5,5,1,1,2,2,1,1);
    ConvLayers convlayer2(2, convlayer1.outputTensor, convlayer1.outDims, layerKernel2, poolDim2, alpha, beta, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, CUDNN_CROSS_CORRELATION,
                        CUDNN_ACTIVATION_RELU, CUDNN_POOLING_MAX, cudnn);
    convlayer2.getConvLayerSpecs();
    convlayer2.buildConvLayer(bias2, conv2);
    convlayer2.fwdProp();

    // cout<<"\nAfter Conv layer 2\n";
    // cout<<"Output image size "<<convlayer2.outDims.Batch<<" X "<<convlayer2.outDims.Height<<" X "<<convlayer2.outDims.Width<<" X "<<convlayer2.outDims.Channels<<endl;
  
  

    // poolDim_t poolDim2 = setPoolSpecs((bool)OVERLAP_POOLING);
    kernelDim_t layerKernel3 = setKernelSpecs(384,3,3,1,1,1,1,1,1);
    ConvLayers convlayer3(3, convlayer2.outputTensor, convlayer2.outDims, layerKernel3, alpha, beta, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, CUDNN_CROSS_CORRELATION,
                        CUDNN_ACTIVATION_RELU, cudnn);
    convlayer3.getConvLayerSpecs();
    convlayer3.buildConvLayer(bias3, conv3);
    convlayer3.fwdProp();

    // cout<<"\nAfter Conv layer 3\n";
    // cout<<"Output image size "<<convlayer3.outDims.Batch<<" X "<<convlayer3.outDims.Height<<" X "<<convlayer3.outDims.Width<<" X "<<convlayer3.outDims.Channels<<endl;
  

    // poolDim_t poolDim2 = setPoolSpecs((bool)OVERLAP_POOLING);
    kernelDim_t layerKernel4 = setKernelSpecs(256,3,3,1,1,1,1,1,1);
    ConvLayers convlayer4(4, convlayer3.outputTensor, convlayer3.outDims, layerKernel4, alpha, beta, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, CUDNN_CROSS_CORRELATION,
                        CUDNN_ACTIVATION_RELU, cudnn);
    convlayer4.getConvLayerSpecs();
    convlayer4.buildConvLayer(bias4, conv4);
    convlayer4.fwdProp();

    // cout<<"\nAfter Conv layer 4\n";
    // cout<<"Output image size "<<convlayer4.outDims.Batch<<" X "<<convlayer4.outDims.Height<<" X "<<convlayer4.outDims.Width<<" X "<<convlayer4.outDims.Channels<<endl;
  

    poolDim_t poolDim5 = setPoolSpecs((bool)OVERLAP_POOLING);
    kernelDim_t layerKernel5 = setKernelSpecs(256,3,3,1,1,1,1,1,1);
    ConvLayers convlayer5(5, convlayer4.outputTensor, convlayer4.outDims, layerKernel5, poolDim5, alpha, beta, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, CUDNN_CROSS_CORRELATION,
                        CUDNN_ACTIVATION_RELU, CUDNN_POOLING_MAX, cudnn);
    convlayer5.getConvLayerSpecs();
    convlayer5.buildConvLayer(bias5, conv5);
    convlayer5.fwdProp();

    // cout<<"\nAfter Conv layer 5\n";
    // cout<<"Output image size "<<convlayer5.outDims.Batch<<" X "<<convlayer5.outDims.Height<<" X "<<convlayer5.outDims.Width<<" X "<<convlayer5.outDims.Channels<<endl;
  
    // cout<<convlayer5.outDims.Height<<" "<<convlayer5.outDims.Width<<endl;
    //adaptive pool to (6*6)
    //int h_len = (convlayer5.outDims.Height+5)/6,  w_len =  (convlayer5.outDims.Width+5)/6;
    //int h_stride = (convlayer5.outDims.Height-h_len)/5,, w_stride = (convlayer5.outDims.Width-w_len)/5;
    //h_len = (convlayer5.outDims.Height)/6,  w_len =  (convlayer5.outDims.Width)/6;
  
    

    int fclayer1_input_dims = convlayer5.outDims.Height * convlayer5.outDims.Width * convlayer5.outDims.Channels; 
  	FCLayers fclayer1( convlayer5.outputTensor, fclayer1_input_dims, BATCH_SIZE, 4096 ,1.0, 0.0, CUDNN_ACTIVATION_RELU,CUDNN_TENSOR_NCHW,
  						CUDNN_DATA_FLOAT, cublas,cudnn);
  	fclayer1.getFCLayerSpecs();
  	fclayer1.buildFCLayer(bias6, lin6);
  	fclayer1.fwdProp(1);


    FCLayers fclayer2(fclayer1.outputTensor, fclayer1.outDims, BATCH_SIZE, 4096 ,1.0, 0.0, CUDNN_ACTIVATION_RELU,CUDNN_TENSOR_NCHW,
						CUDNN_DATA_FLOAT, cublas,cudnn);
  	fclayer2.getFCLayerSpecs();
  	fclayer2.buildFCLayer(bias7, lin7);
  	fclayer2.fwdProp(1);


  	// dummy labels
  	float* labels;
  	cudaMallocManaged(&labels, sizeof(float));
  	cudaMemset(labels,1.0,sizeof(float));

  	FCLayers fclayer3(fclayer2.outputTensor, fclayer2.outDims, BATCH_SIZE, 1000 ,1.0, 0.0, CUDNN_ACTIVATION_RELU, CUDNN_TENSOR_NCHW,
  						CUDNN_DATA_FLOAT, cublas,cudnn, labels);
  	fclayer3.getFCLayerSpecs();
  	fclayer3.buildFCLayer(bias8, lin8);
  	fclayer3.fwdProp(0);

    duration = (std::clock() - start) / (double)CLOCKS_PER_SEC;
		std::cout << "CPU time: " << duration << std::endl;

  	float *output_image{nullptr};
  	output_image = (float*)malloc(1000*sizeof(float));
  	cudaMemcpy(output_image, fclayer3.outputTensor, 1000*sizeof(float),cudaMemcpyDeviceToHost);
    
    //cudaDeviceSynchronize();
    // for(int i=0;i<100;i++){
    //     cout<<output_image[i]<<" ";
    // }cout<<endl;
  
    return output_image;
}

void softmax(float* input, int size) {
	assert(0 <= size <= sizeof(input) / sizeof(float));

  vector<long double> inp(size);
  for(int i=0;i<size;i++)inp[i]=input[i];

	long double m, sum, constant;

	m = *max_element(inp.begin(),inp.end());

	sum = 0.0;
	for (int i = 0; i < size; i++) {
		sum += exp(inp[i] - m);
	}

	constant = m + log(sum);
	for (int i = 0; i < size; ++i) {
		input[i] = (float)exp(inp[i] - constant);
	}

}

float * image2array(Mat image){

  float *imageArray = (float *)image.data;
  
  return imageArray;
}

Mat load_image(const char* image_path) {
  cv::Mat image = cv::imread(image_path, CV_LOAD_IMAGE_COLOR);
  image.convertTo(image, CV_32FC3);
  cv::normalize(image, image, 0, 1, cv::NORM_MINMAX);
  cout << "Input Image: " << image.rows << " x " << image.cols << " x " << image.channels() << endl;
  return image;
}

Mat normalize_mean_std(Mat image, vector<float> mean, vector<float> std, int channels){
    for(int i=0;i<image.rows;++i) { 
      for(int j=0;j<image.cols;++j) {
          Vec3f temp = image.at<Vec3f>(i, j);
          for(int k=0;k<channels;k++)
              temp[k] = (temp[k] - mean[k]) / std[k];
          image.at<Vec3f>(i,j) = temp;
      } 
    }
    return image;
}

float* ret_image(Mat image){
    int row=image.rows, col=image.cols, chn= image.channels();
    float *fin = (float *)malloc(row*col*chn*sizeof(float));
    int it=0;
    for(int k=0;k<chn;k++){
      for(int i=0;i<row;++i) { 
        for(int j=0;j<col;++j) {
            fin[it++]=image.at<Vec3f>(i, j)[k];
        } 
      }
    }
    return fin;
} 

float* ret_image_normal(Mat image){
    int row=image.rows, col=image.cols, chn= image.channels();
    float *fin = (float *)malloc(row*col*chn*sizeof(float));
    int it=0;
    for(int i=0;i<row;++i) { 
        for(int j=0;j<col;++j) {
            for(int k=0;k<chn;k++){
                fin[it++]=image.at<Vec3f>(i, j)[k];
            }
        } 
    }
    return fin;
} 

Mat transformImage(Mat image){
    //Reference - https://colab.research.google.com/github/pytorch/pytorch.github.io/blob/master/assets/hub/pytorch_vision_alexnet.ipynb

    //Resize to 256
    Mat red_image;
    cv::resize(image, red_image, Size(256, 256), 0, 0, CV_INTER_AREA);
    //cout<<red_image.cols<<' '<<red_image.rows<<'\n';

    //CenterCrop to 224
    const int cropSize = 224;
    const int offsetW = (red_image.cols - cropSize) / 2;
    const int offsetH = (red_image.rows - cropSize) / 2;
    const Rect roi(offsetW, offsetH, cropSize, cropSize);
    red_image = image(roi).clone();
    //cout<<red_image.cols<<' '<<red_image.rows<<'\n';

    //Normalize using mean and std
    vector<float> mean{0.485, 0.456, 0.406}, std{0.229, 0.224, 0.225};
    red_image = normalize_mean_std(red_image, mean, std, 3);

    return red_image;
}

int main(){
    //float img[3][227][227]={0.0};
    //cout<<sizeof(img)<<endl;

    Mat image = load_image("/content/drive/MyDrive/HP3/AlexNet/Images/dog.jpg");
    cout<<"\nInput Image Details"<<endl;
    cout<<"H = "<<image.rows<<" ";
	  cout<<"W = "<<image.cols<<" ";
	  cout<<"C = "<<image.channels()<<'\n';

    image = transformImage(image);
    cout<<"\nInput Image Details after transformation"<<endl;
    cout<<"H = "<<image.rows<<" ";
	  cout<<"W = "<<image.cols<<" ";
	  cout<<"C = "<<image.channels()<<'\n';

    //float *inputImage = ret_image(image);
    //float *inputImageNormal = ret_image_normal(image);

    /*cout<<"\nInput Image, after resize + crop + normalization (100):"<<endl;
     for(int i=0;i<100;i++){
        cout<<inputImage[i]<<" ";
    }cout<<"\n\n";*/

    /*cout<<"\nInput Image, normal, after resize + crop + normalization (100):"<<endl;
     for(int i=0;i<100;i++){
        cout<<inputImageNormal[i]<<" ";
    }cout<<"\n\n";*/
    
    //cout<<"idhar dekho"<<sizeof(inputImage)<<endl;

    float *inputImage = (float *)malloc(3*224*224*sizeof(float));
    ifstream myfile;
    myfile.open("/content/drive/MyDrive/HP3/AlexNet/Images/inp_dog_tr.txt");
    string line;
    int itt=0;
    while (getline(myfile, line))
        inputImage[itt++] = stof(line);
      
    cout<<"\nInput Image, from file (100):"<<endl;
     for(int i=0;i<100;i++){
        cout<<inputImage[i]<<" ";
    }cout<<"\n\n";

    float *output_image = processImgAlexnet(inputImage);

    cout<<"\nFinal Output (100):"<<endl;
    float sum=0;
    for(int i=0;i<100;i++){
        sum+=output_image[i];
        cout<<output_image[i]<<" ";
    }cout<<endl;
    //cout<<"All_Sum = "<<sum<<'\n';

    softmax(output_image,100);

    cout<<"\nFinal Output after applying softmax (100):"<<endl;
    sum=0;
    for(int i=0;i<100;i++){
        sum+=output_image[i];
        cout<<output_image[i]<<" ";
    }cout<<endl;
    //cout<<"All_Sum = "<<sum<<'\n';

    vector<pair<float,int>> fin_v(1000);
    for(int i=0;i<1000;i++)
      fin_v[i]={-output_image[i],i};
    sort(fin_v.begin(),fin_v.end());

    vector<string> classes;
    fstream newfile;
    newfile.open("/content/drive/MyDrive/HP3/AlexNet/imagenet_classes.txt",ios::in);
    string tp;
    while(getline(newfile, tp))
        classes.push_back(tp);
    newfile.close();

    cout<<"\nPrinting classes in decreasing order of probabilities (10)\n";
    for(int i=0;i<10;i++)
      cout<<"Class = "<<classes[fin_v[i].second]<<", ID = "<<fin_v[i].second<<", Probability = "<<-fin_v[i].first<<'\n';

    cout<<"\nFinni"<<endl;
    
}


'File written in /content/src/alexnet.cu'

In [None]:
!nvcc /content/src/alexnet.cu `pkg-config --cflags --libs opencv` -lcudnn -lcublas -lopencv_imgcodecs -lopencv_imgproc -lopencv_core -pg -std=c++11 -o /content/src/alexnet

In [None]:
!/content/src/alexnet

Input Image: 1213 x 1546 x 3

Input Image Details
H = 1213 W = 1546 C = 3

Input Image Details after transformation
H = 224 W = 224 C = 3

Input Image, from file (100):
-1.98091 -1.91241 -1.86103 -1.77541 -1.77541 -1.86103 -2.03228 -2.0494 -1.92953 -1.89528 -1.96378 -1.99803 -2.0494 -2.01516 -1.98091 -2.0494 -2.0494 -2.06653 -1.99803 -1.94666 -1.89528 -1.99803 -1.98091 -1.94666 -1.92953 -1.91241 -1.91241 -1.96378 -1.98091 -1.98091 -1.99803 -1.96378 -2.01516 -2.01516 -1.98091 -1.96378 -1.96378 -1.92953 -1.86103 -1.82678 -1.94666 -1.94666 -1.94666 -1.51854 -0.525302 0.536433 1.37555 1.22142 0.998801 1.05018 1.05018 1.17005 1.56392 1.94066 -0.165682 -1.67266 -1.84391 -1.87816 -1.86103 -1.86103 -1.80966 -1.84391 -1.96378 -1.99803 -2.03228 -1.98091 -1.98091 -1.87816 -1.67266 -1.62129 -1.60416 -1.74116 -1.77541 -1.84391 -1.86103 -1.79253 -1.77541 -1.70691 -1.65554 -1.67266 -1.68979 -1.62129 -1.72403 -1.65554 -1.60416 -1.74116 -1.80966 -1.92953 -1.96378 -1.77541 -1.60416 -1.65554 -1.56991 -1.

In [None]:
import torch
import torch.nn as nn
# from .utils import load_state_dict_from_url
try:
    from torch.hub import load_state_dict_from_url
except ImportError:
    from torch.utils.model_zoo import load_url as load_state_dict_from_url

from typing import Any


__all__ = ['AlexNet', 'alexnet']


model_urls = {
    'alexnet': 'https://download.pytorch.org/models/alexnet-owt-7be5be79.pth',
}


class AlexNet(nn.Module):

    def __init__(self, num_classes: int = 1000) -> None:
        super(AlexNet, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(64, 192, kernel_size=5, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(192, 384, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(384, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
        )
        self.avgpool = nn.AdaptiveAvgPool2d((6, 6))
        self.classifier = nn.Sequential(
            nn.Dropout(),
            nn.Linear(256 * 6 * 6, 4096),
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Linear(4096, 4096),
            nn.ReLU(inplace=True),
            nn.Linear(4096, num_classes),
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # x = self.features(x)
        print("idhar aao bhaiya")
        for module in self.features.modules():
            if not isinstance(module, nn.Sequential):
                print(x.shape)
                print(x)
                print(module) 
                # print(module.weight.data)
                x = module(x)
                
        # print(x.size())
        print(x)
        x = self.avgpool(x)
        # print(x)
        # print(x.size())
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x


def alexnet(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> AlexNet:
    r"""AlexNet model architecture from the
    `"One weird trick..." <https://arxiv.org/abs/1404.5997>`_ paper.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
        progress (bool): If True, displays a progress bar of the download to stderr
    """
    model = AlexNet(**kwargs)
    if pretrained:
        state_dict = load_state_dict_from_url(model_urls['alexnet'],
                                              progress=progress)
        model.load_state_dict(state_dict)
    return model

model = alexnet(pretrained=True)

model.eval()
print(model)


In [None]:
# sample execution (requires torchvision)
from PIL import Image
from torchvision import transforms
filename = "/content/drive/MyDrive/HP3/AlexNet/Images/dog.jpg"
input_image = Image.open(filename)
preprocess = transforms.Compose([
    transforms.Resize([256,256]),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])
input_tensor = preprocess(input_image)
print(input_tensor.shape)
# print(input_tensor.data)
new_t = torch.flatten(input_tensor)
print(new_t.shape)
new_list = [it.item() for it in new_t]
print(new_list[:5])
print(new_t)
with open('/content/drive/MyDrive/HP3/AlexNet/Images/inp_dog_tr.txt', 'w') as f:
    for item in new_list:
        f.write("%s\n" % item)

input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model

# move the input and model to GPU for speed if available
if torch.cuda.is_available():
    input_batch = input_batch.to('cuda')
    model.to('cuda')

print(input_batch.shape)
print(input_batch[0])
with torch.no_grad():
    output = model(input_batch)
# Tensor of shape 1000, with confidence scores over Imagenet's 1000 classes
# print(output[0])
# The output has unnormalized scores. To get probabilities, you can run a softmax on it.
probabilities = torch.nn.functional.softmax(output[0], dim=0)
# print(probabilities)

In [None]:
#Below is testig section for single convolutiuon layer

In [None]:
%%cuda --name test.cu
#include <cudnn.h>
#include <iostream>

#include <cudnn.h>
#include <cublas_v2.h>

// #include <helper_cuda.h>
#include <curand.h>

namespace cudl
{
#define BLOCK_DIM_1D    512
#define BLOCK_DIM       16

/* DEBUG FLAGS */
#define DEBUG_FORWARD   0
#define DEBUG_BACKWARD  0

#define DEBUG_CONV      0
#define DEBUG_DENSE     0
#define DEBUG_SOFTMAX   0
#define DEBUG_UPDATE    0

#define DEBUG_LOSS      0
#define DEBUG_ACCURACY  0

/* CUDA API error return checker */
#ifndef checkCudaErrors
#define checkCudaErrors(err)                                                                        \
    {                                                                                               \
        if (err != cudaSuccess)                                                                     \
        {                                                                                           \
            fprintf(stderr, "checkCudaErrors() API error = %04d \"%s\" from file <%s>, line %i.\n", \
                    err, cudaGetErrorString(err), __FILE__, __LINE__);                              \
			fprintf(stderr, "%d\n", cudaSuccess);													\
            exit(-1);                                                                               \
        }                                                                                           \
    }
#endif

static const char *_cublasGetErrorEnum(cublasStatus_t error) {
  switch (error) {
    case CUBLAS_STATUS_SUCCESS:
      return "CUBLAS_STATUS_SUCCESS";

    case CUBLAS_STATUS_NOT_INITIALIZED:
      return "CUBLAS_STATUS_NOT_INITIALIZED";

    case CUBLAS_STATUS_ALLOC_FAILED:
      return "CUBLAS_STATUS_ALLOC_FAILED";

    case CUBLAS_STATUS_INVALID_VALUE:
      return "CUBLAS_STATUS_INVALID_VALUE";

    case CUBLAS_STATUS_ARCH_MISMATCH:
      return "CUBLAS_STATUS_ARCH_MISMATCH";

    case CUBLAS_STATUS_MAPPING_ERROR:
      return "CUBLAS_STATUS_MAPPING_ERROR";

    case CUBLAS_STATUS_EXECUTION_FAILED:
      return "CUBLAS_STATUS_EXECUTION_FAILED";

    case CUBLAS_STATUS_INTERNAL_ERROR:
      return "CUBLAS_STATUS_INTERNAL_ERROR";

    case CUBLAS_STATUS_NOT_SUPPORTED:
      return "CUBLAS_STATUS_NOT_SUPPORTED";

    case CUBLAS_STATUS_LICENSE_ERROR:
      return "CUBLAS_STATUS_LICENSE_ERROR";
  }

  return "<unknown>";
}

#define checkCublasErrors(err)                                                                        \
    {                                                                                                 \
        if (err != CUBLAS_STATUS_SUCCESS)                                                             \
        {                                                                                             \
            fprintf(stderr, "checkCublasErrors() API error = %04d \"%s\" from file <%s>, line %i.\n", \
                    err, _cublasGetErrorEnum(err), __FILE__, __LINE__);                                 \
            exit(-1);                                                                                 \
        }                                                                                             \
    }

#define checkCudnnErrors(err)                                                                        \
    {                                                                                                \
        if (err != CUDNN_STATUS_SUCCESS)                                                             \
        {                                                                                            \
            fprintf(stderr, "checkCudnnErrors() API error = %04d \"%s\" from file <%s>, line %i.\n", \
                    err, cudnnGetErrorString(err), __FILE__, __LINE__);                              \
            exit(-1);                                                                                \
        }                                                                                            \
    }

// cuRAND API errors
static const char *_curandGetErrorEnum(curandStatus_t error) {
    switch (error) {
    case CURAND_STATUS_SUCCESS:
        return "CURAND_STATUS_SUCCESS";

    case CURAND_STATUS_VERSION_MISMATCH:
        return "CURAND_STATUS_VERSION_MISMATCH";

    case CURAND_STATUS_NOT_INITIALIZED:
        return "CURAND_STATUS_NOT_INITIALIZED";

    case CURAND_STATUS_ALLOCATION_FAILED:
        return "CURAND_STATUS_ALLOCATION_FAILED";

    case CURAND_STATUS_TYPE_ERROR:
        return "CURAND_STATUS_TYPE_ERROR";

    case CURAND_STATUS_OUT_OF_RANGE:
        return "CURAND_STATUS_OUT_OF_RANGE";

    case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
        return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";

    case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
        return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";

    case CURAND_STATUS_LAUNCH_FAILURE:
        return "CURAND_STATUS_LAUNCH_FAILURE";

    case CURAND_STATUS_PREEXISTING_FAILURE:
        return "CURAND_STATUS_PREEXISTING_FAILURE";

    case CURAND_STATUS_INITIALIZATION_FAILED:
        return "CURAND_STATUS_INITIALIZATION_FAILED";

    case CURAND_STATUS_ARCH_MISMATCH:
        return "CURAND_STATUS_ARCH_MISMATCH";

    case CURAND_STATUS_INTERNAL_ERROR:
        return "CURAND_STATUS_INTERNAL_ERROR";
    }

    return "<unknown>";
}


#define checkCurandErrors(err)                                                                        \
    {                                                                                                \
        if (err != CURAND_STATUS_SUCCESS)                                                             \
        {                                                                                            \
            fprintf(stderr, "checkCurandErrors() API error = %04d \"%s\" from file <%s>, line %i.\n", \
                    err, _curandGetErrorEnum(err), __FILE__, __LINE__);                              \
            exit(-1);                                                                                \
        }                                                                                            \
    }

// container for cuda resources
class CudaContext
{
    public:
    CudaContext()
    {
        cublasCreate(&_cublas_handle);
        checkCudaErrors(cudaGetLastError());
        checkCudnnErrors(cudnnCreate(&_cudnn_handle));
    }
    ~CudaContext()
    {
        cublasDestroy(_cublas_handle);
        checkCudnnErrors(cudnnDestroy(_cudnn_handle));
    }

    cublasHandle_t cublas() { 
        //std::cout << "Get cublas request" << std::endl; getchar();
        return _cublas_handle; };
    cudnnHandle_t cudnn() { return _cudnn_handle; };

    const float one       =  1.f;
    const float zero      =  0.f;
    const float minus_one = -1.f;

    private:
    cublasHandle_t _cublas_handle;
    cudnnHandle_t  _cudnn_handle;
};

} // namespace cudl





int main()
{
    cudnnHandle_t cudnn;
    cudnnTensorDescriptor_t input_desc;
    cudnnTensorDescriptor_t output_desc;
    cudnnFilterDescriptor_t filter_desc;
    cudnnConvolutionDescriptor_t conv_desc;
    cudnnTensorDescriptor_t bias_desc;

    cudnnConvolutionFwdAlgo_t falgo;
    cudnnConvolutionBwdFilterAlgo_t b_falgo;
    cudnnConvolutionBwdDataAlgo_t b_dalgo;

    float *d_input = nullptr;
    float *d_output = nullptr;
    float *d_filter = nullptr;
    float *d_bias = nullptr;

    int input_n = 1;
    int input_c = 3;
    int input_h = 224;
    int input_w = 224;

    // output size
    int output_n = input_n;
    int output_c = 64;
    int output_h = 1;
    int output_w = 1;

    // kernel size
    int filter_h = 11;
    int filter_w = 11;

    // alpha, beta
    float one = 1.f;
    float zero = 0.f;

    std::cout << "[" <<  __LINE__ << "]" << std::endl;

    cudnnCreate(&cudnn);

    std::cout << "[" <<  __LINE__ << "]" << std::endl;

    /* Create Resources */
    cudnnCreateTensorDescriptor(&input_desc);
    cudnnCreateTensorDescriptor(&output_desc);
    cudnnCreateFilterDescriptor(&filter_desc);
    cudnnCreateConvolutionDescriptor(&conv_desc);
    cudnnCreateTensorDescriptor(&bias_desc);

    std::cout << "[" <<  __LINE__ << "]" << std::endl;

    // Initilziae resources
    cudnnSetTensor4dDescriptor(input_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, input_n, input_c, input_h, input_w);
    cudnnSetFilter4dDescriptor(filter_desc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, output_c, input_c, filter_h, filter_w);
    cudnnSetConvolution2dDescriptor(conv_desc,
                                    2, 2,
                                    4, 4,
                                    1, 1,
                                    CUDNN_CROSS_CORRELATION,
                                    CUDNN_DATA_FLOAT);
    cudnnGetConvolution2dForwardOutputDim(conv_desc, input_desc, filter_desc, &output_n, &output_c, &output_h, &output_w);
    cudnnSetTensor4dDescriptor(output_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, output_n, output_c, output_h, output_w);
    cudnnSetTensor4dDescriptor(bias_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, output_c, 1, 1);

    int weight_size = output_c * input_c * filter_h * filter_w;
    int bias_size = output_c;

    std::cout << "input  size: " << input_n << " " << input_c << " " << input_h << " " << input_w << std::endl;
    std::cout << "output size: " << output_n << " " << output_c << " " << output_h << " " << output_w << std::endl;

    std::cout << "[" <<  __LINE__ << "]" << std::endl;

    // convolution
    size_t workspace_size = 0;
    size_t temp_size = 0;
    float *d_workspace = nullptr;
    cudnnGetConvolutionForwardAlgorithm(cudnn, input_desc, filter_desc, conv_desc, output_desc, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, 0, &falgo);
    cudnnGetConvolutionForwardWorkspaceSize(cudnn, input_desc, filter_desc, conv_desc, output_desc, falgo, &temp_size);
    workspace_size = max(workspace_size, temp_size);
    std::cout << "workspace size: " << workspace_size << std::endl;
 
    // convolution (bwd - filter)
    cudnnGetConvolutionBackwardFilterAlgorithm(cudnn, input_desc, output_desc, conv_desc, filter_desc, CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST, 0, &b_falgo);
    cudnnGetConvolutionBackwardFilterWorkspaceSize(cudnn, input_desc, output_desc, conv_desc, filter_desc, b_falgo, &temp_size);
    workspace_size = max(workspace_size, temp_size);

    // convolution (bwd - data)
    cudnnGetConvolutionBackwardDataAlgorithm(cudnn, filter_desc, output_desc, conv_desc, input_desc, CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST, 0, &b_dalgo);
    cudnnGetConvolutionBackwardDataWorkspaceSize(cudnn, filter_desc, output_desc, conv_desc, input_desc, b_dalgo, &temp_size);
    workspace_size = max(workspace_size, temp_size);

    std::cout << "workspace size: " << workspace_size << std::endl;
    std::cout << "[" <<  __LINE__ << "]" << std::endl;

    // allocate memory space
    float arr1[input_n * input_c * input_h * input_w], arr2[weight_size], arr3[bias_size], arr4[output_n * output_c * output_h * output_w];
    for(int i=0;i<input_n * input_c * input_h * input_w;i++)arr1[i]=1.0;
    for(int i=0;i<weight_size;i++)arr2[i]=1.0;
    for(int i=0;i<bias_size;i++)arr3[i]=1.0;
    
    cudaMalloc((void**)&d_input,        sizeof(float) * input_n * input_c * input_h * input_w);
    cudaMalloc((void**)&d_filter,       sizeof(float) * weight_size);
    cudaMalloc((void**)&d_output,       sizeof(float) * output_n * output_c * output_h * output_w);
    cudaMalloc((void**)&d_workspace,    sizeof(float) * workspace_size);
    cudaMalloc((void**)&d_bias,         sizeof(float) * bias_size);
    cudaMemcpy(d_input, arr1, sizeof(float) * input_n * input_c * input_h * input_w, cudaMemcpyHostToDevice);
    cudaMemcpy(d_filter, arr2, sizeof(float) * weight_size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_bias, arr3, sizeof(float) * bias_size, cudaMemcpyHostToDevice);
 
    std::cout << "[" <<  __LINE__ << "]" << std::endl;

    // Forward
    checkCudnnErrors(cudnnConvolutionForward(cudnn, &one, input_desc, d_input, filter_desc, d_filter, conv_desc, falgo, d_workspace, workspace_size, &zero, output_desc, d_output));
    //cudaMemcpy(arr4, d_output, sizeof(float) * output_n * output_c * output_h * output_w, cudaMemcpyDeviceToHost);
    
    checkCudnnErrors(cudnnAddTensor(cudnn, &one, bias_desc, d_bias, &one, output_desc, d_output));
    checkCudaErrors(cudaGetLastError());
    cudaMemcpy(arr4, d_output, sizeof(float) * output_n * output_c * output_h * output_w, cudaMemcpyDeviceToHost);
    for(int i=0;i<output_n * output_c * output_h * output_w;i++){
        std::cout << arr4[i]<< " ";
    }

    
    std::cout << "[" <<  __LINE__ << "]" << std::endl;

    // backward
    //checkCudnnErrors(cudnnConvolutionBackwardBias(cudnn, &one, output_desc, d_output, &zero, bias_desc, d_bias));
    //checkCudnnErrors(cudnnConvolutionBackwardFilter(cudnn, &one, input_desc, d_input, output_desc, d_output, conv_desc, b_falgo, d_workspace, workspace_size, &zero, filter_desc, d_filter));
    //checkCudnnErrors(cudnnConvolutionBackwardData(cudnn, &one, filter_desc, d_filter, output_desc, d_output, conv_desc, b_dalgo, d_workspace, workspace_size, &zero, input_desc, d_input));
    //checkCudaErrors(cudaGetLastError());
    
    std::cout << "[" <<  __LINE__ << "]" << std::endl;

    cudnnDestroyTensorDescriptor(input_desc);
    cudnnDestroyTensorDescriptor(output_desc);
    cudnnDestroyFilterDescriptor(filter_desc);
    cudnnDestroyConvolutionDescriptor(conv_desc);
    cudnnDestroyTensorDescriptor(bias_desc);

    std::cout << "[" <<  __LINE__ << "]" << std::endl;

    cudaFree(d_input);    
    cudaFree(d_filter);
    cudaFree(d_output);
    cudaFree(d_workspace);
    cudaFree(d_bias);

    cudnnDestroy(cudnn);

    std::cout << "[" <<  __LINE__ << "]" << std::endl;
}


'File written in /content/src/test.cu'

In [None]:
!nvcc /content/src/test.cu `pkg-config --cflags --libs opencv` -lcudnn -lcublas -lopencv_imgcodecs -lopencv_imgproc -lopencv_core -pg -std=c++11 -o /content/src/test





In [None]:
!/content/src/test

[227]
[231]
[240]
input  size: 1 3 224 224
output size: 1 64 55 55
[261]
workspace size: 0
workspace size: 0
[283]
[300]
244 298 298 298 298 298 298 298 298 298 298 298 298 298 298 298 298 298 298 298 298 298 298 298 298 298 298 298 298 298 298 298 298 298 298 298 298 298 298 298 298 298 298 298 298 298 298 298 298 298 298 298 298 298 271 298 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 331 298 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 331 298 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 364 331

In [None]:
#testing for everything to 1 for above cuda code
import torch
import torch.nn as nn
model = nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2)
model.weight.data.fill_(1.0)
# self.UTuser.weight.data.fill_(-0.000001)
model.bias.data.fill_(1.0)
# self.UTuser.bias.data.fill_(1)
x=torch.ones(1, 3, 224, 224)
print(model(x))

tensor([[[[244., 298., 298.,  ..., 298., 298., 271.],
          [298., 364., 364.,  ..., 364., 364., 331.],
          [298., 364., 364.,  ..., 364., 364., 331.],
          ...,
          [298., 364., 364.,  ..., 364., 364., 331.],
          [298., 364., 364.,  ..., 364., 364., 331.],
          [271., 331., 331.,  ..., 331., 331., 301.]],

         [[244., 298., 298.,  ..., 298., 298., 271.],
          [298., 364., 364.,  ..., 364., 364., 331.],
          [298., 364., 364.,  ..., 364., 364., 331.],
          ...,
          [298., 364., 364.,  ..., 364., 364., 331.],
          [298., 364., 364.,  ..., 364., 364., 331.],
          [271., 331., 331.,  ..., 331., 331., 301.]],

         [[244., 298., 298.,  ..., 298., 298., 271.],
          [298., 364., 364.,  ..., 364., 364., 331.],
          [298., 364., 364.,  ..., 364., 364., 331.],
          ...,
          [298., 364., 364.,  ..., 364., 364., 331.],
          [298., 364., 364.,  ..., 364., 364., 331.],
          [271., 331., 331.,  ...