diff --git a/src/seq_mv/csr_matop_device.c b/src/seq_mv/csr_matop_device.c index a3ea4f15f..03c0b02d7 100644 --- a/src/seq_mv/csr_matop_device.c +++ b/src/seq_mv/csr_matop_device.c @@ -2001,166 +2001,6 @@ hypre_CSRMatrixPermuteDevice( hypre_CSRMatrix *A, #endif /* HYPRE_USING_CUDA || defined(HYPRE_USING_HIP) */ -#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) - -__global__ void -hypreGPUKernel_CSRMatrixComputePermutedRowPtrs( hypre_DeviceItem &item, - HYPRE_Int n, - HYPRE_Int *perm, - HYPRE_Int *row_ptrs_in, - HYPRE_Int *row_ptrs_out) -{ - HYPRE_Int i = hypre_gpu_get_grid_thread_id<1, 1>(item); - if (i < n) - { - HYPRE_Int p = perm[i]; - HYPRE_Int r0 = row_ptrs_in[p]; - HYPRE_Int r1 = row_ptrs_in[p+1]; - row_ptrs_out[i] = r1 - r0; - } -} - -HYPRE_Int -hypreDevice_CSRMatrixComputePermutedRowPtrs(HYPRE_Int n, - HYPRE_Int *perm, - HYPRE_Int *row_ptrs_in, - HYPRE_Int *row_ptrs_out) -{ - /* trivial case */ - if (n <= 0) - { - return hypre_error_flag; - } - - dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); - dim3 gDim = hypre_GetDefaultDeviceGridDimension(n, "thread", bDim); - - HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRMatrixComputePermutedRowPtrs, gDim, bDim, n, - perm, row_ptrs_in, row_ptrs_out); - - return hypre_error_flag; -} - - - -__global__ void -hypreGPUKernel_CSRMatrixPermuteColsVals( hypre_DeviceItem &item, - HYPRE_Int num_rows, HYPRE_Int nnz, HYPRE_Int threads_per_row, - HYPRE_Int * perm, HYPRE_Int * rqperm, - HYPRE_Int * src_i, HYPRE_Int * src_j, HYPRE_Complex * src_data, - HYPRE_Int * dst_i, HYPRE_Int * dst_j, HYPRE_Complex * dst_data) -{ - HYPRE_Int tidx = hypre_gpu_get_thread_id<1>(item); - HYPRE_Int tid = tidx%threads_per_row; - HYPRE_Int rowSmall = tidx/threads_per_row; - HYPRE_Int row = (blockDim.x/threads_per_row)*blockIdx.x + rowSmall; - - HYPRE_Int src_r0=0, src_r1=0, dst_r0=0, dst_r1=0; - if (row> 1; - v |= v >> 2; - v |= v >> 4; - v |= v >> 8; - v |= v >> 16; - v++; - return v; -} - -HYPRE_Int -hypreDevice_CSRMatrixPermuteColsVals(HYPRE_Int nrows, HYPRE_Int nnz, - HYPRE_Int * perm, HYPRE_Int * rqperm, - HYPRE_Int * src_i, HYPRE_Int * src_j, HYPRE_Complex * src_data, - HYPRE_Int * dst_i, HYPRE_Int * dst_j, HYPRE_Complex * dst_data) -{ - HYPRE_Int num_threads=128; - HYPRE_Int threads_per_row = (nnz + nrows - 1)/nrows; - threads_per_row = std::min(nextPowerOfTwo(threads_per_row), HYPRE_WARP_SIZE); - HYPRE_Int num_rows_per_block = num_threads/threads_per_row; - HYPRE_Int num_blocks = (nrows + num_rows_per_block - 1)/num_rows_per_block; - - /* compute the location of the diagonal in each row */ - const dim3 bDim(num_threads,1,1); - const dim3 gDim(num_blocks,1,1); - - HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRMatrixPermuteColsVals, gDim, bDim, nrows, nnz, threads_per_row, - perm, rqperm, src_i, src_j, src_data, dst_i, dst_j, dst_data); - return hypre_error_flag; -} - -HYPRE_Int -hypre_CSRMatrixApplyRowColPermutation( hypre_CSRMatrix *A, - HYPRE_Int *perm, - HYPRE_Int *rqperm, - hypre_CSRMatrix **B ) -{ - HYPRE_Int *A_i = hypre_CSRMatrixI(A); - HYPRE_Int *A_j = hypre_CSRMatrixJ(A); - HYPRE_Complex *A_data = hypre_CSRMatrixData(A); - HYPRE_Int n = hypre_CSRMatrixNumRows(A); - HYPRE_Int nnz = hypre_CSRMatrixNumNonzeros(A); - -#if defined(HYPRE_USING_CUDA) - cudaEvent_t start, stop; - float time; - cudaEventCreate(&start); - cudaEventCreate(&stop); - cudaEventRecord(start, 0); -#endif - - /* No schur complement makes everything easy :) */ - *B = hypre_CSRMatrixCreate(n, n, nnz); - hypre_CSRMatrixInitialize(*B); - HYPRE_Int *B_i = hypre_CSRMatrixI(*B); - HYPRE_Int *B_j = hypre_CSRMatrixJ(*B); - HYPRE_Complex *B_data = hypre_CSRMatrixData(*B); - - /* compute permuted row ptrs */ - hypreDevice_CSRMatrixComputePermutedRowPtrs(n, perm, A_i, B_i); - hypreDevice_IntegerExclusiveScan(n+1, B_i); - - /* permute the cols/values */ - hypreDevice_CSRMatrixPermuteColsVals(n, nnz, perm, rqperm, A_i, A_j, A_data, B_i, B_j, B_data); - -#if defined(HYPRE_USING_CUDA) - cudaEventRecord( stop, 0 ); - cudaEventSynchronize( stop ); - cudaEventElapsedTime( &time, start, stop ); - //printf("%s %s %d : time=%1.5g\n",__FILE__,__FUNCTION__,__LINE__,time/1000.); - cudaEventDestroy( start ); - cudaEventDestroy( stop ); -#endif - return hypre_error_flag; -} - -#endif /* HYPRE_USING_CUDA || defined(HYPRE_USING_HIP) */ - #if defined(HYPRE_USING_GPU) HYPRE_Int @@ -2168,8 +2008,6 @@ hypre_CSRMatrixTransposeDevice(hypre_CSRMatrix *A, hypre_CSRMatrix **AT_ptr, HYPRE_Int data) { - hypre_GpuProfilingPushRange("CSRMatrixTranspose"); - HYPRE_Complex *A_data = hypre_CSRMatrixData(A); HYPRE_Int *A_i = hypre_CSRMatrixI(A); HYPRE_Int *A_j = hypre_CSRMatrixJ(A); @@ -2181,6 +2019,8 @@ hypre_CSRMatrixTransposeDevice(hypre_CSRMatrix *A, HYPRE_Int *C_j; hypre_CSRMatrix *C; + HYPRE_ANNOTATE_FUNC_BEGIN; + hypre_GpuProfilingPushRange("CSRMatrixTranspose"); /* trivial case */ if (nnz_A == 0) @@ -2222,6 +2062,7 @@ hypre_CSRMatrixTransposeDevice(hypre_CSRMatrix *A, hypre_SyncComputeStream(hypre_handle()); hypre_GpuProfilingPopRange(); + HYPRE_ANNOTATE_FUNC_END; return hypre_error_flag; } @@ -2248,14 +2089,21 @@ hypre_CSRMatrixSortRow(hypre_CSRMatrix *A) } #if defined(HYPRE_USING_CUSPARSE) -/* @brief This functions sorts values and column indices in each row in ascending order INPLACE - * @param[in] n Number of rows - * @param[in] m Number of columns - * @param[in] nnzA Number of nonzeroes - * @param[in] *d_ia (Unsorted) Row indices - * @param[in,out] *d_ja_sorted On Start: Unsorted column indices. On return: Sorted column indices - * @param[in,out] *d_a_sorted On Start: Unsorted values. On Return: Sorted values corresponding with column indices - */ + +/*-------------------------------------------------------------------------- + * hypre_SortCSRCusparse + * + * Sorts values and column indices in each row in ascending order INPLACE + * + * Parameters: + * n: Number of rows [in] + * m: Number of columns [in] + * nnzA: Number of nonzeros [in] + * d_ia: row pointers [in/out] + * d_ja_sorted: column indices [in/out] + * d_a_sorted: coefficients [in/out] + *--------------------------------------------------------------------------*/ + void hypre_SortCSRCusparse( HYPRE_Int n, HYPRE_Int m, diff --git a/src/seq_mv/protos.h b/src/seq_mv/protos.h index 0573f4d40..fad325d8b 100644 --- a/src/seq_mv/protos.h +++ b/src/seq_mv/protos.h @@ -344,7 +344,6 @@ HYPRE_Int hypre_CSRMatrixIntSpMVDevice( HYPRE_Int num_rows, HYPRE_Int num_nonzer HYPRE_Int alpha, HYPRE_Int *d_ia, HYPRE_Int *d_ja, HYPRE_Int *d_a, HYPRE_Int *d_x, HYPRE_Int beta, HYPRE_Int *d_y ); -HYPRE_Int hypre_CSRMatrixGetMainDiag(hypre_CSRMatrix *A, hypre_Vector *diag); #if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE) || defined(HYPRE_USING_ONEMKLSPARSE) hypre_CsrsvData* hypre_CsrsvDataCreate(); @@ -359,8 +358,3 @@ hypre_GpuMatData* hypre_CSRMatrixGetGPUMatData(hypre_CSRMatrix *matrix); #define hypre_CSRMatrixGPUMatSpMVBuffer(matrix) ( hypre_GpuMatDataSpMVBuffer (hypre_CSRMatrixGetGPUMatData(matrix)) ) #endif void hypre_CSRMatrixGpuSpMVAnalysis(hypre_CSRMatrix *matrix); - -#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) -HYPRE_Int -hypre_CSRMatrixApplyRowColPermutation( hypre_CSRMatrix *A, HYPRE_Int *perm, HYPRE_Int *rqperm, hypre_CSRMatrix **B ); -#endif diff --git a/src/seq_mv/seq_mv.h b/src/seq_mv/seq_mv.h index 4c22e668c..2e37169be 100644 --- a/src/seq_mv/seq_mv.h +++ b/src/seq_mv/seq_mv.h @@ -616,7 +616,6 @@ HYPRE_Int hypre_CSRMatrixIntSpMVDevice( HYPRE_Int num_rows, HYPRE_Int num_nonzer HYPRE_Int alpha, HYPRE_Int *d_ia, HYPRE_Int *d_ja, HYPRE_Int *d_a, HYPRE_Int *d_x, HYPRE_Int beta, HYPRE_Int *d_y ); -HYPRE_Int hypre_CSRMatrixGetMainDiag(hypre_CSRMatrix *A, hypre_Vector *diag); #if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE) || defined(HYPRE_USING_ONEMKLSPARSE) hypre_CsrsvData* hypre_CsrsvDataCreate(); @@ -632,11 +631,6 @@ hypre_GpuMatData* hypre_CSRMatrixGetGPUMatData(hypre_CSRMatrix *matrix); #endif void hypre_CSRMatrixGpuSpMVAnalysis(hypre_CSRMatrix *matrix); -#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) -HYPRE_Int -hypre_CSRMatrixApplyRowColPermutation( hypre_CSRMatrix *A, HYPRE_Int *perm, HYPRE_Int *rqperm, hypre_CSRMatrix **B ); -#endif - #ifdef __cplusplus } #endif