Permalink
Browse files

add optimized ttm and mttkrp kernels

  • Loading branch information...
fruitfly1026 committed Apr 24, 2017
1 parent 895fe09 commit 997bab70a745dc98112937f09f564fa8775fe700
View
@@ -13,7 +13,6 @@ A Parallel Tensor Infrastructure (ParTI!), is to support fast essential sparse t
* Sparse tensor-times-dense matrix (SpTTM)
* Sparse matricized tensor times Khatri-Rao product (SpMTTKRP)
* Sparse tensor matricization
* Sparse CANDECOMP/PARAFAC decomposition
## Build requirements:
@@ -74,5 +73,5 @@ A Parallel Tensor Infrastructure (ParTI!), is to support fast essential sparse t
## Contributiors
* Yuchen Ma (Contact: m13253@hotmail.com)
* Jiajia Li (Contact: jiajiali@gatech.edu)
* Yuchen Ma (Contact: m13253@hotmail.com)
View
@@ -16,42 +16,46 @@
If not, see <http://www.gnu.org/licenses/>.
*/
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
#include <ParTI.h>
#include "../src/sptensor/sptensor.h"
int main(int argc, char const *argv[]) {
FILE *fX, *fo;
sptSparseTensor X;
sptMatrix ** U;
sptSizeVector mats_order;
sptVector scratch;
size_t mode = 0;
size_t R = 16;
int cuda_dev_id = -2;
int niters = 5;
int nthreads;
int impl_num = 0;
if(argc < 3) {
printf("Usage: %s X mode [cuda_dev_id, R, Y]\n\n", argv[0]);
if(argc < 4) {
printf("Usage: %s X mode impl_num [cuda_dev_id, R, Y]\n\n", argv[0]);
return 1;
}
fX = fopen(argv[1], "r");
assert(fX != NULL);
sptAssert(fX != NULL);
printf("input file: %s\n", argv[1]); fflush(stdout);
sptLoadSparseTensor(&X, 1, fX);
// assert(sptLoadSparseTensor(&X, 1, fX) == 0);
sptAssert(sptLoadSparseTensor(&X, 1, fX) == 0);
fclose(fX);
printf("Tensor ndims:\n");
spt_DumpArray(X.ndims, X.nmodes, 0, stdout);
printf("Tensor NNZ: %zu\n", X.nnz);
sscanf(argv[2], "%zu", &mode);
if(argc >= 4) {
sscanf(argv[3], "%d", &cuda_dev_id);
}
sscanf(argv[3], "%d", &impl_num);
if(argc >= 5) {
sscanf(argv[4], "%zu", &R);
sscanf(argv[4], "%d", &cuda_dev_id);
}
if(argc >= 6) {
sscanf(argv[5], "%zu", &R);
}
size_t nmodes = X.nmodes;
@@ -61,30 +65,31 @@ int main(int argc, char const *argv[]) {
}
size_t max_ndims = 0;
for(size_t m=0; m<nmodes; ++m) {
assert(sptRandomizeMatrix(U[m], X.ndims[m], R) == 0);
// sptAssert(sptRandomizeMatrix(U[m], X.ndims[m], R) == 0);
sptAssert(sptNewMatrix(U[m], X.ndims[m], R) == 0);
sptAssert(sptConstantMatrix(U[m], 1) == 0);
if(X.ndims[m] > max_ndims)
max_ndims = X.ndims[m];
}
assert(sptNewMatrix(U[nmodes], max_ndims, R) == 0);
assert(sptConstantMatrix(U[nmodes], 0) == 0);
sptAssert(sptNewMatrix(U[nmodes], max_ndims, R) == 0);
sptAssert(sptConstantMatrix(U[nmodes], 0) == 0);
size_t stride = U[0]->stride;
sptNewSizeVector(&mats_order, nmodes-1, nmodes-1);
size_t j = 0;
for(int m=nmodes-1; m>=0; --m) {
if(m != (int)mode) {
mats_order.data[j] = m;
++ j;
}
}
size_t * mats_order = (size_t*)malloc(nmodes * sizeof(size_t));
mats_order[0] = mode;
for(size_t i=1; i<nmodes; ++i)
mats_order[i] = (mode+i) % nmodes;
printf("mats_order:\n");
spt_DumpArray(mats_order, nmodes, 0, stdout);
/* For warm-up caches, timing not included */
if(cuda_dev_id == -2) {
nthreads = 1;
sptNewVector(&scratch, R, R);
sptConstantVector(&scratch, 0);
assert(sptMTTKRP(&X, U, &mats_order, mode, &scratch) == 0);
sptAssert(sptMTTKRP(&X, U, mats_order, mode, &scratch) == 0);
sptFreeVector(&scratch);
} else if(cuda_dev_id == -1) {
#pragma omp parallel
@@ -94,20 +99,21 @@ int main(int argc, char const *argv[]) {
printf("nthreads: %d\n", nthreads);
sptNewVector(&scratch, X.nnz * stride, X.nnz * stride);
sptConstantVector(&scratch, 0);
assert(sptOmpMTTKRP(&X, U, &mats_order, mode, &scratch) == 0);
sptAssert(sptOmpMTTKRP(&X, U, mats_order, mode, &scratch) == 0);
sptFreeVector(&scratch);
} else {
sptCudaSetDevice(cuda_dev_id);
assert(sptCudaMTTKRP(&X, U, &mats_order, mode) == 0);
sptCudaSetDevice(cuda_dev_id);
// sptAssert(sptCudaMTTKRP(&X, U, mats_order, mode, impl_num) == 0);
sptAssert(sptCudaMTTKRPOneKernel(&X, U, mats_order, mode, impl_num) == 0);
}
for(int it=0; it<niters; ++it) {
sptAssert(sptConstantMatrix(U[nmodes], 0) == 0);
if(cuda_dev_id == -2) {
nthreads = 1;
sptNewVector(&scratch, R, R);
sptConstantVector(&scratch, 0);
assert(sptMTTKRP(&X, U, &mats_order, mode, &scratch) == 0);
sptAssert(sptMTTKRP(&X, U, mats_order, mode, &scratch) == 0);
sptFreeVector(&scratch);
} else if(cuda_dev_id == -1) {
#pragma omp parallel
@@ -117,25 +123,27 @@ int main(int argc, char const *argv[]) {
printf("nthreads: %d\n", nthreads);
sptNewVector(&scratch, X.nnz * stride, X.nnz * stride);
sptConstantVector(&scratch, 0);
assert(sptOmpMTTKRP(&X, U, &mats_order, mode, &scratch) == 0);
sptAssert(sptOmpMTTKRP(&X, U, mats_order, mode, &scratch) == 0);
sptFreeVector(&scratch);
} else {
sptCudaSetDevice(cuda_dev_id);
assert(sptCudaMTTKRP(&X, U, &mats_order, mode) == 0);
sptCudaSetDevice(cuda_dev_id);
// sptAssert(sptCudaMTTKRP(&X, U, mats_order, mode, impl_num) == 0);
sptAssert(sptCudaMTTKRPOneKernel(&X, U, mats_order, mode, impl_num) == 0);
}
}
for(size_t m=0; m<nmodes; ++m) {
sptFreeMatrix(U[m]);
}
sptFreeSparseTensor(&X);
sptFreeSizeVector(&mats_order);
free(mats_order);
if(argc >= 6) {
fo = fopen(argv[5], "w");
assert(fo != NULL);
assert(sptDumpMatrix(U[nmodes], fo) == 0);
if(argc >= 7) {
fo = fopen(argv[6], "w");
sptAssert(fo != NULL);
sptAssert(sptDumpMatrix(U[nmodes], fo) == 0);
fclose(fo);
}
View
@@ -16,7 +16,6 @@
If not, see <http://www.gnu.org/licenses/>.
*/
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <ParTI.h>
@@ -31,61 +30,73 @@ int main(int argc, char const *argv[]) {
int cuda_dev_id = -2;
int niters = 5;
if(argc < 3) {
printf("Usage: %s X mode [cuda_dev_id, R, Y]\n\n", argv[0]);
if(argc < 5) {
printf("Usage: %s X mode impl_num smem_size [cuda_dev_id, R, Y]\n\n", argv[0]);
return 1;
}
fX = fopen(argv[1], "r");
assert(fX != NULL);
assert(sptLoadSparseTensor(&X, 1, fX) == 0);
sptAssert(fX != NULL);
printf("input file: %s\n", argv[1]); fflush(stdout);
sptAssert(sptLoadSparseTensor(&X, 1, fX) == 0);
fclose(fX);
sscanf(argv[2], "%zu", &mode);
if(argc >= 4) {
sscanf(argv[3], "%d", &cuda_dev_id);
size_t impl_num = 0;
sscanf(argv[3], "%zu", &impl_num);
size_t smem_size = 0;
sscanf(argv[4], "%zu", &smem_size);
if(argc > 5) {
sscanf(argv[5], "%d", &cuda_dev_id);
}
if(argc >= 5) {
sscanf(argv[4], "%zu", &R);
if(argc > 6) {
sscanf(argv[6], "%zu", &R);
}
assert(sptRandomizeMatrix(&U, X.ndims[mode], R) == 0);
sptAssert(sptRandomizeMatrix(&U, X.ndims[mode], R) == 0);
sptAssert(sptNewMatrix(&U, X.ndims[mode], R) == 0);
sptAssert(sptConstantMatrix(&U, 1) == 0);
/* For warm-up caches, timing not included */
if(cuda_dev_id == -2) {
assert(sptSparseTensorMulMatrix(&Y, &X, &U, mode) == 0);
sptAssert(sptSparseTensorMulMatrix(&Y, &X, &U, mode) == 0);
} else if(cuda_dev_id == -1) {
assert(sptOmpSparseTensorMulMatrix(&Y, &X, &U, mode) == 0);
sptAssert(sptOmpSparseTensorMulMatrix(&Y, &X, &U, mode) == 0);
} else {
sptCudaSetDevice(cuda_dev_id);
assert(sptCudaSparseTensorMulMatrix(&Y, &X, &U, mode) == 0);
// sptAssert(sptCudaSparseTensorMulMatrix(&Y, &X, &U, mode) == 0);
sptAssert(sptCudaSparseTensorMulMatrixOneKernel(&Y, &X, &U, mode, impl_num, smem_size) == 0);
}
for(int it=0; it<niters; ++it) {
sptFreeSemiSparseTensor(&Y);
// sptFreeSemiSparseTensor(&Y);
if(cuda_dev_id == -2) {
assert(sptSparseTensorMulMatrix(&Y, &X, &U, mode) == 0);
sptAssert(sptSparseTensorMulMatrix(&Y, &X, &U, mode) == 0);
} else if(cuda_dev_id == -1) {
assert(sptOmpSparseTensorMulMatrix(&Y, &X, &U, mode) == 0);
sptAssert(sptOmpSparseTensorMulMatrix(&Y, &X, &U, mode) == 0);
} else {
sptCudaSetDevice(cuda_dev_id);
assert(sptCudaSparseTensorMulMatrix(&Y, &X, &U, mode) == 0);
// sptAssert(sptCudaSparseTensorMulMatrix(&Y, &X, &U, mode) == 0);
sptAssert(sptCudaSparseTensorMulMatrixOneKernel(&Y, &X, &U, mode, impl_num, smem_size) == 0);
}
}
assert(sptSemiSparseTensorToSparseTensor(&spY, &Y, 1e-9) == 0);
sptFreeSemiSparseTensor(&Y);
sptFreeMatrix(&U);
sptFreeSparseTensor(&X);
if(argc > 7) {
sptAssert(sptSemiSparseTensorToSparseTensor(&spY, &Y, 1e-9) == 0);
if(argc >= 6) {
fY = fopen(argv[5], "w");
assert(fY != NULL);
assert(sptDumpSparseTensor(&spY, 1, fY) == 0);
fY = fopen(argv[7], "w");
sptAssert(fY != NULL);
sptAssert(sptDumpSparseTensor(&spY, 0, fY) == 0);
fclose(fY);
sptFreeSparseTensor(&spY);
}
sptFreeSparseTensor(&spY);
sptFreeSemiSparseTensor(&Y);
sptFreeMatrix(&U);
sptFreeSparseTensor(&X);
return 0;
}
View
@@ -78,12 +78,12 @@ typedef struct {
* Sparse tensor type
*/
typedef struct {
size_t nmodes; /// # modes
size_t sortkey; /// the least significant mode during sort
size_t *ndims; /// size of each mode, length nmodes
size_t nnz; /// # non-zeros
sptSizeVector *inds; /// indices of each element, length [nmodes][nnz]
sptVector values; /// non-zero values, length nnz
size_t nmodes; /// # modes
size_t *sortorder; /// the order in which the indices are sorted
size_t *ndims; /// size of each mode, length nmodes
size_t nnz; /// # non-zeros
sptSizeVector *inds; /// indices of each element, length [nmodes][nnz]
sptVector values; /// non-zero values, length nnz
} sptSparseTensor;
/**
@@ -122,6 +122,13 @@ typedef enum {
int sptGetLastError(const char **module, const char **file, unsigned *line, const char **reason);
void sptClearLastError(void);
void spt_Panic(const char *file, unsigned line, const char *expr);
/**
* The assert function that always execute even when `NDEBUG` is set
*
* Quick & dirty error checking. Useful when writing small programs.
*/
#define sptAssert(expr) ((expr) ? (void) 0 : spt_Panic(__FILE__, __LINE__, #expr))
/* Helper function for pure C module */
int sptCudaSetDevice(int device);
@@ -136,6 +143,10 @@ double sptPrintElapsedTime(const sptTimer timer, const char *name);
int sptFreeTimer(sptTimer timer);
/* Dense array */
size_t sptMaxSizeArray(size_t const * const indices, size_t const size);
void spt_DumpArray(const size_t array[], size_t length, size_t start_index, FILE *fp);
/* Dense vector, aka variable length array */
int sptNewVector(sptVector *vec, size_t len, size_t cap);
int sptConstantVector(sptVector * const vec, sptScalar const val);
@@ -196,8 +207,8 @@ int sptCopySparseTensor(sptSparseTensor *dest, const sptSparseTensor *src);
void sptFreeSparseTensor(sptSparseTensor *tsr);
int sptLoadSparseTensor(sptSparseTensor *tsr, size_t start_index, FILE *fp);
int sptDumpSparseTensor(const sptSparseTensor *tsr, size_t start_index, FILE *fp);
void sptSparseTensorSortIndex(sptSparseTensor *tsr);
void sptSparseTensorSortIndexAtMode(sptSparseTensor *tsr, size_t mode);
void sptSparseTensorSortIndex(sptSparseTensor *tsr, int force);
void sptSparseTensorSortIndexAtMode(sptSparseTensor *tsr, size_t mode, int force);
/**
* epsilon is a small positive value, every -epsilon < x < x would be considered as zero
@@ -231,6 +242,7 @@ int sptSparseTensorDotDiv(sptSparseTensor *Z, const sptSparseTensor *X, const sp
int sptSparseTensorMulMatrix(sptSemiSparseTensor *Y, sptSparseTensor *X, const sptMatrix *U, size_t mode);
int sptOmpSparseTensorMulMatrix(sptSemiSparseTensor *Y, sptSparseTensor *X, const sptMatrix *U, size_t mode);
int sptCudaSparseTensorMulMatrix(sptSemiSparseTensor *Y, sptSparseTensor *X, const sptMatrix *U, size_t mode);
int sptCudaSparseTensorMulMatrixOneKernel(sptSemiSparseTensor *Y, sptSparseTensor *X, const sptMatrix *U, size_t mode, size_t const impl_num, size_t const smen_size);
/**
* Kronecker product
@@ -246,32 +258,25 @@ int sptSparseTensorKhatriRaoMul(sptSparseTensor *Y, const sptSparseTensor *A, co
* Matricized tensor times Khatri-Rao product.
*/
int sptMTTKRP(sptSparseTensor const * const X,
sptMatrix ** const mats, // mats[nmodes] as temporary space.
sptSizeVector const * const mats_order, // Correspond to the mode order of X.
size_t const mode,
sptVector * scratch);
sptMatrix * mats[], // mats[nmodes] as temporary space.
size_t const mats_order[], // Correspond to the mode order of X.
size_t const mode,
sptVector * scratch);
int sptOmpMTTKRP(sptSparseTensor const * const X,
sptMatrix ** const mats, // mats[nmodes] as temporary space.
sptSizeVector const * const mats_order, // Correspond to the mode order of X.
size_t const mode,
sptVector * scratch);
sptMatrix * mats[], // mats[nmodes] as temporary space.
size_t const mats_order[], // Correspond to the mode order of X.
size_t const mode,
sptVector * scratch);
int sptCudaMTTKRP(sptSparseTensor const * const X,
sptMatrix ** const mats, // mats[nmodes] as temporary space.
sptSizeVector const * const mats_order, // Correspond to the mode order of X.
size_t const mode);
int sptCudaMTTKRPDevice(
const size_t mode,
const size_t nmodes,
const size_t nnz,
const size_t rank,
const size_t stride,
const size_t * Xndims,
size_t ** const Xinds,
const sptScalar * Xvals,
const size_t * dev_mats_order,
sptScalar ** dev_mats,
sptScalar * dev_scratch);
int sptCudaMTTKRPOneKernel(
sptSparseTensor const * const X,
sptMatrix ** const mats, // mats[nmodes] as temporary space.
size_t * const mats_order, // Correspond to the mode order of X.
size_t const mode,
size_t const impl_num);
#ifdef __cplusplus
}
Oops, something went wrong.

0 comments on commit 997bab7

Please sign in to comment.