Cleaned up some comments and labeled thigns better.

ledatelescope · Oct 10, 2017 · 9541a30 · 9541a30
1 parent 4506420
commit 9541a30
Show file tree

Hide file tree

Showing 5 changed files with 34 additions and 25 deletions.
diff --git a/src/guantize.cu b/src/guantize.cu
@@ -35,7 +35,10 @@
 #include <limits>
 #include <cmath>
 
+// HACK TESTING
 #include <iostream>
+using std::cout;
+using std::endl;
 
 using std::max;
 using std::min;
@@ -63,14 +66,6 @@ inline __device__ F clip_1bit(F x) {
 template<typename IType, typename SType, typename OType>
 __device__
 void guantize(IType ival, SType scale, OType& oval) {
-	//std::cout << (int)minval<OType>() << ", " << (int)maxval<OType>() << std::endl;
-	//std::cout << scale << std::endl;
-	//std::cout << ival
-	//          << " --> " << ival*scale
-	//          << " --> " << clip<OType>(ival*scale)
-	//          << " --> " << rint(clip<OType>(ival*scale))
-	//          << " --> " << (int)OType(rint(clip<OType>(ival*scale)))
-	//          << std::endl;
 	oval = OType(rint(clip<OType>(ival*scale)));
 }
 
@@ -104,7 +99,7 @@ void foreach_simple_gpu(T const* in,
 
 	if( v0 < nelement ) {
 		func(in[v0], out[v0]);
-		//std::cout << std::hex << (int)in[i] << " --> " << (int)out[i] << std::endl;
+
 	}
 }
 
@@ -114,7 +109,6 @@ inline void launch_foreach_simple_gpu(T const*     in,
                                       Size         nelement,
                                       Func         func,
                                       cudaStream_t stream=0) {
-	//cout << "LAUNCH for " << nelement << endl;
 	dim3 block(512, 1); // TODO: Tune this
 	Size first = std::min((nelement-1)/block.x+1, 65535ul);
 	Size secnd = std::min((nelement - first*block.x) / first + 1, 65535ul);
@@ -160,7 +154,6 @@ void foreach_simple_gpu_4bit(T const* in,
 			byteswap_gpu(tempR, &tempR);
 			byteswap_gpu(tempI, &tempI);
 		}
-		//std::cout << tempR << ", " << tempI << " --> " << rint(clip_4bit(tempR)) << ", " << rint(clip_4bit(tempI)) << '\n';
 		tempO = (((int8_t(rint(clip_4bit(tempR*func.scale)))*16)     ) & 0xF0) | \
 			   (((int8_t(rint(clip_4bit(tempI*func.scale)))*16) >> 4) & 0x0F);
 		if(func.byteswap_out) {
@@ -177,7 +170,6 @@ inline void launch_foreach_simple_gpu_4bit(T const*     in,
                                            Func         func,
                                            cudaStream_t stream=0) {
 	nelement /= 2;
-	//cout << "LAUNCH for " << nelement << endl;
 	dim3 block(512, 1); // TODO: Tune this
 	Size first = std::min((nelement-1)/block.x+1, 65535ul);
 	Size secnd = std::min((nelement - first*block.x) / first + 1, 65535ul);
@@ -229,7 +221,6 @@ void foreach_simple_gpu_2bit(T const* in,
 			byteswap_gpu(tempC, &tempC);
 			byteswap_gpu(tempD, &tempD);
 		}
-		//std::cout << tempR << ", " << tempI << " --> " << rint(clip_4bit(tempR)) << ", " << rint(clip_4bit(tempI)) << '\n';
 		tempO = (((int8_t(rint(clip_2bit(tempA*func.scale)))*64)     ) & 0xC0) | \
 			   (((int8_t(rint(clip_2bit(tempB*func.scale)))*64) >> 2) & 0x30) | \
 			   (((int8_t(rint(clip_2bit(tempC*func.scale)))*64) >> 4) & 0x0C) | \
@@ -248,7 +239,6 @@ inline void launch_foreach_simple_gpu_2bit(T const*     in,
                                            Func         func,
                                            cudaStream_t stream=0) {
 	nelement /= 4;
-	//cout << "LAUNCH for " << nelement << endl;
 	dim3 block(512, 1); // TODO: Tune this
 	Size first = std::min((nelement-1)/block.x+1, 65535ul);
 	Size secnd = std::min((nelement - first*block.x) / first + 1, 65535ul);
@@ -312,7 +302,6 @@ void foreach_simple_gpu_1bit(T const* in,
 			byteswap_gpu(tempG, &tempG);
 			byteswap_gpu(tempH, &tempH);
 		}
-		//std::cout << tempR << ", " << tempI << " --> " << rint(clip_4bit(tempR)) << ", " << rint(clip_4bit(tempI)) << '\n';
 		tempO = (((int8_t(rint(clip_1bit(tempA*func.scale)))*128)     ) & 0x08) | \
 			   (((int8_t(rint(clip_1bit(tempB*func.scale)))*128) >> 1) & 0x04) | \
 			   (((int8_t(rint(clip_1bit(tempC*func.scale)))*128) >> 2) & 0x02) | \
@@ -335,7 +324,6 @@ inline void launch_foreach_simple_gpu_1bit(T const*     in,
                                            Func         func,
                                            cudaStream_t stream=0) {
 	nelement /= 8;
-	//cout << "LAUNCH for " << nelement << endl;
 	dim3 block(512, 1); // TODO: Tune this
 	Size first = std::min((nelement-1)/block.x+1, 65535ul);
 	Size secnd = std::min((nelement - first*block.x) / first + 1, 65535ul);
@@ -363,34 +351,47 @@ inline void launch_foreach_simple_gpu_1bit(T const*     in,
 	                        BF_STATUS_INTERNAL_ERROR);
 }
 
+// Instantiation - gunatize functors used in quantize.cpp
+//// unsigned
 template class GuantizeFunctor<float,float,uint8_t>;
 template class GuantizeFunctor<float,double,uint8_t>;
 template class GuantizeFunctor<float,float,uint16_t>;
 template class GuantizeFunctor<float,double,uint16_t>;
 template class GuantizeFunctor<float,float,uint32_t>;
 template class GuantizeFunctor<float,double,uint32_t>;
+//// signed
 template class GuantizeFunctor<float,float,int8_t>;
 template class GuantizeFunctor<float,double,int8_t>;
 template class GuantizeFunctor<float,float,int16_t>;
 template class GuantizeFunctor<float,double,int16_t>;
 template class GuantizeFunctor<float,float,int32_t>;
 template class GuantizeFunctor<float,double,int32_t>;
 
+// Instantiation - launch_foreach_simple_gpu_1bit calls used in quantize.cpp
 template void launch_foreach_simple_gpu_1bit<float,GuantizeFunctor<float,float,uint8_t>,size_t>(float const* in, int8_t* out, size_t nelement, GuantizeFunctor<float,float,uint8_t> func, cudaStream_t stream);
 template void launch_foreach_simple_gpu_1bit<float,GuantizeFunctor<float,double,uint8_t>,size_t>(float const* in, int8_t* out, size_t nelement, GuantizeFunctor<float,double,uint8_t> func, cudaStream_t stream);
+
+// Instantiation - launch_foreach_simple_gpu_2bit calls used in quantize.cpp
 template void launch_foreach_simple_gpu_2bit<float,GuantizeFunctor<float,float,uint8_t>,size_t>(float const* in, int8_t* out, size_t nelement, GuantizeFunctor<float,float,uint8_t> func, cudaStream_t stream);
 template void launch_foreach_simple_gpu_2bit<float,GuantizeFunctor<float,double,uint8_t>,size_t>(float const* in, int8_t* out, size_t nelement, GuantizeFunctor<float,double,uint8_t> func, cudaStream_t stream);
+
+// Instantiation - launch_foreach_simple_gpu_4bit calls used in quantize.cpp
 template void launch_foreach_simple_gpu_4bit<float,GuantizeFunctor<float,float,uint8_t>,size_t>(float const* in, int8_t* out, size_t nelement, GuantizeFunctor<float,float,uint8_t> func, cudaStream_t stream);
 template void launch_foreach_simple_gpu_4bit<float,GuantizeFunctor<float,double,uint8_t>,size_t>(float const* in, int8_t* out, size_t nelement, GuantizeFunctor<float,double,uint8_t> func, cudaStream_t stream);
+
+// Instantiation - launch_foreach_simple_gpu calls used in quantize.cpp
+//// unsigned
 template void launch_foreach_simple_gpu<float,uint8_t,GuantizeFunctor<float,float,uint8_t>,size_t>(float const* in, uint8_t* out, size_t nelement, GuantizeFunctor<float,float,uint8_t> func, cudaStream_t stream);
 template void launch_foreach_simple_gpu<float,uint8_t,GuantizeFunctor<float,double,uint8_t>,size_t>(float const* in, uint8_t* out, size_t nelement, GuantizeFunctor<float,double,uint8_t> func, cudaStream_t stream);
 template void launch_foreach_simple_gpu<float,uint16_t,GuantizeFunctor<float,float,uint16_t>,size_t>(float const* in, uint16_t* out, size_t nelement, GuantizeFunctor<float,float,uint16_t> func, cudaStream_t stream);
 template void launch_foreach_simple_gpu<float,uint16_t,GuantizeFunctor<float,double,uint16_t>,size_t>(float const* in, uint16_t* out, size_t nelement, GuantizeFunctor<float,double,uint16_t> func, cudaStream_t stream);
 template void launch_foreach_simple_gpu<float,uint32_t,GuantizeFunctor<float,float,uint32_t>,size_t>(float const* in, uint32_t* out, size_t nelement, GuantizeFunctor<float,float,uint32_t> func, cudaStream_t stream);
 template void launch_foreach_simple_gpu<float,uint32_t,GuantizeFunctor<float,double,uint32_t>,size_t>(float const* in, uint32_t* out, size_t nelement, GuantizeFunctor<float,double,uint32_t> func, cudaStream_t stream);
+//// signed
 template void launch_foreach_simple_gpu<float,int8_t,GuantizeFunctor<float,float,int8_t>,size_t>(float const* in, int8_t* out, size_t nelement, GuantizeFunctor<float,float,int8_t> func, cudaStream_t stream);
 template void launch_foreach_simple_gpu<float,int8_t,GuantizeFunctor<float,double,int8_t>,size_t>(float const* in, int8_t* out, size_t nelement, GuantizeFunctor<float,double,int8_t> func, cudaStream_t stream);
 template void launch_foreach_simple_gpu<float,int16_t,GuantizeFunctor<float,float,int16_t>,size_t>(float const* in, int16_t* out, size_t nelement, GuantizeFunctor<float,float,int16_t> func, cudaStream_t stream);
 template void launch_foreach_simple_gpu<float,int16_t,GuantizeFunctor<float,double,int16_t>,size_t>(float const* in, int16_t* out, size_t nelement, GuantizeFunctor<float,double,int16_t> func, cudaStream_t stream);
 template void launch_foreach_simple_gpu<float,int32_t,GuantizeFunctor<float,float,int32_t>,size_t>(float const* in, int32_t* out, size_t nelement, GuantizeFunctor<float,float,int32_t> func, cudaStream_t stream);
 template void launch_foreach_simple_gpu<float,int32_t,GuantizeFunctor<float,double,int32_t>,size_t>(float const* in, int32_t* out, size_t nelement, GuantizeFunctor<float,double,int32_t> func, cudaStream_t stream);
+
diff --git a/src/guantize.hu b/src/guantize.hu
@@ -69,3 +69,4 @@ void launch_foreach_simple_gpu_1bit(T const*     in,
                                     cudaStream_t stream=0);
 
 #endif // BF_GUANTIZE_HU_INCLUDE_GUARD_
+
diff --git a/src/gunpack.cu b/src/gunpack.cu
@@ -31,7 +31,6 @@
 #include "assert.hpp"
 #include "cuda.hpp"
 #include "utils.hu"
-//#include "gunpack.hu"
 
 // HACK TESTING
 #include <iostream>
@@ -208,7 +207,6 @@ __global__ void foreach_simple_gpu(T const* in,
 
 	if( v0 < nelement ) {
 		func(in[v0], out[v0]);
-		//std::cout << std::hex << (int)in[i] << " --> " << (int)out[i] << std::endl;
 	}
 }
 
@@ -218,7 +216,6 @@ inline void launch_foreach_simple_gpu(T const*     in,
                                       Size         nelement,
                                       Func         func,
                                       cudaStream_t stream) {
-	//cout << "LAUNCH for " << nelement << endl;
 	dim3 block(512, 1); // TODO: Tune this
 	Size first = std::min((nelement-1)/block.x+1, 65535ul);
 	Size secnd = std::min((nelement - first*block.x) / first + 1, 65535ul);
@@ -259,7 +256,6 @@ __global__ void foreach_promote_gpu(T const* in,
 		for( Size j=0; j<sizeof(U)/sizeof(T); j++ ) {
 			out[v0*sizeof(U)/sizeof(T) + j] = int8_t((tmp2 >> j*8) & 0xFF);
 		}
-		//std::cout << std::hex << (int)in[i] << " --> " << (int)out[i] << std::endl;
 	}
 }
 
@@ -270,7 +266,6 @@ inline void launch_foreach_promote_gpu(T const*     in,
                                        Size         nelement,
                                        Func         func,
                                        cudaStream_t stream) {
-	//cout << "LAUNCH for " << nelement << endl;
 	dim3 block(512, 1); // TODO: Tune this
 	Size first = std::min((nelement-1)/block.x+1, 65535ul);
 	Size secnd = std::min((nelement - first*block.x) / first + 1, 65535ul);
@@ -298,23 +293,33 @@ inline void launch_foreach_promote_gpu(T const*     in,
 	                        BF_STATUS_INTERNAL_ERROR);
 }
 
+// Instantiation - Gunpack functors used in unpack.cpp
+//// unsigned
 template class GunpackFunctor<uint8_t,uint16_t>;
 template class GunpackFunctor<uint8_t,uint32_t>;
 template class GunpackFunctor<uint8_t,uint64_t>;
+//// signed
 template class GunpackFunctor<uint8_t,int16_t>;
 template class GunpackFunctor<uint8_t,int32_t>;
 template class GunpackFunctor<uint8_t,int64_t>;
 
+// Instantiation - launch_foreach_simple_gpu calls used in unpack.cpp
+//// unsigned
 template void launch_foreach_simple_gpu<uint8_t,uint16_t,GunpackFunctor<uint8_t,uint16_t>,size_t>(uint8_t const *in, uint16_t* out,size_t nelement,GunpackFunctor<uint8_t,uint16_t> func,cudaStream_t stream);
 template void launch_foreach_simple_gpu<uint8_t,uint32_t,GunpackFunctor<uint8_t,uint32_t>,size_t>(uint8_t const *in, uint32_t* out,size_t nelement,GunpackFunctor<uint8_t,uint32_t> func,cudaStream_t stream);
 template void launch_foreach_simple_gpu<uint8_t,uint64_t,GunpackFunctor<uint8_t,uint64_t>,size_t>(uint8_t const *in, uint64_t* out,size_t nelement,GunpackFunctor<uint8_t,uint64_t> func,cudaStream_t stream);
+//// signed
 template void launch_foreach_simple_gpu<uint8_t,int16_t,GunpackFunctor<uint8_t,int16_t>,size_t>(uint8_t const *in, int16_t* out,size_t nelement,GunpackFunctor<uint8_t,int16_t> func,cudaStream_t stream);
 template void launch_foreach_simple_gpu<uint8_t,int32_t,GunpackFunctor<uint8_t,int32_t>,size_t>(uint8_t const *in, int32_t* out,size_t nelement,GunpackFunctor<uint8_t,int32_t> func,cudaStream_t stream);
 template void launch_foreach_simple_gpu<uint8_t,int64_t,GunpackFunctor<uint8_t,int64_t>,size_t>(uint8_t const *in, int64_t* out,size_t nelement,GunpackFunctor<uint8_t,int64_t> func,cudaStream_t stream);
 
-template void launch_foreach_promote_gpu<uint8_t,int64_t,float,GunpackFunctor<uint8_t,int64_t>,size_t>(uint8_t const *in, int64_t* tmp, float* out,size_t nelement,GunpackFunctor<uint8_t,int64_t> func,cudaStream_t stream);
-template void launch_foreach_promote_gpu<uint8_t,int32_t,float,GunpackFunctor<uint8_t,int32_t>,size_t>(uint8_t const *in, int32_t* tmp, float* out,size_t nelement,GunpackFunctor<uint8_t,int32_t> func,cudaStream_t stream);
+// Instantiation - launch_foreach_promote_gpu calls used in unpack.cpp
+//// promote to float
 template void launch_foreach_promote_gpu<uint8_t,int16_t,float,GunpackFunctor<uint8_t,int16_t>,size_t>(uint8_t const *in, int16_t* tmp, float* out,size_t nelement,GunpackFunctor<uint8_t,int16_t> func,cudaStream_t stream);
-template void launch_foreach_promote_gpu<uint8_t,int64_t,double,GunpackFunctor<uint8_t,int64_t>,size_t>(uint8_t const *in, int64_t* tmp, double* out,size_t nelement,GunpackFunctor<uint8_t,int64_t> func,cudaStream_t stream);
+template void launch_foreach_promote_gpu<uint8_t,int32_t,float,GunpackFunctor<uint8_t,int32_t>,size_t>(uint8_t const *in, int32_t* tmp, float* out,size_t nelement,GunpackFunctor<uint8_t,int32_t> func,cudaStream_t stream);
+template void launch_foreach_promote_gpu<uint8_t,int64_t,float,GunpackFunctor<uint8_t,int64_t>,size_t>(uint8_t const *in, int64_t* tmp, float* out,size_t nelement,GunpackFunctor<uint8_t,int64_t> func,cudaStream_t stream);
+//// promote to double
+template void launch_foreach_promote_gpu<uint8_t,int16_t,double,GunpackFunctor<uint8_t,int16_t>,size_t>(uint8_t const *in, int16_t* tmp, double* out,size_t nelement,GunpackFunctor<uint8_t,int16_t> func,cudaStream_t stream);
 template void launch_foreach_promote_gpu<uint8_t,int32_t,double,GunpackFunctor<uint8_t,int32_t>,size_t>(uint8_t const *in, int32_t* tmp, double* out,size_t nelement,GunpackFunctor<uint8_t,int32_t> func,cudaStream_t stream);
-template void launch_foreach_promote_gpu<uint8_t,int16_t,double,GunpackFunctor<uint8_t,int16_t>,size_t>(uint8_t const *in, int16_t* tmp, double* out,size_t nelement,GunpackFunctor<uint8_t,int16_t> func,cudaStream_t stream);
+template void launch_foreach_promote_gpu<uint8_t,int64_t,double,GunpackFunctor<uint8_t,int64_t>,size_t>(uint8_t const *in, int64_t* tmp, double* out,size_t nelement,GunpackFunctor<uint8_t,int64_t> func,cudaStream_t stream);
+
diff --git a/src/gunpack.hu b/src/gunpack.hu
@@ -56,3 +56,4 @@ void launch_foreach_promote_gpu(T const*     in,
                                 cudaStream_t stream=0);
 
 #endif // BF_GUNPACK_HU_INCLUDE_GUARD_
+
diff --git a/src/unpack.cpp b/src/unpack.cpp
@@ -520,3 +520,4 @@ BFstatus bfUnpack(BFarray const* in,
 #endif
 	return BF_STATUS_SUCCESS;
 }
+