Skip to content

Commit

Permalink
Cleaned up some comments and labeled thigns better.
Browse files Browse the repository at this point in the history
  • Loading branch information
jaycedowell committed Oct 10, 2017
1 parent 4506420 commit 9541a30
Show file tree
Hide file tree
Showing 5 changed files with 34 additions and 25 deletions.
33 changes: 17 additions & 16 deletions src/guantize.cu
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,10 @@
#include <limits>
#include <cmath>

// HACK TESTING
#include <iostream>
using std::cout;
using std::endl;

using std::max;
using std::min;
Expand Down Expand Up @@ -63,14 +66,6 @@ inline __device__ F clip_1bit(F x) {
template<typename IType, typename SType, typename OType>
__device__
void guantize(IType ival, SType scale, OType& oval) {
//std::cout << (int)minval<OType>() << ", " << (int)maxval<OType>() << std::endl;
//std::cout << scale << std::endl;
//std::cout << ival
// << " --> " << ival*scale
// << " --> " << clip<OType>(ival*scale)
// << " --> " << rint(clip<OType>(ival*scale))
// << " --> " << (int)OType(rint(clip<OType>(ival*scale)))
// << std::endl;
oval = OType(rint(clip<OType>(ival*scale)));
}

Expand Down Expand Up @@ -104,7 +99,7 @@ void foreach_simple_gpu(T const* in,

if( v0 < nelement ) {
func(in[v0], out[v0]);
//std::cout << std::hex << (int)in[i] << " --> " << (int)out[i] << std::endl;

}
}

Expand All @@ -114,7 +109,6 @@ inline void launch_foreach_simple_gpu(T const* in,
Size nelement,
Func func,
cudaStream_t stream=0) {
//cout << "LAUNCH for " << nelement << endl;
dim3 block(512, 1); // TODO: Tune this
Size first = std::min((nelement-1)/block.x+1, 65535ul);
Size secnd = std::min((nelement - first*block.x) / first + 1, 65535ul);
Expand Down Expand Up @@ -160,7 +154,6 @@ void foreach_simple_gpu_4bit(T const* in,
byteswap_gpu(tempR, &tempR);
byteswap_gpu(tempI, &tempI);
}
//std::cout << tempR << ", " << tempI << " --> " << rint(clip_4bit(tempR)) << ", " << rint(clip_4bit(tempI)) << '\n';
tempO = (((int8_t(rint(clip_4bit(tempR*func.scale)))*16) ) & 0xF0) | \
(((int8_t(rint(clip_4bit(tempI*func.scale)))*16) >> 4) & 0x0F);
if(func.byteswap_out) {
Expand All @@ -177,7 +170,6 @@ inline void launch_foreach_simple_gpu_4bit(T const* in,
Func func,
cudaStream_t stream=0) {
nelement /= 2;
//cout << "LAUNCH for " << nelement << endl;
dim3 block(512, 1); // TODO: Tune this
Size first = std::min((nelement-1)/block.x+1, 65535ul);
Size secnd = std::min((nelement - first*block.x) / first + 1, 65535ul);
Expand Down Expand Up @@ -229,7 +221,6 @@ void foreach_simple_gpu_2bit(T const* in,
byteswap_gpu(tempC, &tempC);
byteswap_gpu(tempD, &tempD);
}
//std::cout << tempR << ", " << tempI << " --> " << rint(clip_4bit(tempR)) << ", " << rint(clip_4bit(tempI)) << '\n';
tempO = (((int8_t(rint(clip_2bit(tempA*func.scale)))*64) ) & 0xC0) | \
(((int8_t(rint(clip_2bit(tempB*func.scale)))*64) >> 2) & 0x30) | \
(((int8_t(rint(clip_2bit(tempC*func.scale)))*64) >> 4) & 0x0C) | \
Expand All @@ -248,7 +239,6 @@ inline void launch_foreach_simple_gpu_2bit(T const* in,
Func func,
cudaStream_t stream=0) {
nelement /= 4;
//cout << "LAUNCH for " << nelement << endl;
dim3 block(512, 1); // TODO: Tune this
Size first = std::min((nelement-1)/block.x+1, 65535ul);
Size secnd = std::min((nelement - first*block.x) / first + 1, 65535ul);
Expand Down Expand Up @@ -312,7 +302,6 @@ void foreach_simple_gpu_1bit(T const* in,
byteswap_gpu(tempG, &tempG);
byteswap_gpu(tempH, &tempH);
}
//std::cout << tempR << ", " << tempI << " --> " << rint(clip_4bit(tempR)) << ", " << rint(clip_4bit(tempI)) << '\n';
tempO = (((int8_t(rint(clip_1bit(tempA*func.scale)))*128) ) & 0x08) | \
(((int8_t(rint(clip_1bit(tempB*func.scale)))*128) >> 1) & 0x04) | \
(((int8_t(rint(clip_1bit(tempC*func.scale)))*128) >> 2) & 0x02) | \
Expand All @@ -335,7 +324,6 @@ inline void launch_foreach_simple_gpu_1bit(T const* in,
Func func,
cudaStream_t stream=0) {
nelement /= 8;
//cout << "LAUNCH for " << nelement << endl;
dim3 block(512, 1); // TODO: Tune this
Size first = std::min((nelement-1)/block.x+1, 65535ul);
Size secnd = std::min((nelement - first*block.x) / first + 1, 65535ul);
Expand Down Expand Up @@ -363,34 +351,47 @@ inline void launch_foreach_simple_gpu_1bit(T const* in,
BF_STATUS_INTERNAL_ERROR);
}

// Instantiation - gunatize functors used in quantize.cpp
//// unsigned
template class GuantizeFunctor<float,float,uint8_t>;
template class GuantizeFunctor<float,double,uint8_t>;
template class GuantizeFunctor<float,float,uint16_t>;
template class GuantizeFunctor<float,double,uint16_t>;
template class GuantizeFunctor<float,float,uint32_t>;
template class GuantizeFunctor<float,double,uint32_t>;
//// signed
template class GuantizeFunctor<float,float,int8_t>;
template class GuantizeFunctor<float,double,int8_t>;
template class GuantizeFunctor<float,float,int16_t>;
template class GuantizeFunctor<float,double,int16_t>;
template class GuantizeFunctor<float,float,int32_t>;
template class GuantizeFunctor<float,double,int32_t>;

// Instantiation - launch_foreach_simple_gpu_1bit calls used in quantize.cpp
template void launch_foreach_simple_gpu_1bit<float,GuantizeFunctor<float,float,uint8_t>,size_t>(float const* in, int8_t* out, size_t nelement, GuantizeFunctor<float,float,uint8_t> func, cudaStream_t stream);
template void launch_foreach_simple_gpu_1bit<float,GuantizeFunctor<float,double,uint8_t>,size_t>(float const* in, int8_t* out, size_t nelement, GuantizeFunctor<float,double,uint8_t> func, cudaStream_t stream);

// Instantiation - launch_foreach_simple_gpu_2bit calls used in quantize.cpp
template void launch_foreach_simple_gpu_2bit<float,GuantizeFunctor<float,float,uint8_t>,size_t>(float const* in, int8_t* out, size_t nelement, GuantizeFunctor<float,float,uint8_t> func, cudaStream_t stream);
template void launch_foreach_simple_gpu_2bit<float,GuantizeFunctor<float,double,uint8_t>,size_t>(float const* in, int8_t* out, size_t nelement, GuantizeFunctor<float,double,uint8_t> func, cudaStream_t stream);

// Instantiation - launch_foreach_simple_gpu_4bit calls used in quantize.cpp
template void launch_foreach_simple_gpu_4bit<float,GuantizeFunctor<float,float,uint8_t>,size_t>(float const* in, int8_t* out, size_t nelement, GuantizeFunctor<float,float,uint8_t> func, cudaStream_t stream);
template void launch_foreach_simple_gpu_4bit<float,GuantizeFunctor<float,double,uint8_t>,size_t>(float const* in, int8_t* out, size_t nelement, GuantizeFunctor<float,double,uint8_t> func, cudaStream_t stream);

// Instantiation - launch_foreach_simple_gpu calls used in quantize.cpp
//// unsigned
template void launch_foreach_simple_gpu<float,uint8_t,GuantizeFunctor<float,float,uint8_t>,size_t>(float const* in, uint8_t* out, size_t nelement, GuantizeFunctor<float,float,uint8_t> func, cudaStream_t stream);
template void launch_foreach_simple_gpu<float,uint8_t,GuantizeFunctor<float,double,uint8_t>,size_t>(float const* in, uint8_t* out, size_t nelement, GuantizeFunctor<float,double,uint8_t> func, cudaStream_t stream);
template void launch_foreach_simple_gpu<float,uint16_t,GuantizeFunctor<float,float,uint16_t>,size_t>(float const* in, uint16_t* out, size_t nelement, GuantizeFunctor<float,float,uint16_t> func, cudaStream_t stream);
template void launch_foreach_simple_gpu<float,uint16_t,GuantizeFunctor<float,double,uint16_t>,size_t>(float const* in, uint16_t* out, size_t nelement, GuantizeFunctor<float,double,uint16_t> func, cudaStream_t stream);
template void launch_foreach_simple_gpu<float,uint32_t,GuantizeFunctor<float,float,uint32_t>,size_t>(float const* in, uint32_t* out, size_t nelement, GuantizeFunctor<float,float,uint32_t> func, cudaStream_t stream);
template void launch_foreach_simple_gpu<float,uint32_t,GuantizeFunctor<float,double,uint32_t>,size_t>(float const* in, uint32_t* out, size_t nelement, GuantizeFunctor<float,double,uint32_t> func, cudaStream_t stream);
//// signed
template void launch_foreach_simple_gpu<float,int8_t,GuantizeFunctor<float,float,int8_t>,size_t>(float const* in, int8_t* out, size_t nelement, GuantizeFunctor<float,float,int8_t> func, cudaStream_t stream);
template void launch_foreach_simple_gpu<float,int8_t,GuantizeFunctor<float,double,int8_t>,size_t>(float const* in, int8_t* out, size_t nelement, GuantizeFunctor<float,double,int8_t> func, cudaStream_t stream);
template void launch_foreach_simple_gpu<float,int16_t,GuantizeFunctor<float,float,int16_t>,size_t>(float const* in, int16_t* out, size_t nelement, GuantizeFunctor<float,float,int16_t> func, cudaStream_t stream);
template void launch_foreach_simple_gpu<float,int16_t,GuantizeFunctor<float,double,int16_t>,size_t>(float const* in, int16_t* out, size_t nelement, GuantizeFunctor<float,double,int16_t> func, cudaStream_t stream);
template void launch_foreach_simple_gpu<float,int32_t,GuantizeFunctor<float,float,int32_t>,size_t>(float const* in, int32_t* out, size_t nelement, GuantizeFunctor<float,float,int32_t> func, cudaStream_t stream);
template void launch_foreach_simple_gpu<float,int32_t,GuantizeFunctor<float,double,int32_t>,size_t>(float const* in, int32_t* out, size_t nelement, GuantizeFunctor<float,double,int32_t> func, cudaStream_t stream);

1 change: 1 addition & 0 deletions src/guantize.hu
Original file line number Diff line number Diff line change
Expand Up @@ -69,3 +69,4 @@ void launch_foreach_simple_gpu_1bit(T const* in,
cudaStream_t stream=0);

#endif // BF_GUANTIZE_HU_INCLUDE_GUARD_

23 changes: 14 additions & 9 deletions src/gunpack.cu
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@
#include "assert.hpp"
#include "cuda.hpp"
#include "utils.hu"
//#include "gunpack.hu"

// HACK TESTING
#include <iostream>
Expand Down Expand Up @@ -208,7 +207,6 @@ __global__ void foreach_simple_gpu(T const* in,

if( v0 < nelement ) {
func(in[v0], out[v0]);
//std::cout << std::hex << (int)in[i] << " --> " << (int)out[i] << std::endl;
}
}

Expand All @@ -218,7 +216,6 @@ inline void launch_foreach_simple_gpu(T const* in,
Size nelement,
Func func,
cudaStream_t stream) {
//cout << "LAUNCH for " << nelement << endl;
dim3 block(512, 1); // TODO: Tune this
Size first = std::min((nelement-1)/block.x+1, 65535ul);
Size secnd = std::min((nelement - first*block.x) / first + 1, 65535ul);
Expand Down Expand Up @@ -259,7 +256,6 @@ __global__ void foreach_promote_gpu(T const* in,
for( Size j=0; j<sizeof(U)/sizeof(T); j++ ) {
out[v0*sizeof(U)/sizeof(T) + j] = int8_t((tmp2 >> j*8) & 0xFF);
}
//std::cout << std::hex << (int)in[i] << " --> " << (int)out[i] << std::endl;
}
}

Expand All @@ -270,7 +266,6 @@ inline void launch_foreach_promote_gpu(T const* in,
Size nelement,
Func func,
cudaStream_t stream) {
//cout << "LAUNCH for " << nelement << endl;
dim3 block(512, 1); // TODO: Tune this
Size first = std::min((nelement-1)/block.x+1, 65535ul);
Size secnd = std::min((nelement - first*block.x) / first + 1, 65535ul);
Expand Down Expand Up @@ -298,23 +293,33 @@ inline void launch_foreach_promote_gpu(T const* in,
BF_STATUS_INTERNAL_ERROR);
}

// Instantiation - Gunpack functors used in unpack.cpp
//// unsigned
template class GunpackFunctor<uint8_t,uint16_t>;
template class GunpackFunctor<uint8_t,uint32_t>;
template class GunpackFunctor<uint8_t,uint64_t>;
//// signed
template class GunpackFunctor<uint8_t,int16_t>;
template class GunpackFunctor<uint8_t,int32_t>;
template class GunpackFunctor<uint8_t,int64_t>;

// Instantiation - launch_foreach_simple_gpu calls used in unpack.cpp
//// unsigned
template void launch_foreach_simple_gpu<uint8_t,uint16_t,GunpackFunctor<uint8_t,uint16_t>,size_t>(uint8_t const *in, uint16_t* out,size_t nelement,GunpackFunctor<uint8_t,uint16_t> func,cudaStream_t stream);
template void launch_foreach_simple_gpu<uint8_t,uint32_t,GunpackFunctor<uint8_t,uint32_t>,size_t>(uint8_t const *in, uint32_t* out,size_t nelement,GunpackFunctor<uint8_t,uint32_t> func,cudaStream_t stream);
template void launch_foreach_simple_gpu<uint8_t,uint64_t,GunpackFunctor<uint8_t,uint64_t>,size_t>(uint8_t const *in, uint64_t* out,size_t nelement,GunpackFunctor<uint8_t,uint64_t> func,cudaStream_t stream);
//// signed
template void launch_foreach_simple_gpu<uint8_t,int16_t,GunpackFunctor<uint8_t,int16_t>,size_t>(uint8_t const *in, int16_t* out,size_t nelement,GunpackFunctor<uint8_t,int16_t> func,cudaStream_t stream);
template void launch_foreach_simple_gpu<uint8_t,int32_t,GunpackFunctor<uint8_t,int32_t>,size_t>(uint8_t const *in, int32_t* out,size_t nelement,GunpackFunctor<uint8_t,int32_t> func,cudaStream_t stream);
template void launch_foreach_simple_gpu<uint8_t,int64_t,GunpackFunctor<uint8_t,int64_t>,size_t>(uint8_t const *in, int64_t* out,size_t nelement,GunpackFunctor<uint8_t,int64_t> func,cudaStream_t stream);

template void launch_foreach_promote_gpu<uint8_t,int64_t,float,GunpackFunctor<uint8_t,int64_t>,size_t>(uint8_t const *in, int64_t* tmp, float* out,size_t nelement,GunpackFunctor<uint8_t,int64_t> func,cudaStream_t stream);
template void launch_foreach_promote_gpu<uint8_t,int32_t,float,GunpackFunctor<uint8_t,int32_t>,size_t>(uint8_t const *in, int32_t* tmp, float* out,size_t nelement,GunpackFunctor<uint8_t,int32_t> func,cudaStream_t stream);
// Instantiation - launch_foreach_promote_gpu calls used in unpack.cpp
//// promote to float
template void launch_foreach_promote_gpu<uint8_t,int16_t,float,GunpackFunctor<uint8_t,int16_t>,size_t>(uint8_t const *in, int16_t* tmp, float* out,size_t nelement,GunpackFunctor<uint8_t,int16_t> func,cudaStream_t stream);
template void launch_foreach_promote_gpu<uint8_t,int64_t,double,GunpackFunctor<uint8_t,int64_t>,size_t>(uint8_t const *in, int64_t* tmp, double* out,size_t nelement,GunpackFunctor<uint8_t,int64_t> func,cudaStream_t stream);
template void launch_foreach_promote_gpu<uint8_t,int32_t,float,GunpackFunctor<uint8_t,int32_t>,size_t>(uint8_t const *in, int32_t* tmp, float* out,size_t nelement,GunpackFunctor<uint8_t,int32_t> func,cudaStream_t stream);
template void launch_foreach_promote_gpu<uint8_t,int64_t,float,GunpackFunctor<uint8_t,int64_t>,size_t>(uint8_t const *in, int64_t* tmp, float* out,size_t nelement,GunpackFunctor<uint8_t,int64_t> func,cudaStream_t stream);
//// promote to double
template void launch_foreach_promote_gpu<uint8_t,int16_t,double,GunpackFunctor<uint8_t,int16_t>,size_t>(uint8_t const *in, int16_t* tmp, double* out,size_t nelement,GunpackFunctor<uint8_t,int16_t> func,cudaStream_t stream);
template void launch_foreach_promote_gpu<uint8_t,int32_t,double,GunpackFunctor<uint8_t,int32_t>,size_t>(uint8_t const *in, int32_t* tmp, double* out,size_t nelement,GunpackFunctor<uint8_t,int32_t> func,cudaStream_t stream);
template void launch_foreach_promote_gpu<uint8_t,int16_t,double,GunpackFunctor<uint8_t,int16_t>,size_t>(uint8_t const *in, int16_t* tmp, double* out,size_t nelement,GunpackFunctor<uint8_t,int16_t> func,cudaStream_t stream);
template void launch_foreach_promote_gpu<uint8_t,int64_t,double,GunpackFunctor<uint8_t,int64_t>,size_t>(uint8_t const *in, int64_t* tmp, double* out,size_t nelement,GunpackFunctor<uint8_t,int64_t> func,cudaStream_t stream);

1 change: 1 addition & 0 deletions src/gunpack.hu
Original file line number Diff line number Diff line change
Expand Up @@ -56,3 +56,4 @@ void launch_foreach_promote_gpu(T const* in,
cudaStream_t stream=0);

#endif // BF_GUNPACK_HU_INCLUDE_GUARD_

1 change: 1 addition & 0 deletions src/unpack.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -520,3 +520,4 @@ BFstatus bfUnpack(BFarray const* in,
#endif
return BF_STATUS_SUCCESS;
}

0 comments on commit 9541a30

Please sign in to comment.