Find file
Fetching contributors…
Cannot retrieve contributors at this time
246 lines (188 sloc) 8.25 KB
#include <thrust/scan.h>
#include <thrust/fill.h>
#include <thrust/device_vector.h>
#include <thrust/functional.h>
#include <thrust/random.h>
#include <cassert>
#include <iostream>
#include "time_invocation_cuda.hpp"
#include <thrust/detail/temporary_array.h>
#include <thrust/detail/type_traits/function_traits.h>
#include <bulk/bulk.hpp>
#include "decomposition.hpp"
struct inclusive_scan_n
{
template<typename ConcurrentGroup, typename InputIterator, typename Size, typename OutputIterator, typename T, typename BinaryFunction>
__device__ void operator()(ConcurrentGroup &this_group, InputIterator first, Size n, OutputIterator result, T init, BinaryFunction binary_op)
{
bulk::inclusive_scan(this_group, first, first + n, result, init, binary_op);
}
};
struct exclusive_scan_n
{
template<typename ConcurrentGroup, typename InputIterator, typename Size, typename OutputIterator, typename T, typename BinaryFunction>
__device__ void operator()(ConcurrentGroup &this_group, InputIterator first, Size n, OutputIterator result, T init, BinaryFunction binary_op)
{
bulk::exclusive_scan(this_group, first, first + n, result, init, binary_op);
}
};
struct inclusive_downsweep
{
template<typename ConcurrentGroup, typename RandomAccessIterator1, typename Decomposition, typename RandomAccessIterator2, typename RandomAccessIterator3, typename BinaryFunction>
__device__ void operator()(ConcurrentGroup &this_group,
RandomAccessIterator1 first,
Decomposition decomp,
RandomAccessIterator2 carries_first,
RandomAccessIterator3 result,
BinaryFunction binary_op)
{
typename Decomposition::range range = decomp[this_group.index()];
RandomAccessIterator1 last = first + range.second;
first += range.first;
result += range.first;
typename thrust::iterator_value<RandomAccessIterator2>::type carry = carries_first[this_group.index()];
bulk::inclusive_scan(this_group, first, last, result, carry, binary_op);
}
};
struct accumulate_tiles
{
template<typename ConcurrentGroup, typename RandomAccessIterator1, typename Decomposition, typename RandomAccessIterator2, typename BinaryFunction>
__device__ void operator()(ConcurrentGroup &this_group,
RandomAccessIterator1 first,
Decomposition decomp,
RandomAccessIterator2 result,
BinaryFunction binary_op)
{
typedef typename thrust::iterator_value<RandomAccessIterator1>::type value_type;
typename Decomposition::range range = decomp[this_group.index()];
const bool commutative = thrust::detail::is_commutative<BinaryFunction>::value;
// for a commutative accumulate, it's much faster to pass the last value as the init for some reason
value_type init = commutative ? first[range.second-1] : *first;
value_type sum = commutative ?
bulk::accumulate(this_group, first + range.first, first + range.second - 1, init, binary_op) :
bulk::accumulate(this_group, first + range.first + 1, first + range.second, init, binary_op);
if(this_group.this_exec.index() == 0)
{
result[this_group.index()] = sum;
} // end if
} // end operator()
}; // end accumulate_tiles
template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename T, typename BinaryFunction>
RandomAccessIterator2 inclusive_scan(RandomAccessIterator1 first, RandomAccessIterator1 last, RandomAccessIterator2 result, T init, BinaryFunction binary_op)
{
typedef typename bulk::detail::scan_detail::scan_intermediate<
RandomAccessIterator1,
RandomAccessIterator2,
BinaryFunction
>::type intermediate_type;
typedef typename thrust::iterator_difference<RandomAccessIterator1>::type Size;
Size n = last - first;
const Size threshold_of_parallelism = 20000;
if(n < threshold_of_parallelism)
{
typedef bulk::detail::scan_detail::scan_buffer<512,3,RandomAccessIterator1,RandomAccessIterator2,BinaryFunction> heap_type;
Size heap_size = sizeof(heap_type);
bulk::async(bulk::con<512,3>(heap_size), inclusive_scan_n(), bulk::root, first, n, result, init, binary_op);
} // end if
else
{
// determined from empirical testing on k20c
const int groupsize = sizeof(intermediate_type) <= sizeof(int) ? 128 : 256;
const int grainsize = sizeof(intermediate_type) <= sizeof(int) ? 9 : 5;
const Size tile_size = groupsize * grainsize;
int num_tiles = (n + tile_size - 1) / tile_size;
// 20 determined from empirical testing on k20c & GTX 480
int subscription = 20;
Size num_groups = thrust::min<Size>(subscription * bulk::concurrent_group<>::hardware_concurrency(), num_tiles);
aligned_decomposition<Size> decomp(n, num_groups, tile_size);
thrust::cuda::tag t;
thrust::detail::temporary_array<intermediate_type,thrust::cuda::tag> carries(t, num_groups);
// Run the parallel raking reduce as an upsweep.
// n loads + num_groups stores
Size heap_size = groupsize * sizeof(intermediate_type);
bulk::async(bulk::grid<groupsize,grainsize>(num_groups,heap_size), accumulate_tiles(), bulk::root.this_exec, first, decomp, carries.begin(), binary_op);
// scan the sums to get the carries
// num_groups loads + num_groups stores
typedef bulk::detail::scan_detail::scan_buffer<256,3,RandomAccessIterator1,RandomAccessIterator2,BinaryFunction> heap_type2;
heap_size = sizeof(heap_type2);
bulk::async(bulk::con<256,3>(heap_size), exclusive_scan_n(), bulk::root, carries.begin(), num_groups, carries.begin(), init, binary_op);
// do the downsweep - n loads, n stores
typedef bulk::detail::scan_detail::scan_buffer<
groupsize,
grainsize,
RandomAccessIterator1,RandomAccessIterator2,BinaryFunction
> heap_type3;
heap_size = sizeof(heap_type3);
bulk::async(bulk::grid<groupsize,grainsize>(num_groups,heap_size), inclusive_downsweep(), bulk::root.this_exec, first, decomp, carries.begin(), result, binary_op);
} // end else
return result + n;
} // end inclusive_scan()
template<typename T>
void my_scan(thrust::device_vector<T> *data, T init)
{
::inclusive_scan(data->begin(), data->end(), data->begin(), init, thrust::plus<T>());
}
template<typename T>
void validate(size_t n)
{
thrust::host_vector<T> h_input(n);
thrust::fill(h_input.begin(), h_input.end(), 1);
thrust::host_vector<T> h_result(n);
T init = 13;
thrust::inclusive_scan(h_input.begin(), h_input.end(), h_result.begin());
thrust::for_each(h_result.begin(), h_result.end(), thrust::placeholders::_1 += init);
thrust::device_vector<T> d_input = h_input;
thrust::device_vector<T> d_result(d_input.size());
::inclusive_scan(d_input.begin(), d_input.end(), d_result.begin(), init, thrust::plus<T>());
cudaError_t error = cudaDeviceSynchronize();
if(error)
{
std::cerr << "CUDA error: " << cudaGetErrorString(error) << std::endl;
}
assert(h_result == d_result);
}
template<typename T>
void thrust_scan(thrust::device_vector<T> *data)
{
thrust::inclusive_scan(data->begin(), data->end(), data->begin());
}
template<typename T>
void compare(size_t n = 1 << 28)
{
thrust::device_vector<T> vec(n);
thrust_scan(&vec);
double thrust_msecs = time_invocation_cuda(50, thrust_scan<T>, &vec);
my_scan(&vec, T(13));
double my_msecs = time_invocation_cuda(50, my_scan<T>, &vec, 13);
std::cout << "N: " << n << std::endl;
std::cout << " Thrust's time: " << thrust_msecs << " ms" << std::endl;
std::cout << " My time: " << my_msecs << " ms" << std::endl;
std::cout << " Performance relative to Thrust: " << thrust_msecs / my_msecs << std::endl;
std::cout << std::endl;
}
int main()
{
for(size_t n = 1; n <= 1 << 20; n <<= 1)
{
std::cout << "Testing n = " << n << std::endl;
validate<int>(n);
}
thrust::default_random_engine rng;
for(int i = 0; i < 20; ++i)
{
size_t n = rng() % (1 << 20);
std::cout << "Testing n = " << n << std::endl;
validate<int>(n);
}
std::cout << "32b int:" << std::endl;
for(int i = 0; i < 28; ++i)
{
compare<int>(1 << i);
}
std::cout << "64b float:" << std::endl;
for(int i = 0; i < 28; ++i)
{
compare<double>(1 << i);
}
return 0;
}