Skip to content

Commit

Permalink
Network initialization restructuring.
Browse files Browse the repository at this point in the history
* Network initialization restructuring

- Create one net at a time when doing fp16/fp32 autodetect.  Saves some GPU memory
- Create an internal lambda which initializes the nets
- Use std::copy to copy vectors to reduce runtime

* zeropad_U : loop reordering for performance optimization

Plus other optimizations for zero-copying initialization

Pull request #1750.
  • Loading branch information
ihavnoid authored and gcp committed Aug 20, 2018
1 parent f85a685 commit 19f250f
Show file tree
Hide file tree
Showing 4 changed files with 136 additions and 130 deletions.
167 changes: 81 additions & 86 deletions src/Network.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,22 @@ std::pair<int, int> Network::load_network_file(const std::string& filename) {
}

void Network::initialize(int playouts, const std::string & weightsfile) {
#ifdef USE_BLAS
#ifndef __APPLE__
#ifdef USE_OPENBLAS
openblas_set_num_threads(1);
myprintf("BLAS Core: %s\n", openblas_get_corename());
#endif
#ifdef USE_MKL
//mkl_set_threading_layer(MKL_THREADING_SEQUENTIAL);
mkl_set_num_threads(1);
MKLVersion Version;
mkl_get_version(&Version);
myprintf("BLAS core: MKL %s\n", Version.Processor);
#endif
#endif
#endif

m_nncache.set_size_from_playouts(playouts);
// Prepare symmetry table
for (auto s = 0; s < NUM_SYMMETRIES; ++s) {
Expand Down Expand Up @@ -372,132 +388,111 @@ void Network::initialize(int playouts, const std::string & weightsfile) {
m_conv_pol_b[i] = 0.0f;
}

#ifdef USE_HALF
std::unique_ptr<ForwardPipe> fp16net;
#endif
std::vector<ForwardPipe*> to_init;
auto init_net = [this, channels, residual_blocks](auto&& p) {
p->initialize(channels);
auto weight_index = size_t{0};

// Winograd filter transformation changes filter size to 4x4
p->push_input_convolution(WINOGRAD_ALPHA, INPUT_CHANNELS,
channels, m_conv_weights[weight_index],
m_batchnorm_means[weight_index], m_batchnorm_stddevs[weight_index]);
weight_index++;

// residual blocks
for (auto i = size_t{0}; i < residual_blocks; i++) {
p->push_residual(WINOGRAD_ALPHA, channels, channels,
m_conv_weights[weight_index],
m_batchnorm_means[weight_index],
m_batchnorm_stddevs[weight_index],
m_conv_weights[weight_index + 1],
m_batchnorm_means[weight_index + 1],
m_batchnorm_stddevs[weight_index + 1]);
weight_index += 2;
}

// Output head convolutions
p->push_convolve(1, channels, OUTPUTS_POLICY, m_conv_pol_w);
p->push_convolve(1, channels, OUTPUTS_VALUE, m_conv_val_w);

return std::move(p);
};


bool use_selfcheck = true;
#ifdef USE_OPENCL
if (cfg_cpu_only) {
myprintf("Initializing CPU-only evaluation.\n");
m_forward = std::make_unique<CPUPipe>();
m_forward = init_net(std::make_unique<CPUPipe>());

use_selfcheck = false;
} else {
#ifdef USE_HALF
switch (cfg_precision) {
case precision_t::AUTO: {
// create fp16 and fp32 both here. will select one of them later.
auto score_fp16 = float{-1.0};
auto score_fp32 = float{-1.0};

myprintf("Initializing OpenCL (autodetect precision).\n");
try {
fp16net = std::make_unique<OpenCLScheduler<half_float::half>>();
fp16net->initialize(channels);
to_init.emplace_back(fp16net.get());
} catch (std::runtime_error) {
myprintf("Failed to initialize half precision net. Resorting to single precision.\n");
fp16net.reset();
m_forward = init_net(std::make_unique<OpenCLScheduler<float>>());
score_fp32 = benchmark_time(100);
} catch (...) {
// empty - if exception thrown just throw away fp16 net
}

try {
m_forward = init_net(std::make_unique<OpenCLScheduler<half_float::half>>());
score_fp16 = benchmark_time(100);
} catch (...) {
// empty - if exception thrown just throw away fp16 net
}


if (score_fp16 < 0.0 && score_fp32 < 0.0) {
myprintf("Both single precision and half precision failed to run\n");
throw std::runtime_error("Failed to initialize net");
} else if (score_fp16 < 0.0) {
myprintf("Using OpenCL single precision (half precision failed to run)\n");
m_forward = init_net(std::make_unique<OpenCLScheduler<float>>());
} else if (score_fp32 < 0.0) {
myprintf("Using OpenCL half precision (single precision failed to run)\n");
} else if (score_fp32 * 1.05f > score_fp16) {
myprintf("Using OpenCL single precision (less than 5%% slower than half)\n");
m_forward = init_net(std::make_unique<OpenCLScheduler<float>>());
} else {
myprintf("Using OpenCL half precision (at least 5%% faster than single)\n");
}
m_forward = std::make_unique<OpenCLScheduler<float>>();
}
break;
case precision_t::SINGLE: {
myprintf("Initializing OpenCL (single precision).\n");
m_forward = std::make_unique<OpenCLScheduler<float>>();
m_forward = init_net(std::make_unique<OpenCLScheduler<float>>());
}
break;
case precision_t::HALF: {
myprintf("Initializing OpenCL (half precision).\n");
m_forward = std::make_unique<OpenCLScheduler<half_float::half>>();
m_forward = init_net(std::make_unique<OpenCLScheduler<half_float::half>>());
}
}
#else
myprintf("Initializing OpenCL (single precision).\n");
m_forward = std::make_unique<OpenCLScheduler<float>>();
m_forward = init_net(std::make_unique<OpenCLScheduler<float>>());
#endif
}

#else //!USE_OPENCL
myprintf("Initializing CPU-only evaluation.\n");
m_forward = std::make_unique<CPUPipe>();
m_forward = init_net(std::make_unique<CPUPipe>());
use_selfcheck = false;
#endif

m_forward->initialize(channels);
to_init.emplace_back(m_forward.get());
#ifdef USE_OPENCL_SELFCHECK
if (use_selfcheck) {
m_forward_cpu = std::make_unique<CPUPipe>();
m_forward_cpu->initialize(channels);
to_init.emplace_back(m_forward_cpu.get());
m_forward_cpu = init_net(std::make_unique<CPUPipe>());
}
#else
(void)use_selfcheck;
#endif

for (const auto& p : to_init) {
weight_index = 0;

// Winograd filter transformation changes filter size to 4x4
p->push_input_convolution(WINOGRAD_ALPHA, INPUT_CHANNELS,
channels, m_conv_weights[weight_index],
m_batchnorm_means[weight_index], m_batchnorm_stddevs[weight_index]);
weight_index++;

// residual blocks
for (auto i = size_t{0}; i < residual_blocks; i++) {
p->push_residual(WINOGRAD_ALPHA, channels, channels,
m_conv_weights[weight_index],
m_batchnorm_means[weight_index],
m_batchnorm_stddevs[weight_index],
m_conv_weights[weight_index + 1],
m_batchnorm_means[weight_index + 1],
m_batchnorm_stddevs[weight_index + 1]);
weight_index += 2;
}

// Output head convolutions
p->push_convolve(1, channels, OUTPUTS_POLICY, m_conv_pol_w);
p->push_convolve(1, channels, OUTPUTS_VALUE, m_conv_val_w);
}
#ifdef USE_BLAS
#ifndef __APPLE__
#ifdef USE_OPENBLAS
openblas_set_num_threads(1);
myprintf("BLAS Core: %s\n", openblas_get_corename());
#endif
#ifdef USE_MKL
//mkl_set_threading_layer(MKL_THREADING_SEQUENTIAL);
mkl_set_num_threads(1);
MKLVersion Version;
mkl_get_version(&Version);
myprintf("BLAS core: MKL %s\n", Version.Processor);
#endif
#endif
#endif

#ifdef USE_HALF
if (fp16net != nullptr) {
auto score_fp32 = benchmark_time(100);
std::swap(fp16net, m_forward);
auto score_fp16 = float{-1.0};
try {
score_fp16 = benchmark_time(100);
} catch (...) {
// empty - if exception thrown just throw away fp16 net
}

if (score_fp16 < 0.0) {
std::swap(fp16net, m_forward);
myprintf("Using OpenCL single precision (half precision failed to run)\n");
} else if (score_fp32 * 1.05f > score_fp16) {
std::swap(fp16net, m_forward);
myprintf("Using OpenCL single precision (less than 5%% slower than half)\n");
} else {
myprintf("Using OpenCL half precision (at least 5%% faster than single)\n");
}
}
#endif
}

#ifdef USE_BLAS
Expand Down
12 changes: 4 additions & 8 deletions src/OpenCL.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,22 +106,18 @@ void OpenCL<net_t>::ensure_context_initialized(OpenCLContext &opencl_context) {
template <typename net_t>
void OpenCL_Network<net_t>::add_weights(size_t layer,
size_t size,
const float * weights) {
const net_t * weights) {
if (layer >= m_layers.size()) {
m_layers.push_back(Layer());
}

auto converted_weights = std::vector<net_t>();
for (auto i = size_t{0}; i < size; i++) {
converted_weights.emplace_back(weights[i]);
}

auto weightSize = size * sizeof(typename decltype(converted_weights)::value_type);
auto weightSize = size * sizeof(net_t);
m_layers.back().weights.emplace_back(
m_opencl.m_context,
CL_MEM_COPY_HOST_PTR | CL_MEM_READ_ONLY,
weightSize,
const_cast<net_t*>(converted_weights.data()));
const_cast<net_t*>(weights)
);
}

template <typename net_t>
Expand Down
24 changes: 12 additions & 12 deletions src/OpenCL.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,9 +81,9 @@ class OpenCL_Network {
void push_input_convolution(unsigned int filter_size,
unsigned int channels,
unsigned int outputs,
const std::vector<float>& weights,
const std::vector<float>& means,
const std::vector<float>& variances) {
const std::vector<net_t>& weights,
const std::vector<net_t>& means,
const std::vector<net_t>& variances) {
size_t layer = get_layer_count();
push_weights(layer, weights);
push_weights(layer, means);
Expand All @@ -97,12 +97,12 @@ class OpenCL_Network {
void push_residual(unsigned int filter_size,
unsigned int channels,
unsigned int outputs,
const std::vector<float>& weights_1,
const std::vector<float>& means_1,
const std::vector<float>& variances_1,
const std::vector<float>& weights_2,
const std::vector<float>& means_2,
const std::vector<float>& variances_2) {
const std::vector<net_t>& weights_1,
const std::vector<net_t>& means_1,
const std::vector<net_t>& variances_1,
const std::vector<net_t>& weights_2,
const std::vector<net_t>& means_2,
const std::vector<net_t>& variances_2) {
size_t layer = get_layer_count();
push_weights(layer, weights_1);
push_weights(layer, means_1);
Expand All @@ -119,7 +119,7 @@ class OpenCL_Network {
void push_convolve(unsigned int filter_size,
unsigned int channels,
unsigned int outputs,
const std::vector<float>& weights) {
const std::vector<net_t>& weights) {
(void)filter_size;
assert(filter_size == 1);

Expand All @@ -143,10 +143,10 @@ class OpenCL_Network {
private:
using weight_slice_t = std::vector<cl::Buffer>::const_iterator;

void push_weights(size_t layer, const std::vector<float>& weights) {
void push_weights(size_t layer, const std::vector<net_t>& weights) {
add_weights(layer, weights.size(), weights.data());
}
void add_weights(size_t layer, size_t size, const float* weights);
void add_weights(size_t layer, size_t size, const net_t* weights);

void convolve3(OpenCLContext & opencl_context,
int channels, int outputs,
Expand Down

0 comments on commit 19f250f

Please sign in to comment.