Network initialization restructuring.

* Network initialization restructuring - Create one net at a time when doing fp16/fp32 autodetect. Saves some GPU memory - Create an internal lambda which initializes the nets - Use std::copy to copy vectors to reduce runtime * zeropad_U : loop reordering for performance optimization Plus other optimizations for zero-copying initialization Pull request #1750.
leela-zero · Aug 20, 2018 · 19f250f · 19f250f
1 parent f85a685
commit 19f250f
Show file tree

Hide file tree

Showing 4 changed files with 136 additions and 130 deletions.
diff --git a/src/Network.cpp b/src/Network.cpp
@@ -318,6 +318,22 @@ std::pair<int, int> Network::load_network_file(const std::string& filename) {
 }
 
 void Network::initialize(int playouts, const std::string & weightsfile) {
+#ifdef USE_BLAS
+#ifndef __APPLE__
+#ifdef USE_OPENBLAS
+    openblas_set_num_threads(1);
+    myprintf("BLAS Core: %s\n", openblas_get_corename());
+#endif
+#ifdef USE_MKL
+    //mkl_set_threading_layer(MKL_THREADING_SEQUENTIAL);
+    mkl_set_num_threads(1);
+    MKLVersion Version;
+    mkl_get_version(&Version);
+    myprintf("BLAS core: MKL %s\n", Version.Processor);
+#endif
+#endif
+#endif
+
     m_nncache.set_size_from_playouts(playouts);
     // Prepare symmetry table
     for (auto s = 0; s < NUM_SYMMETRIES; ++s) {
@@ -372,132 +388,111 @@ void Network::initialize(int playouts, const std::string & weightsfile) {
         m_conv_pol_b[i] = 0.0f;
     }
 
-#ifdef USE_HALF
-    std::unique_ptr<ForwardPipe> fp16net;
-#endif
-    std::vector<ForwardPipe*> to_init;
+    auto init_net = [this, channels, residual_blocks](auto&& p) {
+        p->initialize(channels);
+        auto weight_index = size_t{0};
+
+        // Winograd filter transformation changes filter size to 4x4
+        p->push_input_convolution(WINOGRAD_ALPHA, INPUT_CHANNELS,
+            channels, m_conv_weights[weight_index],
+            m_batchnorm_means[weight_index], m_batchnorm_stddevs[weight_index]);
+        weight_index++;
+
+        // residual blocks
+        for (auto i = size_t{0}; i < residual_blocks; i++) {
+            p->push_residual(WINOGRAD_ALPHA, channels, channels,
+                             m_conv_weights[weight_index],
+                             m_batchnorm_means[weight_index],
+                             m_batchnorm_stddevs[weight_index],
+                             m_conv_weights[weight_index + 1],
+                             m_batchnorm_means[weight_index + 1],
+                             m_batchnorm_stddevs[weight_index + 1]);
+            weight_index += 2;
+        }
+
+        // Output head convolutions
+        p->push_convolve(1, channels, OUTPUTS_POLICY, m_conv_pol_w);
+        p->push_convolve(1, channels, OUTPUTS_VALUE, m_conv_val_w);
+
+        return std::move(p);
+    };
+
 
     bool use_selfcheck = true;
 #ifdef USE_OPENCL
     if (cfg_cpu_only) {
         myprintf("Initializing CPU-only evaluation.\n");
-        m_forward = std::make_unique<CPUPipe>();
+        m_forward = init_net(std::make_unique<CPUPipe>());
 
         use_selfcheck = false;
     } else {
 #ifdef USE_HALF
         switch (cfg_precision) {
             case precision_t::AUTO: {
-                // create fp16 and fp32 both here.  will select one of them later.
+                auto score_fp16 = float{-1.0};
+                auto score_fp32 = float{-1.0};
+
                 myprintf("Initializing OpenCL (autodetect precision).\n");
                 try {
-                    fp16net = std::make_unique<OpenCLScheduler<half_float::half>>();
-                    fp16net->initialize(channels);
-                    to_init.emplace_back(fp16net.get());
-                } catch (std::runtime_error) {
-                    myprintf("Failed to initialize half precision net.  Resorting to single precision.\n");
-                    fp16net.reset();
+                    m_forward = init_net(std::make_unique<OpenCLScheduler<float>>());
+                    score_fp32 = benchmark_time(100);
+                } catch (...) {
+                    // empty - if exception thrown just throw away fp16 net
+                }
+
+                try {
+                    m_forward = init_net(std::make_unique<OpenCLScheduler<half_float::half>>());
+                    score_fp16 = benchmark_time(100);
+                } catch (...) {
+                    // empty - if exception thrown just throw away fp16 net
+                }
+
+
+                if (score_fp16 < 0.0 && score_fp32 < 0.0) {
+                    myprintf("Both single precision and half precision failed to run\n");
+                    throw std::runtime_error("Failed to initialize net");
+                } else if (score_fp16 < 0.0) {
+                    myprintf("Using OpenCL single precision (half precision failed to run)\n");
+                    m_forward = init_net(std::make_unique<OpenCLScheduler<float>>());
+                } else if (score_fp32 < 0.0) {
+                    myprintf("Using OpenCL half precision (single precision failed to run)\n");
+                } else if (score_fp32 * 1.05f > score_fp16) {
+                    myprintf("Using OpenCL single precision (less than 5%% slower than half)\n");
+                    m_forward = init_net(std::make_unique<OpenCLScheduler<float>>());
+                } else {
+                    myprintf("Using OpenCL half precision (at least 5%% faster than single)\n");
                 }
-                m_forward = std::make_unique<OpenCLScheduler<float>>();
             }
             break;
             case precision_t::SINGLE: {
                 myprintf("Initializing OpenCL (single precision).\n");
-                m_forward = std::make_unique<OpenCLScheduler<float>>();
+                m_forward = init_net(std::make_unique<OpenCLScheduler<float>>());
             }
             break;
             case precision_t::HALF: {
                 myprintf("Initializing OpenCL (half precision).\n");
-                m_forward = std::make_unique<OpenCLScheduler<half_float::half>>();
+                m_forward = init_net(std::make_unique<OpenCLScheduler<half_float::half>>());
             }
         }
 #else
         myprintf("Initializing OpenCL (single precision).\n");
-        m_forward = std::make_unique<OpenCLScheduler<float>>();
+        m_forward = init_net(std::make_unique<OpenCLScheduler<float>>());
 #endif
     }
 
 #else //!USE_OPENCL
     myprintf("Initializing CPU-only evaluation.\n");
-    m_forward = std::make_unique<CPUPipe>();
+    m_forward = init_net(std::make_unique<CPUPipe>());
     use_selfcheck = false;
 #endif
 
-    m_forward->initialize(channels);
-    to_init.emplace_back(m_forward.get());
 #ifdef USE_OPENCL_SELFCHECK
     if (use_selfcheck) {
-        m_forward_cpu = std::make_unique<CPUPipe>();
-        m_forward_cpu->initialize(channels);
-        to_init.emplace_back(m_forward_cpu.get());
+        m_forward_cpu = init_net(std::make_unique<CPUPipe>());
     }
 #else
     (void)use_selfcheck;
 #endif
-
-    for (const auto& p : to_init) {
-        weight_index = 0;
-
-        // Winograd filter transformation changes filter size to 4x4
-        p->push_input_convolution(WINOGRAD_ALPHA, INPUT_CHANNELS,
-            channels, m_conv_weights[weight_index],
-            m_batchnorm_means[weight_index], m_batchnorm_stddevs[weight_index]);
-        weight_index++;
-
-        // residual blocks
-        for (auto i = size_t{0}; i < residual_blocks; i++) {
-            p->push_residual(WINOGRAD_ALPHA, channels, channels,
-                             m_conv_weights[weight_index],
-                             m_batchnorm_means[weight_index],
-                             m_batchnorm_stddevs[weight_index],
-                             m_conv_weights[weight_index + 1],
-                             m_batchnorm_means[weight_index + 1],
-                             m_batchnorm_stddevs[weight_index + 1]);
-            weight_index += 2;
-        }
-
-        // Output head convolutions
-        p->push_convolve(1, channels, OUTPUTS_POLICY, m_conv_pol_w);
-        p->push_convolve(1, channels, OUTPUTS_VALUE, m_conv_val_w);
-    }
-#ifdef USE_BLAS
-#ifndef __APPLE__
-#ifdef USE_OPENBLAS
-    openblas_set_num_threads(1);
-    myprintf("BLAS Core: %s\n", openblas_get_corename());
-#endif
-#ifdef USE_MKL
-    //mkl_set_threading_layer(MKL_THREADING_SEQUENTIAL);
-    mkl_set_num_threads(1);
-    MKLVersion Version;
-    mkl_get_version(&Version);
-    myprintf("BLAS core: MKL %s\n", Version.Processor);
-#endif
-#endif
-#endif
-
-#ifdef USE_HALF
-    if (fp16net != nullptr) {
-        auto score_fp32 = benchmark_time(100);
-        std::swap(fp16net, m_forward);
-        auto score_fp16 = float{-1.0};
-        try {
-            score_fp16 = benchmark_time(100);
-        } catch (...) {
-            // empty - if exception thrown just throw away fp16 net
-        }
-
-        if (score_fp16 < 0.0) {
-            std::swap(fp16net, m_forward);
-            myprintf("Using OpenCL single precision (half precision failed to run)\n");
-        } else if (score_fp32 * 1.05f > score_fp16) {
-            std::swap(fp16net, m_forward);
-            myprintf("Using OpenCL single precision (less than 5%% slower than half)\n");
-        } else {
-            myprintf("Using OpenCL half precision (at least 5%% faster than single)\n");
-        }
-    }
-#endif
 }
 
 #ifdef USE_BLAS

diff --git a/src/OpenCL.cpp b/src/OpenCL.cpp
@@ -106,22 +106,18 @@ void OpenCL<net_t>::ensure_context_initialized(OpenCLContext &opencl_context) {
 template <typename net_t>
 void OpenCL_Network<net_t>::add_weights(size_t layer,
                                  size_t size,
-                                 const float * weights) {
+                                 const net_t * weights) {
     if (layer >= m_layers.size()) {
         m_layers.push_back(Layer());
     }
 
-    auto converted_weights = std::vector<net_t>();
-    for (auto i = size_t{0}; i < size; i++) {
-        converted_weights.emplace_back(weights[i]);
-    }
-
-    auto weightSize = size * sizeof(typename decltype(converted_weights)::value_type);
+    auto weightSize = size * sizeof(net_t);
     m_layers.back().weights.emplace_back(
         m_opencl.m_context,
         CL_MEM_COPY_HOST_PTR | CL_MEM_READ_ONLY,
         weightSize,
-        const_cast<net_t*>(converted_weights.data()));
+        const_cast<net_t*>(weights)
+    );
 }
 
 template <typename net_t>

diff --git a/src/OpenCL.h b/src/OpenCL.h
@@ -81,9 +81,9 @@ class OpenCL_Network {
     void push_input_convolution(unsigned int filter_size,
                        unsigned int channels,
                        unsigned int outputs,
-                       const std::vector<float>& weights,
-                       const std::vector<float>& means,
-                       const std::vector<float>& variances) {
+                       const std::vector<net_t>& weights,
+                       const std::vector<net_t>& means,
+                       const std::vector<net_t>& variances) {
         size_t layer = get_layer_count();
         push_weights(layer, weights);
         push_weights(layer, means);
@@ -97,12 +97,12 @@ class OpenCL_Network {
     void push_residual(unsigned int filter_size,
                        unsigned int channels,
                        unsigned int outputs,
-                       const std::vector<float>& weights_1,
-                       const std::vector<float>& means_1,
-                       const std::vector<float>& variances_1,
-                       const std::vector<float>& weights_2,
-                       const std::vector<float>& means_2,
-                       const std::vector<float>& variances_2) {
+                       const std::vector<net_t>& weights_1,
+                       const std::vector<net_t>& means_1,
+                       const std::vector<net_t>& variances_1,
+                       const std::vector<net_t>& weights_2,
+                       const std::vector<net_t>& means_2,
+                       const std::vector<net_t>& variances_2) {
         size_t layer = get_layer_count();
         push_weights(layer, weights_1);
         push_weights(layer, means_1);
@@ -119,7 +119,7 @@ class OpenCL_Network {
     void push_convolve(unsigned int filter_size,
                        unsigned int channels,
                        unsigned int outputs,
-                       const std::vector<float>& weights) {
+                       const std::vector<net_t>& weights) {
         (void)filter_size;
         assert(filter_size == 1);
 
@@ -143,10 +143,10 @@ class OpenCL_Network {
 private:
     using weight_slice_t = std::vector<cl::Buffer>::const_iterator;
 
-    void push_weights(size_t layer, const std::vector<float>& weights) {
+    void push_weights(size_t layer, const std::vector<net_t>& weights) {
         add_weights(layer, weights.size(), weights.data());
     }
-    void add_weights(size_t layer, size_t size, const float* weights);
+    void add_weights(size_t layer, size_t size, const net_t* weights);
 
     void convolve3(OpenCLContext & opencl_context,
                     int channels, int outputs,