Runtime selection of fp16/fp32.

* OpenCL half precision is now command-line option, support compiled in by default. This converts the OpenCL code into a gigantic template library. * Update Network self-check. - Final output is used for self-check. - Criteria is 20% error, while ignoring values smaller than 1/361. - Throws exception when three out of last ten checks fails. Pull request #1649.
leela-zero · Jul 25, 2018 · b323d40 · b323d40
1 parent 90d4ff2
commit b323d40
Show file tree

Hide file tree

Showing 12 changed files with 268 additions and 110 deletions.
diff --git a/src/GTP.cpp b/src/GTP.cpp
@@ -64,6 +64,9 @@ bool cfg_dumbpass;
 std::vector<int> cfg_gpus;
 bool cfg_sgemm_exhaustive;
 bool cfg_tune_only;
+#ifdef USE_HALF
+bool cfg_use_half;
+#endif
 #endif
 float cfg_puct;
 float cfg_softmax_temp;
@@ -101,6 +104,9 @@ void GTP::setup_default_parameters() {
     cfg_gpus = { };
     cfg_sgemm_exhaustive = false;
     cfg_tune_only = false;
+#ifdef USE_HALF
+    cfg_use_half = false;
+#endif
 #endif
     cfg_puct = 0.8f;
     cfg_softmax_temp = 1.0f;

diff --git a/src/GTP.h b/src/GTP.h
@@ -48,6 +48,9 @@ extern bool cfg_dumbpass;
 extern std::vector<int> cfg_gpus;
 extern bool cfg_sgemm_exhaustive;
 extern bool cfg_tune_only;
+#ifdef USE_HALF
+extern bool cfg_use_half;
+#endif
 #endif
 extern float cfg_puct;
 extern float cfg_softmax_temp;

diff --git a/src/Leela.cpp b/src/Leela.cpp
@@ -90,6 +90,9 @@ static void parse_commandline(int argc, char *argv[]) {
                 "ID of the OpenCL device(s) to use (disables autodetection).")
         ("full-tuner", "Try harder to find an optimal OpenCL tuning.")
         ("tune-only", "Tune OpenCL only and then exit.")
+#ifdef USE_HALF
+        ("use-half", "Use half-precision OpenCL code.  Trades off some accuracy for higher performance")
+#endif
         ;
 #endif
     po::options_description selfplay_desc("Self-play options");
@@ -318,6 +321,12 @@ static void parse_commandline(int argc, char *argv[]) {
     if (vm.count("tune-only")) {
         cfg_tune_only = true;
     }
+
+#ifdef USE_HALF
+    if (vm.count("use-half")) {
+        cfg_use_half = true;
+    }
+#endif
 #endif
 
     if (vm.count("benchmark")) {

diff --git a/src/Network.cpp b/src/Network.cpp
@@ -345,26 +345,41 @@ void Network::initialize(int playouts, const std::string & weightsfile) {
 
     std::vector<ForwardPipe*> to_init;
 
+    bool use_selfcheck = true;
 #ifdef USE_OPENCL
     if (cfg_cpu_only) {
         myprintf("Initializing CPU-only evaluation.\n");
         m_forward = std::make_unique<CPUPipe>();
+        use_selfcheck = false;
     } else {
-        myprintf("Initializing OpenCL.\n");
-        m_forward = std::make_unique<OpenCLScheduler>();
-    }
+#ifdef USE_HALF
+        if (cfg_use_half) {
+            myprintf("Initializing OpenCL (half precision).\n");
+            m_forward = std::make_unique<OpenCLScheduler<half_float::half>>();
+        } else {
+            myprintf("Initializing OpenCL (single precision).\n");
+            m_forward = std::make_unique<OpenCLScheduler<float>>();
+        }
 #else
+        myprintf("Initializing OpenCL (single precision).\n");
+        m_forward = std::make_unique<OpenCLScheduler<float>>();
+#endif
+    }
+
+#else //!USE_OPENCL
     myprintf("Initializing CPU-only evaluation.\n");
     m_forward = std::make_unique<CPUPipe>();
+    use_selfcheck = false;
 #endif
 
     to_init.emplace_back(m_forward.get());
-
 #ifdef USE_OPENCL_SELFCHECK
-    if (!cfg_cpu_only) {
+    if (use_selfcheck) {
         m_forward_cpu = std::make_unique<CPUPipe>();
         to_init.emplace_back(m_forward_cpu.get());
     }
+#else
+    (void)use_selfcheck;
 #endif
 
     for (const auto& p : to_init) {
@@ -477,7 +492,7 @@ T relative_difference(const T a, const T b) {
         return std::numeric_limits<T>::max();
     }
 
-    constexpr auto small_number = 1e-3f;
+    constexpr auto small_number = 1.0f/361.0f;
     auto fa = std::fabs(a);
     auto fb = std::fabs(b);
 
@@ -495,20 +510,50 @@ T relative_difference(const T a, const T b) {
     return fabs(fa - fb) / std::min(fa, fb);
 }
 
-void compare_net_outputs(std::vector<float>& data,
-                         std::vector<float>& ref) {
-    // We accept an error up to 5%, but output values
-    // smaller than 1/1000th are "rounded up" for the comparison.
-    constexpr auto relative_error = 5e-2f;
-    for (auto idx = size_t{0}; idx < data.size(); ++idx) {
-        const auto err = relative_difference(data[idx], ref[idx]);
+#endif
+
+#ifdef USE_OPENCL_SELFCHECK
+void Network::compare_net_outputs(Netresult& data,
+                                  Netresult& ref) {
+    // We accept an error up to 20%, but output values
+    // smaller than 1/361th are "rounded up" for the comparison.
+    constexpr auto relative_error = 2e-1f;
+
+    // assert-fail when we hit 3 failures out of last 10 checks
+    constexpr auto max_failures = 3;
+    constexpr auto last_failure_window = 10;
+
+    auto selfcheck_fail = false;
+    for (auto idx = size_t{0}; idx < data.policy.size(); ++idx) {
+        const auto err = relative_difference(data.policy[idx], ref.policy[idx]);
         if (err > relative_error) {
-            printf("Error in OpenCL calculation: expected %f got %f "
-                   "(error=%f%%)\n", ref[idx], data[idx], err * 100.0);
-            printf("Update your GPU drivers or reduce the amount of games "
+            selfcheck_fail = true;
+            break;
+        }
+    }
+    const auto err_pass = relative_difference(data.policy_pass, ref.policy_pass);
+    const auto err_winrate = relative_difference(data.winrate, ref.winrate);
+    if (err_pass > relative_error) {
+        selfcheck_fail = true;
+    }
+    if (err_winrate > relative_error) {
+        selfcheck_fail = true;
+    }
+
+    LOCK(m_selfcheck_mutex, selfcheck_lock);
+    if (selfcheck_fail) {
+        m_selfcheck_fails.push_back(true);
+        if (std::count(begin(m_selfcheck_fails), end(m_selfcheck_fails), true) >= max_failures) {
+            printf("Error in OpenCL calculation: Update your GPU drivers or reduce the amount of games "
                    "played simultaneously.\n");
             throw std::runtime_error("OpenCL self-check mismatch.");
         }
+    } else {
+        m_selfcheck_fails.push_back(false);
+    }
+
+    while (m_selfcheck_fails.size() >= last_failure_window) {
+        m_selfcheck_fails.pop_front();
     }
 }
 #endif
@@ -598,6 +643,16 @@ Network::Netresult Network::get_output(
         assert(symmetry == -1);
         const auto rand_sym = Random::get_Rng().randfix<NUM_SYMMETRIES>();
         result = get_output_internal(state, rand_sym);
+#ifdef USE_OPENCL_SELFCHECK
+        // Both implementations are available, self-check the OpenCL driver by
+        // running both with a probability of 1/2000.
+        // selfcheck is done here because this is the only place NN evaluation is done
+        // on actual gameplay.
+        if (m_forward_cpu != nullptr && Random::get_Rng().randfix<SELFCHECK_PROBABILITY>() == 0) {
+            auto result_ref = get_output_internal(state, rand_sym, true);
+            compare_net_outputs(result, result_ref);
+        }
+#endif
     }
 
     // v2 format (ELF Open Go) returns black value, not stm
@@ -614,25 +669,23 @@ Network::Netresult Network::get_output(
 }
 
 Network::Netresult Network::get_output_internal(
-    const GameState* const state, const int symmetry) {
+    const GameState* const state, const int symmetry, bool selfcheck) {
     assert(symmetry >= 0 && symmetry < NUM_SYMMETRIES);
     constexpr auto width = BOARD_SIZE;
     constexpr auto height = BOARD_SIZE;
 
     const auto input_data = gather_features(state, symmetry);
     std::vector<float> policy_data(OUTPUTS_POLICY * width * height);
     std::vector<float> value_data(OUTPUTS_VALUE * width * height);
-    m_forward->forward(input_data, policy_data, value_data);
 #ifdef USE_OPENCL_SELFCHECK
-    // Both implementations are available, self-check the OpenCL driver by
-    // running both with a probability of 1/2000.
-    if (m_forward_cpu != nullptr && Random::get_Rng().randfix<SELFCHECK_PROBABILITY>() == 0) {
-        auto cpu_policy_data = std::vector<float>(policy_data.size());
-        auto cpu_value_data = std::vector<float>(value_data.size());
-        m_forward_cpu->forward(input_data, cpu_policy_data, cpu_value_data);
-        compare_net_outputs(policy_data, cpu_policy_data);
-        compare_net_outputs(value_data, cpu_value_data);
+    if (selfcheck) {
+        m_forward_cpu->forward(input_data, policy_data, value_data);
+    } else {
+        m_forward->forward(input_data, policy_data, value_data);
     }
+#else
+    m_forward->forward(input_data, policy_data, value_data);
+    (void) selfcheck;
 #endif
 
     // Get the moves

diff --git a/src/Network.h b/src/Network.h
@@ -21,6 +21,7 @@
 
 #include "config.h"
 
+#include <deque>
 #include <array>
 #include <memory>
 #include <string>
@@ -39,6 +40,10 @@
 #include "OpenCLScheduler.h"
 #endif
 
+#ifdef USE_OPENCL_SELFCHECK
+#include "SMP.h"
+#endif
+
 
 // Winograd filter transformation changes 3x3 filters to M + 3 - 1
 constexpr auto WINOGRAD_M = 4;
@@ -104,15 +109,22 @@ class Network {
                                const std::vector<float>& V,
                                std::vector<float>& M, const int C, const int K);
     Netresult get_output_internal(const GameState* const state,
-                                  const int symmetry);
+                                  const int symmetry, bool selfcheck = false);
     static void fill_input_plane_pair(const FullBoard& board,
                                       std::vector<float>::iterator black,
                                       std::vector<float>::iterator white,
                                       const int symmetry);
     bool probe_cache(const GameState* const state, Network::Netresult& result);
     std::unique_ptr<ForwardPipe> m_forward;
 #ifdef USE_OPENCL_SELFCHECK
+    void compare_net_outputs(Netresult& data, Netresult& ref);
     std::unique_ptr<ForwardPipe> m_forward_cpu;
+
+    // records the result of most recent selfchecks
+    std::deque<bool> m_selfcheck_fails;
+
+    // mutex that protects m_selfcheck_fails
+    SMP::Mutex m_selfcheck_mutex;
 #endif
 
     NNCache m_nncache;