Skip to content

Commit

Permalink
Runtime selection of fp16/fp32.
Browse files Browse the repository at this point in the history
* OpenCL half precision is now command-line option, 
  support compiled in by default.
  This converts the OpenCL code into a gigantic template library.
* Update Network self-check.
 - Final output is used for self-check.
 - Criteria is 20% error, while ignoring values smaller than 1/361.
 - Throws exception when three out of last ten checks fails.

Pull request #1649.
  • Loading branch information
ihavnoid authored and gcp committed Jul 25, 2018
1 parent 90d4ff2 commit b323d40
Show file tree
Hide file tree
Showing 12 changed files with 268 additions and 110 deletions.
6 changes: 6 additions & 0 deletions src/GTP.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,9 @@ bool cfg_dumbpass;
std::vector<int> cfg_gpus;
bool cfg_sgemm_exhaustive;
bool cfg_tune_only;
#ifdef USE_HALF
bool cfg_use_half;
#endif
#endif
float cfg_puct;
float cfg_softmax_temp;
Expand Down Expand Up @@ -101,6 +104,9 @@ void GTP::setup_default_parameters() {
cfg_gpus = { };
cfg_sgemm_exhaustive = false;
cfg_tune_only = false;
#ifdef USE_HALF
cfg_use_half = false;
#endif
#endif
cfg_puct = 0.8f;
cfg_softmax_temp = 1.0f;
Expand Down
3 changes: 3 additions & 0 deletions src/GTP.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@ extern bool cfg_dumbpass;
extern std::vector<int> cfg_gpus;
extern bool cfg_sgemm_exhaustive;
extern bool cfg_tune_only;
#ifdef USE_HALF
extern bool cfg_use_half;
#endif
#endif
extern float cfg_puct;
extern float cfg_softmax_temp;
Expand Down
9 changes: 9 additions & 0 deletions src/Leela.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,9 @@ static void parse_commandline(int argc, char *argv[]) {
"ID of the OpenCL device(s) to use (disables autodetection).")
("full-tuner", "Try harder to find an optimal OpenCL tuning.")
("tune-only", "Tune OpenCL only and then exit.")
#ifdef USE_HALF
("use-half", "Use half-precision OpenCL code. Trades off some accuracy for higher performance")
#endif
;
#endif
po::options_description selfplay_desc("Self-play options");
Expand Down Expand Up @@ -318,6 +321,12 @@ static void parse_commandline(int argc, char *argv[]) {
if (vm.count("tune-only")) {
cfg_tune_only = true;
}

#ifdef USE_HALF
if (vm.count("use-half")) {
cfg_use_half = true;
}
#endif
#endif

if (vm.count("benchmark")) {
Expand Down
105 changes: 79 additions & 26 deletions src/Network.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -345,26 +345,41 @@ void Network::initialize(int playouts, const std::string & weightsfile) {

std::vector<ForwardPipe*> to_init;

bool use_selfcheck = true;
#ifdef USE_OPENCL
if (cfg_cpu_only) {
myprintf("Initializing CPU-only evaluation.\n");
m_forward = std::make_unique<CPUPipe>();
use_selfcheck = false;
} else {
myprintf("Initializing OpenCL.\n");
m_forward = std::make_unique<OpenCLScheduler>();
}
#ifdef USE_HALF
if (cfg_use_half) {
myprintf("Initializing OpenCL (half precision).\n");
m_forward = std::make_unique<OpenCLScheduler<half_float::half>>();
} else {
myprintf("Initializing OpenCL (single precision).\n");
m_forward = std::make_unique<OpenCLScheduler<float>>();
}
#else
myprintf("Initializing OpenCL (single precision).\n");
m_forward = std::make_unique<OpenCLScheduler<float>>();
#endif
}

#else //!USE_OPENCL
myprintf("Initializing CPU-only evaluation.\n");
m_forward = std::make_unique<CPUPipe>();
use_selfcheck = false;
#endif

to_init.emplace_back(m_forward.get());

#ifdef USE_OPENCL_SELFCHECK
if (!cfg_cpu_only) {
if (use_selfcheck) {
m_forward_cpu = std::make_unique<CPUPipe>();
to_init.emplace_back(m_forward_cpu.get());
}
#else
(void)use_selfcheck;
#endif

for (const auto& p : to_init) {
Expand Down Expand Up @@ -477,7 +492,7 @@ T relative_difference(const T a, const T b) {
return std::numeric_limits<T>::max();
}

constexpr auto small_number = 1e-3f;
constexpr auto small_number = 1.0f/361.0f;
auto fa = std::fabs(a);
auto fb = std::fabs(b);

Expand All @@ -495,20 +510,50 @@ T relative_difference(const T a, const T b) {
return fabs(fa - fb) / std::min(fa, fb);
}

void compare_net_outputs(std::vector<float>& data,
std::vector<float>& ref) {
// We accept an error up to 5%, but output values
// smaller than 1/1000th are "rounded up" for the comparison.
constexpr auto relative_error = 5e-2f;
for (auto idx = size_t{0}; idx < data.size(); ++idx) {
const auto err = relative_difference(data[idx], ref[idx]);
#endif

#ifdef USE_OPENCL_SELFCHECK
void Network::compare_net_outputs(Netresult& data,
Netresult& ref) {
// We accept an error up to 20%, but output values
// smaller than 1/361th are "rounded up" for the comparison.
constexpr auto relative_error = 2e-1f;

// assert-fail when we hit 3 failures out of last 10 checks
constexpr auto max_failures = 3;
constexpr auto last_failure_window = 10;

auto selfcheck_fail = false;
for (auto idx = size_t{0}; idx < data.policy.size(); ++idx) {
const auto err = relative_difference(data.policy[idx], ref.policy[idx]);
if (err > relative_error) {
printf("Error in OpenCL calculation: expected %f got %f "
"(error=%f%%)\n", ref[idx], data[idx], err * 100.0);
printf("Update your GPU drivers or reduce the amount of games "
selfcheck_fail = true;
break;
}
}
const auto err_pass = relative_difference(data.policy_pass, ref.policy_pass);
const auto err_winrate = relative_difference(data.winrate, ref.winrate);
if (err_pass > relative_error) {
selfcheck_fail = true;
}
if (err_winrate > relative_error) {
selfcheck_fail = true;
}

LOCK(m_selfcheck_mutex, selfcheck_lock);
if (selfcheck_fail) {
m_selfcheck_fails.push_back(true);
if (std::count(begin(m_selfcheck_fails), end(m_selfcheck_fails), true) >= max_failures) {
printf("Error in OpenCL calculation: Update your GPU drivers or reduce the amount of games "
"played simultaneously.\n");
throw std::runtime_error("OpenCL self-check mismatch.");
}
} else {
m_selfcheck_fails.push_back(false);
}

while (m_selfcheck_fails.size() >= last_failure_window) {
m_selfcheck_fails.pop_front();
}
}
#endif
Expand Down Expand Up @@ -598,6 +643,16 @@ Network::Netresult Network::get_output(
assert(symmetry == -1);
const auto rand_sym = Random::get_Rng().randfix<NUM_SYMMETRIES>();
result = get_output_internal(state, rand_sym);
#ifdef USE_OPENCL_SELFCHECK
// Both implementations are available, self-check the OpenCL driver by
// running both with a probability of 1/2000.
// selfcheck is done here because this is the only place NN evaluation is done
// on actual gameplay.
if (m_forward_cpu != nullptr && Random::get_Rng().randfix<SELFCHECK_PROBABILITY>() == 0) {
auto result_ref = get_output_internal(state, rand_sym, true);
compare_net_outputs(result, result_ref);
}
#endif
}

// v2 format (ELF Open Go) returns black value, not stm
Expand All @@ -614,25 +669,23 @@ Network::Netresult Network::get_output(
}

Network::Netresult Network::get_output_internal(
const GameState* const state, const int symmetry) {
const GameState* const state, const int symmetry, bool selfcheck) {
assert(symmetry >= 0 && symmetry < NUM_SYMMETRIES);
constexpr auto width = BOARD_SIZE;
constexpr auto height = BOARD_SIZE;

const auto input_data = gather_features(state, symmetry);
std::vector<float> policy_data(OUTPUTS_POLICY * width * height);
std::vector<float> value_data(OUTPUTS_VALUE * width * height);
m_forward->forward(input_data, policy_data, value_data);
#ifdef USE_OPENCL_SELFCHECK
// Both implementations are available, self-check the OpenCL driver by
// running both with a probability of 1/2000.
if (m_forward_cpu != nullptr && Random::get_Rng().randfix<SELFCHECK_PROBABILITY>() == 0) {
auto cpu_policy_data = std::vector<float>(policy_data.size());
auto cpu_value_data = std::vector<float>(value_data.size());
m_forward_cpu->forward(input_data, cpu_policy_data, cpu_value_data);
compare_net_outputs(policy_data, cpu_policy_data);
compare_net_outputs(value_data, cpu_value_data);
if (selfcheck) {
m_forward_cpu->forward(input_data, policy_data, value_data);
} else {
m_forward->forward(input_data, policy_data, value_data);
}
#else
m_forward->forward(input_data, policy_data, value_data);
(void) selfcheck;
#endif

// Get the moves
Expand Down
14 changes: 13 additions & 1 deletion src/Network.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

#include "config.h"

#include <deque>
#include <array>
#include <memory>
#include <string>
Expand All @@ -39,6 +40,10 @@
#include "OpenCLScheduler.h"
#endif

#ifdef USE_OPENCL_SELFCHECK
#include "SMP.h"
#endif


// Winograd filter transformation changes 3x3 filters to M + 3 - 1
constexpr auto WINOGRAD_M = 4;
Expand Down Expand Up @@ -104,15 +109,22 @@ class Network {
const std::vector<float>& V,
std::vector<float>& M, const int C, const int K);
Netresult get_output_internal(const GameState* const state,
const int symmetry);
const int symmetry, bool selfcheck = false);
static void fill_input_plane_pair(const FullBoard& board,
std::vector<float>::iterator black,
std::vector<float>::iterator white,
const int symmetry);
bool probe_cache(const GameState* const state, Network::Netresult& result);
std::unique_ptr<ForwardPipe> m_forward;
#ifdef USE_OPENCL_SELFCHECK
void compare_net_outputs(Netresult& data, Netresult& ref);
std::unique_ptr<ForwardPipe> m_forward_cpu;

// records the result of most recent selfchecks
std::deque<bool> m_selfcheck_fails;

// mutex that protects m_selfcheck_fails
SMP::Mutex m_selfcheck_mutex;
#endif

NNCache m_nncache;
Expand Down

0 comments on commit b323d40

Please sign in to comment.