Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Runtime selection of fp16/fp32 #1649

Merged
merged 6 commits into from
Jul 25, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
6 changes: 6 additions & 0 deletions src/GTP.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,9 @@ bool cfg_dumbpass;
std::vector<int> cfg_gpus;
bool cfg_sgemm_exhaustive;
bool cfg_tune_only;
#ifdef USE_HALF
bool cfg_use_half;
#endif
#endif
float cfg_puct;
float cfg_softmax_temp;
Expand Down Expand Up @@ -101,6 +104,9 @@ void GTP::setup_default_parameters() {
cfg_gpus = { };
cfg_sgemm_exhaustive = false;
cfg_tune_only = false;
#ifdef USE_HALF
cfg_use_half = false;
#endif
#endif
cfg_puct = 0.8f;
cfg_softmax_temp = 1.0f;
Expand Down
3 changes: 3 additions & 0 deletions src/GTP.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@ extern bool cfg_dumbpass;
extern std::vector<int> cfg_gpus;
extern bool cfg_sgemm_exhaustive;
extern bool cfg_tune_only;
#ifdef USE_HALF
extern bool cfg_use_half;
#endif
#endif
extern float cfg_puct;
extern float cfg_softmax_temp;
Expand Down
9 changes: 9 additions & 0 deletions src/Leela.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,9 @@ static void parse_commandline(int argc, char *argv[]) {
"ID of the OpenCL device(s) to use (disables autodetection).")
("full-tuner", "Try harder to find an optimal OpenCL tuning.")
("tune-only", "Tune OpenCL only and then exit.")
#ifdef USE_HALF
("use-half", "Use half-precision OpenCL code. Trades off some accuracy for higher performance")
#endif
;
#endif
po::options_description selfplay_desc("Self-play options");
Expand Down Expand Up @@ -318,6 +321,12 @@ static void parse_commandline(int argc, char *argv[]) {
if (vm.count("tune-only")) {
cfg_tune_only = true;
}

#ifdef USE_HALF
if (vm.count("use-half")) {
cfg_use_half = true;
}
#endif
#endif

if (vm.count("benchmark")) {
Expand Down
105 changes: 79 additions & 26 deletions src/Network.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -345,26 +345,41 @@ void Network::initialize(int playouts, const std::string & weightsfile) {

std::vector<ForwardPipe*> to_init;

bool use_selfcheck = true;
#ifdef USE_OPENCL
if (cfg_cpu_only) {
myprintf("Initializing CPU-only evaluation.\n");
m_forward = std::make_unique<CPUPipe>();
use_selfcheck = false;
} else {
myprintf("Initializing OpenCL.\n");
m_forward = std::make_unique<OpenCLScheduler>();
}
#ifdef USE_HALF
if (cfg_use_half) {
myprintf("Initializing OpenCL (half precision).\n");
m_forward = std::make_unique<OpenCLScheduler<half_float::half>>();
} else {
myprintf("Initializing OpenCL (single precision).\n");
m_forward = std::make_unique<OpenCLScheduler<float>>();
}
#else
myprintf("Initializing OpenCL (single precision).\n");
m_forward = std::make_unique<OpenCLScheduler<float>>();
#endif
}

#else //!USE_OPENCL
myprintf("Initializing CPU-only evaluation.\n");
m_forward = std::make_unique<CPUPipe>();
use_selfcheck = false;
#endif

to_init.emplace_back(m_forward.get());

#ifdef USE_OPENCL_SELFCHECK
if (!cfg_cpu_only) {
if (use_selfcheck) {
m_forward_cpu = std::make_unique<CPUPipe>();
to_init.emplace_back(m_forward_cpu.get());
}
#else
(void)use_selfcheck;
#endif

for (const auto& p : to_init) {
Expand Down Expand Up @@ -477,7 +492,7 @@ T relative_difference(const T a, const T b) {
return std::numeric_limits<T>::max();
}

constexpr auto small_number = 1e-3f;
constexpr auto small_number = 1.0f/361.0f;
auto fa = std::fabs(a);
auto fb = std::fabs(b);

Expand All @@ -495,20 +510,50 @@ T relative_difference(const T a, const T b) {
return fabs(fa - fb) / std::min(fa, fb);
}

void compare_net_outputs(std::vector<float>& data,
std::vector<float>& ref) {
// We accept an error up to 5%, but output values
// smaller than 1/1000th are "rounded up" for the comparison.
constexpr auto relative_error = 5e-2f;
for (auto idx = size_t{0}; idx < data.size(); ++idx) {
const auto err = relative_difference(data[idx], ref[idx]);
#endif

#ifdef USE_OPENCL_SELFCHECK
void Network::compare_net_outputs(Netresult& data,
Netresult& ref) {
// We accept an error up to 20%, but output values
// smaller than 1/361th are "rounded up" for the comparison.
constexpr auto relative_error = 2e-1f;

// assert-fail when we hit 3 failures out of last 10 checks
constexpr auto max_failures = 3;
constexpr auto last_failure_window = 10;

auto selfcheck_fail = false;
for (auto idx = size_t{0}; idx < data.policy.size(); ++idx) {
const auto err = relative_difference(data.policy[idx], ref.policy[idx]);
if (err > relative_error) {
printf("Error in OpenCL calculation: expected %f got %f "
"(error=%f%%)\n", ref[idx], data[idx], err * 100.0);
printf("Update your GPU drivers or reduce the amount of games "
selfcheck_fail = true;
break;
}
}
const auto err_pass = relative_difference(data.policy_pass, ref.policy_pass);
const auto err_winrate = relative_difference(data.winrate, ref.winrate);
if (err_pass > relative_error) {
selfcheck_fail = true;
}
if (err_winrate > relative_error) {
selfcheck_fail = true;
}

LOCK(m_selfcheck_mutex, selfcheck_lock);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you might be able to get rid of this lock by making m_selfcheck_fails a bitfield and CAS-ing it. (Optional)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thought about quickly, but it seems to be more trouble than necessary - we need to implement an array of bitfields and then implement a circular buffer to track last N pass/fails. This lock is unlikely to be performance-critical anyway (only happens one out of 2000 evals).

Any better idea is welcome but I can't figure out how to make it simple :)

Copy link
Member

@gcp gcp Jul 25, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

only happens one out of 2000 evals

That's a good point and makes my idea moot.

For reference:
You don't need an array or circular buffer. You just add the latest result like this bitfield = (bitfield << 1 | result) & 0xA; i.e. just relying on the shift and mask. Checking failure then is popcount(bitfield) > 2. Because the bitfield can be an int32, it can be atomically CASed and no lock is needed.

if (selfcheck_fail) {
m_selfcheck_fails.push_back(true);
if (std::count(begin(m_selfcheck_fails), end(m_selfcheck_fails), true) >= max_failures) {
printf("Error in OpenCL calculation: Update your GPU drivers or reduce the amount of games "
"played simultaneously.\n");
throw std::runtime_error("OpenCL self-check mismatch.");
}
} else {
m_selfcheck_fails.push_back(false);
}

while (m_selfcheck_fails.size() >= last_failure_window) {
m_selfcheck_fails.pop_front();
}
}
#endif
Expand Down Expand Up @@ -598,6 +643,16 @@ Network::Netresult Network::get_output(
assert(symmetry == -1);
const auto rand_sym = Random::get_Rng().randfix<NUM_SYMMETRIES>();
result = get_output_internal(state, rand_sym);
#ifdef USE_OPENCL_SELFCHECK
// Both implementations are available, self-check the OpenCL driver by
// running both with a probability of 1/2000.
// selfcheck is done here because this is the only place NN evaluation is done
// on actual gameplay.
if (m_forward_cpu != nullptr && Random::get_Rng().randfix<SELFCHECK_PROBABILITY>() == 0) {
auto result_ref = get_output_internal(state, rand_sym, true);
compare_net_outputs(result, result_ref);
}
#endif
}

// v2 format (ELF Open Go) returns black value, not stm
Expand All @@ -614,25 +669,23 @@ Network::Netresult Network::get_output(
}

Network::Netresult Network::get_output_internal(
const GameState* const state, const int symmetry) {
const GameState* const state, const int symmetry, bool selfcheck) {
assert(symmetry >= 0 && symmetry < NUM_SYMMETRIES);
constexpr auto width = BOARD_SIZE;
constexpr auto height = BOARD_SIZE;

const auto input_data = gather_features(state, symmetry);
std::vector<float> policy_data(OUTPUTS_POLICY * width * height);
std::vector<float> value_data(OUTPUTS_VALUE * width * height);
m_forward->forward(input_data, policy_data, value_data);
#ifdef USE_OPENCL_SELFCHECK
// Both implementations are available, self-check the OpenCL driver by
// running both with a probability of 1/2000.
if (m_forward_cpu != nullptr && Random::get_Rng().randfix<SELFCHECK_PROBABILITY>() == 0) {
auto cpu_policy_data = std::vector<float>(policy_data.size());
auto cpu_value_data = std::vector<float>(value_data.size());
m_forward_cpu->forward(input_data, cpu_policy_data, cpu_value_data);
compare_net_outputs(policy_data, cpu_policy_data);
compare_net_outputs(value_data, cpu_value_data);
if (selfcheck) {
m_forward_cpu->forward(input_data, policy_data, value_data);
} else {
m_forward->forward(input_data, policy_data, value_data);
}
#else
m_forward->forward(input_data, policy_data, value_data);
(void) selfcheck;
#endif

// Get the moves
Expand Down
14 changes: 13 additions & 1 deletion src/Network.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

#include "config.h"

#include <deque>
#include <array>
#include <memory>
#include <string>
Expand All @@ -39,6 +40,10 @@
#include "OpenCLScheduler.h"
#endif

#ifdef USE_OPENCL_SELFCHECK
#include "SMP.h"
#endif


// Winograd filter transformation changes 3x3 filters to M + 3 - 1
constexpr auto WINOGRAD_M = 4;
Expand Down Expand Up @@ -104,15 +109,22 @@ class Network {
const std::vector<float>& V,
std::vector<float>& M, const int C, const int K);
Netresult get_output_internal(const GameState* const state,
const int symmetry);
const int symmetry, bool selfcheck = false);
static void fill_input_plane_pair(const FullBoard& board,
std::vector<float>::iterator black,
std::vector<float>::iterator white,
const int symmetry);
bool probe_cache(const GameState* const state, Network::Netresult& result);
std::unique_ptr<ForwardPipe> m_forward;
#ifdef USE_OPENCL_SELFCHECK
void compare_net_outputs(Netresult& data, Netresult& ref);
std::unique_ptr<ForwardPipe> m_forward_cpu;

// records the result of most recent selfchecks
std::deque<bool> m_selfcheck_fails;

// mutex that protects m_selfcheck_fails
SMP::Mutex m_selfcheck_mutex;
#endif

NNCache m_nncache;
Expand Down