Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Winograd F(4x4, 3x3) #1643

Merged
merged 2 commits into from
Jul 24, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
204 changes: 96 additions & 108 deletions src/CPUPipe.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,19 +44,16 @@ void CPUPipe::winograd_transform_in(const std::vector<float>& in,
const int C) {
constexpr auto W = BOARD_SIZE;
constexpr auto H = BOARD_SIZE;
constexpr auto WTILES = (W + 1) / 2;
constexpr auto P = WTILES * WTILES;

std::array<std::array<float, WTILES * 2 + 2>, WTILES * 2 + 2> in_pad;
for (auto xin = size_t{0}; xin < in_pad.size(); xin++) {
in_pad[0][xin] = 0.0f;
in_pad[H + 1][xin] = 0.0f;
in_pad[H + 2][xin] = 0.0f;
}
for (auto yin = size_t{1}; yin < in_pad[0].size() - 2; yin++) {
in_pad[yin][0] = 0.0f;
in_pad[yin][W + 1] = 0.0f;
in_pad[yin][W + 2] = 0.0f;
constexpr auto WTILES = WINOGRAD_WTILES;
constexpr auto P = WINOGRAD_P;

constexpr auto Wpad = 2 + WINOGRAD_M * WTILES;

std::array<std::array<float, Wpad>, Wpad> in_pad;
for (auto xin = size_t{0}; xin < Wpad; xin++) {
for (auto yin = size_t{0}; yin < Wpad; yin++) {
in_pad[yin][xin] = 0.0f;
}
}

for (auto ch = 0; ch < C; ch++) {
Expand All @@ -67,59 +64,48 @@ void CPUPipe::winograd_transform_in(const std::vector<float>& in,
}
for (auto block_y = 0; block_y < WTILES; block_y++) {
// Tiles overlap by 2
const auto yin = 2 * block_y;
const auto yin = WINOGRAD_M * block_y;
for (auto block_x = 0; block_x < WTILES; block_x++) {
const auto xin = 2 * block_x;

// Calculates transpose(B).x.B
// B = [[ 1.0, 0.0, 0.0, 0.0],
// [ 0.0, 1.0, -1.0, 1.0],
// [-1.0, 1.0, 1.0, 0.0],
// [ 0.0, 0.0, 0.0, -1.0]]
const auto xin = WINOGRAD_M * block_x;

using WinogradTile =
std::array<std::array<float, Network::WINOGRAD_ALPHA>, Network::WINOGRAD_ALPHA>;
std::array<std::array<float, WINOGRAD_ALPHA>, WINOGRAD_ALPHA>;
WinogradTile T1, T2;

T1[0][0] = in_pad[yin + 0][xin + 0] - in_pad[yin + 2][xin + 0];
T1[0][1] = in_pad[yin + 0][xin + 1] - in_pad[yin + 2][xin + 1];
T1[0][2] = in_pad[yin + 0][xin + 2] - in_pad[yin + 2][xin + 2];
T1[0][3] = in_pad[yin + 0][xin + 3] - in_pad[yin + 2][xin + 3];
T1[1][0] = in_pad[yin + 1][xin + 0] + in_pad[yin + 2][xin + 0];
T1[1][1] = in_pad[yin + 1][xin + 1] + in_pad[yin + 2][xin + 1];
T1[1][2] = in_pad[yin + 1][xin + 2] + in_pad[yin + 2][xin + 2];
T1[1][3] = in_pad[yin + 1][xin + 3] + in_pad[yin + 2][xin + 3];
T1[2][0] = in_pad[yin + 2][xin + 0] - in_pad[yin + 1][xin + 0];
T1[2][1] = in_pad[yin + 2][xin + 1] - in_pad[yin + 1][xin + 1];
T1[2][2] = in_pad[yin + 2][xin + 2] - in_pad[yin + 1][xin + 2];
T1[2][3] = in_pad[yin + 2][xin + 3] - in_pad[yin + 1][xin + 3];
T1[3][0] = in_pad[yin + 1][xin + 0] - in_pad[yin + 3][xin + 0];
T1[3][1] = in_pad[yin + 1][xin + 1] - in_pad[yin + 3][xin + 1];
T1[3][2] = in_pad[yin + 1][xin + 2] - in_pad[yin + 3][xin + 2];
T1[3][3] = in_pad[yin + 1][xin + 3] - in_pad[yin + 3][xin + 3];

T2[0][0] = T1[0][0] - T1[0][2];
T2[0][1] = T1[0][1] + T1[0][2];
T2[0][2] = T1[0][2] - T1[0][1];
T2[0][3] = T1[0][1] - T1[0][3];
T2[1][0] = T1[1][0] - T1[1][2];
T2[1][1] = T1[1][1] + T1[1][2];
T2[1][2] = T1[1][2] - T1[1][1];
T2[1][3] = T1[1][1] - T1[1][3];
T2[2][0] = T1[2][0] - T1[2][2];
T2[2][1] = T1[2][1] + T1[2][2];
T2[2][2] = T1[2][2] - T1[2][1];
T2[2][3] = T1[2][1] - T1[2][3];
T2[3][0] = T1[3][0] - T1[3][2];
T2[3][1] = T1[3][1] + T1[3][2];
T2[3][2] = T1[3][2] - T1[3][1];
T2[3][3] = T1[3][1] - T1[3][3];
const auto Bt = std::array<float, WINOGRAD_TILE>
{1.0f, 0.0f, -5.0f/2.0f, 0.0f, 1.0f, 0.0f,
0.0f, -SQ2, -2.0f, SQ2/2.0f, 1.0f, 0.0f,
0.0f, SQ2, -2.0f, -SQ2/2.0f, 1.0f, 0.0f,
0.0f, -SQ2/2.0f, -1.0f/2.0f, SQ2, 1.0f, 0.0f,
0.0f, SQ2/2.0f, -1.0f/2.0f, -SQ2, 1.0f, 0.0f,
0.0f, 1.0f, 0.0f, -5.0f/2.0f, 0.0f, 1.0f};

// Calculates transpose(B).x.B
for (auto i = 0; i < WINOGRAD_ALPHA; i++){
for (auto j = 0; j < WINOGRAD_ALPHA; j++) {
auto acc = 0.0f;
for (auto k = 0; k < WINOGRAD_ALPHA; k++) {
acc += Bt[i * WINOGRAD_ALPHA + k] * \
in_pad[yin + k][xin + j];
}
T1[i][j] = acc;
}
}

for (auto i = 0; i < WINOGRAD_ALPHA; i++){
for (auto j = 0; j < WINOGRAD_ALPHA; j++) {
auto acc = 0.0f;
for (auto k = 0; k < WINOGRAD_ALPHA; k++) {
acc += T1[i][k] * Bt[j * WINOGRAD_ALPHA + k];
}
T2[i][j] = acc;
}
}

const auto offset = ch * P + block_y * WTILES + block_x;
for (auto i = 0; i < Network::WINOGRAD_ALPHA; i++) {
for (auto j = 0; j < Network::WINOGRAD_ALPHA; j++) {
V[(i*Network::WINOGRAD_ALPHA + j)*C*P + offset] =
T2[i][j];
for (auto i = 0; i < WINOGRAD_ALPHA; i++) {
for (auto j = 0; j < WINOGRAD_ALPHA; j++) {
V[(i * WINOGRAD_ALPHA + j)*C*P + offset] = T2[i][j];
}
}
}
Expand All @@ -131,10 +117,9 @@ void CPUPipe::winograd_sgemm(const std::vector<float>& U,
const std::vector<float>& V,
std::vector<float>& M,
const int C, const int K) {
constexpr auto P =
(BOARD_SIZE + 1) * (BOARD_SIZE + 1) / Network::WINOGRAD_ALPHA;
constexpr auto P = WINOGRAD_P;

for (auto b = 0; b < Network::WINOGRAD_TILE; b++) {
for (auto b = 0; b < WINOGRAD_TILE; b++) {
const auto offset_u = b * K * C;
const auto offset_v = b * C * P;
const auto offset_m = b * K * P;
Expand All @@ -154,57 +139,62 @@ void CPUPipe::winograd_transform_out(const std::vector<float>& M,
const int K) {
constexpr auto W = BOARD_SIZE;
constexpr auto H = BOARD_SIZE;
constexpr auto WTILES = (W + 1) / 2;
constexpr auto P = WTILES * WTILES;
constexpr auto WTILES = WINOGRAD_WTILES;
constexpr auto P = WINOGRAD_P;

for (auto k = 0; k < K; k++) {
const auto kHW = k * W * H;
for (auto block_x = 0; block_x < WTILES; block_x++) {
const auto x = 2 * block_x;
const auto x = WINOGRAD_M * block_x;
for (auto block_y = 0; block_y < WTILES; block_y++) {
const auto y = 2 * block_y;
const auto y = WINOGRAD_M * block_y;

const auto b = block_y * WTILES + block_x;
using WinogradTile =
std::array<std::array<float, Network::WINOGRAD_ALPHA>, Network::WINOGRAD_ALPHA>;
std::array<std::array<float, WINOGRAD_ALPHA>, WINOGRAD_ALPHA>;
WinogradTile temp_m;
for (auto xi = 0; xi < Network::WINOGRAD_ALPHA; xi++) {
for (auto nu = 0; nu < Network::WINOGRAD_ALPHA; nu++) {
for (auto xi = 0; xi < WINOGRAD_ALPHA; xi++) {
for (auto nu = 0; nu < WINOGRAD_ALPHA; nu++) {
temp_m[xi][nu] =
M[xi*(Network::WINOGRAD_ALPHA*K*P) + nu*(K*P)+ k*P + b];
M[(xi*WINOGRAD_ALPHA + nu)*K*P + k*P + b];
}
}

const auto At = std::array<float, WINOGRAD_ALPHA * WINOGRAD_M>
{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.0f,
0.0f, SQ2/2.0f, -SQ2/2.0f, SQ2, -SQ2, 0.0f,
0.0f, 1.0f/2.0f, 1.0f/2.0f, 2.0f, 2.0f, 0.0f,
0.0f, SQ2/4.0f, -SQ2/4.0f, 2.0f*SQ2, -2.0f*SQ2, 1.0f};

std::array<std::array<float, WINOGRAD_ALPHA>, WINOGRAD_M> temp;
std::array<std::array<float, WINOGRAD_M>, WINOGRAD_M> o;

// Calculates transpose(A).temp_m.A
// A = [1.0, 0.0],
// [1.0, 1.0],
// [1.0, -1.0],
// [0.0, -1.0]]

const std::array<std::array<float, 2>, 2> o = {
temp_m[0][0] + temp_m[0][1] + temp_m[0][2] +
temp_m[1][0] + temp_m[1][1] + temp_m[1][2] +
temp_m[2][0] + temp_m[2][1] + temp_m[2][2],
temp_m[0][1] - temp_m[0][2] - temp_m[0][3] +
temp_m[1][1] - temp_m[1][2] - temp_m[1][3] +
temp_m[2][1] - temp_m[2][2] - temp_m[2][3],
temp_m[1][0] + temp_m[1][1] + temp_m[1][2] -
temp_m[2][0] - temp_m[2][1] - temp_m[2][2] -
temp_m[3][0] - temp_m[3][1] - temp_m[3][2],
temp_m[1][1] - temp_m[1][2] - temp_m[1][3] -
temp_m[2][1] + temp_m[2][2] + temp_m[2][3] -
temp_m[3][1] + temp_m[3][2] + temp_m[3][3]
};

const auto y_ind = kHW + (y)*W + (x);
Y[y_ind] = o[0][0];
if (x + 1 < W) {
Y[y_ind + 1] = o[0][1];
for (auto i = 0; i < WINOGRAD_M; i++){
for (auto j = 0; j < WINOGRAD_ALPHA; j++) {
auto acc = 0.0f;
for (auto q = 0; q < WINOGRAD_ALPHA; q++) {
acc += At[i * WINOGRAD_ALPHA + q] * temp_m[q][j];
}
temp[i][j] = acc;
}
}
if (y + 1 < H) {
Y[y_ind + W] = o[1][0];
if (x + 1 < W) {
Y[y_ind + W + 1] = o[1][1];

for (auto i = 0; i < WINOGRAD_M; i++){
for (auto j = 0; j < WINOGRAD_M; j++) {
auto acc = 0.0f;
for (auto q = 0; q < WINOGRAD_ALPHA; q++) {
acc += temp[i][q] * At[j * WINOGRAD_ALPHA + q];
}
o[i][j] = acc;
}
}

const auto y_ind = k * H * W + y * W + x;
for (auto i = 0; i < WINOGRAD_M; i++) {
for (auto j = 0; j < WINOGRAD_M; j++) {
if (y + i < H && x + j < W) {
Y[y_ind + i * W + j] = o[i][j];
}
}
}
}
Expand All @@ -219,7 +209,7 @@ void CPUPipe::winograd_convolve3(const int outputs,
std::vector<float>& M,
std::vector<float>& output) {

constexpr unsigned int filter_len = Network::WINOGRAD_ALPHA * Network::WINOGRAD_ALPHA;
constexpr unsigned int filter_len = WINOGRAD_ALPHA * WINOGRAD_ALPHA;
const auto input_channels = U.size() / (outputs * filter_len);

winograd_transform_in(input, V, input_channels);
Expand Down Expand Up @@ -303,29 +293,27 @@ void CPUPipe::forward(const std::vector<float>& input,
std::vector<float>& output_pol,
std::vector<float>& output_val) {
// Input convolution
constexpr auto width = BOARD_SIZE;
constexpr auto height = BOARD_SIZE;
constexpr auto tiles = (width + 1) * (height + 1) / 4;
constexpr auto P = WINOGRAD_P;
// Calculate output channels
const auto output_channels = m_input_channels;
// input_channels is the maximum number of input channels of any
// convolution. Residual blocks are identical, but the first convolution
// might be bigger when the network has very few filters
const auto input_channels = std::max(static_cast<size_t>(output_channels),
static_cast<size_t>(Network::INPUT_CHANNELS));
auto conv_out = std::vector<float>(output_channels * width * height);
auto conv_out = std::vector<float>(output_channels * BOARD_SQUARES);

auto V = std::vector<float>(Network::WINOGRAD_TILE * input_channels * tiles);
auto M = std::vector<float>(Network::WINOGRAD_TILE * output_channels * tiles);
auto V = std::vector<float>(WINOGRAD_TILE * input_channels * P);
auto M = std::vector<float>(WINOGRAD_TILE * output_channels * P);

winograd_convolve3(output_channels, input, m_conv_weights[0], V, M, conv_out);
batchnorm<BOARD_SQUARES>(output_channels, conv_out,
m_batchnorm_means[0].data(),
m_batchnorm_stddivs[0].data());

// Residual tower
auto conv_in = std::vector<float>(output_channels * width * height);
auto res = std::vector<float>(output_channels * width * height);
auto conv_in = std::vector<float>(output_channels * BOARD_SQUARES);
auto res = std::vector<float>(output_channels * BOARD_SQUARES);
for (auto i = size_t{1}; i < m_conv_weights.size(); i += 2) {
auto output_channels = m_input_channels;
std::swap(conv_out, conv_in);
Expand Down
24 changes: 14 additions & 10 deletions src/Network.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,19 +97,23 @@ void Network::process_bn_var(std::vector<float>& weights, const float epsilon) {
std::vector<float> Network::winograd_transform_f(const std::vector<float>& f,
const int outputs,
const int channels) {
// F(2x2, 3x3) Winograd filter transformation
// F(4x4, 3x3) Winograd filter transformation
// transpose(G.dot(f).dot(G.transpose()))
// U matrix is transposed for better memory layout in SGEMM
auto U = std::vector<float>(WINOGRAD_TILE * outputs * channels);
const auto G = std::array<float, WINOGRAD_TILE>{ 1.0, 0.0, 0.0,
0.5, 0.5, 0.5,
0.5, -0.5, 0.5,
0.0, 0.0, 1.0};
auto temp = std::array<float, 12>{};
const auto G = std::array<float, 3 * WINOGRAD_ALPHA>
{ 1.0f, 0.0f, 0.0f,
-2.0f/3.0f, -SQ2/3.0f, -1.0f/3.0f,
-2.0f/3.0f, SQ2/3.0f, -1.0f/3.0f,
1.0f/6.0f, SQ2/6.0f, 1.0f/3.0f,
1.0f/6.0f, -SQ2/6.0f, 1.0f/3.0f,
0.0f, 0.0f, 1.0f};

auto temp = std::array<float, 3 * WINOGRAD_ALPHA>{};

for (auto o = 0; o < outputs; o++) {
for (auto c = 0; c < channels; c++) {
for (auto i = 0; i < 4; i++){
for (auto i = 0; i < WINOGRAD_ALPHA; i++){
for (auto j = 0; j < 3; j++) {
auto acc = 0.0f;
for (auto k = 0; k < 3; k++) {
Expand All @@ -119,13 +123,13 @@ std::vector<float> Network::winograd_transform_f(const std::vector<float>& f,
}
}

for (auto xi = 0; xi < 4; xi++) {
for (auto nu = 0; nu < 4; nu++) {
for (auto xi = 0; xi < WINOGRAD_ALPHA; xi++) {
for (auto nu = 0; nu < WINOGRAD_ALPHA; nu++) {
auto acc = 0.0f;
for (auto k = 0; k < 3; k++) {
acc += temp[xi*3 + k] * G[nu*3 + k];
}
U[xi * (4 * outputs * channels)
U[xi * (WINOGRAD_ALPHA * outputs * channels)
+ nu * (outputs * channels)
+ c * outputs
+ o] = acc;
Expand Down
13 changes: 9 additions & 4 deletions src/Network.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,15 @@
#include "OpenCLScheduler.h"
#endif


// Winograd filter transformation changes 3x3 filters to M + 3 - 1
constexpr auto WINOGRAD_M = 4;
constexpr auto WINOGRAD_ALPHA = WINOGRAD_M + 3 - 1;
constexpr auto WINOGRAD_WTILES = BOARD_SIZE / WINOGRAD_M + (BOARD_SIZE % WINOGRAD_M != 0);
constexpr auto WINOGRAD_TILE = WINOGRAD_ALPHA * WINOGRAD_ALPHA;
constexpr auto WINOGRAD_P = WINOGRAD_WTILES * WINOGRAD_WTILES;
constexpr auto SQ2 = 1.4142135623730951f; // Square root of 2

class Network {
public:
static constexpr auto NUM_SYMMETRIES = 8;
Expand All @@ -59,10 +68,6 @@ class Network {
static constexpr auto OUTPUTS_POLICY = 2;
static constexpr auto OUTPUTS_VALUE = 1;

// Winograd filter transformation changes 3x3 filters to 4x4
static constexpr auto WINOGRAD_ALPHA = 4;
static constexpr auto WINOGRAD_TILE = WINOGRAD_ALPHA * WINOGRAD_ALPHA;

void initialize(int playouts, const std::string & weightsfile);
void benchmark(const GameState * const state,
const int iterations = 1600);
Expand Down