Skip to content
This repository was archived by the owner on Aug 30, 2024. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions .editorconfig
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# EditorConfig is awesome: https://EditorConfig.org

# top-most EditorConfig file
root = true

# Unix-style newlines with a newline ending every file
[*]
end_of_line = lf
insert_final_newline = true
trim_trailing_whitespace = true

# C/C++ follows clang-format
[*.{c,cpp,h,hpp}]
indent_style = space
indent_size = 4
4 changes: 2 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,14 @@ if (NOT CMAKE_BUILD_TYPE)
endif()
if(UNIX)
else() # Windows
# Force CMake to use icx-cl rather than the default C++ compiler/linker
# Force CMake to use icx-cl rather than the default C++ compiler/linker
# (needed on Windows only)
# include (CMakeForceCompiler)
# CMAKE_FORCE_CXX_COMPILER (icx-cl IntelDPCPP)
set(CMAKE_CXX_COMPILER icx-cl)
include (Platform/Windows-Clang)
include(cmake/GTestExternal.cmake)
endif()
endif()

project(XeTLA)

Expand Down
79 changes: 43 additions & 36 deletions examples/01_gemm_universal/gemm_universal.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,13 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#include <tests/utils/utils.hpp>
#include "xetla.hpp"
#include <tests/utils/utils.hpp>

enum class kslicing_impl_t : uint8_t { none = 0, global = 1, local = 2 };

template <kslicing_impl_t kslicing_type = kslicing_impl_t::none>
template <gpu_arch arch_tag,
kslicing_impl_t kslicing_type = kslicing_impl_t::none>
void gemm_universal_run(uint32_t iter) {
// Tips, the example demonstrates programming kernel with XeTLA, it works as expected with current configurations.
// Please make sure you fully understand these configurations before you do any modifications, incomplete changes may lead to unexpected behaviors.
Expand Down Expand Up @@ -82,7 +83,7 @@ void gemm_universal_run(uint32_t iter) {
constexpr uint32_t num_local_splitk
= (kslicing_type == kslicing_impl_t::local) ? 2 : 1;

// Mirco-kernel configuration
// Micro-kernel configuration
using tune_option = dict_t<
elem_v_t<tune_key::param_optimizer_type,
tune_key_value::param_optimizer_decision_tree>,
Expand All @@ -102,8 +103,8 @@ void gemm_universal_run(uint32_t iter) {
data_type_c, // output datatype for C
mem_layout::row_major, // memory layout for C
8, // leading dimension alignment for C, in unit of element
data_type_acc, // accumulator data type for intermediate resutls
gpu_arch::Xe, // GPU arch
data_type_acc, // accumulator data type for intermediate results
arch_tag, // GPU arch
tune_option>;

// allocate temp buffers for global split
Expand Down Expand Up @@ -184,36 +185,42 @@ void gemm_universal_run(uint32_t iter) {
free(Cnt, context);
}

template <gpu_arch arch_tag>
struct main_wrapper {
static constexpr auto exec = []() {
// An example code for calculating matrix multiplication using
// GEMM_UNIVERSAL API:
// C = A x B
// The resulted matrix C is partitioned by the group range
// in to multiple blocks. The block matrix
// C<i_w, j_w>
// is computed by the workgroup with id: (0, i_w, j_w).
// (i_w, j_w) is an element in range specified by group range.
// Each thread with index (0, i_s, j_s) inside the same workgroup
// is responsible for a sub block of matrix multiplication, which is
// C<i_w, j_w>[i_s*sg_m:(i_s+1):sg_m,j_s*sg_n:(j_s+1)*sg_n]

// Alternatively, some threads can cooperate on the same sub block
// matrix given the same (i_s, j_s), i.e. the index space is extended
// from (0, i_s, j_s) to (k_s, i_s, j_s).

// Another method to achieve the same effect is to extend the index space
// in group range, i.e. from (0, i_w, j_w) to (k_w, i_w, j_w)

// More detailed description referring to the cooperation (kslicing) could
// be found in the example 01_gemm_universal with custom implementation

// basic gemm_universal
gemm_universal_run<arch_tag, kslicing_impl_t::none>(10);

// basic gemm_universal with workgroup cooperation
// gemm_universal_run<arch_tag, kslicing_impl_t::global>(10);

// basic gemm_universal with thread cooperation
// gemm_universal_run<arch_tag, kslicing_impl_t::local>(10);
};
};
int main() {
// An example code for calculating matrix multiplication using
// GEMM_UNIVERSAL API:
// C = A x B
// The resulted matrix C is partitioned by the group range
// in to multiple blocks. The block matrix
// C<i_w, j_w>
// is computed by the workgroup with id: (0, i_w, j_w).
// (i_w, j_w) is an element in range specified by group range.
// Each thread with index (0, i_s, j_s) inside the same workgroup
// is responsible for a sub block of matrix multiplication, which is
// C<i_w, j_w>[i_s*sg_m:(i_s+1):sg_m,j_s*sg_n:(j_s+1)*sg_n]

// Alternatively, some threads can cooperate on the same sub block
// matrix given the same (i_s, j_s), i.e. the index space is extended
// from (0, i_s, j_s) to (k_s, i_s, j_s).

// Another method to achieve the same effect is to extend the index space
// in group range, i.e. from (0, i_w, j_w) to (k_w, i_w, j_w)

// More detailed description referring to the cooperation (kslicing) could
// be found in the example 01_gemm_universal with custom implementation

// basic gemm_universal
gemm_universal_run<kslicing_impl_t::none>(10);

// basic gemm_universal with workgroup cooperation
// gemm_universal_run<kslicing_impl_t::global>(10);

// basic gemm_universal with thread cooperation
// gemm_universal_run<kslicing_impl_t::local>(10);
return (0);
dispatch_arch<main_wrapper>::exec();
return 0;
}
61 changes: 30 additions & 31 deletions examples/02_basic_gemm/basic_gemm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#include <tests/utils/utils.hpp>
#include "xetla.hpp"
#include <tests/utils/utils.hpp>

template <gpu_arch arch_tag_>
template <gpu_arch arch_tag>
void basic_gemm_run(sycl::queue queue, uint32_t iter) {
// Tips, the example demonstrates programming kernel with XeTLA, it works as expected with current configurations.
// Please make sure you fully understand these configurations before you do any modifications, incomplete changes may lead to unexpected behaviors.
Expand Down Expand Up @@ -110,11 +110,11 @@ void basic_gemm_run(sycl::queue queue, uint32_t iter) {
// should larger than 8
static constexpr uint32_t k_stride = 32;

// Step 1: define mirco-kernel's configuration
// Step 1: define Micro-kernel's configuration
using wg_shape = shape<wg_tile_n, wg_tile_m>;
using sg_shape = shape<sg_tile_n, sg_tile_m>;

// Mirco-kernel configuration
// Micro-kernel configuration
using gemm_tune_option
= dict_t<elem_t_t<tune_key::sg_tile_shape, sg_shape>,
elem_v_t<tune_key::prefetch_distance,
Expand All @@ -132,10 +132,10 @@ void basic_gemm_run(sycl::queue queue, uint32_t iter) {
8, // leading dimension for B, in unit of element
mem_space::
global, // memory reading from global mem for B
data_type_acc, // accumulator data type for intermediate resutls
data_type_acc, // accumulator data type for intermediate results
wg_shape, // computation tile shape
k_stride, // elements in each iteration
gpu_arch::Xe, // GPU arch
arch_tag, // GPU arch
gemm_tune_option>;
gemm_t gemm;

Expand All @@ -149,24 +149,26 @@ void basic_gemm_run(sycl::queue queue, uint32_t iter) {
mem_space::global, // memory writing to global mem for C
wg_shape, // computation tile shape
k_stride, // elements in each iteration
gpu_arch::Xe, // GPU arch
arch_tag, // GPU arch
epilogue_tune_option>;

// Step 3: define the shared local memory usages
// developers have the responsibility to set
// shared loacal memory through XeTLA API
// shared local memory through XeTLA API
static constexpr uint32_t barrier_count = gemm_t::barrier_count;
static constexpr uint32_t slm_size = gemm_t::slm_size;
static_assert(slm_size <= arch_attr_t<arch_tag>::local_mem_size,
"The local memory size excess!");
xetla_nbarrier_init<barrier_count>();
xetla_local_init<slm_size>();

// Step 4: ecah workgroup gets it individual index to start computation
// Step 4: each workgroup gets it individual index to start computation
int start_n = item.get_group(2) * wg_tile_n;
int start_m = item.get_group(1) * wg_tile_m;
// no slicing in K direction so start from zero for all WG
int start_k = 0;

// Each workgroup will compute all data in K based on no k_sliciing
// Each workgroup will compute all data in K based on no k_slicing
// The developer can set how much data a subgroup compute by k_stride
uint32_t wg_tile_k = matrix_k;
uint32_t inner_loop_count
Expand All @@ -183,7 +185,7 @@ void basic_gemm_run(sycl::queue queue, uint32_t iter) {
mem_desc_output_c md_c(
{C}, {matrix_n, matrix_m, ldc}, {start_n, start_m});

// Step 6: real calculation with accumulator varibales which suppose
// Step 6: real calculation with accumulator variables which suppose
// will be in register.
typename gemm_t::matAcc_t matAcc;
matAcc.init(0);
Expand All @@ -194,8 +196,7 @@ void basic_gemm_run(sycl::queue queue, uint32_t iter) {
// the results is in the matAcc rather than real output C
typename gemm_t::work_group_t g(item.get_local_linear_id());
gemm(g, matAcc, gemm_args);

// Step 7: write the results from matACC to real output C
// Step 7: write the results from matAcc to real output C
epilogue_t epilogue;
epilogue(g, matAcc, md_c);
});
Expand All @@ -220,23 +221,21 @@ void basic_gemm_run(sycl::queue queue, uint32_t iter) {
free(C, context);
}

template <gpu_arch arch_tag>
struct main_wrapper {
static constexpr auto exec = []() {
// This case shows how to use batch-reduce (br) GEMM microkernel to
// solve a standard GEMM
// Turn on the profiling property to facilitate subsequent profiling
sycl::property_list properties {
sycl::property::queue::enable_profiling()};

// Define SYCL queue, context and device
auto queue = sycl::queue(properties);
basic_gemm_run<arch_tag>(queue, 10);
};
};
int main() {
// This case shows how to use batch-reduce (br) GEMM microkernel to
// solve a standard GEMM
// Turn on the profiling property to facilitate subsequent profiling
sycl::property_list properties {sycl::property::queue::enable_profiling()};

// Define SYCL queue, context and device
auto queue = sycl::queue(properties);
auto device = queue.get_device();

// Detect the execution size, 8 for Arc, 16 for PVC.
int ExecSize
= device.get_info<ext::intel::info::device::gpu_eu_simd_width>();
if (ExecSize == 8) {
basic_gemm_run<gpu_arch::Dg2>(queue, 10);
} else {
basic_gemm_run<gpu_arch::Xe>(queue, 10);
}
return (0);
dispatch_arch<main_wrapper>::exec();
return 0;
}
10 changes: 5 additions & 5 deletions examples/03_gemm_relu_bias/gemm_relu_bias.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@
* limitations under the License.
*******************************************************************************/
#include <algorithm>
#include <tests/utils/utils.hpp>
#include "xetla.hpp"
#include <tests/utils/utils.hpp>

using namespace cl::sycl;
using namespace gpu::xetla;
Expand Down Expand Up @@ -140,7 +140,7 @@ void gemm_relu_bias_run(uint32_t iter) {
using epilogue_policy
= xetla::group::epilogue_policy_tile_op<tile_op_t, gpu_arch::Xe>;

// Mirco-kernel configuration
// Micro-kernel configuration
using tune_option = dict_t<
elem_v_t<tune_key::param_optimizer_type,
tune_key_value::param_optimizer_decision_tree>,
Expand All @@ -156,7 +156,7 @@ void gemm_relu_bias_run(uint32_t iter) {
data_type_c, // output datatype for C
mem_layout::row_major, // memory layout for C
8, // leading dimension alignment for C, in unit of element
data_type_acc, // accumulator data type for intermediate resutls
data_type_acc, // accumulator data type for intermediate results
gpu_arch::Xe, // GPU arch
tune_option>;
using gemm_op_t = typename default_config_t::type;
Expand Down Expand Up @@ -223,15 +223,15 @@ int main() {
// The purpose of this example is to illustrate the epilogue_t API in XeTLA.

// It allows user to implement multiple Ops inside a kernel call to avoid
// overheads in invokation, memory transfer, etc.
// overheads in invocation, memory transfer, etc.
// Take the following python code as an example:

// Original:
// > import torch as to
// > x = to.matmul(A, B)
// > y = to.nn.functional.relu(x)

// It takes two kernel invokations and the ReLU Op is a elementwise operation
// It takes two kernel invocations and the ReLU Op is a elementwise operation
// that could be fused into MatMul Op, which is basically calling GEMM kernel.

// Fusion:
Expand Down
6 changes: 3 additions & 3 deletions examples/04_gemm_polynomial/gemm_polynomial.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@
* limitations under the License.
*******************************************************************************/
#include <algorithm>
#include <tests/utils/utils.hpp>
#include "xetla.hpp"
#include <tests/utils/utils.hpp>

#include "gemm_polynomial.hpp"

Expand Down Expand Up @@ -137,7 +137,7 @@ void gemm_polynomial_run(int iter) {
using epilogue_policy
= xetla::group::epilogue_policy_tile_op<tile_op_t, gpu_arch::Xe>;

// Mirco-kernel configuration
// Micro-kernel configuration
using tune_option = dict_t<
elem_v_t<tune_key::param_optimizer_type,
tune_key_value::param_optimizer_decision_tree>,
Expand All @@ -154,7 +154,7 @@ void gemm_polynomial_run(int iter) {
data_type_c, // output datatype for C
mem_layout::row_major, // memory layout for C
8, // leading dimension alignment for C, in unit of element
data_type_acc, // accumulator data type for intermediate resutls
data_type_acc, // accumulator data type for intermediate results
gpu_arch::Xe, // GPU arch
tune_option>;

Expand Down
4 changes: 2 additions & 2 deletions examples/05_batch_gemm/batch_gemm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ void batch_gemm_run(uint32_t iter) {
using wg_shape = shape<wg_tile_n, wg_tile_m>;
using sg_shape = shape<sg_tile_n, sg_tile_m>;

// Mirco-kernel configuration
// Micro-kernel configuration
using tune_option
= dict_t<elem_v_t<tune_key::param_optimizer_type,
tune_key_value::param_optimizer_decision_tree>,
Expand All @@ -106,7 +106,7 @@ void batch_gemm_run(uint32_t iter) {
mem_layout::row_major, // memory layout for B
8, // leading dimension for B, in unit of element
mem_space::global, // memory reading from global mem for B
data_type_acc, // accumulator data type for intermediate resutls
data_type_acc, // accumulator data type for intermediate results
wg_shape, // computation tile shape
wg_tile_k, // elements in each iteration
gpu_arch::Xe, // GPU arch
Expand Down
4 changes: 2 additions & 2 deletions examples/05_batch_gemm/batch_gemm.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -173,8 +173,8 @@ class batch_gemm_t {
/// @return The size of local memory required.
__XETLA_API static constexpr uint32_t get_slm_size() {
constexpr uint32_t size = gemm_t::slm_size + epilogue_t::slm_size;
static_assert(size <= (128 * 1024),
"The local memory size should be less than 128KB!");
static_assert(size <= arch_attr_t<arch_tag>::local_mem_size,
"The local memory size excess!");
return size;
};

Expand Down
Loading