From f2fcf953bd1b3b6bc8ef19abc28747fc7f3e67b4 Mon Sep 17 00:00:00 2001
From: Petr Kurapov <petr.a.kurapov@intel.com>
Date: Fri, 25 Oct 2024 07:54:29 -0700
Subject: [PATCH 1/2] [GPU] Add 2d load/store ops validation

---
 lib/gc/Dialect/LLVMIR/IR/XeVMDialect.cpp      | 258 +++++++++++++++++-
 .../test/gc/Dialect/XeVM/xevm-invalid.mlir    | 171 ++++++++++++
 2 files changed, 426 insertions(+), 3 deletions(-)
 create mode 100644 test/mlir/test/gc/Dialect/XeVM/xevm-invalid.mlir
diff --git a/lib/gc/Dialect/LLVMIR/IR/XeVMDialect.cpp b/lib/gc/Dialect/LLVMIR/IR/XeVMDialect.cpp
index 9e0c25069..fc34fb029 100644
--- a/lib/gc/Dialect/LLVMIR/IR/XeVMDialect.cpp
+++ b/lib/gc/Dialect/LLVMIR/IR/XeVMDialect.cpp
@@ -9,6 +9,7 @@
 
 #include "mlir/Dialect/GPU/IR/CompilationInterfaces.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/Utils/StaticValueUtils.h"
 #include "mlir/IR/DialectImplementation.h"
 #include "llvm/ADT/TypeSwitch.h"
 
@@ -18,9 +19,260 @@ using namespace xevm;
 #include "gc/Dialect/LLVMIR/XeVMOpsDialect.cpp.inc"
 #include "gc/Dialect/LLVMIR/XeVMOpsEnums.cpp.inc"
 
-// TODO
-LogicalResult BlockLoad2dOp::verify() { return success(); }
-LogicalResult BlockStore2dOp::verify() { return success(); }
+namespace {
+constexpr uint32_t subgroupSize = 16;
+
+template <typename Op> LogicalResult verifyMatrixInput(Op op) {
+  static_assert(llvm::is_one_of<Op, BlockLoad2dOp, BlockStore2dOp>::value,
+                "Unexpected template parameter");
+
+  std::optional<int64_t> width = getConstantIntValue(op.getBaseWidth());
+  std::optional<int64_t> pitch = getConstantIntValue(op.getBasePitch());
+  if (pitch && width && *pitch < *width)
+    return op->emitOpError(
+        "4th operand (base pitch) should be >= 2nd operand (base width)");
+
+  if (op.getElemSizeInBits() != 8 && op.getElemSizeInBits() != 16 &&
+      op.getElemSizeInBits() != 32)
+    return op->emitOpError("expecting 'elem_size_in_bits' to be 8, 16, or 32");
+
+  uint32_t tileHeight = op.getTileHeight();
+  if (tileHeight != 1 && tileHeight != 2 && tileHeight != 4 &&
+      tileHeight != 8 && tileHeight != 16 && tileHeight != 32)
+    return op->emitOpError("expecting tile_height to be 1, 2, 4, 8, 16, or 32");
+
+  uint32_t vBlocks = op.getVBlocks();
+  if (vBlocks != 1 && vBlocks != 2 && vBlocks != 4 && vBlocks != 8)
+    return op->emitOpError("expecting v_blocks to be 1, 2, 4, or 8");
+
+  return success();
+}
+
+LogicalResult verify2DBlockLoadHWRestriction(BlockLoad2dOp op) {
+  VectorType resTy = op.getRes().getType();
+  unsigned resElemTySize = resTy.getElementType().getIntOrFloatBitWidth();
+  unsigned resSize = resTy.getNumElements() * resElemTySize;
+  unsigned expectedSize = op.getElemSizeInBits() * op.getTileHeight() *
+                          op.getTileWidth() * op.getVBlocks() / subgroupSize;
+  if (resSize != expectedSize)
+    return op.emitOpError() << "result size of " << resSize
+                            << " bits does not match the expected size of "
+                            << expectedSize << " bits";
+
+  if (op.getTranspose() && op.getVnniTransform())
+    return op.emitOpError(
+        "transpose and vnni_transform are mutually exclusive");
+
+  if (!op.getTranspose() && !op.getVnniTransform()) {
+    uint32_t tileHeight = op.getTileHeight();
+    if (tileHeight < 1 || tileHeight > 32)
+      return op.emitOpError("expecting tile_height to be between 1 and 32");
+
+    uint32_t tileWidth = op.getTileWidth();
+    uint32_t vBlocks = op.getVBlocks();
+    switch (op.getElemSizeInBits()) {
+    case 8:
+      if (tileWidth < 4 || tileWidth > 64)
+        return op.emitOpError("expecting tile_width to be between 4 and 64");
+      if (vBlocks != 1 && vBlocks != 2 && vBlocks != 4)
+        return op.emitOpError("expecting v_blocks to be 1, 2, or 4");
+      if (tileWidth * vBlocks > 64)
+        return op.emitOpError(
+            "tile_width * v_blocks should be less than or equal "
+            "to 64 for 8 bit elements");
+      break;
+    case 16:
+      if (tileWidth < 2 || tileWidth > 32)
+        return op.emitOpError("expecting tile_width to be between 2 and 32");
+      if (vBlocks != 1 && vBlocks != 2 && vBlocks != 4)
+        return op.emitOpError("expecting v_blocks to be 1, 2, or 4");
+      if (tileWidth * vBlocks > 32)
+        return op.emitOpError(
+            "tile_width * v_blocks should be less than or equal "
+            "to 32 for 16 bit elements");
+      break;
+    case 32:
+      if (tileWidth < 1 || tileWidth > 16)
+        return op.emitOpError("expecting tile_width to be between 1 and 16");
+      if (vBlocks != 1 && vBlocks != 2)
+        return op.emitOpError("expecting v_blocks to be 1 or 2");
+      if (tileWidth * vBlocks > 16)
+        return op.emitOpError(
+            "tile_width * v_blocks should be less than or equal "
+            "to 16 for 32 bit elements");
+      break;
+    case 64:
+      if (tileWidth < 1 || tileWidth > 8)
+        return op.emitOpError("expecting tile_width to be between 1 and 8");
+      if (vBlocks != 1)
+        return op.emitOpError("expecting v_blocks to be 1");
+      break;
+    default:
+      return op.emitOpError(
+          "expecting elem_size_in_bits to be 8, 16, 32, or 64");
+    }
+
+    return success();
+  }
+
+  if (op.getTranspose()) {
+    assert(!op.getVnniTransform() &&
+           "Expecting vnni_transform should be false");
+
+    uint32_t vBlocks = op.getVBlocks();
+    if (vBlocks != 1)
+      return op.emitOpError("expecting v_blocks to be 1");
+
+    uint32_t tileHeight = op.getTileHeight();
+    uint32_t tileWidth = op.getTileWidth();
+    switch (op.getElemSizeInBits()) {
+    case 32:
+      if (tileHeight < 1 || tileHeight > 32)
+        return op.emitOpError("expecting tile_height to be between 1 and 32");
+      if (tileWidth < 1 || tileWidth > 8)
+        return op.emitOpError("expecting tile_width to be between 1 and 8");
+      break;
+    case 64:
+      if (tileHeight != 8)
+        return op.emitOpError(
+            "expecting tile_height to be 8 for 64 bit elements");
+      if (tileWidth != 1 && tileWidth != 2 && tileWidth != 4)
+        return op.emitOpError("expecting tile_width to be 1, 2, or 4");
+      break;
+    default:
+      return op.emitOpError("transpose is only supported for 32 and 64 bit "
+                            "elements");
+    }
+
+    return success();
+  }
+
+  assert(op.getVnniTransform() && !op.getTranspose() &&
+         "Expecting vnni_transform should be true and transpose should be "
+         "false");
+
+  uint32_t vBlocks = op.getVBlocks();
+  if (vBlocks != 1 && vBlocks != 2 && vBlocks != 4)
+    return op.emitOpError("expecting v_blocks to be 1, 2, or 4");
+
+  uint32_t tileHeight = op.getTileHeight();
+  uint32_t tileWidth = op.getTileWidth();
+  switch (op.getElemSizeInBits()) {
+  case 8:
+    if (tileHeight < 4 || tileHeight > 32)
+      return op.emitOpError("expecting tile_height to be between 4 and 32");
+    if (tileWidth < 4 || tileWidth > 16)
+      return op.emitOpError("expecting tile_width to be between 4 and 16");
+    break;
+  case 16:
+    if (tileHeight < 2 || tileHeight > 32)
+      return op.emitOpError("expecting tile_height to be between 2 and 32");
+    if (tileWidth < 2 || tileWidth > 16)
+      return op.emitOpError("expecting tile_width to be between 2 and 16");
+    if (tileWidth * vBlocks > 32)
+      return op.emitOpError(
+          "tile_width * v_blocks should be less than or equal "
+          "to 32 for 16 bit elements");
+    break;
+  default:
+    return op.emitOpError("vnni_transform is only supported for 8 and 16 bit "
+                          "elements");
+  }
+
+  return success();
+}
+
+static LogicalResult verify2DBlockStoreHWRestriction(BlockStore2dOp op) {
+  uint32_t tileHeight = op.getTileHeight();
+  if (tileHeight < 1 || tileHeight > 8)
+    return op.emitOpError("expecting tile_height to be between 1 and 8");
+
+  uint32_t tileWidth = op.getTileWidth();
+  switch (op.getElemSizeInBits()) {
+  case 8:
+    if (tileWidth < 4 || tileWidth > 64)
+      return op.emitOpError("expecting tile_width to be between 4 and 64");
+    break;
+  case 16:
+    if (tileWidth < 2 || tileWidth > 32)
+      return op.emitOpError("expecting tile_width to be between 2 and 32");
+    break;
+  case 32:
+    if (tileWidth < 1 || tileWidth > 16)
+      return op.emitOpError("expecting tile_width to be between 1 and 16");
+    break;
+  case 64:
+    if (tileWidth < 1 || tileWidth > 8)
+      return op.emitOpError("expecting tile_width to be between 1 and 8");
+    break;
+  default:
+    return op.emitOpError("expecting elem_size_in_bits to be 8, 16, 32, or 64");
+  }
+
+  uint32_t vBlocks = op.getVBlocks();
+  if (vBlocks != 1)
+    return op.emitOpError("expecting v_blocks to be 1");
+  return success();
+}
+
+} // namespace
+
+LogicalResult BlockLoad2dOp::verify() {
+  if (verify2DBlockLoadHWRestriction(*this).failed())
+    return failure();
+
+  if (verifyMatrixInput(*this).failed())
+    return failure();
+
+  VectorType resTy = getRes().getType();
+  unsigned resElemTySize = resTy.getElementType().getIntOrFloatBitWidth();
+  if (getElemSizeInBits() == 32 || getVnniTransform()) {
+    if (resElemTySize != 32)
+      return emitOpError() << "expecting result element type to be 32 bits";
+  }
+
+  uint32_t tileWidth = getTileWidth();
+  if (getVnniTransform()) {
+    if (tileWidth != 16)
+      return emitOpError(
+          "tile_width when vnni_transform is true should be equal "
+          "to subgroup size (16 elements)");
+    return success();
+  }
+
+  return success();
+}
+
+LogicalResult BlockStore2dOp::verify() {
+  if (verify2DBlockStoreHWRestriction(*this).failed())
+    return failure();
+
+  if (verifyMatrixInput(*this).failed())
+    return failure();
+
+  uint32_t tileWidth = getTileWidth();
+  switch (getElemSizeInBits()) {
+  case 8:
+    if (tileWidth != 16 && tileWidth != 32)
+      return emitOpError("tile_width for 8 bit elements should be equal to "
+                         "16 or 32");
+    break;
+  case 16:
+    if (tileWidth != 16)
+      return emitOpError("tile_width for 16 bit elements should be equal "
+                         "to 16");
+    break;
+  case 32:
+    if (tileWidth != 16)
+      return emitOpError("tile_width for 32 bit elements should be equal "
+                         "to 16");
+    break;
+  default:
+    llvm_unreachable("unexpected element size");
+  }
+
+  return success();
+}
 
 void XeVMDialect::initialize() {
   // NOLINTBEGIN
diff --git a/test/mlir/test/gc/Dialect/XeVM/xevm-invalid.mlir b/test/mlir/test/gc/Dialect/XeVM/xevm-invalid.mlir
new file mode 100644
index 000000000..86d00167f
--- /dev/null
+++ b/test/mlir/test/gc/Dialect/XeVM/xevm-invalid.mlir
@@ -0,0 +1,171 @@
+// RUN: gc-opt -split-input-file -verify-diagnostics %s
+
+llvm.func @blockload2d(%ptr : !llvm.ptr, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) {
+  // expected-error @+1 {{'xevm.blockload2d' op transpose and vnni_transform are mutually exclusive}}
+  %0 = xevm.blockload2d %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=16, tile_height=16, v_blocks=1, transpose=true, vnni_transform=true, l1_cache_control=Default, l3_cache_control=Default} : (!llvm.ptr, i32, i32, i32, i32, i32) -> vector<4xi32>
+  llvm.return
+}
+
+// -----
+
+llvm.func @blockload2d(%ptr : !llvm.ptr, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) {
+  // expected-error @+1 {{'xevm.blockload2d' op expecting tile_height to be between 1 and 32}}
+  %0 = xevm.blockload2d %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=64, v_blocks=1, transpose=false, vnni_transform=false, l1_cache_control=Default, l3_cache_control=Default} : (!llvm.ptr, i32, i32, i32, i32, i32) -> vector<64xi16>
+  llvm.return
+}
+
+// -----
+
+llvm.func @blockload2d(%ptr : !llvm.ptr, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) {
+  // expected-error @+1 {{'xevm.blockload2d' op expecting tile_width to be between 4 and 64}}
+  %0 = xevm.blockload2d %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=128, tile_height=8, v_blocks=1, transpose=false, vnni_transform=false, l1_cache_control=Default, l3_cache_control=Default} : (!llvm.ptr, i32, i32, i32, i32, i32) -> vector<16xi32>
+  llvm.return
+}
+
+// -----
+
+llvm.func @blockload2d(%ptr : !llvm.ptr, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) {
+  // expected-error @+1 {{'xevm.blockload2d' op expecting v_blocks to be 1, 2, or 4}}
+  %0 = xevm.blockload2d %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=6, transpose=false, vnni_transform=false, l1_cache_control=Default, l3_cache_control=Default} : (!llvm.ptr, i32, i32, i32, i32, i32) -> vector<48xi16>
+  llvm.return
+}
+
+// -----
+
+llvm.func @blockload2d(%ptr : !llvm.ptr, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) {
+  // expected-error @+1 {{'xevm.blockload2d' op tile_width * v_blocks should be less than or equal to 64 for 8 bit elements}}
+  %0 = xevm.blockload2d %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=4, transpose=false, vnni_transform=false, l1_cache_control=Default, l3_cache_control=Default} : (!llvm.ptr, i32, i32, i32, i32, i32) -> vector<32xi16>
+  llvm.return
+}
+
+// -----
+
+llvm.func @blockload2d(%ptr : !llvm.ptr, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) {
+  // expected-error @+1 {{'xevm.blockload2d' op transpose is only supported for 32 and 64 bit elements}}
+  %0 = xevm.blockload2d %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=1, transpose=true, vnni_transform=false, l1_cache_control=Default, l3_cache_control=Default} : (!llvm.ptr, i32, i32, i32, i32, i32) -> vector<8xi16>
+  llvm.return
+}
+
+// -----
+
+llvm.func @blockload2d(%ptr : !llvm.ptr, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) {
+  // expected-error @+1 {{'xevm.blockload2d' op vnni_transform is only supported for 8 and 16 bit elements}}
+  %0 = xevm.blockload2d %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=32, tile_width=8, tile_height=8, v_blocks=1, transpose=false, vnni_transform=true, l1_cache_control=Default, l3_cache_control=Default} : (!llvm.ptr, i32, i32, i32, i32, i32) -> vector<4xi32>
+  llvm.return
+}
+
+// -----
+
+llvm.func @blockload2d(%ptr : !llvm.ptr, %base_height : i32, %x : i32, %y : i32) {
+  %base_width = llvm.mlir.constant(4 : i32) : i32
+  %base_pitch = llvm.mlir.constant(2 : i32) : i32
+  // expected-error @+1 {{'xevm.blockload2d' op 4th operand (base pitch) should be >= 2nd operand (base width)}}
+  %0 = xevm.blockload2d %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=1, transpose=false, vnni_transform=false, l1_cache_control=Default, l3_cache_control=Default} : (!llvm.ptr, i32, i32, i32, i32, i32) -> vector<8xi16>
+  llvm.return
+}
+
+// -----
+
+llvm.func @blockload2d(%ptr : !llvm.ptr, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) {
+  // expected-error @+1 {{'xevm.blockload2d' op expecting 'elem_size_in_bits' to be 8, 16, or 32}}
+  %0 = xevm.blockload2d %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=64, tile_width=4, tile_height=8, v_blocks=1, transpose=false, vnni_transform=false, l1_cache_control=Default, l3_cache_control=Default} : (!llvm.ptr, i32, i32, i32, i32, i32) -> vector<8xi16>
+  llvm.return
+}
+
+// -----
+
+llvm.func @blockload2d(%ptr : !llvm.ptr, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) {
+  // expected-error @+1 {{'xevm.blockload2d' op expecting tile_height to be 1, 2, 4, 8, 16, or 32}}
+  %0 = xevm.blockload2d %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=24, v_blocks=1, transpose=false, vnni_transform=false, l1_cache_control=Default, l3_cache_control=Default} : (!llvm.ptr, i32, i32, i32, i32, i32) -> vector<24xi16>
+  llvm.return
+}
+
+// -----
+
+llvm.func @blockload2d(%ptr : !llvm.ptr, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) {
+  // expected-error @+1 {{'xevm.blockload2d' op tile_width when vnni_transform is true should be equal to subgroup size (16 elements)}}
+  %0 = xevm.blockload2d %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=8, tile_height=8, v_blocks=1, transpose=false, vnni_transform=true, l1_cache_control=Default, l3_cache_control=Default} : (!llvm.ptr, i32, i32, i32, i32, i32) -> vector<1xi32>
+  llvm.return
+}
+
+// -----
+
+llvm.func @blockload2d(%ptr : !llvm.ptr, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) {
+  // expected-error @+1 {{'xevm.blockload2d' op expecting result element type to be 32 bits}}
+  %0 = xevm.blockload2d %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=32, tile_width=8, tile_height=8, v_blocks=1, transpose=false, vnni_transform=false, l1_cache_control=Default, l3_cache_control=Default} : (!llvm.ptr, i32, i32, i32, i32, i32) -> vector<8xi16>
+  llvm.return
+}
+
+// -----
+
+llvm.func @blockstore2d(%ptr : !llvm.ptr, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32, %stored_val : vector<64xi8>) {
+  // expected-error @+1 {{'xevm.blockstore2d' op expecting tile_height to be between 1 and 8}}
+  xevm.blockstore2d %ptr, %base_width, %base_height, %base_pitch, %x, %y, %stored_val {elem_size_in_bits=8, tile_width=32, tile_height=64, v_blocks=1, l1_cache_control=Default, l3_cache_control=Default} : (!llvm.ptr, i32, i32, i32, i32, i32, vector<64xi8>)
+  llvm.return
+}
+
+// -----
+
+llvm.func @blockstore2d(%ptr : !llvm.ptr, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32, %stored_val : vector<4xi8>) {
+  // expected-error @+1 {{'xevm.blockstore2d' op expecting tile_width to be between 4 and 64}}
+  xevm.blockstore2d %ptr, %base_width, %base_height, %base_pitch, %x, %y, %stored_val {elem_size_in_bits=8, tile_width=2, tile_height=8, v_blocks=1, l1_cache_control=Default, l3_cache_control=Default} : (!llvm.ptr, i32, i32, i32, i32, i32, vector<4xi8>)
+  llvm.return
+}
+
+// -----
+
+llvm.func @blockstore2d(%ptr : !llvm.ptr, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32, %stored_val : vector<8xi8>) {
+  // expected-error @+1 {{'xevm.blockstore2d' op expecting v_blocks to be 1}}
+  xevm.blockstore2d %ptr, %base_width, %base_height, %base_pitch, %x, %y, %stored_val {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=2, l1_cache_control=Default, l3_cache_control=Default} : (!llvm.ptr, i32, i32, i32, i32, i32, vector<8xi8>)
+  llvm.return
+}
+
+// -----
+
+llvm.func @blockstore2d(%ptr : !llvm.ptr, %base_height : i32, %x : i32, %y : i32, %stored_val : vector<8xi8>) {
+  %base_width = llvm.mlir.constant(4 : i32) : i32
+  %base_pitch = llvm.mlir.constant(2 : i32) : i32
+  // expected-error @+1 {{'xevm.blockstore2d' op 4th operand (base pitch) should be >= 2nd operand (base width)}}
+  xevm.blockstore2d %ptr, %base_width, %base_height, %base_pitch, %x, %y, %stored_val {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=1, l1_cache_control=Default, l3_cache_control=Default} : (!llvm.ptr, i32, i32, i32, i32, i32, vector<8xi8>)
+  llvm.return
+}
+
+// -----
+
+llvm.func @blockstore2d(%ptr : !llvm.ptr, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32, %stored_val : vector<32xi16>) {
+  // expected-error @+1 {{'xevm.blockstore2d' op expecting 'elem_size_in_bits' to be 8, 16, or 32}}
+  xevm.blockstore2d %ptr, %base_width, %base_height, %base_pitch, %x, %y, %stored_val {elem_size_in_bits=64, tile_width=4, tile_height=8, v_blocks=1, l1_cache_control=Default, l3_cache_control=Default} : (!llvm.ptr, i32, i32, i32, i32, i32, vector<32xi16>)
+  llvm.return
+}
+
+// -----
+
+llvm.func @blockstore2d(%ptr : !llvm.ptr, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32, %stored_val : vector<64xi8>) {
+  // expected-error @+1 {{'xevm.blockstore2d' op expecting tile_height to be 1, 2, 4, 8, 16, or 32}}
+  xevm.blockstore2d %ptr, %base_width, %base_height, %base_pitch, %x, %y, %stored_val {elem_size_in_bits=8, tile_width=32, tile_height=6, v_blocks=1, l1_cache_control=Default, l3_cache_control=Default} : (!llvm.ptr, i32, i32, i32, i32, i32, vector<64xi8>)
+  llvm.return
+}
+
+// -----
+
+llvm.func @blockstore2d(%ptr : !llvm.ptr, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32, %stored_val : vector<4xi8>) {
+  // expected-error @+1 {{'xevm.blockstore2d' op tile_width for 8 bit elements should be equal to 16 or 32}}
+  xevm.blockstore2d %ptr, %base_width, %base_height, %base_pitch, %x, %y, %stored_val {elem_size_in_bits=8, tile_width=8, tile_height=8, v_blocks=1, l1_cache_control=Default, l3_cache_control=Default} : (!llvm.ptr, i32, i32, i32, i32, i32, vector<4xi8>)
+  llvm.return
+}
+
+// -----
+
+llvm.func @blockstore2d(%ptr : !llvm.ptr, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32, %stored_val : vector<8xi16>) {
+  // expected-error @+1 {{'xevm.blockstore2d' op tile_width for 16 bit elements should be equal to 16}}
+  xevm.blockstore2d %ptr, %base_width, %base_height, %base_pitch, %x, %y, %stored_val {elem_size_in_bits=16, tile_width=32, tile_height=8, v_blocks=1, l1_cache_control=Default, l3_cache_control=Default} : (!llvm.ptr, i32, i32, i32, i32, i32, vector<8xi16>)
+  llvm.return
+}
+
+// -----
+
+llvm.func @blockstore2d(%ptr : !llvm.ptr, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32, %stored_val : vector<4xi32>) {
+  // expected-error @+1 {{'xevm.blockstore2d' op tile_width for 32 bit elements should be equal to 16}}
+  xevm.blockstore2d %ptr, %base_width, %base_height, %base_pitch, %x, %y, %stored_val {elem_size_in_bits=32, tile_width=8, tile_height=8, v_blocks=1, l1_cache_control=Default, l3_cache_control=Default} : (!llvm.ptr, i32, i32, i32, i32, i32, vector<4xi32>)
+  llvm.return
+}

From df152b7581624e4b4efa9dcfd9a2a3ecbb265647 Mon Sep 17 00:00:00 2001
From: Petr Kurapov <petr.a.kurapov@intel.com>
Date: Mon, 28 Oct 2024 01:47:32 -0700
Subject: [PATCH 2/2] address review comments

---
 lib/gc/Dialect/LLVMIR/IR/XeVMDialect.cpp | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/lib/gc/Dialect/LLVMIR/IR/XeVMDialect.cpp b/lib/gc/Dialect/LLVMIR/IR/XeVMDialect.cpp
index fc34fb029..025b7d95c 100644
--- a/lib/gc/Dialect/LLVMIR/IR/XeVMDialect.cpp
+++ b/lib/gc/Dialect/LLVMIR/IR/XeVMDialect.cpp
@@ -12,6 +12,7 @@
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
 #include "mlir/IR/DialectImplementation.h"
 #include "llvm/ADT/TypeSwitch.h"
+#include "llvm/Support/MathExtras.h"
 
 using namespace mlir;
 using namespace xevm;
@@ -32,17 +33,16 @@ template <typename Op> LogicalResult verifyMatrixInput(Op op) {
     return op->emitOpError(
         "4th operand (base pitch) should be >= 2nd operand (base width)");
 
-  if (op.getElemSizeInBits() != 8 && op.getElemSizeInBits() != 16 &&
-      op.getElemSizeInBits() != 32)
+  uint32_t elemSize = op.getElemSizeInBits();
+  if (elemSize < 8 || !llvm::isPowerOf2_32(elemSize) || elemSize > 32)
     return op->emitOpError("expecting 'elem_size_in_bits' to be 8, 16, or 32");
 
   uint32_t tileHeight = op.getTileHeight();
-  if (tileHeight != 1 && tileHeight != 2 && tileHeight != 4 &&
-      tileHeight != 8 && tileHeight != 16 && tileHeight != 32)
+  if (tileHeight > 32 || !llvm::isPowerOf2_32(tileHeight))
     return op->emitOpError("expecting tile_height to be 1, 2, 4, 8, 16, or 32");
 
   uint32_t vBlocks = op.getVBlocks();
-  if (vBlocks != 1 && vBlocks != 2 && vBlocks != 4 && vBlocks != 8)
+  if (vBlocks > 8 || !llvm::isPowerOf2_32(vBlocks))
     return op->emitOpError("expecting v_blocks to be 1, 2, 4, or 8");
 
   return success();
@@ -50,6 +50,9 @@ template <typename Op> LogicalResult verifyMatrixInput(Op op) {
 
 LogicalResult verify2DBlockLoadHWRestriction(BlockLoad2dOp op) {
   VectorType resTy = op.getRes().getType();
+  if (!resTy.getElementType().isIntOrFloat())
+    return op.emitOpError()
+           << "expecting result element type to be int or float";
   unsigned resElemTySize = resTy.getElementType().getIntOrFloatBitWidth();
   unsigned resSize = resTy.getNumElements() * resElemTySize;
   unsigned expectedSize = op.getElemSizeInBits() * op.getTileHeight() *
@@ -225,6 +228,8 @@ LogicalResult BlockLoad2dOp::verify() {
     return failure();
 
   VectorType resTy = getRes().getType();
+  if (!resTy.getElementType().isIntOrFloat())
+    return emitOpError() << "expecting result element type to be int of float";
   unsigned resElemTySize = resTy.getElementType().getIntOrFloatBitWidth();
   if (getElemSizeInBits() == 32 || getVnniTransform()) {
     if (resElemTySize != 32)