From 13faa3333b395e8bea8ae45cee32005ea90b2392 Mon Sep 17 00:00:00 2001
From: "Mei, Yijie" <yijie.mei@intel.com>
Date: Tue, 14 May 2024 15:07:57 +0800
Subject: [PATCH 01/19] add cpuruntime dialect

---
 include/gc/Dialect/CMakeLists.txt             |   1 +
 include/gc/Dialect/CPURuntime/CMakeLists.txt  |   2 +
 .../gc/Dialect/CPURuntime/IR/CMakeLists.txt   |   1 +
 .../Dialect/CPURuntime/IR/CPURuntimeDialect.h |  18 ++
 .../CPURuntime/IR/CPURuntimeDialect.td        |  34 ++++
 .../gc/Dialect/CPURuntime/IR/CPURuntimeOps.h  |  26 +++
 .../gc/Dialect/CPURuntime/IR/CPURuntimeOps.td |  72 ++++++++
 .../CPURuntime/Transforms/CMakeLists.txt      |   5 +
 .../CPURuntime/Transforms/CPURuntimePasses.h  |  29 ++++
 .../CPURuntime/Transforms/CPURuntimePasses.td |  57 +++++++
 lib/gc/Dialect/CMakeLists.txt                 |   1 +
 lib/gc/Dialect/CPURuntime/CMakeLists.txt      |   2 +
 lib/gc/Dialect/CPURuntime/IR/CMakeLists.txt   |  16 ++
 .../CPURuntime/IR/CPURuntimeDialect.cpp       |  26 +++
 .../Dialect/CPURuntime/IR/CPURuntimeOps.cpp   |  56 ++++++
 .../CPURuntime/Transforms/CMakeLists.txt      |  16 ++
 .../Transforms/CPURuntimePasses.cpp           |  77 +++++++++
 .../Transforms/CPURuntimeToLLVM.cpp           | 159 ++++++++++++++++++
 lib/gc/Transforms/CMakeLists.txt              |   2 +
 src/CMakeLists.txt                            |   1 +
 src/gc-opt/CMakeLists.txt                     |   2 +-
 src/gc-opt/gc-opt.cpp                         |   5 +-
 .../Dialect/CPURuntime/cpu-runner/printf.mlir |  17 ++
 .../CPURuntime/cpuruntime-atexit-to-omp.mlir  |  41 +++++
 .../CPURuntime/cpuruntime-to-llvm.mlir        |  19 +++
 25 files changed, 683 insertions(+), 2 deletions(-)
 create mode 100644 include/gc/Dialect/CPURuntime/CMakeLists.txt
 create mode 100644 include/gc/Dialect/CPURuntime/IR/CMakeLists.txt
 create mode 100644 include/gc/Dialect/CPURuntime/IR/CPURuntimeDialect.h
 create mode 100644 include/gc/Dialect/CPURuntime/IR/CPURuntimeDialect.td
 create mode 100644 include/gc/Dialect/CPURuntime/IR/CPURuntimeOps.h
 create mode 100644 include/gc/Dialect/CPURuntime/IR/CPURuntimeOps.td
 create mode 100644 include/gc/Dialect/CPURuntime/Transforms/CMakeLists.txt
 create mode 100644 include/gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.h
 create mode 100644 include/gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.td
 create mode 100644 lib/gc/Dialect/CPURuntime/CMakeLists.txt
 create mode 100644 lib/gc/Dialect/CPURuntime/IR/CMakeLists.txt
 create mode 100644 lib/gc/Dialect/CPURuntime/IR/CPURuntimeDialect.cpp
 create mode 100644 lib/gc/Dialect/CPURuntime/IR/CPURuntimeOps.cpp
 create mode 100644 lib/gc/Dialect/CPURuntime/Transforms/CMakeLists.txt
 create mode 100644 lib/gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.cpp
 create mode 100644 lib/gc/Dialect/CPURuntime/Transforms/CPURuntimeToLLVM.cpp
 create mode 100644 test/gc/Dialect/CPURuntime/cpu-runner/printf.mlir
 create mode 100644 test/gc/Dialect/CPURuntime/cpuruntime-atexit-to-omp.mlir
 create mode 100644 test/gc/Dialect/CPURuntime/cpuruntime-to-llvm.mlir

diff --git a/include/gc/Dialect/CMakeLists.txt b/include/gc/Dialect/CMakeLists.txt
index ffeda0aa7..a23f3f9f1 100644
--- a/include/gc/Dialect/CMakeLists.txt
+++ b/include/gc/Dialect/CMakeLists.txt
@@ -1,3 +1,4 @@
+add_subdirectory(CPURuntime)
 add_subdirectory(OnednnGraph)
 add_subdirectory(Microkernel)
 add_subdirectory(Linalgx)
\ No newline at end of file
diff --git a/include/gc/Dialect/CPURuntime/CMakeLists.txt b/include/gc/Dialect/CPURuntime/CMakeLists.txt
new file mode 100644
index 000000000..9f57627c3
--- /dev/null
+++ b/include/gc/Dialect/CPURuntime/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_subdirectory(IR)
+add_subdirectory(Transforms)
diff --git a/include/gc/Dialect/CPURuntime/IR/CMakeLists.txt b/include/gc/Dialect/CPURuntime/IR/CMakeLists.txt
new file mode 100644
index 000000000..fb73ae02b
--- /dev/null
+++ b/include/gc/Dialect/CPURuntime/IR/CMakeLists.txt
@@ -0,0 +1 @@
+add_mlir_dialect(CPURuntimeOps cpuruntime)
diff --git a/include/gc/Dialect/CPURuntime/IR/CPURuntimeDialect.h b/include/gc/Dialect/CPURuntime/IR/CPURuntimeDialect.h
new file mode 100644
index 000000000..757182964
--- /dev/null
+++ b/include/gc/Dialect/CPURuntime/IR/CPURuntimeDialect.h
@@ -0,0 +1,18 @@
+//===- CPURuntimeDialect.h - CPU Runtime dialect ----------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef CPURUNTIME_CPURUNTIMEDIALECT_H
+#define CPURUNTIME_CPURUNTIMEDIALECT_H
+
+#include "mlir/Bytecode/BytecodeOpInterface.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/Interfaces/DestinationStyleOpInterface.h"
+
+#include "gc/Dialect/CPURuntime/IR/CPURuntimeOpsDialect.h.inc"
+
+#endif // CPURUNTIME_CPURUNTIMEDIALECT_H
diff --git a/include/gc/Dialect/CPURuntime/IR/CPURuntimeDialect.td b/include/gc/Dialect/CPURuntime/IR/CPURuntimeDialect.td
new file mode 100644
index 000000000..06f3af526
--- /dev/null
+++ b/include/gc/Dialect/CPURuntime/IR/CPURuntimeDialect.td
@@ -0,0 +1,34 @@
+//===- CPURuntimeDialect.td - CPU Runtime Dialect ---------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef CPUPARALLEL_DIALECT
+#define CPUPARALLEL_DIALECT
+
+include "mlir/IR/OpBase.td"
+
+//===----------------------------------------------------------------------===//
+// CPURuntime dialect definition.
+//===----------------------------------------------------------------------===//
+
+def CPURuntime_Dialect : Dialect {
+    let name = "cpuruntime";
+    let summary = "A dialect for CPU parallel primitives.";
+    let description = [{
+        This dialect contains primitives for CPU runtime.
+    }];
+    let cppNamespace = "::mlir::cpuruntime";
+}
+
+//===----------------------------------------------------------------------===//
+// Base cpuruntime operation definition.
+//===----------------------------------------------------------------------===//
+
+class CPURuntime_Op<string mnemonic, list<Trait> traits = []> :
+        Op<CPURuntime_Dialect, mnemonic, traits>;
+
+#endif // CPUPARALLEL_DIALECT
diff --git a/include/gc/Dialect/CPURuntime/IR/CPURuntimeOps.h b/include/gc/Dialect/CPURuntime/IR/CPURuntimeOps.h
new file mode 100644
index 000000000..5ce667a91
--- /dev/null
+++ b/include/gc/Dialect/CPURuntime/IR/CPURuntimeOps.h
@@ -0,0 +1,26 @@
+//===- CPURuntimeOps.h - CPU Runtime Ops ====--------------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef CPURUNTIME_CPURUNTIMEOPS_H
+#define CPURUNTIME_CPURUNTIMEOPS_H
+
+#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/Interfaces/ControlFlowInterfaces.h"
+#include "mlir/Interfaces/InferTypeOpInterface.h"
+#include "mlir/Interfaces/SideEffectInterfaces.h"
+
+#define GET_OP_CLASSES
+#include "gc/Dialect/CPURuntime/IR/CPURuntimeOps.h.inc"
+
+#endif // CPURUNTIME_CPURUNTIMEOPS_H
diff --git a/include/gc/Dialect/CPURuntime/IR/CPURuntimeOps.td b/include/gc/Dialect/CPURuntime/IR/CPURuntimeOps.td
new file mode 100644
index 000000000..cc1b7c555
--- /dev/null
+++ b/include/gc/Dialect/CPURuntime/IR/CPURuntimeOps.td
@@ -0,0 +1,72 @@
+//===- CPURuntimeOps.td - CPU Runtime Ops -----------------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef CPURUNTIME_OPS
+#define CPURUNTIME_OPS
+
+include "gc/Dialect/CPURuntime/IR/CPURuntimeDialect.td"
+include "mlir/Interfaces/InferTypeOpInterface.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
+include "mlir/Interfaces/DestinationStyleOpInterface.td"
+include "mlir/Interfaces/ControlFlowInterfaces.td"
+include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.td"
+
+
+def CPURuntime_AtParallelExitOp : CPURuntime_Op<"at_parallel_exit", [
+       ParentOneOf<["scf::ForallOp", "scf::ParallelOp", "omp::WsloopOp", "memref::AllocaScopeOp"]>,
+       SingleBlockImplicitTerminator<"ParallelExitReturnOp">
+      ]> {
+  let summary = "Runs the block once in all threads at the exit of the parallel section";
+  let description = [{
+    It executes the block for each thread working in the parallel section for
+    once, at the exit of parallel section.
+  }];
+
+  let regions = (region SizedRegion<1>:$region);
+
+  let hasCustomAssemblyFormat = 1;
+
+  // The default builder does not add a region with an empty body, add our own.
+  let skipDefaultBuilders = 1;
+  let builders = [
+    OpBuilder<(ins)>,
+  ];
+}
+
+def CPURuntime_ParallelExitReturnOp : CPURuntime_Op<"parallel_exit.return", [
+       Pure,
+       HasParent<"AtParallelExitOp">,
+       Terminator, ReturnLike
+      ]> {
+  let summary = "Terminates at_parallel_exit block";
+  let description = [{
+    at_parallel_exit should ends with parallel_exit.return
+  }];
+  let assemblyFormat =
+      [{  attr-dict }];
+}
+
+
+def CPURuntime_PrintfOp : CPURuntime_Op<"printf", [MemoryEffects<[MemWrite]>]>,
+  Arguments<(ins StrAttr:$format,
+                Variadic<AnyTypeOf<[AnyInteger, Index, AnyFloat]>>:$args)> {
+  let summary = "C-style printf";
+  let description = [{
+    `cpuruntime.printf` takes a literal format string `format` and an arbitrary number of
+    scalar arguments that should be printed.
+
+    The format string is a C-style printf string, subject to any restrictions
+    imposed by one's target platform.
+  }];
+  let assemblyFormat = [{
+    $format attr-dict ($args^ `:` type($args))?
+  }];
+}
+
+
+#endif // CPURUNTIME_OPS
diff --git a/include/gc/Dialect/CPURuntime/Transforms/CMakeLists.txt b/include/gc/Dialect/CPURuntime/Transforms/CMakeLists.txt
new file mode 100644
index 000000000..763ffab86
--- /dev/null
+++ b/include/gc/Dialect/CPURuntime/Transforms/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(LLVM_TARGET_DEFINITIONS CPURuntimePasses.td)
+mlir_tablegen(CPURuntimePasses.h.inc --gen-pass-decls -name CPURuntime)
+mlir_tablegen(CPURuntimePasses.capi.h.inc -gen-pass-capi-header --prefix CPURuntime)
+mlir_tablegen(CPURuntimePasses.capi.cpp.inc -gen-pass-capi-impl --prefix CPURuntime)
+add_public_tablegen_target(MLIRCPURuntimePassesIncGen)
diff --git a/include/gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.h b/include/gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.h
new file mode 100644
index 000000000..8fde8f4fd
--- /dev/null
+++ b/include/gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.h
@@ -0,0 +1,29 @@
+//===- CPURuntimePasses.h - CPU Runtime Passes ------------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef CPURUNTIME_CPURUNTIMEPASSES_H
+#define CPURUNTIME_CPURUNTIMEPASSES_H
+
+#include "gc/Dialect/CPURuntime/IR/CPURuntimeDialect.h"
+#include "gc/Dialect/CPURuntime/IR/CPURuntimeOps.h"
+#include "mlir/Pass/Pass.h"
+#include <memory>
+
+namespace mlir {
+namespace cpuruntime {
+void registerConvertCPURuntimeToLLVMInterface(DialectRegistry &registry);
+
+#define GEN_PASS_DECL
+#include "gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.h.inc"
+
+#define GEN_PASS_REGISTRATION
+#include "gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.h.inc"
+} // namespace cpuruntime
+} // namespace mlir
+
+#endif
diff --git a/include/gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.td b/include/gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.td
new file mode 100644
index 000000000..0685ce498
--- /dev/null
+++ b/include/gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.td
@@ -0,0 +1,57 @@
+//===- CPURuntimePasses.td - CPU Runtime Passes -----------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef CPURUNTIME_PASS
+#define CPURUNTIME_PASS
+
+include "mlir/Pass/PassBase.td"
+
+
+def CPURuntimeAtExitToOmp: Pass<"cpuruntime-atexit-to-omp", "::mlir::func::FuncOp"> {
+  let summary = "Lower at_parallel_exit to code in omp.parallel section";
+  let description = [{
+    Switches the name of a FuncOp named `bar` to `foo` and folds.
+    ```
+    omp.parallel {
+      omp.wsloop for (%arg1, %arg2) : index = (%c0, %c0) to (%c1, %c512) step (%c1, %c1) {
+        memref.alloca_scope  {
+          cpuruntime.at_parallel_exit {
+            "your.op"()
+            cpuruntime.parallel_exit.return
+          }
+        }
+        omp.yield
+      }
+      omp.terminator
+    }
+    ```
+    Will be changed into
+    ```
+    omp.parallel {
+      omp.wsloop for  (%arg1, %arg2) : index = (%c0, %c0) to (%c1, %c512) step (%c1, %c1) {
+        memref.alloca_scope  {
+        }
+        omp.yield
+      }
+      "your.op"()
+      omp.terminator
+    }
+    ```
+  }];
+}
+
+
+def CPURuntimeToLLVM: Pass<"convert-cpuruntime-to-llvm"> {
+  let summary = "Convert cpuruntime to LLVM dialect";
+  let description = [{
+    This pass converts supported cpuruntime ops to LLVM dialect instructions.
+  }];
+  let dependentDialects = ["LLVM::LLVMDialect"];
+}
+
+#endif // CPURUNTIME_PASS
diff --git a/lib/gc/Dialect/CMakeLists.txt b/lib/gc/Dialect/CMakeLists.txt
index a880ff2ed..8720bd8e6 100644
--- a/lib/gc/Dialect/CMakeLists.txt
+++ b/lib/gc/Dialect/CMakeLists.txt
@@ -1,3 +1,4 @@
+add_subdirectory(CPURuntime)
 add_subdirectory(Linalgx)
 add_subdirectory(Microkernel)
 add_subdirectory(OnednnGraph)
diff --git a/lib/gc/Dialect/CPURuntime/CMakeLists.txt b/lib/gc/Dialect/CPURuntime/CMakeLists.txt
new file mode 100644
index 000000000..9f57627c3
--- /dev/null
+++ b/lib/gc/Dialect/CPURuntime/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_subdirectory(IR)
+add_subdirectory(Transforms)
diff --git a/lib/gc/Dialect/CPURuntime/IR/CMakeLists.txt b/lib/gc/Dialect/CPURuntime/IR/CMakeLists.txt
new file mode 100644
index 000000000..e349da72c
--- /dev/null
+++ b/lib/gc/Dialect/CPURuntime/IR/CMakeLists.txt
@@ -0,0 +1,16 @@
+add_mlir_dialect_library(MLIRCPURuntimeDialect
+        CPURuntimeDialect.cpp
+        CPURuntimeOps.cpp
+
+        ADDITIONAL_HEADER_DIRS
+        ${PROJECT_SOURCE_DIR}/include/
+
+        DEPENDS
+        MLIRCPURuntimeOpsIncGen
+        MLIRCPURuntimePassesIncGen
+
+        LINK_LIBS PUBLIC
+        MLIR
+        # MLIRInferTypeOpInterface
+        # MLIRFuncDialect
+        )
diff --git a/lib/gc/Dialect/CPURuntime/IR/CPURuntimeDialect.cpp b/lib/gc/Dialect/CPURuntime/IR/CPURuntimeDialect.cpp
new file mode 100644
index 000000000..9f3e97b57
--- /dev/null
+++ b/lib/gc/Dialect/CPURuntime/IR/CPURuntimeDialect.cpp
@@ -0,0 +1,26 @@
+//===- CPURuntimeDialect.cpp - CPU Runtime Dialect --------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "gc/Dialect/CPURuntime/IR/CPURuntimeDialect.h"
+#include "gc/Dialect/CPURuntime/IR/CPURuntimeOps.h"
+
+using namespace mlir;
+using namespace mlir::cpuruntime;
+
+#include "gc/Dialect/CPURuntime/IR/CPURuntimeOpsDialect.cpp.inc"
+
+//===----------------------------------------------------------------------===//
+// CPURuntime dialect.
+//===----------------------------------------------------------------------===//
+
+void CPURuntimeDialect::initialize() {
+  addOperations<
+#define GET_OP_LIST
+#include "gc/Dialect/CPURuntime/IR/CPURuntimeOps.cpp.inc"
+      >();
+}
diff --git a/lib/gc/Dialect/CPURuntime/IR/CPURuntimeOps.cpp b/lib/gc/Dialect/CPURuntime/IR/CPURuntimeOps.cpp
new file mode 100644
index 000000000..ca632e9db
--- /dev/null
+++ b/lib/gc/Dialect/CPURuntime/IR/CPURuntimeOps.cpp
@@ -0,0 +1,56 @@
+//===- CPURuntimeOps.cpp - CPU Runtime Ops ----------------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "gc/Dialect/CPURuntime/IR/CPURuntimeOps.h"
+#include "gc/Dialect/CPURuntime/IR/CPURuntimeDialect.h"
+
+#define GET_OP_CLASSES
+#include "gc/Dialect/CPURuntime/IR/CPURuntimeOps.cpp.inc"
+
+#include <llvm/Support/Debug.h>
+
+namespace mlir {
+using namespace bufferization;
+
+namespace cpuruntime {
+
+void AtParallelExitOp::build(OpBuilder &b, OperationState &result) {
+  OpBuilder::InsertionGuard g(b);
+  Region *bodyRegion = result.addRegion();
+  b.createBlock(bodyRegion);
+}
+
+void AtParallelExitOp::print(OpAsmPrinter &p) {
+  p << " ";
+  p.printRegion(getRegion(),
+                /*printEntryBlockArgs=*/false,
+                /*printBlockTerminators=*/true);
+  p.printOptionalAttrDict(getOperation()->getAttrs());
+}
+
+ParseResult AtParallelExitOp::parse(OpAsmParser &parser,
+                                    OperationState &result) {
+  auto &builder = parser.getBuilder();
+
+  SmallVector<OpAsmParser::Argument, 8> regionOperands;
+  std::unique_ptr<Region> region = std::make_unique<Region>();
+  if (parser.parseRegion(*region, regionOperands))
+    return failure();
+
+  if (region->empty())
+    OpBuilder(builder.getContext()).createBlock(region.get());
+  result.addRegion(std::move(region));
+
+  // Parse the optional attribute list.
+  if (parser.parseOptionalAttrDict(result.attributes))
+    return failure();
+  return success();
+}
+
+} // namespace cpuruntime
+} // namespace mlir
\ No newline at end of file
diff --git a/lib/gc/Dialect/CPURuntime/Transforms/CMakeLists.txt b/lib/gc/Dialect/CPURuntime/Transforms/CMakeLists.txt
new file mode 100644
index 000000000..ee6148aa4
--- /dev/null
+++ b/lib/gc/Dialect/CPURuntime/Transforms/CMakeLists.txt
@@ -0,0 +1,16 @@
+add_mlir_dialect_library(MLIRCPURuntimeTransforms
+        CPURuntimePasses.cpp
+        CPURuntimeToLLVM.cpp
+
+        ADDITIONAL_HEADER_DIRS
+        ${PROJECT_SOURCE_DIR}/include/
+
+        DEPENDS
+        MLIRCPURuntimePassesIncGen
+
+        LINK_LIBS PUBLIC
+        MLIRFuncDialect
+        MLIRCPURuntimeDialect
+        )
+
+set_property(GLOBAL APPEND PROPERTY GC_PASS_LIBS MLIRCPURuntimeTransforms)
\ No newline at end of file
diff --git a/lib/gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.cpp b/lib/gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.cpp
new file mode 100644
index 000000000..f2a098fcf
--- /dev/null
+++ b/lib/gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.cpp
@@ -0,0 +1,77 @@
+//===- CPURuntimePasses.cpp - CPU Runtime Passes ----------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Rewrite/FrozenRewritePatternSet.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+#include "gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.h"
+
+namespace mlir::cpuruntime {
+#define GEN_PASS_DEF_CPURUNTIMEATEXITTOOMP
+#include "gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.h.inc"
+
+namespace {
+
+class CPURuntimeAtExitToOmpRewriter
+    : public OpRewritePattern<AtParallelExitOp> {
+public:
+  using OpRewritePattern<AtParallelExitOp>::OpRewritePattern;
+  LogicalResult matchAndRewrite(AtParallelExitOp op,
+                                PatternRewriter &rewriter) const final {
+    auto parent = op->getParentOp();
+    Operation *secondLast = nullptr;
+    while (parent && (llvm::isa<memref::AllocaScopeOp>(parent) ||
+                      llvm::isa<omp::WsloopOp>(parent))) {
+      secondLast = parent;
+      parent = parent->getParentOp();
+    }
+    auto parallel = llvm::dyn_cast<omp::ParallelOp>(parent);
+    if (!parallel) {
+      return failure();
+    }
+    assert(secondLast->getBlock());
+    auto itr = secondLast->getBlock()->end();
+    --itr;
+    rewriter.inlineBlockBefore(&op->getRegion(0).getBlocks().front(),
+                               secondLast->getBlock(), itr);
+    rewriter.eraseOp(op);
+    return success();
+  }
+};
+
+class CPURuntimeExitReturnRewriter
+    : public OpRewritePattern<ParallelExitReturnOp> {
+public:
+  using OpRewritePattern<ParallelExitReturnOp>::OpRewritePattern;
+  LogicalResult matchAndRewrite(ParallelExitReturnOp op,
+                                PatternRewriter &rewriter) const final {
+    rewriter.eraseOp(op);
+    return success();
+  }
+};
+
+class CPURuntimeAtExitToOmp
+    : public impl::CPURuntimeAtExitToOmpBase<CPURuntimeAtExitToOmp> {
+public:
+  using impl::CPURuntimeAtExitToOmpBase<
+      CPURuntimeAtExitToOmp>::CPURuntimeAtExitToOmpBase;
+  void runOnOperation() final {
+    RewritePatternSet patterns(&getContext());
+    patterns.add<CPURuntimeExitReturnRewriter>(&getContext());
+    patterns.add<CPURuntimeAtExitToOmpRewriter>(&getContext());
+    FrozenRewritePatternSet patternSet(std::move(patterns));
+    if (failed(applyPatternsAndFoldGreedily(getOperation(), patternSet)))
+      signalPassFailure();
+  }
+};
+
+} // namespace
+} // namespace mlir::cpuruntime
diff --git a/lib/gc/Dialect/CPURuntime/Transforms/CPURuntimeToLLVM.cpp b/lib/gc/Dialect/CPURuntime/Transforms/CPURuntimeToLLVM.cpp
new file mode 100644
index 000000000..73cf14a84
--- /dev/null
+++ b/lib/gc/Dialect/CPURuntime/Transforms/CPURuntimeToLLVM.cpp
@@ -0,0 +1,159 @@
+//===- CPURuntimeToLLVM.cpp - CPU Runtime To LLVM ---------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Conversion/ArithCommon/AttrToLLVMConverter.h"
+#include "mlir/Conversion/ConvertToLLVM/ToLLVMInterface.h"
+#include "mlir/Conversion/LLVMCommon/ConversionTarget.h"
+#include "mlir/Conversion/LLVMCommon/VectorPattern.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/TypeUtilities.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Rewrite/FrozenRewritePatternSet.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+#include "gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.h"
+
+namespace mlir::cpuruntime {
+
+void populateCPURuntimeToLLVMConversionPatterns(LLVMTypeConverter &converter,
+                                                 RewritePatternSet &patterns);
+
+#define GEN_PASS_DEF_CPURUNTIMETOLLVM
+#include "gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.h.inc"
+
+namespace {
+static const char formatStringPrefix[] = "cpuprintfFormat_";
+
+static LLVM::LLVMFuncOp getOrDefineFunction(ModuleOp &moduleOp,
+                                            const Location loc,
+                                            ConversionPatternRewriter &rewriter,
+                                            StringRef name,
+                                            LLVM::LLVMFunctionType type) {
+  LLVM::LLVMFuncOp ret;
+  if (!(ret = moduleOp.template lookupSymbol<LLVM::LLVMFuncOp>(name))) {
+    ConversionPatternRewriter::InsertionGuard guard(rewriter);
+    rewriter.setInsertionPointToStart(moduleOp.getBody());
+    ret = rewriter.create<LLVM::LLVMFuncOp>(loc, name, type,
+                                            LLVM::Linkage::External);
+  }
+  return ret;
+}
+
+
+class PrintfRewriter : public ConvertOpToLLVMPattern<PrintfOp> {
+public:
+  using ConvertOpToLLVMPattern<PrintfOp>::ConvertOpToLLVMPattern;
+  LogicalResult
+  matchAndRewrite(PrintfOp op, PrintfOpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const final {
+    auto moduleOp = op->getParentOfType<ModuleOp>();
+    auto loc = op->getLoc();
+    mlir::Type llvmI32 = typeConverter->convertType(rewriter.getI32Type());
+    mlir::Type llvmI64 = typeConverter->convertType(rewriter.getI64Type());
+    mlir::Type llvmI8 = typeConverter->convertType(rewriter.getI8Type());
+    mlir::Type i8Ptr = LLVM::LLVMPointerType::get(op.getContext());
+    auto printfFunc = getOrDefineFunction(
+        moduleOp, loc, rewriter, "printf",
+        LLVM::LLVMFunctionType::get(llvmI32, {i8Ptr}, /*isVarArg*/ true));
+
+    unsigned stringNumber = 0;
+    SmallString<16> stringConstName;
+    do {
+      stringConstName.clear();
+      (formatStringPrefix + Twine(stringNumber++)).toStringRef(stringConstName);
+    } while (moduleOp.lookupSymbol(stringConstName));
+
+    llvm::SmallString<20> formatString(adaptor.getFormat());
+    formatString.push_back('\0'); // Null terminate for C
+    size_t formatStringSize = formatString.size_in_bytes();
+
+    auto globalType = LLVM::LLVMArrayType::get(llvmI8, formatStringSize);
+    LLVM::GlobalOp global;
+    {
+      ConversionPatternRewriter::InsertionGuard guard(rewriter);
+      rewriter.setInsertionPointToStart(moduleOp.getBody());
+      global = rewriter.create<LLVM::GlobalOp>(
+          loc, globalType,
+          /*isConstant=*/true, LLVM::Linkage::Internal, stringConstName,
+          rewriter.getStringAttr(formatString));
+    }
+    Value globalPtr = rewriter.create<LLVM::AddressOfOp>(
+        loc,
+        LLVM::LLVMPointerType::get(rewriter.getContext(),
+                                   global.getAddrSpace()),
+        global.getSymNameAttr());
+    Value stringStart = rewriter.create<LLVM::GEPOp>(
+        loc, i8Ptr, globalType, globalPtr, ArrayRef<LLVM::GEPArg>{0, 0});
+    SmallVector<Value, 5> appendFormatArgs = {stringStart};
+    for (auto arg : adaptor.getArgs()) {
+      if (auto floatType = dyn_cast<FloatType>(arg.getType())) {
+        if (!floatType.isF64())
+          arg = rewriter.create<LLVM::FPExtOp>(
+              loc, typeConverter->convertType(rewriter.getF64Type()), arg);
+      }
+      if (arg.getType().getIntOrFloatBitWidth() != 64)
+        arg = rewriter.create<LLVM::ZExtOp>(loc, llvmI64, arg);
+      appendFormatArgs.push_back(arg);
+    }
+    rewriter.create<LLVM::CallOp>(loc, printfFunc, appendFormatArgs);
+    rewriter.eraseOp(op);
+    return success();
+  }
+};
+
+class CPURuntimeToLLVM
+    : public impl::CPURuntimeToLLVMBase<CPURuntimeToLLVM> {
+public:
+  using Base::Base;
+  void runOnOperation() final {
+    LLVMConversionTarget target(getContext());
+    RewritePatternSet patterns(&getContext());
+    LowerToLLVMOptions options(&getContext());
+    LLVMTypeConverter converter(&getContext(), options);
+    populateCPURuntimeToLLVMConversionPatterns(converter, patterns);
+
+    if (failed(applyPartialConversion(getOperation(), target,
+                                      std::move(patterns))))
+      signalPassFailure();
+  }
+};
+
+/// Implement the interface to convert MemRef to LLVM.
+struct CPURuntimeToDialectInterface : public ConvertToLLVMPatternInterface {
+  using ConvertToLLVMPatternInterface::ConvertToLLVMPatternInterface;
+  void loadDependentDialects(MLIRContext *context) const final {
+    context->loadDialect<LLVM::LLVMDialect>();
+  }
+
+  /// Hook for derived dialect interface to provide conversion patterns
+  /// and mark dialect legal for the conversion target.
+  void populateConvertToLLVMConversionPatterns(
+      ConversionTarget &target, LLVMTypeConverter &typeConverter,
+      RewritePatternSet &patterns) const final {
+    populateCPURuntimeToLLVMConversionPatterns(typeConverter, patterns);
+  }
+};
+
+} // namespace
+
+void populateCPURuntimeToLLVMConversionPatterns(LLVMTypeConverter &converter,
+                                                 RewritePatternSet &patterns) {
+  patterns.add<PrintfRewriter>(converter);
+}
+
+void registerConvertCPURuntimeToLLVMInterface(DialectRegistry &registry) {
+  registry.addExtension(+[](MLIRContext *ctx, arith::ArithDialect *dialect) {
+    dialect->addInterfaces<CPURuntimeToDialectInterface>();
+  });
+}
+
+} // namespace mlir::cpuruntime
diff --git a/lib/gc/Transforms/CMakeLists.txt b/lib/gc/Transforms/CMakeLists.txt
index df8a14d01..e87106946 100644
--- a/lib/gc/Transforms/CMakeLists.txt
+++ b/lib/gc/Transforms/CMakeLists.txt
@@ -14,3 +14,5 @@ add_mlir_library(GCPasses
     MLIRBufferizationToMemRef
     MLIRBufferizationPipelines
   )
+
+set_property(GLOBAL APPEND PROPERTY GC_PASS_LIBS GCPasses)
\ No newline at end of file
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index f6298c270..e71fe30a0 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,5 +1,6 @@
 get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
 get_property(conversion_libs GLOBAL PROPERTY MLIR_CONVERSION_LIBS)
+get_property(gc_pass_libs GLOBAL PROPERTY GC_PASS_LIBS)
 
 add_subdirectory(dnnl)
 add_subdirectory(gc-cpu-runner)
diff --git a/src/gc-opt/CMakeLists.txt b/src/gc-opt/CMakeLists.txt
index ff33375de..0deb242a0 100644
--- a/src/gc-opt/CMakeLists.txt
+++ b/src/gc-opt/CMakeLists.txt
@@ -2,7 +2,7 @@ set(gc_opt_libs
         ${dialect_libs}
         ${conversion_libs} 
         MLIROptLib 
-        GCPasses)
+        ${gc_pass_libs})
 if(GC_MLIR_CXX_FLAGS)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${GC_MLIR_CXX_FLAGS}")
 endif()
diff --git a/src/gc-opt/gc-opt.cpp b/src/gc-opt/gc-opt.cpp
index 72a25abf5..e1996c050 100644
--- a/src/gc-opt/gc-opt.cpp
+++ b/src/gc-opt/gc-opt.cpp
@@ -18,6 +18,7 @@
  */
 
 #include "gc/Transforms/Passes.h"
+#include "gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.h"
 #include "mlir/InitAllDialects.h"
 #include "mlir/InitAllPasses.h"
 #include "mlir/Tools/mlir-opt/MlirOptMain.h"
@@ -25,9 +26,11 @@
 int main(int argc, char *argv[]) {
   mlir::registerAllPasses();
   mlir::gc::registerGraphCompilerPasses();
-
+  mlir::cpuruntime::registerCPURuntimePasses();
   mlir::DialectRegistry registry;
+  registry.insert<mlir::cpuruntime::CPURuntimeDialect>();
   mlir::registerAllDialects(registry);
+  mlir::cpuruntime::registerConvertCPURuntimeToLLVMInterface(registry);
   return mlir::asMainReturnCode(mlir::MlirOptMain(
       argc, argv, "Graph Compiler modular optimizer driver\n", registry));
 }
diff --git a/test/gc/Dialect/CPURuntime/cpu-runner/printf.mlir b/test/gc/Dialect/CPURuntime/cpu-runner/printf.mlir
new file mode 100644
index 000000000..e95471d50
--- /dev/null
+++ b/test/gc/Dialect/CPURuntime/cpu-runner/printf.mlir
@@ -0,0 +1,17 @@
+// RUN: gc-opt %s --convert-cpuruntime-to-llvm --convert-func-to-llvm --convert-arith-to-llvm --convert-cf-to-llvm --convert-complex-to-llvm | gc-cpu-runner -e main -entry-point-result=void -shared-libs=%mlir_runner_utils,%mlir_c_runner_utils | FileCheck %s
+
+module {
+  func.func @doprint(%t: f32, %t2: i32, %t3: i64) {
+    cpuruntime.printf "Hello world %f %d %lld\n" %t, %t2, %t3 : f32, i32, i64
+    return
+  }
+
+  func.func @main() {
+    %c2 = arith.constant 2.0 : f32
+    %c32i = arith.constant 2000000 : i32
+    %c64i = arith.constant 2000000 : i64
+    call @doprint(%c2, %c32i, %c64i) : (f32, i32, i64) -> ()
+    return
+  }
+  // CHECK: Hello world 2.000000 2000000 2000000
+}
\ No newline at end of file
diff --git a/test/gc/Dialect/CPURuntime/cpuruntime-atexit-to-omp.mlir b/test/gc/Dialect/CPURuntime/cpuruntime-atexit-to-omp.mlir
new file mode 100644
index 000000000..401de95cc
--- /dev/null
+++ b/test/gc/Dialect/CPURuntime/cpuruntime-atexit-to-omp.mlir
@@ -0,0 +1,41 @@
+// RUN: gc-opt %s --cpuruntime-atexit-to-omp | FileCheck %s
+
+module {
+  func.func @parallel_insert_slice(%arg0: memref<512x512xf32>) -> memref<512x512xf32> {
+    %cst = arith.constant 0.000000e+00 : f32
+    %alloc = memref.alloc() {alignment = 64 : i64} : memref<512x512xf32>
+    %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
+    memref.copy %arg0, %alloc : memref<512x512xf32> to memref<512x512xf32>
+    %0 = llvm.mlir.constant(1 : i64) : i64
+    omp.parallel {
+      omp.wsloop for  (%arg1, %arg2) : index = (%c0, %c0) to (%c1, %c512) step (%c1, %c1) {
+        memref.alloca_scope  {
+          %alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<512xf32>
+          %subview = memref.subview %alloc[%arg1, 0] [1, 512] [1, 1] : memref<512x512xf32> to memref<512xf32, strided<[1], offset: ?>>
+          memref.copy %alloc_0, %subview : memref<512xf32> to memref<512xf32, strided<[1], offset: ?>>
+          memref.dealloc %alloc_0 : memref<512xf32>
+          cpuruntime.at_parallel_exit {
+            memref.prefetch %alloc[%c1,%c0], read, locality<3>, data  : memref<512x512xf32>
+            cpuruntime.parallel_exit.return
+          }
+        }
+        omp.yield
+      }
+      memref.prefetch %alloc[%c0,%c0], read, locality<3>, data  : memref<512x512xf32>
+      omp.terminator
+    }
+    // CHECK-DAG:    %[[C1:.*]] = arith.constant 1
+    // CHECK-DAG:    %[[C0:.*]] = arith.constant 0
+    // CHECK:        omp.parallel
+    // CHECK-NEXT:      omp.wsloop
+    // CHECK-NEXT:         memref.alloca_scope
+    // CHECK-NOT:              cpuruntime.at_parallel_exit
+    // CHECK:              omp.yield
+    // CHECK:           memref.prefetch {{%alloc}}[%[[C0]], %[[C0]]]
+    // CHECK-NEXT:      memref.prefetch {{%alloc}}[%[[C1]], %[[C0]]]
+    // CHECK-NEXT:      omp.terminator
+    return %alloc : memref<512x512xf32>
+  }
+}
diff --git a/test/gc/Dialect/CPURuntime/cpuruntime-to-llvm.mlir b/test/gc/Dialect/CPURuntime/cpuruntime-to-llvm.mlir
new file mode 100644
index 000000000..fb8d748f1
--- /dev/null
+++ b/test/gc/Dialect/CPURuntime/cpuruntime-to-llvm.mlir
@@ -0,0 +1,19 @@
+// RUN: gc-opt %s --convert-cpuruntime-to-llvm | FileCheck %s
+
+module {
+  // CHECK: llvm.mlir.global internal constant @cpuprintfFormat_0("Hello world %f %d %lld\0A\00") {addr_space = 0 : i32}
+  // CHECK: llvm.func @printf(!llvm.ptr,
+  // CHECK-NEXT: func.func @doprint(%[[ARG0:.*]]: f32, %[[ARG1:.*]]: i32, %[[ARG2:.*]]: i64)
+  func.func @doprint(%t: f32, %t2: i32, %t3: i64) {
+    // CHECK-NEXT: llvm.mlir.addressof
+    // CHECK-DAG: %[[C1:.*]] = llvm.getelementptr
+    // CHECK-SAME: !llvm.ptr, !llvm.array<24 x i8>
+    // CHECK: %[[C2:.*]] = llvm.fpext %[[ARG0]] 
+    // CHECK: %[[C3:.*]] = llvm.zext %[[ARG1]] 
+    // CHECK-NOT: cpuruntime.printf
+    // CHECK-NEXT: llvm.call @printf(%[[C1]], %[[C2]], %[[C3]], %[[ARG2]])
+    cpuruntime.printf "Hello world %f %d %lld\n" %t, %t2, %t3 : f32, i32, i64
+    return
+  }
+
+}
\ No newline at end of file

From 161848e3e442ac1f6007de5c664e3b70f5a20b02 Mon Sep 17 00:00:00 2001
From: "Mei, Yijie" <yijie.mei@intel.com>
Date: Tue, 14 May 2024 15:11:57 +0800
Subject: [PATCH 02/19] format

---
 lib/gc/Dialect/CPURuntime/Transforms/CPURuntimeToLLVM.cpp | 8 +++-----
 src/gc-opt/gc-opt.cpp                                     | 2 +-
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/lib/gc/Dialect/CPURuntime/Transforms/CPURuntimeToLLVM.cpp b/lib/gc/Dialect/CPURuntime/Transforms/CPURuntimeToLLVM.cpp
index 73cf14a84..c56621d45 100644
--- a/lib/gc/Dialect/CPURuntime/Transforms/CPURuntimeToLLVM.cpp
+++ b/lib/gc/Dialect/CPURuntime/Transforms/CPURuntimeToLLVM.cpp
@@ -25,7 +25,7 @@
 namespace mlir::cpuruntime {
 
 void populateCPURuntimeToLLVMConversionPatterns(LLVMTypeConverter &converter,
-                                                 RewritePatternSet &patterns);
+                                                RewritePatternSet &patterns);
 
 #define GEN_PASS_DEF_CPURUNTIMETOLLVM
 #include "gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.h.inc"
@@ -48,7 +48,6 @@ static LLVM::LLVMFuncOp getOrDefineFunction(ModuleOp &moduleOp,
   return ret;
 }
 
-
 class PrintfRewriter : public ConvertOpToLLVMPattern<PrintfOp> {
 public:
   using ConvertOpToLLVMPattern<PrintfOp>::ConvertOpToLLVMPattern;
@@ -110,8 +109,7 @@ class PrintfRewriter : public ConvertOpToLLVMPattern<PrintfOp> {
   }
 };
 
-class CPURuntimeToLLVM
-    : public impl::CPURuntimeToLLVMBase<CPURuntimeToLLVM> {
+class CPURuntimeToLLVM : public impl::CPURuntimeToLLVMBase<CPURuntimeToLLVM> {
 public:
   using Base::Base;
   void runOnOperation() final {
@@ -146,7 +144,7 @@ struct CPURuntimeToDialectInterface : public ConvertToLLVMPatternInterface {
 } // namespace
 
 void populateCPURuntimeToLLVMConversionPatterns(LLVMTypeConverter &converter,
-                                                 RewritePatternSet &patterns) {
+                                                RewritePatternSet &patterns) {
   patterns.add<PrintfRewriter>(converter);
 }
 
diff --git a/src/gc-opt/gc-opt.cpp b/src/gc-opt/gc-opt.cpp
index e1996c050..9b06ecf1f 100644
--- a/src/gc-opt/gc-opt.cpp
+++ b/src/gc-opt/gc-opt.cpp
@@ -17,8 +17,8 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
-#include "gc/Transforms/Passes.h"
 #include "gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.h"
+#include "gc/Transforms/Passes.h"
 #include "mlir/InitAllDialects.h"
 #include "mlir/InitAllPasses.h"
 #include "mlir/Tools/mlir-opt/MlirOptMain.h"

From 447ef129fdbd73e15533c1bbc9a8f1e6f1273413 Mon Sep 17 00:00:00 2001
From: "Mei, Yijie" <yijie.mei@intel.com>
Date: Tue, 14 May 2024 15:18:43 +0800
Subject: [PATCH 03/19] add dependency

---
 lib/gc/Dialect/CPURuntime/IR/CMakeLists.txt | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/lib/gc/Dialect/CPURuntime/IR/CMakeLists.txt b/lib/gc/Dialect/CPURuntime/IR/CMakeLists.txt
index e349da72c..3a1d63d3d 100644
--- a/lib/gc/Dialect/CPURuntime/IR/CMakeLists.txt
+++ b/lib/gc/Dialect/CPURuntime/IR/CMakeLists.txt
@@ -10,7 +10,5 @@ add_mlir_dialect_library(MLIRCPURuntimeDialect
         MLIRCPURuntimePassesIncGen
 
         LINK_LIBS PUBLIC
-        MLIR
-        # MLIRInferTypeOpInterface
-        # MLIRFuncDialect
+        MLIRFuncDialect
         )

From a73dcc12e1023de4b59d4303835c2489b9f955b7 Mon Sep 17 00:00:00 2001
From: "Mei, Yijie" <yijie.mei@intel.com>
Date: Tue, 14 May 2024 16:03:52 +0800
Subject: [PATCH 04/19] fix new MLIR

---
 .../Transforms/CPURuntimePasses.cpp           | 19 +++++++-------
 .../CPURuntime/cpuruntime-atexit-to-omp.mlir  | 25 +++++++++++--------
 2 files changed, 24 insertions(+), 20 deletions(-)

diff --git a/lib/gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.cpp b/lib/gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.cpp
index f2a098fcf..a8f74c079 100644
--- a/lib/gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.cpp
+++ b/lib/gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.cpp
@@ -27,21 +27,22 @@ class CPURuntimeAtExitToOmpRewriter
   LogicalResult matchAndRewrite(AtParallelExitOp op,
                                 PatternRewriter &rewriter) const final {
     auto parent = op->getParentOp();
-    Operation *secondLast = nullptr;
-    while (parent && (llvm::isa<memref::AllocaScopeOp>(parent) ||
-                      llvm::isa<omp::WsloopOp>(parent))) {
-      secondLast = parent;
+    omp::ParallelOp parallel;
+    while (parent) {
+      parallel = llvm::dyn_cast<omp::ParallelOp>(parent);
+      if (parallel) {
+        break;
+      }
       parent = parent->getParentOp();
     }
-    auto parallel = llvm::dyn_cast<omp::ParallelOp>(parent);
     if (!parallel) {
       return failure();
     }
-    assert(secondLast->getBlock());
-    auto itr = secondLast->getBlock()->end();
+    auto &block = parallel.getRegion().front();
+    auto itr = block.end();
     --itr;
-    rewriter.inlineBlockBefore(&op->getRegion(0).getBlocks().front(),
-                               secondLast->getBlock(), itr);
+    rewriter.inlineBlockBefore(&op->getRegion(0).getBlocks().front(), &block,
+                               itr);
     rewriter.eraseOp(op);
     return success();
   }
diff --git a/test/gc/Dialect/CPURuntime/cpuruntime-atexit-to-omp.mlir b/test/gc/Dialect/CPURuntime/cpuruntime-atexit-to-omp.mlir
index 401de95cc..172777690 100644
--- a/test/gc/Dialect/CPURuntime/cpuruntime-atexit-to-omp.mlir
+++ b/test/gc/Dialect/CPURuntime/cpuruntime-atexit-to-omp.mlir
@@ -10,18 +10,21 @@ module {
     memref.copy %arg0, %alloc : memref<512x512xf32> to memref<512x512xf32>
     %0 = llvm.mlir.constant(1 : i64) : i64
     omp.parallel {
-      omp.wsloop for  (%arg1, %arg2) : index = (%c0, %c0) to (%c1, %c512) step (%c1, %c1) {
-        memref.alloca_scope  {
-          %alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<512xf32>
-          %subview = memref.subview %alloc[%arg1, 0] [1, 512] [1, 1] : memref<512x512xf32> to memref<512xf32, strided<[1], offset: ?>>
-          memref.copy %alloc_0, %subview : memref<512xf32> to memref<512xf32, strided<[1], offset: ?>>
-          memref.dealloc %alloc_0 : memref<512xf32>
-          cpuruntime.at_parallel_exit {
-            memref.prefetch %alloc[%c1,%c0], read, locality<3>, data  : memref<512x512xf32>
-            cpuruntime.parallel_exit.return
+      omp.wsloop {
+        omp.loop_nest (%arg1, %arg2) : index = (%c0, %c0) to (%c1, %c512) step (%c1, %c1) {
+          memref.alloca_scope  {
+            %alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<512xf32>
+            %subview = memref.subview %alloc[%arg1, 0] [1, 512] [1, 1] : memref<512x512xf32> to memref<512xf32, strided<[1], offset: ?>>
+            memref.copy %alloc_0, %subview : memref<512xf32> to memref<512xf32, strided<[1], offset: ?>>
+            memref.dealloc %alloc_0 : memref<512xf32>
+            cpuruntime.at_parallel_exit {
+              memref.prefetch %alloc[%c1,%c0], read, locality<3>, data  : memref<512x512xf32>
+              cpuruntime.parallel_exit.return
+            }
           }
+          omp.yield
         }
-        omp.yield
+        omp.terminator
       }
       memref.prefetch %alloc[%c0,%c0], read, locality<3>, data  : memref<512x512xf32>
       omp.terminator
@@ -30,7 +33,7 @@ module {
     // CHECK-DAG:    %[[C0:.*]] = arith.constant 0
     // CHECK:        omp.parallel
     // CHECK-NEXT:      omp.wsloop
-    // CHECK-NEXT:         memref.alloca_scope
+    // CHECK:         memref.alloca_scope
     // CHECK-NOT:              cpuruntime.at_parallel_exit
     // CHECK:              omp.yield
     // CHECK:           memref.prefetch {{%alloc}}[%[[C0]], %[[C0]]]

From 1cfede8e24231618e69ef692c3b1a0120d5a8857 Mon Sep 17 00:00:00 2001
From: "Mei, Yijie" <yijie.mei@intel.com>
Date: Wed, 15 May 2024 15:24:41 +0800
Subject: [PATCH 05/19] add

---
 include/gc/Transforms/Passes.h               |  26 +++
 include/gc/Transforms/Passes.td              |  13 ++
 lib/gc/Transforms/CMakeLists.txt             |   1 +
 lib/gc/Transforms/Pipeline.cpp               | 164 +++++++++++++++++++
 test/gc/Transforms/Pipeline/run.mlir         |  23 +++
 test/gc/Transforms/Pipeline/tensor_args.mlir |  13 ++
 6 files changed, 240 insertions(+)
 create mode 100644 lib/gc/Transforms/Pipeline.cpp
 create mode 100644 test/gc/Transforms/Pipeline/run.mlir
 create mode 100644 test/gc/Transforms/Pipeline/tensor_args.mlir

diff --git a/include/gc/Transforms/Passes.h b/include/gc/Transforms/Passes.h
index 243a6f4f6..e5e4aee33 100644
--- a/include/gc/Transforms/Passes.h
+++ b/include/gc/Transforms/Passes.h
@@ -12,8 +12,34 @@
 #include "mlir/Pass/Pass.h"
 
 namespace mlir {
+
+namespace LLVM {
+class LLVMDialect;
+}
+
+namespace scf {
+class SCFDialect;
+}
+
+namespace openmp {
+class OpenMPDialect;
+}
+
+namespace linalg {
+class LinalgDialect;
+}
+
+namespace MemRef {
+class MemRefDialect;
+}
+
+class PassManager;
+
 namespace gc {
 
+void populateFrontendPasses(mlir::PassManager &);
+void populateCPUPipeline(mlir::PassManager &);
+
 #define GEN_PASS_DECL
 #include "gc/Transforms/Passes.h.inc"
 
diff --git a/include/gc/Transforms/Passes.td b/include/gc/Transforms/Passes.td
index d31baa5a7..ff0cd8a90 100644
--- a/include/gc/Transforms/Passes.td
+++ b/include/gc/Transforms/Passes.td
@@ -31,4 +31,17 @@ def ConvertOneDNNGraphToLinalg : Pass<"convert-onednn-graph-to-linalg"> {
   ];
 }
 
+def GCCPUPipeline: Pass<"gc-cpu-pipeline"> {
+  let summary = "All-in-one pipeline for GC for CPU";
+  let dependentDialects = ["onednn_graph::OneDNNGraphDialect",
+      "tensor::TensorDialect",
+      "memref::MemRefDialect",
+      "linalg::LinalgDialect",
+      "LLVM::LLVMDialect",
+      "scf::SCFDialect",
+      "bufferization::BufferizationDialect",
+      "omp::OpenMPDialect",
+      "vector::VectorDialect"];
+}
+
 #endif // GC_DIALECT_GC_PASSES
diff --git a/lib/gc/Transforms/CMakeLists.txt b/lib/gc/Transforms/CMakeLists.txt
index e7e97ea26..f3fa43e04 100644
--- a/lib/gc/Transforms/CMakeLists.txt
+++ b/lib/gc/Transforms/CMakeLists.txt
@@ -6,6 +6,7 @@ gc_set_mlir_link_components(MLIR_LINK_COMPONENTS
 
 add_mlir_library(GCPasses
   OneDNNGraphToLinalg.cpp
+  Pipeline.cpp
   TileNamed.cpp
 
   ADDITIONAL_HEADER_DIRS
diff --git a/lib/gc/Transforms/Pipeline.cpp b/lib/gc/Transforms/Pipeline.cpp
new file mode 100644
index 000000000..ed50925f6
--- /dev/null
+++ b/lib/gc/Transforms/Pipeline.cpp
@@ -0,0 +1,164 @@
+//===- Pipeline.cpp - Graph Compiler all-in-one pipeline --------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Conversion/Passes.h"
+#include "mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h"
+#include "mlir/Dialect/Bufferization/Transforms/Passes.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/Linalg/Passes.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/MemRef/Transforms/Passes.h"
+#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/InitAllPasses.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Transforms/Passes.h"
+
+#include "gc/Dialect/OneDNNGraph/OneDNNGraphDialect.h"
+#include "gc/Transforms/Passes.h"
+
+namespace mlir::gc {
+
+void populateFrontendPasses(mlir::PassManager &pm) {
+  // pm.addPass(onednn_graph::createConvertOneDNNGraphToLinalg());
+}
+// linalg + linalgX + tensor ==> GC V1 GIR
+
+void populateTensorPasses(mlir::PassManager &pm) {
+  // + padding propagation pass, upstream-able 127x127 -> tilling size:32
+  // ->padding to 128x128
+  // + layout propagation pass, upstream-able 4x32x4x32 ->
+  // tensor.pack/tensor.unpack
+  // + tensor constant propagation pass, down-stream pass, designed to support
+  // oneDNN graph spec
+  // + linalg.matmul lowering to (scf.loop + linalg.brgemm) pass, upstream-able
+  // + fine-grain fusion pass, upstream-able -> scf.for + linalgx.mask
+  // + lower linalg to arith/math on virtual vector pass, up-streamable
+
+  // REMOVE this pass after the above passes are added. Currently we add this
+  // pass to make the pipeline work properly
+  pm.addNestedPass<func::FuncOp>(createLinalgGeneralizeNamedOpsPass());
+}
+// scf + arith + math + vector + tensor + linalg.brgemm + tensor.pack/unpack ==>
+// GC V1 TIR
+
+void populateVectorPasses(mlir::PassManager &pm) {
+  // + bf16 promotion pass, down-stream pass, device dependent pass, maybe can
+  // upstream
+  // + bf16 cast elimilation pass, down-stream pass, fast-math kind pass,
+  // designed to support oneDNN graph spec
+  pm.addNestedPass<func::FuncOp>(arith::createArithExpandOpsPass());
+  // + lower to physical vector pass, down-stream pass, device dependent pass,
+  // maybe can upstream
+}
+// scf + arith + math + vector + tensor + linalg.brgemm
+
+void populateBufferizationPasses(mlir::PassManager &pm) {
+  bufferization::OneShotBufferizationOptions options;
+  pm.addPass(bufferization::createOneShotBufferizePass(options));
+  pm.addPass(createCSEPass());
+  pm.addPass(mlir::func::createFuncBufferizePass());
+  pm.addPass(bufferization::createBufferResultsToOutParamsPass());
+  pm.addNestedPass<func::FuncOp>(
+      bufferization::createBufferizationBufferizePass());
+  pm.addNestedPass<func::FuncOp>(
+      bufferization::createFinalizingBufferizePass());
+  // + buffer schedule pass, down-stream pass, to migrate buffer reschedule pass
+  // from GC V1.
+  pm.addNestedPass<func::FuncOp>(
+      bufferization::createBufferHoistingPass()); // Need to improve this pass
+                                                  // to support thread-local
+                                                  // allocator.
+  pm.addNestedPass<func::FuncOp>(bufferization::createBufferLoopHoistingPass());
+  pm.addNestedPass<func::FuncOp>(bufferization::createBufferDeallocationPass());
+  pm.addPass(createBufferizationToMemRefPass());
+}
+// scf + arith + math + vector + memref + linalg.brgemm
+
+void populateMicroKernelPasses(mlir::PassManager &pm) {
+  // + ConvertLinalgToMicrokernel pass, upstream-able,
+  // + CleanupInvalidMicrokernel pass, upstream-able
+  // + InvariantMicrokernelMotion pass, upstream-able
+  // + ConvertMicrokernelToDnnlFunc, down-stream pass, to lower brgemm to dnnl
+  // call
+  // + ConvertMicrokernelToXsmm, down-stream pass, to lower brgemm to libxsmm
+  // call
+  // + LowerMicrokernel pass, upstream-able
+  // + DispatchMicrokernel, down-stream pass
+}
+// scf + arith + math + vector + memref + func/microkernel
+
+void populateCPURuntimePasses(mlir::PassManager &pm) {
+  // + flatten nested parallel pass, down-stream pass, to support coarse-grain
+  // fusion
+  // pm.addNestedPass<func::FuncOp>(parallelcpu::createParallelCPUAtExitToOmp());
+  // remove this pass after we add FlattenNestedParallel
+  pm.addPass(createConvertSCFToOpenMPPass());
+}
+
+void populateLoweringToLLVMPasses(mlir::PassManager &pm) {
+  pm.addPass(createConvertSCFToCFPass());
+  // pm.addPass(parallelcpu::createParallelCPUToLLVM());
+  pm.addPass(createConvertOpenMPToLLVMPass());
+  pm.addNestedPass<func::FuncOp>(createConvertMathToLLVMPass());
+  pm.addPass(createConvertMathToLibmPass());
+  pm.addPass(createFinalizeMemRefToLLVMConversionPass());
+  pm.addNestedPass<func::FuncOp>(createArithToLLVMConversionPass());
+  pm.addPass(createConvertFuncToLLVMPass());
+  pm.addPass(createConvertControlFlowToLLVMPass());
+  pm.addPass(createCSEPass());
+  pm.addPass(createCanonicalizerPass());
+  pm.addPass(createReconcileUnrealizedCastsPass());
+  pm.addPass(createSymbolDCEPass());
+}
+
+void populateLLVMPasses(mlir::PassManager &pm) {
+  pm.addPass(memref::createExpandOpsPass());
+  pm.addPass(memref::createExpandStridedMetadataPass());
+  populateLoweringToLLVMPasses(pm);
+}
+
+void populateCPUPipeline(mlir::PassManager &pm) {
+  // front-end, oneDNN graph dialect
+  populateFrontendPasses(pm);
+  // middle-end, LinalgX/Linalg/tensor dialects
+  populateTensorPasses(pm);
+  // middle-end, arith/math/vector dialects
+  populateVectorPasses(pm);
+  // back-end, arith/math/vector/memref dialects
+  populateBufferizationPasses(pm);
+  // REMOVE this pass after the TensorPasses are added. Currently we add this
+  // pass to make the pipeline work properly
+  pm.addNestedPass<func::FuncOp>(createConvertLinalgToParallelLoopsPass());
+  populateMicroKernelPasses(pm);
+  populateCPURuntimePasses(pm);
+  // // back-end, llvm dialect
+  populateLLVMPasses(pm);
+}
+
+#define GEN_PASS_DEF_GCCPUPIPELINE
+#include "gc/Transforms/Passes.h.inc"
+namespace {
+
+class GCCPUPipeline : public impl::GCCPUPipelineBase<GCCPUPipeline> {
+public:
+  friend struct PassHelper;
+  using impl::GCCPUPipelineBase<GCCPUPipeline>::GCCPUPipelineBase;
+  void runOnOperation() final {
+    auto op = getOperation();
+    PassManager pm{op->getContext()};
+    populateCPUPipeline(pm);
+    if (failed(pm.run(op)))
+      signalPassFailure();
+  }
+};
+
+} // namespace
+} // namespace mlir::gc
diff --git a/test/gc/Transforms/Pipeline/run.mlir b/test/gc/Transforms/Pipeline/run.mlir
new file mode 100644
index 000000000..799935006
--- /dev/null
+++ b/test/gc/Transforms/Pipeline/run.mlir
@@ -0,0 +1,23 @@
+// RUN: gc-opt %s --gc-cpu-pipeline | gc-cpu-runner -e main -entry-point-result=void | FileCheck %s
+
+module {
+func.func @aaa() -> tensor<128xf32> {
+    %c2 = arith.constant 2.0 : f32
+    %a = tensor.empty() : tensor<128xf32>
+    %2 = linalg.fill ins(%c2 : f32) outs(%a : tensor<128xf32>) -> tensor<128xf32>
+    return %2 : tensor<128xf32>
+}
+
+func.func @main() {
+    %result = call @aaa() : ()-> tensor<128xf32>
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    scf.for %iv = %c0 to %c128 step %c1 {
+        %4 = tensor.extract %result[%iv] : tensor<128xf32>
+        parallelcpu.printf "%f\n" %4 : f32
+    }
+    return
+}
+// CHECK-COUNT-128: 2.000000
+}
\ No newline at end of file
diff --git a/test/gc/Transforms/Pipeline/tensor_args.mlir b/test/gc/Transforms/Pipeline/tensor_args.mlir
new file mode 100644
index 000000000..73d916d04
--- /dev/null
+++ b/test/gc/Transforms/Pipeline/tensor_args.mlir
@@ -0,0 +1,13 @@
+// RUN: gc-opt %s --gc-cpu-pipeline | FileCheck %s
+
+module {
+// CHECK: aaa
+// check that the func returns void
+// CHECK-NOT: ) -> !llvm.struct<
+func.func @aaa(%a: tensor<128xf32>, %b: tensor<128xf32>) -> tensor<128xf32> {
+    %out = tensor.empty() : tensor<128xf32>
+    %2 = linalg.add ins(%a, %b : tensor<128xf32>,tensor<128xf32>) outs(%out : tensor<128xf32>) -> tensor<128xf32>
+    // CHECK: memcpy
+    return %out : tensor<128xf32>
+}
+}
\ No newline at end of file

From 475faf8052309cfd9e170f61e2622d5c4cd7a5ad Mon Sep 17 00:00:00 2001
From: "Mei, Yijie" <yijie.mei@intel.com>
Date: Wed, 15 May 2024 15:46:46 +0800
Subject: [PATCH 06/19] update

---
 lib/gc/Transforms/Pipeline.cpp       | 9 ++++++---
 test/gc/Transforms/Pipeline/run.mlir | 2 +-
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/lib/gc/Transforms/Pipeline.cpp b/lib/gc/Transforms/Pipeline.cpp
index ed50925f6..8b74df1b9 100644
--- a/lib/gc/Transforms/Pipeline.cpp
+++ b/lib/gc/Transforms/Pipeline.cpp
@@ -21,6 +21,7 @@
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Transforms/Passes.h"
 
+#include "gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.h"
 #include "gc/Dialect/OneDNNGraph/OneDNNGraphDialect.h"
 #include "gc/Transforms/Passes.h"
 
@@ -65,7 +66,9 @@ void populateBufferizationPasses(mlir::PassManager &pm) {
   pm.addPass(bufferization::createOneShotBufferizePass(options));
   pm.addPass(createCSEPass());
   pm.addPass(mlir::func::createFuncBufferizePass());
-  pm.addPass(bufferization::createBufferResultsToOutParamsPass());
+  bufferization::BufferResultsToOutParamsOpts opt{};
+  // opt.hoistStaticAllocs = true;
+  pm.addPass(bufferization::createBufferResultsToOutParamsPass(opt));
   pm.addNestedPass<func::FuncOp>(
       bufferization::createBufferizationBufferizePass());
   pm.addNestedPass<func::FuncOp>(
@@ -98,14 +101,14 @@ void populateMicroKernelPasses(mlir::PassManager &pm) {
 void populateCPURuntimePasses(mlir::PassManager &pm) {
   // + flatten nested parallel pass, down-stream pass, to support coarse-grain
   // fusion
-  // pm.addNestedPass<func::FuncOp>(parallelcpu::createParallelCPUAtExitToOmp());
+  pm.addNestedPass<func::FuncOp>(cpuruntime::createCPURuntimeAtExitToOmp());
   // remove this pass after we add FlattenNestedParallel
   pm.addPass(createConvertSCFToOpenMPPass());
 }
 
 void populateLoweringToLLVMPasses(mlir::PassManager &pm) {
   pm.addPass(createConvertSCFToCFPass());
-  // pm.addPass(parallelcpu::createParallelCPUToLLVM());
+  pm.addPass(cpuruntime::createCPURuntimeToLLVM());
   pm.addPass(createConvertOpenMPToLLVMPass());
   pm.addNestedPass<func::FuncOp>(createConvertMathToLLVMPass());
   pm.addPass(createConvertMathToLibmPass());
diff --git a/test/gc/Transforms/Pipeline/run.mlir b/test/gc/Transforms/Pipeline/run.mlir
index 799935006..71feb0843 100644
--- a/test/gc/Transforms/Pipeline/run.mlir
+++ b/test/gc/Transforms/Pipeline/run.mlir
@@ -15,7 +15,7 @@ func.func @main() {
     %c1 = arith.constant 1 : index
     scf.for %iv = %c0 to %c128 step %c1 {
         %4 = tensor.extract %result[%iv] : tensor<128xf32>
-        parallelcpu.printf "%f\n" %4 : f32
+        cpuruntime.printf "%f\n" %4 : f32
     }
     return
 }

From 0ac087deb28d4ec1efcf53519d95e1700690467f Mon Sep 17 00:00:00 2001
From: "Mei, Yijie" <yijie.mei@intel.com>
Date: Wed, 15 May 2024 17:02:27 +0800
Subject: [PATCH 07/19] fix

---
 src/gc-opt/CMakeLists.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/gc-opt/CMakeLists.txt b/src/gc-opt/CMakeLists.txt
index 74eb4b28e..36ace6847 100644
--- a/src/gc-opt/CMakeLists.txt
+++ b/src/gc-opt/CMakeLists.txt
@@ -15,7 +15,9 @@ endif()
 set(gc_opt_libs 
         ${dialect_libs}
         ${conversion_libs} 
-        MLIROptLib)
+        ${MLIR_LINK_COMPONENTS}
+        GCPasses)
+
 if(GC_MLIR_CXX_FLAGS)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${GC_MLIR_CXX_FLAGS}")
 endif()

From 74b0d342fba8e5896a9ca07bf57b449f34911155 Mon Sep 17 00:00:00 2001
From: "Mei, Yijie" <yijie.mei@intel.com>
Date: Thu, 16 May 2024 11:00:23 +0800
Subject: [PATCH 08/19] remove at exit

---
 .../gc/Dialect/CPURuntime/IR/CPURuntimeOps.td | 41 +---------
 .../CPURuntime/Transforms/CPURuntimePasses.td | 35 ---------
 .../Dialect/CPURuntime/IR/CPURuntimeOps.cpp   | 34 --------
 .../CPURuntime/Transforms/CMakeLists.txt      |  1 -
 .../Transforms/CPURuntimePasses.cpp           | 78 -------------------
 .../CPURuntime/cpuruntime-atexit-to-omp.mlir  | 44 -----------
 6 files changed, 1 insertion(+), 232 deletions(-)
 delete mode 100644 lib/gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.cpp
 delete mode 100644 test/gc/Dialect/CPURuntime/cpuruntime-atexit-to-omp.mlir

diff --git a/include/gc/Dialect/CPURuntime/IR/CPURuntimeOps.td b/include/gc/Dialect/CPURuntime/IR/CPURuntimeOps.td
index cc1b7c555..bd77ad997 100644
--- a/include/gc/Dialect/CPURuntime/IR/CPURuntimeOps.td
+++ b/include/gc/Dialect/CPURuntime/IR/CPURuntimeOps.td
@@ -10,46 +10,7 @@
 #define CPURUNTIME_OPS
 
 include "gc/Dialect/CPURuntime/IR/CPURuntimeDialect.td"
-include "mlir/Interfaces/InferTypeOpInterface.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
-include "mlir/Interfaces/DestinationStyleOpInterface.td"
-include "mlir/Interfaces/ControlFlowInterfaces.td"
-include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.td"
-
-
-def CPURuntime_AtParallelExitOp : CPURuntime_Op<"at_parallel_exit", [
-       ParentOneOf<["scf::ForallOp", "scf::ParallelOp", "omp::WsloopOp", "memref::AllocaScopeOp"]>,
-       SingleBlockImplicitTerminator<"ParallelExitReturnOp">
-      ]> {
-  let summary = "Runs the block once in all threads at the exit of the parallel section";
-  let description = [{
-    It executes the block for each thread working in the parallel section for
-    once, at the exit of parallel section.
-  }];
-
-  let regions = (region SizedRegion<1>:$region);
-
-  let hasCustomAssemblyFormat = 1;
-
-  // The default builder does not add a region with an empty body, add our own.
-  let skipDefaultBuilders = 1;
-  let builders = [
-    OpBuilder<(ins)>,
-  ];
-}
-
-def CPURuntime_ParallelExitReturnOp : CPURuntime_Op<"parallel_exit.return", [
-       Pure,
-       HasParent<"AtParallelExitOp">,
-       Terminator, ReturnLike
-      ]> {
-  let summary = "Terminates at_parallel_exit block";
-  let description = [{
-    at_parallel_exit should ends with parallel_exit.return
-  }];
-  let assemblyFormat =
-      [{  attr-dict }];
-}
 
 
 def CPURuntime_PrintfOp : CPURuntime_Op<"printf", [MemoryEffects<[MemWrite]>]>,
@@ -61,7 +22,7 @@ def CPURuntime_PrintfOp : CPURuntime_Op<"printf", [MemoryEffects<[MemWrite]>]>,
     scalar arguments that should be printed.
 
     The format string is a C-style printf string, subject to any restrictions
-    imposed by one's target platform.
+    imposed by the target platform.
   }];
   let assemblyFormat = [{
     $format attr-dict ($args^ `:` type($args))?
diff --git a/include/gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.td b/include/gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.td
index 0685ce498..20c81e10a 100644
--- a/include/gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.td
+++ b/include/gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.td
@@ -11,41 +11,6 @@
 
 include "mlir/Pass/PassBase.td"
 
-
-def CPURuntimeAtExitToOmp: Pass<"cpuruntime-atexit-to-omp", "::mlir::func::FuncOp"> {
-  let summary = "Lower at_parallel_exit to code in omp.parallel section";
-  let description = [{
-    Switches the name of a FuncOp named `bar` to `foo` and folds.
-    ```
-    omp.parallel {
-      omp.wsloop for (%arg1, %arg2) : index = (%c0, %c0) to (%c1, %c512) step (%c1, %c1) {
-        memref.alloca_scope  {
-          cpuruntime.at_parallel_exit {
-            "your.op"()
-            cpuruntime.parallel_exit.return
-          }
-        }
-        omp.yield
-      }
-      omp.terminator
-    }
-    ```
-    Will be changed into
-    ```
-    omp.parallel {
-      omp.wsloop for  (%arg1, %arg2) : index = (%c0, %c0) to (%c1, %c512) step (%c1, %c1) {
-        memref.alloca_scope  {
-        }
-        omp.yield
-      }
-      "your.op"()
-      omp.terminator
-    }
-    ```
-  }];
-}
-
-
 def CPURuntimeToLLVM: Pass<"convert-cpuruntime-to-llvm"> {
   let summary = "Convert cpuruntime to LLVM dialect";
   let description = [{
diff --git a/lib/gc/Dialect/CPURuntime/IR/CPURuntimeOps.cpp b/lib/gc/Dialect/CPURuntime/IR/CPURuntimeOps.cpp
index ca632e9db..ed7bc6581 100644
--- a/lib/gc/Dialect/CPURuntime/IR/CPURuntimeOps.cpp
+++ b/lib/gc/Dialect/CPURuntime/IR/CPURuntimeOps.cpp
@@ -15,42 +15,8 @@
 #include <llvm/Support/Debug.h>
 
 namespace mlir {
-using namespace bufferization;
-
 namespace cpuruntime {
 
-void AtParallelExitOp::build(OpBuilder &b, OperationState &result) {
-  OpBuilder::InsertionGuard g(b);
-  Region *bodyRegion = result.addRegion();
-  b.createBlock(bodyRegion);
-}
-
-void AtParallelExitOp::print(OpAsmPrinter &p) {
-  p << " ";
-  p.printRegion(getRegion(),
-                /*printEntryBlockArgs=*/false,
-                /*printBlockTerminators=*/true);
-  p.printOptionalAttrDict(getOperation()->getAttrs());
-}
-
-ParseResult AtParallelExitOp::parse(OpAsmParser &parser,
-                                    OperationState &result) {
-  auto &builder = parser.getBuilder();
-
-  SmallVector<OpAsmParser::Argument, 8> regionOperands;
-  std::unique_ptr<Region> region = std::make_unique<Region>();
-  if (parser.parseRegion(*region, regionOperands))
-    return failure();
-
-  if (region->empty())
-    OpBuilder(builder.getContext()).createBlock(region.get());
-  result.addRegion(std::move(region));
-
-  // Parse the optional attribute list.
-  if (parser.parseOptionalAttrDict(result.attributes))
-    return failure();
-  return success();
-}
 
 } // namespace cpuruntime
 } // namespace mlir
\ No newline at end of file
diff --git a/lib/gc/Dialect/CPURuntime/Transforms/CMakeLists.txt b/lib/gc/Dialect/CPURuntime/Transforms/CMakeLists.txt
index ee6148aa4..3bc84f6c8 100644
--- a/lib/gc/Dialect/CPURuntime/Transforms/CMakeLists.txt
+++ b/lib/gc/Dialect/CPURuntime/Transforms/CMakeLists.txt
@@ -1,5 +1,4 @@
 add_mlir_dialect_library(MLIRCPURuntimeTransforms
-        CPURuntimePasses.cpp
         CPURuntimeToLLVM.cpp
 
         ADDITIONAL_HEADER_DIRS
diff --git a/lib/gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.cpp b/lib/gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.cpp
deleted file mode 100644
index a8f74c079..000000000
--- a/lib/gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.cpp
+++ /dev/null
@@ -1,78 +0,0 @@
-//===- CPURuntimePasses.cpp - CPU Runtime Passes ----------------*- C++ -*-===//
-//
-// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/Rewrite/FrozenRewritePatternSet.h"
-#include "mlir/Support/LogicalResult.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-
-#include "gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.h"
-
-namespace mlir::cpuruntime {
-#define GEN_PASS_DEF_CPURUNTIMEATEXITTOOMP
-#include "gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.h.inc"
-
-namespace {
-
-class CPURuntimeAtExitToOmpRewriter
-    : public OpRewritePattern<AtParallelExitOp> {
-public:
-  using OpRewritePattern<AtParallelExitOp>::OpRewritePattern;
-  LogicalResult matchAndRewrite(AtParallelExitOp op,
-                                PatternRewriter &rewriter) const final {
-    auto parent = op->getParentOp();
-    omp::ParallelOp parallel;
-    while (parent) {
-      parallel = llvm::dyn_cast<omp::ParallelOp>(parent);
-      if (parallel) {
-        break;
-      }
-      parent = parent->getParentOp();
-    }
-    if (!parallel) {
-      return failure();
-    }
-    auto &block = parallel.getRegion().front();
-    auto itr = block.end();
-    --itr;
-    rewriter.inlineBlockBefore(&op->getRegion(0).getBlocks().front(), &block,
-                               itr);
-    rewriter.eraseOp(op);
-    return success();
-  }
-};
-
-class CPURuntimeExitReturnRewriter
-    : public OpRewritePattern<ParallelExitReturnOp> {
-public:
-  using OpRewritePattern<ParallelExitReturnOp>::OpRewritePattern;
-  LogicalResult matchAndRewrite(ParallelExitReturnOp op,
-                                PatternRewriter &rewriter) const final {
-    rewriter.eraseOp(op);
-    return success();
-  }
-};
-
-class CPURuntimeAtExitToOmp
-    : public impl::CPURuntimeAtExitToOmpBase<CPURuntimeAtExitToOmp> {
-public:
-  using impl::CPURuntimeAtExitToOmpBase<
-      CPURuntimeAtExitToOmp>::CPURuntimeAtExitToOmpBase;
-  void runOnOperation() final {
-    RewritePatternSet patterns(&getContext());
-    patterns.add<CPURuntimeExitReturnRewriter>(&getContext());
-    patterns.add<CPURuntimeAtExitToOmpRewriter>(&getContext());
-    FrozenRewritePatternSet patternSet(std::move(patterns));
-    if (failed(applyPatternsAndFoldGreedily(getOperation(), patternSet)))
-      signalPassFailure();
-  }
-};
-
-} // namespace
-} // namespace mlir::cpuruntime
diff --git a/test/gc/Dialect/CPURuntime/cpuruntime-atexit-to-omp.mlir b/test/gc/Dialect/CPURuntime/cpuruntime-atexit-to-omp.mlir
deleted file mode 100644
index 172777690..000000000
--- a/test/gc/Dialect/CPURuntime/cpuruntime-atexit-to-omp.mlir
+++ /dev/null
@@ -1,44 +0,0 @@
-// RUN: gc-opt %s --cpuruntime-atexit-to-omp | FileCheck %s
-
-module {
-  func.func @parallel_insert_slice(%arg0: memref<512x512xf32>) -> memref<512x512xf32> {
-    %cst = arith.constant 0.000000e+00 : f32
-    %alloc = memref.alloc() {alignment = 64 : i64} : memref<512x512xf32>
-    %c512 = arith.constant 512 : index
-    %c1 = arith.constant 1 : index
-    %c0 = arith.constant 0 : index
-    memref.copy %arg0, %alloc : memref<512x512xf32> to memref<512x512xf32>
-    %0 = llvm.mlir.constant(1 : i64) : i64
-    omp.parallel {
-      omp.wsloop {
-        omp.loop_nest (%arg1, %arg2) : index = (%c0, %c0) to (%c1, %c512) step (%c1, %c1) {
-          memref.alloca_scope  {
-            %alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<512xf32>
-            %subview = memref.subview %alloc[%arg1, 0] [1, 512] [1, 1] : memref<512x512xf32> to memref<512xf32, strided<[1], offset: ?>>
-            memref.copy %alloc_0, %subview : memref<512xf32> to memref<512xf32, strided<[1], offset: ?>>
-            memref.dealloc %alloc_0 : memref<512xf32>
-            cpuruntime.at_parallel_exit {
-              memref.prefetch %alloc[%c1,%c0], read, locality<3>, data  : memref<512x512xf32>
-              cpuruntime.parallel_exit.return
-            }
-          }
-          omp.yield
-        }
-        omp.terminator
-      }
-      memref.prefetch %alloc[%c0,%c0], read, locality<3>, data  : memref<512x512xf32>
-      omp.terminator
-    }
-    // CHECK-DAG:    %[[C1:.*]] = arith.constant 1
-    // CHECK-DAG:    %[[C0:.*]] = arith.constant 0
-    // CHECK:        omp.parallel
-    // CHECK-NEXT:      omp.wsloop
-    // CHECK:         memref.alloca_scope
-    // CHECK-NOT:              cpuruntime.at_parallel_exit
-    // CHECK:              omp.yield
-    // CHECK:           memref.prefetch {{%alloc}}[%[[C0]], %[[C0]]]
-    // CHECK-NEXT:      memref.prefetch {{%alloc}}[%[[C1]], %[[C0]]]
-    // CHECK-NEXT:      omp.terminator
-    return %alloc : memref<512x512xf32>
-  }
-}

From 2cebba99452bd68876269b77293838b3a263f112 Mon Sep 17 00:00:00 2001
From: "Mei, Yijie" <yijie.mei@intel.com>
Date: Thu, 16 May 2024 11:01:58 +0800
Subject: [PATCH 09/19] fix lint

---
 lib/gc/Dialect/CPURuntime/IR/CPURuntimeOps.cpp | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/lib/gc/Dialect/CPURuntime/IR/CPURuntimeOps.cpp b/lib/gc/Dialect/CPURuntime/IR/CPURuntimeOps.cpp
index ed7bc6581..460c421cc 100644
--- a/lib/gc/Dialect/CPURuntime/IR/CPURuntimeOps.cpp
+++ b/lib/gc/Dialect/CPURuntime/IR/CPURuntimeOps.cpp
@@ -15,8 +15,5 @@
 #include <llvm/Support/Debug.h>
 
 namespace mlir {
-namespace cpuruntime {
-
-
-} // namespace cpuruntime
+namespace cpuruntime {} // namespace cpuruntime
 } // namespace mlir
\ No newline at end of file

From 34d10ea127052b423b3384c2ada95b3c30a1eb36 Mon Sep 17 00:00:00 2001
From: "Mei, Yijie" <yijie.mei@intel.com>
Date: Thu, 16 May 2024 13:29:48 +0800
Subject: [PATCH 10/19] Add kmp_* wrapper for gomp environment

---
 lib/gc/CMakeLists.txt                         |   3 +-
 lib/gc/ExecutionEngine/CMakeLists.txt         |   1 +
 .../ExecutionEngine/CPURuntime/CMakeLists.txt |  15 ++
 .../ExecutionEngine/CPURuntime/Parallel.cpp   | 189 ++++++++++++++++++
 src/gc-cpu-runner/CMakeLists.txt              |   3 +-
 src/gc-cpu-runner/gc-cpu-runner.cpp           |   4 +
 test/gc/cpu-runner/tid.mlir                   |  37 ++++
 7 files changed, 250 insertions(+), 2 deletions(-)
 create mode 100644 lib/gc/ExecutionEngine/CMakeLists.txt
 create mode 100644 lib/gc/ExecutionEngine/CPURuntime/CMakeLists.txt
 create mode 100644 lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp
 create mode 100644 test/gc/cpu-runner/tid.mlir

diff --git a/lib/gc/CMakeLists.txt b/lib/gc/CMakeLists.txt
index fd78d6cab..921853a08 100644
--- a/lib/gc/CMakeLists.txt
+++ b/lib/gc/CMakeLists.txt
@@ -5,4 +5,5 @@ endif()
 include(functions)
 
 add_subdirectory(Dialect)
-add_subdirectory(Transforms)
\ No newline at end of file
+add_subdirectory(Transforms)
+add_subdirectory(ExecutionEngine)
\ No newline at end of file
diff --git a/lib/gc/ExecutionEngine/CMakeLists.txt b/lib/gc/ExecutionEngine/CMakeLists.txt
new file mode 100644
index 000000000..8aa223412
--- /dev/null
+++ b/lib/gc/ExecutionEngine/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(CPURuntime)
diff --git a/lib/gc/ExecutionEngine/CPURuntime/CMakeLists.txt b/lib/gc/ExecutionEngine/CPURuntime/CMakeLists.txt
new file mode 100644
index 000000000..6be58e28f
--- /dev/null
+++ b/lib/gc/ExecutionEngine/CPURuntime/CMakeLists.txt
@@ -0,0 +1,15 @@
+find_package(OpenMP REQUIRED)
+
+if ("iomp" IN_LIST OpenMP_C_LIB_NAMES OR "omp" IN_LIST OpenMP_C_LIB_NAMES OR "omp5" IN_LIST OpenMP_C_LIB_NAMES)
+else()
+  add_definitions("-DGC_NEEDS_OMP_WRAPPER=1")
+endif()
+
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fopenmp")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
+add_mlir_library(GCCpuRuntime
+    SHARED
+    Parallel.cpp
+
+    EXCLUDE_FROM_LIBMLIR
+  )
\ No newline at end of file
diff --git a/lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp b/lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp
new file mode 100644
index 000000000..a71b8522c
--- /dev/null
+++ b/lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp
@@ -0,0 +1,189 @@
+//===- Parallel.cpp - Definitions for parallel runtime  -----------*- C++ -*-=//
+//-*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <assert.h>
+#include <atomic>
+#include <chrono>
+#include <immintrin.h>
+#include <omp.h>
+#include <stdarg.h>
+
+#define likely(x) __builtin_expect(!!(x), 1)
+#define unlikely(x) __builtin_expect(!!(x), 0)
+
+#define WEAK_SYMBOL __attribute__((weak))
+
+namespace {
+struct barrier_t {
+  alignas(64) std::atomic<int32_t> pending_;
+  std::atomic<int32_t> rounds_;
+  uint64_t total_;
+  // pad barrier to size of cacheline to avoid false sharing
+  char padding_[64 - 4 * sizeof(int32_t)];
+};
+
+typedef uint64_t (*barrier_idle_func)(std::atomic<int32_t> *remaining,
+                                      int32_t expected_remain, int32_t tid,
+                                      void *args);
+} // namespace
+
+extern "C" {
+int gc_runtime_keep_alive = 0;
+void gc_arrive_at_barrier(barrier_t *b, barrier_idle_func idle_func,
+                          void *idle_args) {
+  auto cur_round = b->rounds_.load(std::memory_order_acquire);
+  auto cnt = --b->pending_;
+  assert(cnt >= 0);
+  //   int count = 0;
+  if (cnt == 0) {
+    b->pending_.store(b->total_);
+    b->rounds_.store(cur_round + 1);
+  } else {
+    if (idle_func) {
+      if (cur_round != b->rounds_.load()) {
+        return;
+      }
+      idle_func(&b->rounds_, cur_round + 1, -1, idle_args);
+      //   count = ret & 0xffffffff;
+    }
+    while (cur_round == b->rounds_.load()) {
+      _mm_pause();
+    }
+  }
+}
+
+static_assert(sizeof(barrier_t) == 64, "size of barrier_t should be 64-byte");
+
+void gc_init_barrier(barrier_t *b, int num_barriers, uint64_t thread_count) {
+  for (int i = 0; i < num_barriers; i++) {
+    b[i].total_ = thread_count;
+    b[i].pending_.store(thread_count);
+    b[i].rounds_.store(0);
+  }
+}
+
+#if GC_NEEDS_OMP_WRAPPER
+void WEAK_SYMBOL __kmpc_barrier(void *loc, int32_t global_tid) {
+#pragma omp barrier
+}
+
+int WEAK_SYMBOL __kmpc_global_thread_num(void *loc) {
+  return omp_get_thread_num();
+}
+
+void WEAK_SYMBOL __kmpc_for_static_init_8u(void *loc, int32_t gtid,
+                                           int32_t schedtype,
+                                           int32_t *plastiter, uint64_t *plower,
+                                           uint64_t *pupper, int64_t *pstride,
+                                           int64_t incr, int64_t chunk) {
+  if (unlikely(schedtype != 34)) {
+    std::abort();
+  }
+  const int32_t FALSE = 0;
+  const int32_t TRUE = 0;
+  using UT = uint64_t;
+  //   using ST = int64_t;
+  /*  this all has to be changed back to TID and such.. */
+  uint32_t tid = gtid;
+  uint32_t nth = omp_get_num_threads();
+  UT trip_count;
+
+  /* special handling for zero-trip loops */
+  if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
+    if (plastiter != nullptr)
+      *plastiter = FALSE;
+    /* leave pupper and plower set to entire iteration space */
+    *pstride = incr; /* value should never be used */
+    return;
+  }
+
+  if (nth == 1) {
+    if (plastiter != nullptr)
+      *plastiter = TRUE;
+    *pstride =
+        (incr > 0) ? (*pupper - *plower + 1) : (-(*plower - *pupper + 1));
+    return;
+  }
+
+  /* compute trip count */
+  if (incr == 1) {
+    trip_count = *pupper - *plower + 1;
+  } else if (incr == -1) {
+    trip_count = *plower - *pupper + 1;
+  } else if (incr > 0) {
+    // upper-lower can exceed the limit of signed type
+    trip_count = (UT)(*pupper - *plower) / incr + 1;
+  } else {
+    trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
+  }
+  if (trip_count < nth) {
+    if (tid < trip_count) {
+      *pupper = *plower = *plower + tid * incr;
+    } else {
+      // set bounds so non-active threads execute no iterations
+      *plower = *pupper + (incr > 0 ? 1 : -1);
+    }
+    if (plastiter != nullptr)
+      *plastiter = (tid == trip_count - 1);
+  } else {
+    UT small_chunk = trip_count / nth;
+    UT extras = trip_count % nth;
+    *plower += incr * (tid * small_chunk + (tid < extras ? tid : extras));
+    *pupper = *plower + small_chunk * incr - (tid < extras ? 0 : incr);
+    if (plastiter != nullptr)
+      *plastiter = (tid == nth - 1);
+  }
+  *pstride = trip_count;
+}
+
+void WEAK_SYMBOL __kmpc_for_static_fini(void *ptr, int32_t v) {}
+
+static thread_local int next_num_threads = 0;
+
+/*!
+@ingroup PARALLEL
+The type for a microtask which gets passed to @ref __kmpc_fork_call().
+The arguments to the outlined function are
+@param global_tid the global thread identity of the thread executing the
+function.
+@param bound_tid  the local identity of the thread executing the function
+@param ... pointers to shared variables accessed by the function.
+*/
+using kmpc_micro = void (*)(int32_t *global_tid, int32_t *bound_tid, ...);
+void WEAK_SYMBOL __kmpc_fork_call(void *loc, int32_t argc, void *pfunc, ...) {
+  if (unlikely(argc != 1 && argc != 0)) {
+    std::abort();
+  }
+  va_list ap;
+  va_start(ap, pfunc);
+  void *c = va_arg(ap, void *);
+  int32_t global_tid = 0;
+  if (unlikely(next_num_threads)) {
+#pragma omp parallel num_threads(next_num_threads)
+    {
+      kmpc_micro func = (kmpc_micro)(pfunc);
+      func(&global_tid, nullptr, c);
+    }
+    next_num_threads = 0;
+  } else {
+#pragma omp parallel
+    {
+      kmpc_micro func = (kmpc_micro)(pfunc);
+      func(&global_tid, nullptr, c);
+    }
+  }
+  va_end(ap);
+}
+
+void WEAK_SYMBOL __kmpc_push_num_threads(void *loc, int32_t global_tid,
+                                         int32_t num_threads) {
+  next_num_threads = num_threads;
+}
+#endif
+}
diff --git a/src/gc-cpu-runner/CMakeLists.txt b/src/gc-cpu-runner/CMakeLists.txt
index f3f768612..85dbb6995 100644
--- a/src/gc-cpu-runner/CMakeLists.txt
+++ b/src/gc-cpu-runner/CMakeLists.txt
@@ -36,7 +36,8 @@ endif()
 
 #LLVM_LINK_COMPONENTS is processed by LLVM cmake in add_llvm_executable
 set(gc_cpu_runner_libs
-        ${MLIR_LINK_COMPONENTS})
+        ${MLIR_LINK_COMPONENTS}
+        GCCpuRuntime)
 add_mlir_tool(gc-cpu-runner
         gc-cpu-runner.cpp
 )
diff --git a/src/gc-cpu-runner/gc-cpu-runner.cpp b/src/gc-cpu-runner/gc-cpu-runner.cpp
index 3ece8f2ff..353abffe9 100644
--- a/src/gc-cpu-runner/gc-cpu-runner.cpp
+++ b/src/gc-cpu-runner/gc-cpu-runner.cpp
@@ -27,7 +27,11 @@
 #include "llvm/Support/TargetSelect.h"
 #include <stdio.h>
 
+extern int gc_runtime_keep_alive;
+
 int main(int argc, char **argv) {
+  // keeps GCCPURuntime linked
+  gc_runtime_keep_alive = 0;
   llvm::InitLLVM y(argc, argv);
   llvm::InitializeNativeTarget();
   llvm::InitializeNativeTargetAsmPrinter();
diff --git a/test/gc/cpu-runner/tid.mlir b/test/gc/cpu-runner/tid.mlir
new file mode 100644
index 000000000..aedcc0a20
--- /dev/null
+++ b/test/gc/cpu-runner/tid.mlir
@@ -0,0 +1,37 @@
+// RUN: gc-opt %s --convert-cpuruntime-to-llvm --convert-openmp-to-llvm --convert-func-to-llvm --convert-arith-to-llvm --convert-cf-to-llvm --reconcile-unrealized-casts | gc-cpu-runner -e main -entry-point-result=void | FileCheck %s
+module {
+  func.func private @omp_get_thread_num() -> i32
+
+  func.func @check_parallel() {
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
+    %c8 = arith.constant 8 : index
+    %0 = llvm.mlir.constant(1 : i64) : i64
+    omp.parallel num_threads(%c8: index) {
+      omp.wsloop {
+        omp.loop_nest (%arg1, %arg2) : index = (%c0, %c0) to (%c1, %c64) step (%c1, %c1) {
+          cpuruntime.printf "ITR %zu\n" %arg2 : index
+          omp.yield
+        }
+        omp.terminator
+      }
+      %tid = func.call @omp_get_thread_num() : () -> i32
+      cpuruntime.printf "EXIT %d\n" %tid : i32
+      omp.terminator
+    }
+    return
+  }
+
+  func.func @main() {
+    %0 = func.call @omp_get_thread_num() : () -> i32
+    cpuruntime.printf "TID %d\n" %0 : i32
+    call @check_parallel() : ()->()
+    return
+  }
+  // CHECK: TID 0
+  // CHECK-COUNT-64: ITR {{[0-9]+}}
+  // CHECK-NOT: ITR
+  // CHECK-COUNT-8: EXIT {{[0-9]+}}
+  // CHECK-NOT: EXIT
+}
\ No newline at end of file

From 80a597f0887889f1721c83e502de97e0c05dac97 Mon Sep 17 00:00:00 2001
From: "Mei, Yijie" <yijie.mei@intel.com>
Date: Thu, 16 May 2024 14:18:55 +0800
Subject: [PATCH 11/19] fix

---
 lib/gc/Transforms/Pipeline.cpp               | 7 +++----
 test/gc/Transforms/Pipeline/tensor_args.mlir | 4 ++--
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/lib/gc/Transforms/Pipeline.cpp b/lib/gc/Transforms/Pipeline.cpp
index 8b74df1b9..b336684a1 100644
--- a/lib/gc/Transforms/Pipeline.cpp
+++ b/lib/gc/Transforms/Pipeline.cpp
@@ -66,13 +66,13 @@ void populateBufferizationPasses(mlir::PassManager &pm) {
   pm.addPass(bufferization::createOneShotBufferizePass(options));
   pm.addPass(createCSEPass());
   pm.addPass(mlir::func::createFuncBufferizePass());
-  bufferization::BufferResultsToOutParamsOpts opt{};
-  // opt.hoistStaticAllocs = true;
-  pm.addPass(bufferization::createBufferResultsToOutParamsPass(opt));
   pm.addNestedPass<func::FuncOp>(
       bufferization::createBufferizationBufferizePass());
   pm.addNestedPass<func::FuncOp>(
       bufferization::createFinalizingBufferizePass());
+  bufferization::BufferResultsToOutParamsOpts opt{};
+  opt.hoistStaticAllocs = true;
+  pm.addPass(bufferization::createBufferResultsToOutParamsPass(opt));
   // + buffer schedule pass, down-stream pass, to migrate buffer reschedule pass
   // from GC V1.
   pm.addNestedPass<func::FuncOp>(
@@ -101,7 +101,6 @@ void populateMicroKernelPasses(mlir::PassManager &pm) {
 void populateCPURuntimePasses(mlir::PassManager &pm) {
   // + flatten nested parallel pass, down-stream pass, to support coarse-grain
   // fusion
-  pm.addNestedPass<func::FuncOp>(cpuruntime::createCPURuntimeAtExitToOmp());
   // remove this pass after we add FlattenNestedParallel
   pm.addPass(createConvertSCFToOpenMPPass());
 }
diff --git a/test/gc/Transforms/Pipeline/tensor_args.mlir b/test/gc/Transforms/Pipeline/tensor_args.mlir
index 73d916d04..adcfb3bd8 100644
--- a/test/gc/Transforms/Pipeline/tensor_args.mlir
+++ b/test/gc/Transforms/Pipeline/tensor_args.mlir
@@ -7,7 +7,7 @@ module {
 func.func @aaa(%a: tensor<128xf32>, %b: tensor<128xf32>) -> tensor<128xf32> {
     %out = tensor.empty() : tensor<128xf32>
     %2 = linalg.add ins(%a, %b : tensor<128xf32>,tensor<128xf32>) outs(%out : tensor<128xf32>) -> tensor<128xf32>
-    // CHECK: memcpy
-    return %out : tensor<128xf32>
+    // CHECK-NOT: memcpy
+    return %2 : tensor<128xf32>
 }
 }
\ No newline at end of file

From 0b4332b2fe3a1e0c481fa1bcfb67a86130241ac1 Mon Sep 17 00:00:00 2001
From: "Mei, Yijie" <yijie.mei@intel.com>
Date: Thu, 16 May 2024 17:06:44 +0800
Subject: [PATCH 12/19] fix

---
 lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp b/lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp
index a71b8522c..4a25a5ee0 100644
--- a/lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp
+++ b/lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp
@@ -40,7 +40,6 @@ void gc_arrive_at_barrier(barrier_t *b, barrier_idle_func idle_func,
   auto cur_round = b->rounds_.load(std::memory_order_acquire);
   auto cnt = --b->pending_;
   assert(cnt >= 0);
-  //   int count = 0;
   if (cnt == 0) {
     b->pending_.store(b->total_);
     b->rounds_.store(cur_round + 1);
@@ -50,7 +49,6 @@ void gc_arrive_at_barrier(barrier_t *b, barrier_idle_func idle_func,
         return;
       }
       idle_func(&b->rounds_, cur_round + 1, -1, idle_args);
-      //   count = ret & 0xffffffff;
     }
     while (cur_round == b->rounds_.load()) {
       _mm_pause();
@@ -86,7 +84,7 @@ void WEAK_SYMBOL __kmpc_for_static_init_8u(void *loc, int32_t gtid,
     std::abort();
   }
   const int32_t FALSE = 0;
-  const int32_t TRUE = 0;
+  const int32_t TRUE = 1;
   using UT = uint64_t;
   //   using ST = int64_t;
   /*  this all has to be changed back to TID and such.. */

From 382171bf38ec16e7b9f3edeb0d07834800aba98d Mon Sep 17 00:00:00 2001
From: "Mei, Yijie" <yijie.mei@intel.com>
Date: Thu, 23 May 2024 16:49:08 +0800
Subject: [PATCH 13/19] fix lint

---
 lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp |  7 +++----
 src/gc-cpu-runner/CMakeLists.txt               | 17 +++++++++++++++++
 2 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp b/lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp
index 4a25a5ee0..6efb38142 100644
--- a/lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp
+++ b/lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp
@@ -1,5 +1,4 @@
 //===- Parallel.cpp - Definitions for parallel runtime  -----------*- C++ -*-=//
-//-*-===//
 //
 // This file is licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -28,9 +27,9 @@ struct barrier_t {
   char padding_[64 - 4 * sizeof(int32_t)];
 };
 
-typedef uint64_t (*barrier_idle_func)(std::atomic<int32_t> *remaining,
-                                      int32_t expected_remain, int32_t tid,
-                                      void *args);
+using barrier_idle_func = uint64_t (*)(std::atomic<int32_t> *remaining,
+                                       int32_t expected_remain, int32_t tid,
+                                       void *args);
 } // namespace
 
 extern "C" {
diff --git a/src/gc-cpu-runner/CMakeLists.txt b/src/gc-cpu-runner/CMakeLists.txt
index 85dbb6995..2599eef84 100644
--- a/src/gc-cpu-runner/CMakeLists.txt
+++ b/src/gc-cpu-runner/CMakeLists.txt
@@ -1,3 +1,20 @@
+################################################################################
+# Copyright (C) 2024 Intel Corporation
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+# SPDX-License-Identifier: Apache-2.0
+################################################################################
+
 if(GC_DEV_LINK_LLVM_DYLIB)
   set(LLVM_LINK_COMPONENTS
     LLVM

From f1fd0ae69fff9b17c8d7bf354e3e160a5d3be0aa Mon Sep 17 00:00:00 2001
From: "Mei, Yijie" <yijie.mei@intel.com>
Date: Thu, 23 May 2024 16:51:25 +0800
Subject: [PATCH 14/19] fix

---
 lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp b/lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp
index 6efb38142..d81e8d3a9 100644
--- a/lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp
+++ b/lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp
@@ -1,4 +1,4 @@
-//===- Parallel.cpp - Definitions for parallel runtime  -----------*- C++ -*-=//
+//===-- Parallel.cpp - Definitions for parallel runtime -----------*- C++ -*-=//
 //
 // This file is licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.

From a773ea6bcbcfebc1370aa78552a703278ee502da Mon Sep 17 00:00:00 2001
From: "Mei, Yijie" <yijie.mei@intel.com>
Date: Thu, 23 May 2024 16:52:29 +0800
Subject: [PATCH 15/19] f

---
 lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp b/lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp
index d81e8d3a9..ea7641417 100644
--- a/lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp
+++ b/lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp
@@ -1,9 +1,9 @@
-//===-- Parallel.cpp - Definitions for parallel runtime -----------*- C++ -*-=//
-//
+//===-- Parallel.cpp - parallel ---------------------------------*- C++ -*-===//
+// 
 // This file is licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
+// 
 //===----------------------------------------------------------------------===//
 
 #include <assert.h>

From 84933c21b7f2c73361953166add0e1143238906d Mon Sep 17 00:00:00 2001
From: "Mei, Yijie" <yijie.mei@intel.com>
Date: Thu, 23 May 2024 16:57:14 +0800
Subject: [PATCH 16/19] fix

---
 lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp b/lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp
index ea7641417..5591dc3af 100644
--- a/lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp
+++ b/lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp
@@ -1,9 +1,9 @@
 //===-- Parallel.cpp - parallel ---------------------------------*- C++ -*-===//
-// 
+//
 // This file is licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// 
+//
 //===----------------------------------------------------------------------===//
 
 #include <assert.h>

From 4cca4dfe7bbcf9dbe4ddab9ff66c0c4ff59ffa86 Mon Sep 17 00:00:00 2001
From: "Mei, Yijie" <yijie.mei@intel.com>
Date: Thu, 23 May 2024 17:11:32 +0800
Subject: [PATCH 17/19] add reference

---
 lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp b/lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp
index 5591dc3af..3a5b4c2c1 100644
--- a/lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp
+++ b/lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp
@@ -74,6 +74,8 @@ int WEAK_SYMBOL __kmpc_global_thread_num(void *loc) {
   return omp_get_thread_num();
 }
 
+// The implementation was extracted and simplified from LLVM libomp
+// at openmp/runtime/src/kmp_sched.cpp
 void WEAK_SYMBOL __kmpc_for_static_init_8u(void *loc, int32_t gtid,
                                            int32_t schedtype,
                                            int32_t *plastiter, uint64_t *plower,

From 1e06c9844c7753400c6efc254898475c52b2c38b Mon Sep 17 00:00:00 2001
From: "Mei, Yijie" <yijie.mei@intel.com>
Date: Tue, 28 May 2024 10:42:03 +0800
Subject: [PATCH 18/19] fix license.py

---
 scripts/license.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/scripts/license.py b/scripts/license.py
index 49f28eaa8..3ed2ce521 100644
--- a/scripts/license.py
+++ b/scripts/license.py
@@ -15,10 +15,10 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import datetime, sys, re, argparse
-from typing import Dict, Set
+from typing import Dict, Set, List
 
 WIDTH: int = 80
-intel_license: list[str] = [
+intel_license: List[str] = [
     'Copyright \\(C\\) (\\d\\d\\d\\d-)?$YEAR Intel Corporation',
     '',
     'Licensed under the Apache License, Version 2.0 (the "License");',
@@ -35,7 +35,7 @@
     'SPDX-License-Identifier: Apache-2.0',
 ]
 
-llvm_license: list[str] = [
+llvm_license: List[str] = [
     "===-{1,2} $FILE - .* -*\\*- $LANG -\\*-===",
     '',
     'This file is licensed under the Apache License v2.0 with LLVM Exceptions.',
@@ -45,7 +45,7 @@
     "===-*===",
 ]
 
-def check_license(filepath: str, license: list[str], var: Dict[str, str], re_line: Set[int]):
+def check_license(filepath: str, license: List[str], var: Dict[str, str], re_line: Set[int]):
     with open(filepath, 'r') as f:
         idx: int = 0
         for line in f.readlines():
@@ -117,7 +117,7 @@ def use_llvm_license(path: str) -> bool:
     var: Dict[str, str] = {}
     re_line: Set[int] = set()
 
-    lic = list[str]
+    lic = List[str]
 
     if filepath.startswith("test/") or filepath.startswith("./test/"):
         continue

From 381677a27f0c796dd94e208a4c2f1378994b1f59 Mon Sep 17 00:00:00 2001
From: "Mei, Yijie" <yijie.mei@intel.com>
Date: Tue, 28 May 2024 14:32:18 +0800
Subject: [PATCH 19/19] fix comments

---
 lib/gc/Transforms/Pipeline.cpp | 64 ++++++++++++++--------------------
 1 file changed, 26 insertions(+), 38 deletions(-)

diff --git a/lib/gc/Transforms/Pipeline.cpp b/lib/gc/Transforms/Pipeline.cpp
index b336684a1..81fae6877 100644
--- a/lib/gc/Transforms/Pipeline.cpp
+++ b/lib/gc/Transforms/Pipeline.cpp
@@ -27,40 +27,35 @@
 
 namespace mlir::gc {
 
+// linalg + linalgX + tensor
 void populateFrontendPasses(mlir::PassManager &pm) {
   // pm.addPass(onednn_graph::createConvertOneDNNGraphToLinalg());
 }
-// linalg + linalgX + tensor ==> GC V1 GIR
 
+// scf + arith + math + vector + tensor + linalg.brgemm + tensor.pack/unpack
 void populateTensorPasses(mlir::PassManager &pm) {
-  // + padding propagation pass, upstream-able 127x127 -> tilling size:32
-  // ->padding to 128x128
-  // + layout propagation pass, upstream-able 4x32x4x32 ->
-  // tensor.pack/tensor.unpack
-  // + tensor constant propagation pass, down-stream pass, designed to support
-  // oneDNN graph spec
-  // + linalg.matmul lowering to (scf.loop + linalg.brgemm) pass, upstream-able
-  // + fine-grain fusion pass, upstream-able -> scf.for + linalgx.mask
-  // + lower linalg to arith/math on virtual vector pass, up-streamable
+  // todo: padding propagation pass
+  // todo: layout propagation pass
+  // todo: tensor constant propagation pass
+  // todo: linalg.matmul lowering to (scf.loop + linalg.brgemm) pass
+  // todo: fine-grain fusion pass
+  // todo: lower linalg to arith/math on virtual vector pass
 
   // REMOVE this pass after the above passes are added. Currently we add this
   // pass to make the pipeline work properly
   pm.addNestedPass<func::FuncOp>(createLinalgGeneralizeNamedOpsPass());
 }
-// scf + arith + math + vector + tensor + linalg.brgemm + tensor.pack/unpack ==>
-// GC V1 TIR
 
+// scf + arith + math + vector + tensor + linalg.brgemm
 void populateVectorPasses(mlir::PassManager &pm) {
-  // + bf16 promotion pass, down-stream pass, device dependent pass, maybe can
-  // upstream
-  // + bf16 cast elimilation pass, down-stream pass, fast-math kind pass,
-  // designed to support oneDNN graph spec
+  // todo: bf16 promotion pass, device dependent pass
+  // todo: bf16 cast elimilation pass, fast-math kind pass, designed to support
+  // oneDNN graph spec
   pm.addNestedPass<func::FuncOp>(arith::createArithExpandOpsPass());
-  // + lower to physical vector pass, down-stream pass, device dependent pass,
-  // maybe can upstream
+  // todo: lower to physical vector pass, device dependent pass
 }
-// scf + arith + math + vector + tensor + linalg.brgemm
 
+// scf + arith + math + vector + memref + linalg.brgemm
 void populateBufferizationPasses(mlir::PassManager &pm) {
   bufferization::OneShotBufferizationOptions options;
   pm.addPass(bufferization::createOneShotBufferizePass(options));
@@ -73,34 +68,27 @@ void populateBufferizationPasses(mlir::PassManager &pm) {
   bufferization::BufferResultsToOutParamsOpts opt{};
   opt.hoistStaticAllocs = true;
   pm.addPass(bufferization::createBufferResultsToOutParamsPass(opt));
-  // + buffer schedule pass, down-stream pass, to migrate buffer reschedule pass
-  // from GC V1.
-  pm.addNestedPass<func::FuncOp>(
-      bufferization::createBufferHoistingPass()); // Need to improve this pass
-                                                  // to support thread-local
-                                                  // allocator.
+  // todo: buffer schedule pass
+  // todo: Need to improve this pass to support nested parallel.
+  pm.addNestedPass<func::FuncOp>(bufferization::createBufferHoistingPass());
   pm.addNestedPass<func::FuncOp>(bufferization::createBufferLoopHoistingPass());
   pm.addNestedPass<func::FuncOp>(bufferization::createBufferDeallocationPass());
   pm.addPass(createBufferizationToMemRefPass());
 }
-// scf + arith + math + vector + memref + linalg.brgemm
 
+// scf + arith + math + vector + memref + func/microkernel
 void populateMicroKernelPasses(mlir::PassManager &pm) {
-  // + ConvertLinalgToMicrokernel pass, upstream-able,
-  // + CleanupInvalidMicrokernel pass, upstream-able
-  // + InvariantMicrokernelMotion pass, upstream-able
-  // + ConvertMicrokernelToDnnlFunc, down-stream pass, to lower brgemm to dnnl
-  // call
-  // + ConvertMicrokernelToXsmm, down-stream pass, to lower brgemm to libxsmm
-  // call
-  // + LowerMicrokernel pass, upstream-able
-  // + DispatchMicrokernel, down-stream pass
+  // todo: ConvertLinalgToMicrokernel pass
+  // todo: CleanupInvalidMicrokernel pass
+  // todo: InvariantMicrokernelMotion pass
+  // todo: ConvertMicrokernelToDnnlFunc to lower brgemm to dnnl call
+  // todo: ConvertMicrokernelToXsmm, to lower brgemm to libxsmm call
+  // todo: LowerMicrokernel pass
+  // todo: DispatchMicrokernel
 }
-// scf + arith + math + vector + memref + func/microkernel
 
 void populateCPURuntimePasses(mlir::PassManager &pm) {
-  // + flatten nested parallel pass, down-stream pass, to support coarse-grain
-  // fusion
+  // todo: flatten nested parallel pass to support coarse-grain usion
   // remove this pass after we add FlattenNestedParallel
   pm.addPass(createConvertSCFToOpenMPPass());
 }