From 13faa3333b395e8bea8ae45cee32005ea90b2392 Mon Sep 17 00:00:00 2001 From: "Mei, Yijie" Date: Tue, 14 May 2024 15:07:57 +0800 Subject: [PATCH 01/19] add cpuruntime dialect --- include/gc/Dialect/CMakeLists.txt | 1 + include/gc/Dialect/CPURuntime/CMakeLists.txt | 2 + .../gc/Dialect/CPURuntime/IR/CMakeLists.txt | 1 + .../Dialect/CPURuntime/IR/CPURuntimeDialect.h | 18 ++ .../CPURuntime/IR/CPURuntimeDialect.td | 34 ++++ .../gc/Dialect/CPURuntime/IR/CPURuntimeOps.h | 26 +++ .../gc/Dialect/CPURuntime/IR/CPURuntimeOps.td | 72 ++++++++ .../CPURuntime/Transforms/CMakeLists.txt | 5 + .../CPURuntime/Transforms/CPURuntimePasses.h | 29 ++++ .../CPURuntime/Transforms/CPURuntimePasses.td | 57 +++++++ lib/gc/Dialect/CMakeLists.txt | 1 + lib/gc/Dialect/CPURuntime/CMakeLists.txt | 2 + lib/gc/Dialect/CPURuntime/IR/CMakeLists.txt | 16 ++ .../CPURuntime/IR/CPURuntimeDialect.cpp | 26 +++ .../Dialect/CPURuntime/IR/CPURuntimeOps.cpp | 56 ++++++ .../CPURuntime/Transforms/CMakeLists.txt | 16 ++ .../Transforms/CPURuntimePasses.cpp | 77 +++++++++ .../Transforms/CPURuntimeToLLVM.cpp | 159 ++++++++++++++++++ lib/gc/Transforms/CMakeLists.txt | 2 + src/CMakeLists.txt | 1 + src/gc-opt/CMakeLists.txt | 2 +- src/gc-opt/gc-opt.cpp | 5 +- .../Dialect/CPURuntime/cpu-runner/printf.mlir | 17 ++ .../CPURuntime/cpuruntime-atexit-to-omp.mlir | 41 +++++ .../CPURuntime/cpuruntime-to-llvm.mlir | 19 +++ 25 files changed, 683 insertions(+), 2 deletions(-) create mode 100644 include/gc/Dialect/CPURuntime/CMakeLists.txt create mode 100644 include/gc/Dialect/CPURuntime/IR/CMakeLists.txt create mode 100644 include/gc/Dialect/CPURuntime/IR/CPURuntimeDialect.h create mode 100644 include/gc/Dialect/CPURuntime/IR/CPURuntimeDialect.td create mode 100644 include/gc/Dialect/CPURuntime/IR/CPURuntimeOps.h create mode 100644 include/gc/Dialect/CPURuntime/IR/CPURuntimeOps.td create mode 100644 include/gc/Dialect/CPURuntime/Transforms/CMakeLists.txt create mode 100644 include/gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.h create mode 100644 include/gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.td create mode 100644 lib/gc/Dialect/CPURuntime/CMakeLists.txt create mode 100644 lib/gc/Dialect/CPURuntime/IR/CMakeLists.txt create mode 100644 lib/gc/Dialect/CPURuntime/IR/CPURuntimeDialect.cpp create mode 100644 lib/gc/Dialect/CPURuntime/IR/CPURuntimeOps.cpp create mode 100644 lib/gc/Dialect/CPURuntime/Transforms/CMakeLists.txt create mode 100644 lib/gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.cpp create mode 100644 lib/gc/Dialect/CPURuntime/Transforms/CPURuntimeToLLVM.cpp create mode 100644 test/gc/Dialect/CPURuntime/cpu-runner/printf.mlir create mode 100644 test/gc/Dialect/CPURuntime/cpuruntime-atexit-to-omp.mlir create mode 100644 test/gc/Dialect/CPURuntime/cpuruntime-to-llvm.mlir diff --git a/include/gc/Dialect/CMakeLists.txt b/include/gc/Dialect/CMakeLists.txt index ffeda0aa7..a23f3f9f1 100644 --- a/include/gc/Dialect/CMakeLists.txt +++ b/include/gc/Dialect/CMakeLists.txt @@ -1,3 +1,4 @@ +add_subdirectory(CPURuntime) add_subdirectory(OnednnGraph) add_subdirectory(Microkernel) add_subdirectory(Linalgx) \ No newline at end of file diff --git a/include/gc/Dialect/CPURuntime/CMakeLists.txt b/include/gc/Dialect/CPURuntime/CMakeLists.txt new file mode 100644 index 000000000..9f57627c3 --- /dev/null +++ b/include/gc/Dialect/CPURuntime/CMakeLists.txt @@ -0,0 +1,2 @@ +add_subdirectory(IR) +add_subdirectory(Transforms) diff --git a/include/gc/Dialect/CPURuntime/IR/CMakeLists.txt b/include/gc/Dialect/CPURuntime/IR/CMakeLists.txt new file mode 100644 index 000000000..fb73ae02b --- /dev/null +++ b/include/gc/Dialect/CPURuntime/IR/CMakeLists.txt @@ -0,0 +1 @@ +add_mlir_dialect(CPURuntimeOps cpuruntime) diff --git a/include/gc/Dialect/CPURuntime/IR/CPURuntimeDialect.h b/include/gc/Dialect/CPURuntime/IR/CPURuntimeDialect.h new file mode 100644 index 000000000..757182964 --- /dev/null +++ b/include/gc/Dialect/CPURuntime/IR/CPURuntimeDialect.h @@ -0,0 +1,18 @@ +//===- CPURuntimeDialect.h - CPU Runtime dialect ----------------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef CPURUNTIME_CPURUNTIMEDIALECT_H +#define CPURUNTIME_CPURUNTIMEDIALECT_H + +#include "mlir/Bytecode/BytecodeOpInterface.h" +#include "mlir/IR/Dialect.h" +#include "mlir/Interfaces/DestinationStyleOpInterface.h" + +#include "gc/Dialect/CPURuntime/IR/CPURuntimeOpsDialect.h.inc" + +#endif // CPURUNTIME_CPURUNTIMEDIALECT_H diff --git a/include/gc/Dialect/CPURuntime/IR/CPURuntimeDialect.td b/include/gc/Dialect/CPURuntime/IR/CPURuntimeDialect.td new file mode 100644 index 000000000..06f3af526 --- /dev/null +++ b/include/gc/Dialect/CPURuntime/IR/CPURuntimeDialect.td @@ -0,0 +1,34 @@ +//===- CPURuntimeDialect.td - CPU Runtime Dialect ---------------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef CPUPARALLEL_DIALECT +#define CPUPARALLEL_DIALECT + +include "mlir/IR/OpBase.td" + +//===----------------------------------------------------------------------===// +// CPURuntime dialect definition. +//===----------------------------------------------------------------------===// + +def CPURuntime_Dialect : Dialect { + let name = "cpuruntime"; + let summary = "A dialect for CPU parallel primitives."; + let description = [{ + This dialect contains primitives for CPU runtime. + }]; + let cppNamespace = "::mlir::cpuruntime"; +} + +//===----------------------------------------------------------------------===// +// Base cpuruntime operation definition. +//===----------------------------------------------------------------------===// + +class CPURuntime_Op traits = []> : + Op; + +#endif // CPUPARALLEL_DIALECT diff --git a/include/gc/Dialect/CPURuntime/IR/CPURuntimeOps.h b/include/gc/Dialect/CPURuntime/IR/CPURuntimeOps.h new file mode 100644 index 000000000..5ce667a91 --- /dev/null +++ b/include/gc/Dialect/CPURuntime/IR/CPURuntimeOps.h @@ -0,0 +1,26 @@ +//===- CPURuntimeOps.h - CPU Runtime Ops ====--------------------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef CPURUNTIME_CPURUNTIMEOPS_H +#define CPURUNTIME_CPURUNTIMEOPS_H + +#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/Dialect/OpenMP/OpenMPDialect.h" +#include "mlir/Dialect/SCF/IR/SCF.h" +#include "mlir/IR/BuiltinTypes.h" +#include "mlir/IR/Dialect.h" +#include "mlir/IR/OpDefinition.h" +#include "mlir/Interfaces/ControlFlowInterfaces.h" +#include "mlir/Interfaces/InferTypeOpInterface.h" +#include "mlir/Interfaces/SideEffectInterfaces.h" + +#define GET_OP_CLASSES +#include "gc/Dialect/CPURuntime/IR/CPURuntimeOps.h.inc" + +#endif // CPURUNTIME_CPURUNTIMEOPS_H diff --git a/include/gc/Dialect/CPURuntime/IR/CPURuntimeOps.td b/include/gc/Dialect/CPURuntime/IR/CPURuntimeOps.td new file mode 100644 index 000000000..cc1b7c555 --- /dev/null +++ b/include/gc/Dialect/CPURuntime/IR/CPURuntimeOps.td @@ -0,0 +1,72 @@ +//===- CPURuntimeOps.td - CPU Runtime Ops -----------------------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef CPURUNTIME_OPS +#define CPURUNTIME_OPS + +include "gc/Dialect/CPURuntime/IR/CPURuntimeDialect.td" +include "mlir/Interfaces/InferTypeOpInterface.td" +include "mlir/Interfaces/SideEffectInterfaces.td" +include "mlir/Interfaces/DestinationStyleOpInterface.td" +include "mlir/Interfaces/ControlFlowInterfaces.td" +include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.td" + + +def CPURuntime_AtParallelExitOp : CPURuntime_Op<"at_parallel_exit", [ + ParentOneOf<["scf::ForallOp", "scf::ParallelOp", "omp::WsloopOp", "memref::AllocaScopeOp"]>, + SingleBlockImplicitTerminator<"ParallelExitReturnOp"> + ]> { + let summary = "Runs the block once in all threads at the exit of the parallel section"; + let description = [{ + It executes the block for each thread working in the parallel section for + once, at the exit of parallel section. + }]; + + let regions = (region SizedRegion<1>:$region); + + let hasCustomAssemblyFormat = 1; + + // The default builder does not add a region with an empty body, add our own. + let skipDefaultBuilders = 1; + let builders = [ + OpBuilder<(ins)>, + ]; +} + +def CPURuntime_ParallelExitReturnOp : CPURuntime_Op<"parallel_exit.return", [ + Pure, + HasParent<"AtParallelExitOp">, + Terminator, ReturnLike + ]> { + let summary = "Terminates at_parallel_exit block"; + let description = [{ + at_parallel_exit should ends with parallel_exit.return + }]; + let assemblyFormat = + [{ attr-dict }]; +} + + +def CPURuntime_PrintfOp : CPURuntime_Op<"printf", [MemoryEffects<[MemWrite]>]>, + Arguments<(ins StrAttr:$format, + Variadic>:$args)> { + let summary = "C-style printf"; + let description = [{ + `cpuruntime.printf` takes a literal format string `format` and an arbitrary number of + scalar arguments that should be printed. + + The format string is a C-style printf string, subject to any restrictions + imposed by one's target platform. + }]; + let assemblyFormat = [{ + $format attr-dict ($args^ `:` type($args))? + }]; +} + + +#endif // CPURUNTIME_OPS diff --git a/include/gc/Dialect/CPURuntime/Transforms/CMakeLists.txt b/include/gc/Dialect/CPURuntime/Transforms/CMakeLists.txt new file mode 100644 index 000000000..763ffab86 --- /dev/null +++ b/include/gc/Dialect/CPURuntime/Transforms/CMakeLists.txt @@ -0,0 +1,5 @@ +set(LLVM_TARGET_DEFINITIONS CPURuntimePasses.td) +mlir_tablegen(CPURuntimePasses.h.inc --gen-pass-decls -name CPURuntime) +mlir_tablegen(CPURuntimePasses.capi.h.inc -gen-pass-capi-header --prefix CPURuntime) +mlir_tablegen(CPURuntimePasses.capi.cpp.inc -gen-pass-capi-impl --prefix CPURuntime) +add_public_tablegen_target(MLIRCPURuntimePassesIncGen) diff --git a/include/gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.h b/include/gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.h new file mode 100644 index 000000000..8fde8f4fd --- /dev/null +++ b/include/gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.h @@ -0,0 +1,29 @@ +//===- CPURuntimePasses.h - CPU Runtime Passes ------------------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef CPURUNTIME_CPURUNTIMEPASSES_H +#define CPURUNTIME_CPURUNTIMEPASSES_H + +#include "gc/Dialect/CPURuntime/IR/CPURuntimeDialect.h" +#include "gc/Dialect/CPURuntime/IR/CPURuntimeOps.h" +#include "mlir/Pass/Pass.h" +#include + +namespace mlir { +namespace cpuruntime { +void registerConvertCPURuntimeToLLVMInterface(DialectRegistry ®istry); + +#define GEN_PASS_DECL +#include "gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.h.inc" + +#define GEN_PASS_REGISTRATION +#include "gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.h.inc" +} // namespace cpuruntime +} // namespace mlir + +#endif diff --git a/include/gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.td b/include/gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.td new file mode 100644 index 000000000..0685ce498 --- /dev/null +++ b/include/gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.td @@ -0,0 +1,57 @@ +//===- CPURuntimePasses.td - CPU Runtime Passes -----------------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef CPURUNTIME_PASS +#define CPURUNTIME_PASS + +include "mlir/Pass/PassBase.td" + + +def CPURuntimeAtExitToOmp: Pass<"cpuruntime-atexit-to-omp", "::mlir::func::FuncOp"> { + let summary = "Lower at_parallel_exit to code in omp.parallel section"; + let description = [{ + Switches the name of a FuncOp named `bar` to `foo` and folds. + ``` + omp.parallel { + omp.wsloop for (%arg1, %arg2) : index = (%c0, %c0) to (%c1, %c512) step (%c1, %c1) { + memref.alloca_scope { + cpuruntime.at_parallel_exit { + "your.op"() + cpuruntime.parallel_exit.return + } + } + omp.yield + } + omp.terminator + } + ``` + Will be changed into + ``` + omp.parallel { + omp.wsloop for (%arg1, %arg2) : index = (%c0, %c0) to (%c1, %c512) step (%c1, %c1) { + memref.alloca_scope { + } + omp.yield + } + "your.op"() + omp.terminator + } + ``` + }]; +} + + +def CPURuntimeToLLVM: Pass<"convert-cpuruntime-to-llvm"> { + let summary = "Convert cpuruntime to LLVM dialect"; + let description = [{ + This pass converts supported cpuruntime ops to LLVM dialect instructions. + }]; + let dependentDialects = ["LLVM::LLVMDialect"]; +} + +#endif // CPURUNTIME_PASS diff --git a/lib/gc/Dialect/CMakeLists.txt b/lib/gc/Dialect/CMakeLists.txt index a880ff2ed..8720bd8e6 100644 --- a/lib/gc/Dialect/CMakeLists.txt +++ b/lib/gc/Dialect/CMakeLists.txt @@ -1,3 +1,4 @@ +add_subdirectory(CPURuntime) add_subdirectory(Linalgx) add_subdirectory(Microkernel) add_subdirectory(OnednnGraph) diff --git a/lib/gc/Dialect/CPURuntime/CMakeLists.txt b/lib/gc/Dialect/CPURuntime/CMakeLists.txt new file mode 100644 index 000000000..9f57627c3 --- /dev/null +++ b/lib/gc/Dialect/CPURuntime/CMakeLists.txt @@ -0,0 +1,2 @@ +add_subdirectory(IR) +add_subdirectory(Transforms) diff --git a/lib/gc/Dialect/CPURuntime/IR/CMakeLists.txt b/lib/gc/Dialect/CPURuntime/IR/CMakeLists.txt new file mode 100644 index 000000000..e349da72c --- /dev/null +++ b/lib/gc/Dialect/CPURuntime/IR/CMakeLists.txt @@ -0,0 +1,16 @@ +add_mlir_dialect_library(MLIRCPURuntimeDialect + CPURuntimeDialect.cpp + CPURuntimeOps.cpp + + ADDITIONAL_HEADER_DIRS + ${PROJECT_SOURCE_DIR}/include/ + + DEPENDS + MLIRCPURuntimeOpsIncGen + MLIRCPURuntimePassesIncGen + + LINK_LIBS PUBLIC + MLIR + # MLIRInferTypeOpInterface + # MLIRFuncDialect + ) diff --git a/lib/gc/Dialect/CPURuntime/IR/CPURuntimeDialect.cpp b/lib/gc/Dialect/CPURuntime/IR/CPURuntimeDialect.cpp new file mode 100644 index 000000000..9f3e97b57 --- /dev/null +++ b/lib/gc/Dialect/CPURuntime/IR/CPURuntimeDialect.cpp @@ -0,0 +1,26 @@ +//===- CPURuntimeDialect.cpp - CPU Runtime Dialect --------------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "gc/Dialect/CPURuntime/IR/CPURuntimeDialect.h" +#include "gc/Dialect/CPURuntime/IR/CPURuntimeOps.h" + +using namespace mlir; +using namespace mlir::cpuruntime; + +#include "gc/Dialect/CPURuntime/IR/CPURuntimeOpsDialect.cpp.inc" + +//===----------------------------------------------------------------------===// +// CPURuntime dialect. +//===----------------------------------------------------------------------===// + +void CPURuntimeDialect::initialize() { + addOperations< +#define GET_OP_LIST +#include "gc/Dialect/CPURuntime/IR/CPURuntimeOps.cpp.inc" + >(); +} diff --git a/lib/gc/Dialect/CPURuntime/IR/CPURuntimeOps.cpp b/lib/gc/Dialect/CPURuntime/IR/CPURuntimeOps.cpp new file mode 100644 index 000000000..ca632e9db --- /dev/null +++ b/lib/gc/Dialect/CPURuntime/IR/CPURuntimeOps.cpp @@ -0,0 +1,56 @@ +//===- CPURuntimeOps.cpp - CPU Runtime Ops ----------------------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "gc/Dialect/CPURuntime/IR/CPURuntimeOps.h" +#include "gc/Dialect/CPURuntime/IR/CPURuntimeDialect.h" + +#define GET_OP_CLASSES +#include "gc/Dialect/CPURuntime/IR/CPURuntimeOps.cpp.inc" + +#include + +namespace mlir { +using namespace bufferization; + +namespace cpuruntime { + +void AtParallelExitOp::build(OpBuilder &b, OperationState &result) { + OpBuilder::InsertionGuard g(b); + Region *bodyRegion = result.addRegion(); + b.createBlock(bodyRegion); +} + +void AtParallelExitOp::print(OpAsmPrinter &p) { + p << " "; + p.printRegion(getRegion(), + /*printEntryBlockArgs=*/false, + /*printBlockTerminators=*/true); + p.printOptionalAttrDict(getOperation()->getAttrs()); +} + +ParseResult AtParallelExitOp::parse(OpAsmParser &parser, + OperationState &result) { + auto &builder = parser.getBuilder(); + + SmallVector regionOperands; + std::unique_ptr region = std::make_unique(); + if (parser.parseRegion(*region, regionOperands)) + return failure(); + + if (region->empty()) + OpBuilder(builder.getContext()).createBlock(region.get()); + result.addRegion(std::move(region)); + + // Parse the optional attribute list. + if (parser.parseOptionalAttrDict(result.attributes)) + return failure(); + return success(); +} + +} // namespace cpuruntime +} // namespace mlir \ No newline at end of file diff --git a/lib/gc/Dialect/CPURuntime/Transforms/CMakeLists.txt b/lib/gc/Dialect/CPURuntime/Transforms/CMakeLists.txt new file mode 100644 index 000000000..ee6148aa4 --- /dev/null +++ b/lib/gc/Dialect/CPURuntime/Transforms/CMakeLists.txt @@ -0,0 +1,16 @@ +add_mlir_dialect_library(MLIRCPURuntimeTransforms + CPURuntimePasses.cpp + CPURuntimeToLLVM.cpp + + ADDITIONAL_HEADER_DIRS + ${PROJECT_SOURCE_DIR}/include/ + + DEPENDS + MLIRCPURuntimePassesIncGen + + LINK_LIBS PUBLIC + MLIRFuncDialect + MLIRCPURuntimeDialect + ) + +set_property(GLOBAL APPEND PROPERTY GC_PASS_LIBS MLIRCPURuntimeTransforms) \ No newline at end of file diff --git a/lib/gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.cpp b/lib/gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.cpp new file mode 100644 index 000000000..f2a098fcf --- /dev/null +++ b/lib/gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.cpp @@ -0,0 +1,77 @@ +//===- CPURuntimePasses.cpp - CPU Runtime Passes ----------------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/IR/PatternMatch.h" +#include "mlir/Rewrite/FrozenRewritePatternSet.h" +#include "mlir/Support/LogicalResult.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" + +#include "gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.h" + +namespace mlir::cpuruntime { +#define GEN_PASS_DEF_CPURUNTIMEATEXITTOOMP +#include "gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.h.inc" + +namespace { + +class CPURuntimeAtExitToOmpRewriter + : public OpRewritePattern { +public: + using OpRewritePattern::OpRewritePattern; + LogicalResult matchAndRewrite(AtParallelExitOp op, + PatternRewriter &rewriter) const final { + auto parent = op->getParentOp(); + Operation *secondLast = nullptr; + while (parent && (llvm::isa(parent) || + llvm::isa(parent))) { + secondLast = parent; + parent = parent->getParentOp(); + } + auto parallel = llvm::dyn_cast(parent); + if (!parallel) { + return failure(); + } + assert(secondLast->getBlock()); + auto itr = secondLast->getBlock()->end(); + --itr; + rewriter.inlineBlockBefore(&op->getRegion(0).getBlocks().front(), + secondLast->getBlock(), itr); + rewriter.eraseOp(op); + return success(); + } +}; + +class CPURuntimeExitReturnRewriter + : public OpRewritePattern { +public: + using OpRewritePattern::OpRewritePattern; + LogicalResult matchAndRewrite(ParallelExitReturnOp op, + PatternRewriter &rewriter) const final { + rewriter.eraseOp(op); + return success(); + } +}; + +class CPURuntimeAtExitToOmp + : public impl::CPURuntimeAtExitToOmpBase { +public: + using impl::CPURuntimeAtExitToOmpBase< + CPURuntimeAtExitToOmp>::CPURuntimeAtExitToOmpBase; + void runOnOperation() final { + RewritePatternSet patterns(&getContext()); + patterns.add(&getContext()); + patterns.add(&getContext()); + FrozenRewritePatternSet patternSet(std::move(patterns)); + if (failed(applyPatternsAndFoldGreedily(getOperation(), patternSet))) + signalPassFailure(); + } +}; + +} // namespace +} // namespace mlir::cpuruntime diff --git a/lib/gc/Dialect/CPURuntime/Transforms/CPURuntimeToLLVM.cpp b/lib/gc/Dialect/CPURuntime/Transforms/CPURuntimeToLLVM.cpp new file mode 100644 index 000000000..73cf14a84 --- /dev/null +++ b/lib/gc/Dialect/CPURuntime/Transforms/CPURuntimeToLLVM.cpp @@ -0,0 +1,159 @@ +//===- CPURuntimeToLLVM.cpp - CPU Runtime To LLVM ---------------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "mlir/Conversion/ArithCommon/AttrToLLVMConverter.h" +#include "mlir/Conversion/ConvertToLLVM/ToLLVMInterface.h" +#include "mlir/Conversion/LLVMCommon/ConversionTarget.h" +#include "mlir/Conversion/LLVMCommon/VectorPattern.h" +#include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/IR/PatternMatch.h" +#include "mlir/IR/TypeUtilities.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Rewrite/FrozenRewritePatternSet.h" +#include "mlir/Support/LogicalResult.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" + +#include "gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.h" + +namespace mlir::cpuruntime { + +void populateCPURuntimeToLLVMConversionPatterns(LLVMTypeConverter &converter, + RewritePatternSet &patterns); + +#define GEN_PASS_DEF_CPURUNTIMETOLLVM +#include "gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.h.inc" + +namespace { +static const char formatStringPrefix[] = "cpuprintfFormat_"; + +static LLVM::LLVMFuncOp getOrDefineFunction(ModuleOp &moduleOp, + const Location loc, + ConversionPatternRewriter &rewriter, + StringRef name, + LLVM::LLVMFunctionType type) { + LLVM::LLVMFuncOp ret; + if (!(ret = moduleOp.template lookupSymbol(name))) { + ConversionPatternRewriter::InsertionGuard guard(rewriter); + rewriter.setInsertionPointToStart(moduleOp.getBody()); + ret = rewriter.create(loc, name, type, + LLVM::Linkage::External); + } + return ret; +} + + +class PrintfRewriter : public ConvertOpToLLVMPattern { +public: + using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; + LogicalResult + matchAndRewrite(PrintfOp op, PrintfOpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const final { + auto moduleOp = op->getParentOfType(); + auto loc = op->getLoc(); + mlir::Type llvmI32 = typeConverter->convertType(rewriter.getI32Type()); + mlir::Type llvmI64 = typeConverter->convertType(rewriter.getI64Type()); + mlir::Type llvmI8 = typeConverter->convertType(rewriter.getI8Type()); + mlir::Type i8Ptr = LLVM::LLVMPointerType::get(op.getContext()); + auto printfFunc = getOrDefineFunction( + moduleOp, loc, rewriter, "printf", + LLVM::LLVMFunctionType::get(llvmI32, {i8Ptr}, /*isVarArg*/ true)); + + unsigned stringNumber = 0; + SmallString<16> stringConstName; + do { + stringConstName.clear(); + (formatStringPrefix + Twine(stringNumber++)).toStringRef(stringConstName); + } while (moduleOp.lookupSymbol(stringConstName)); + + llvm::SmallString<20> formatString(adaptor.getFormat()); + formatString.push_back('\0'); // Null terminate for C + size_t formatStringSize = formatString.size_in_bytes(); + + auto globalType = LLVM::LLVMArrayType::get(llvmI8, formatStringSize); + LLVM::GlobalOp global; + { + ConversionPatternRewriter::InsertionGuard guard(rewriter); + rewriter.setInsertionPointToStart(moduleOp.getBody()); + global = rewriter.create( + loc, globalType, + /*isConstant=*/true, LLVM::Linkage::Internal, stringConstName, + rewriter.getStringAttr(formatString)); + } + Value globalPtr = rewriter.create( + loc, + LLVM::LLVMPointerType::get(rewriter.getContext(), + global.getAddrSpace()), + global.getSymNameAttr()); + Value stringStart = rewriter.create( + loc, i8Ptr, globalType, globalPtr, ArrayRef{0, 0}); + SmallVector appendFormatArgs = {stringStart}; + for (auto arg : adaptor.getArgs()) { + if (auto floatType = dyn_cast(arg.getType())) { + if (!floatType.isF64()) + arg = rewriter.create( + loc, typeConverter->convertType(rewriter.getF64Type()), arg); + } + if (arg.getType().getIntOrFloatBitWidth() != 64) + arg = rewriter.create(loc, llvmI64, arg); + appendFormatArgs.push_back(arg); + } + rewriter.create(loc, printfFunc, appendFormatArgs); + rewriter.eraseOp(op); + return success(); + } +}; + +class CPURuntimeToLLVM + : public impl::CPURuntimeToLLVMBase { +public: + using Base::Base; + void runOnOperation() final { + LLVMConversionTarget target(getContext()); + RewritePatternSet patterns(&getContext()); + LowerToLLVMOptions options(&getContext()); + LLVMTypeConverter converter(&getContext(), options); + populateCPURuntimeToLLVMConversionPatterns(converter, patterns); + + if (failed(applyPartialConversion(getOperation(), target, + std::move(patterns)))) + signalPassFailure(); + } +}; + +/// Implement the interface to convert MemRef to LLVM. +struct CPURuntimeToDialectInterface : public ConvertToLLVMPatternInterface { + using ConvertToLLVMPatternInterface::ConvertToLLVMPatternInterface; + void loadDependentDialects(MLIRContext *context) const final { + context->loadDialect(); + } + + /// Hook for derived dialect interface to provide conversion patterns + /// and mark dialect legal for the conversion target. + void populateConvertToLLVMConversionPatterns( + ConversionTarget &target, LLVMTypeConverter &typeConverter, + RewritePatternSet &patterns) const final { + populateCPURuntimeToLLVMConversionPatterns(typeConverter, patterns); + } +}; + +} // namespace + +void populateCPURuntimeToLLVMConversionPatterns(LLVMTypeConverter &converter, + RewritePatternSet &patterns) { + patterns.add(converter); +} + +void registerConvertCPURuntimeToLLVMInterface(DialectRegistry ®istry) { + registry.addExtension(+[](MLIRContext *ctx, arith::ArithDialect *dialect) { + dialect->addInterfaces(); + }); +} + +} // namespace mlir::cpuruntime diff --git a/lib/gc/Transforms/CMakeLists.txt b/lib/gc/Transforms/CMakeLists.txt index df8a14d01..e87106946 100644 --- a/lib/gc/Transforms/CMakeLists.txt +++ b/lib/gc/Transforms/CMakeLists.txt @@ -14,3 +14,5 @@ add_mlir_library(GCPasses MLIRBufferizationToMemRef MLIRBufferizationPipelines ) + +set_property(GLOBAL APPEND PROPERTY GC_PASS_LIBS GCPasses) \ No newline at end of file diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index f6298c270..e71fe30a0 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,5 +1,6 @@ get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS) get_property(conversion_libs GLOBAL PROPERTY MLIR_CONVERSION_LIBS) +get_property(gc_pass_libs GLOBAL PROPERTY GC_PASS_LIBS) add_subdirectory(dnnl) add_subdirectory(gc-cpu-runner) diff --git a/src/gc-opt/CMakeLists.txt b/src/gc-opt/CMakeLists.txt index ff33375de..0deb242a0 100644 --- a/src/gc-opt/CMakeLists.txt +++ b/src/gc-opt/CMakeLists.txt @@ -2,7 +2,7 @@ set(gc_opt_libs ${dialect_libs} ${conversion_libs} MLIROptLib - GCPasses) + ${gc_pass_libs}) if(GC_MLIR_CXX_FLAGS) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${GC_MLIR_CXX_FLAGS}") endif() diff --git a/src/gc-opt/gc-opt.cpp b/src/gc-opt/gc-opt.cpp index 72a25abf5..e1996c050 100644 --- a/src/gc-opt/gc-opt.cpp +++ b/src/gc-opt/gc-opt.cpp @@ -18,6 +18,7 @@ */ #include "gc/Transforms/Passes.h" +#include "gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.h" #include "mlir/InitAllDialects.h" #include "mlir/InitAllPasses.h" #include "mlir/Tools/mlir-opt/MlirOptMain.h" @@ -25,9 +26,11 @@ int main(int argc, char *argv[]) { mlir::registerAllPasses(); mlir::gc::registerGraphCompilerPasses(); - + mlir::cpuruntime::registerCPURuntimePasses(); mlir::DialectRegistry registry; + registry.insert(); mlir::registerAllDialects(registry); + mlir::cpuruntime::registerConvertCPURuntimeToLLVMInterface(registry); return mlir::asMainReturnCode(mlir::MlirOptMain( argc, argv, "Graph Compiler modular optimizer driver\n", registry)); } diff --git a/test/gc/Dialect/CPURuntime/cpu-runner/printf.mlir b/test/gc/Dialect/CPURuntime/cpu-runner/printf.mlir new file mode 100644 index 000000000..e95471d50 --- /dev/null +++ b/test/gc/Dialect/CPURuntime/cpu-runner/printf.mlir @@ -0,0 +1,17 @@ +// RUN: gc-opt %s --convert-cpuruntime-to-llvm --convert-func-to-llvm --convert-arith-to-llvm --convert-cf-to-llvm --convert-complex-to-llvm | gc-cpu-runner -e main -entry-point-result=void -shared-libs=%mlir_runner_utils,%mlir_c_runner_utils | FileCheck %s + +module { + func.func @doprint(%t: f32, %t2: i32, %t3: i64) { + cpuruntime.printf "Hello world %f %d %lld\n" %t, %t2, %t3 : f32, i32, i64 + return + } + + func.func @main() { + %c2 = arith.constant 2.0 : f32 + %c32i = arith.constant 2000000 : i32 + %c64i = arith.constant 2000000 : i64 + call @doprint(%c2, %c32i, %c64i) : (f32, i32, i64) -> () + return + } + // CHECK: Hello world 2.000000 2000000 2000000 +} \ No newline at end of file diff --git a/test/gc/Dialect/CPURuntime/cpuruntime-atexit-to-omp.mlir b/test/gc/Dialect/CPURuntime/cpuruntime-atexit-to-omp.mlir new file mode 100644 index 000000000..401de95cc --- /dev/null +++ b/test/gc/Dialect/CPURuntime/cpuruntime-atexit-to-omp.mlir @@ -0,0 +1,41 @@ +// RUN: gc-opt %s --cpuruntime-atexit-to-omp | FileCheck %s + +module { + func.func @parallel_insert_slice(%arg0: memref<512x512xf32>) -> memref<512x512xf32> { + %cst = arith.constant 0.000000e+00 : f32 + %alloc = memref.alloc() {alignment = 64 : i64} : memref<512x512xf32> + %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + memref.copy %arg0, %alloc : memref<512x512xf32> to memref<512x512xf32> + %0 = llvm.mlir.constant(1 : i64) : i64 + omp.parallel { + omp.wsloop for (%arg1, %arg2) : index = (%c0, %c0) to (%c1, %c512) step (%c1, %c1) { + memref.alloca_scope { + %alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<512xf32> + %subview = memref.subview %alloc[%arg1, 0] [1, 512] [1, 1] : memref<512x512xf32> to memref<512xf32, strided<[1], offset: ?>> + memref.copy %alloc_0, %subview : memref<512xf32> to memref<512xf32, strided<[1], offset: ?>> + memref.dealloc %alloc_0 : memref<512xf32> + cpuruntime.at_parallel_exit { + memref.prefetch %alloc[%c1,%c0], read, locality<3>, data : memref<512x512xf32> + cpuruntime.parallel_exit.return + } + } + omp.yield + } + memref.prefetch %alloc[%c0,%c0], read, locality<3>, data : memref<512x512xf32> + omp.terminator + } + // CHECK-DAG: %[[C1:.*]] = arith.constant 1 + // CHECK-DAG: %[[C0:.*]] = arith.constant 0 + // CHECK: omp.parallel + // CHECK-NEXT: omp.wsloop + // CHECK-NEXT: memref.alloca_scope + // CHECK-NOT: cpuruntime.at_parallel_exit + // CHECK: omp.yield + // CHECK: memref.prefetch {{%alloc}}[%[[C0]], %[[C0]]] + // CHECK-NEXT: memref.prefetch {{%alloc}}[%[[C1]], %[[C0]]] + // CHECK-NEXT: omp.terminator + return %alloc : memref<512x512xf32> + } +} diff --git a/test/gc/Dialect/CPURuntime/cpuruntime-to-llvm.mlir b/test/gc/Dialect/CPURuntime/cpuruntime-to-llvm.mlir new file mode 100644 index 000000000..fb8d748f1 --- /dev/null +++ b/test/gc/Dialect/CPURuntime/cpuruntime-to-llvm.mlir @@ -0,0 +1,19 @@ +// RUN: gc-opt %s --convert-cpuruntime-to-llvm | FileCheck %s + +module { + // CHECK: llvm.mlir.global internal constant @cpuprintfFormat_0("Hello world %f %d %lld\0A\00") {addr_space = 0 : i32} + // CHECK: llvm.func @printf(!llvm.ptr, + // CHECK-NEXT: func.func @doprint(%[[ARG0:.*]]: f32, %[[ARG1:.*]]: i32, %[[ARG2:.*]]: i64) + func.func @doprint(%t: f32, %t2: i32, %t3: i64) { + // CHECK-NEXT: llvm.mlir.addressof + // CHECK-DAG: %[[C1:.*]] = llvm.getelementptr + // CHECK-SAME: !llvm.ptr, !llvm.array<24 x i8> + // CHECK: %[[C2:.*]] = llvm.fpext %[[ARG0]] + // CHECK: %[[C3:.*]] = llvm.zext %[[ARG1]] + // CHECK-NOT: cpuruntime.printf + // CHECK-NEXT: llvm.call @printf(%[[C1]], %[[C2]], %[[C3]], %[[ARG2]]) + cpuruntime.printf "Hello world %f %d %lld\n" %t, %t2, %t3 : f32, i32, i64 + return + } + +} \ No newline at end of file From 161848e3e442ac1f6007de5c664e3b70f5a20b02 Mon Sep 17 00:00:00 2001 From: "Mei, Yijie" Date: Tue, 14 May 2024 15:11:57 +0800 Subject: [PATCH 02/19] format --- lib/gc/Dialect/CPURuntime/Transforms/CPURuntimeToLLVM.cpp | 8 +++----- src/gc-opt/gc-opt.cpp | 2 +- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/lib/gc/Dialect/CPURuntime/Transforms/CPURuntimeToLLVM.cpp b/lib/gc/Dialect/CPURuntime/Transforms/CPURuntimeToLLVM.cpp index 73cf14a84..c56621d45 100644 --- a/lib/gc/Dialect/CPURuntime/Transforms/CPURuntimeToLLVM.cpp +++ b/lib/gc/Dialect/CPURuntime/Transforms/CPURuntimeToLLVM.cpp @@ -25,7 +25,7 @@ namespace mlir::cpuruntime { void populateCPURuntimeToLLVMConversionPatterns(LLVMTypeConverter &converter, - RewritePatternSet &patterns); + RewritePatternSet &patterns); #define GEN_PASS_DEF_CPURUNTIMETOLLVM #include "gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.h.inc" @@ -48,7 +48,6 @@ static LLVM::LLVMFuncOp getOrDefineFunction(ModuleOp &moduleOp, return ret; } - class PrintfRewriter : public ConvertOpToLLVMPattern { public: using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; @@ -110,8 +109,7 @@ class PrintfRewriter : public ConvertOpToLLVMPattern { } }; -class CPURuntimeToLLVM - : public impl::CPURuntimeToLLVMBase { +class CPURuntimeToLLVM : public impl::CPURuntimeToLLVMBase { public: using Base::Base; void runOnOperation() final { @@ -146,7 +144,7 @@ struct CPURuntimeToDialectInterface : public ConvertToLLVMPatternInterface { } // namespace void populateCPURuntimeToLLVMConversionPatterns(LLVMTypeConverter &converter, - RewritePatternSet &patterns) { + RewritePatternSet &patterns) { patterns.add(converter); } diff --git a/src/gc-opt/gc-opt.cpp b/src/gc-opt/gc-opt.cpp index e1996c050..9b06ecf1f 100644 --- a/src/gc-opt/gc-opt.cpp +++ b/src/gc-opt/gc-opt.cpp @@ -17,8 +17,8 @@ * SPDX-License-Identifier: Apache-2.0 */ -#include "gc/Transforms/Passes.h" #include "gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.h" +#include "gc/Transforms/Passes.h" #include "mlir/InitAllDialects.h" #include "mlir/InitAllPasses.h" #include "mlir/Tools/mlir-opt/MlirOptMain.h" From 447ef129fdbd73e15533c1bbc9a8f1e6f1273413 Mon Sep 17 00:00:00 2001 From: "Mei, Yijie" Date: Tue, 14 May 2024 15:18:43 +0800 Subject: [PATCH 03/19] add dependency --- lib/gc/Dialect/CPURuntime/IR/CMakeLists.txt | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/lib/gc/Dialect/CPURuntime/IR/CMakeLists.txt b/lib/gc/Dialect/CPURuntime/IR/CMakeLists.txt index e349da72c..3a1d63d3d 100644 --- a/lib/gc/Dialect/CPURuntime/IR/CMakeLists.txt +++ b/lib/gc/Dialect/CPURuntime/IR/CMakeLists.txt @@ -10,7 +10,5 @@ add_mlir_dialect_library(MLIRCPURuntimeDialect MLIRCPURuntimePassesIncGen LINK_LIBS PUBLIC - MLIR - # MLIRInferTypeOpInterface - # MLIRFuncDialect + MLIRFuncDialect ) From a73dcc12e1023de4b59d4303835c2489b9f955b7 Mon Sep 17 00:00:00 2001 From: "Mei, Yijie" Date: Tue, 14 May 2024 16:03:52 +0800 Subject: [PATCH 04/19] fix new MLIR --- .../Transforms/CPURuntimePasses.cpp | 19 +++++++------- .../CPURuntime/cpuruntime-atexit-to-omp.mlir | 25 +++++++++++-------- 2 files changed, 24 insertions(+), 20 deletions(-) diff --git a/lib/gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.cpp b/lib/gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.cpp index f2a098fcf..a8f74c079 100644 --- a/lib/gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.cpp +++ b/lib/gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.cpp @@ -27,21 +27,22 @@ class CPURuntimeAtExitToOmpRewriter LogicalResult matchAndRewrite(AtParallelExitOp op, PatternRewriter &rewriter) const final { auto parent = op->getParentOp(); - Operation *secondLast = nullptr; - while (parent && (llvm::isa(parent) || - llvm::isa(parent))) { - secondLast = parent; + omp::ParallelOp parallel; + while (parent) { + parallel = llvm::dyn_cast(parent); + if (parallel) { + break; + } parent = parent->getParentOp(); } - auto parallel = llvm::dyn_cast(parent); if (!parallel) { return failure(); } - assert(secondLast->getBlock()); - auto itr = secondLast->getBlock()->end(); + auto &block = parallel.getRegion().front(); + auto itr = block.end(); --itr; - rewriter.inlineBlockBefore(&op->getRegion(0).getBlocks().front(), - secondLast->getBlock(), itr); + rewriter.inlineBlockBefore(&op->getRegion(0).getBlocks().front(), &block, + itr); rewriter.eraseOp(op); return success(); } diff --git a/test/gc/Dialect/CPURuntime/cpuruntime-atexit-to-omp.mlir b/test/gc/Dialect/CPURuntime/cpuruntime-atexit-to-omp.mlir index 401de95cc..172777690 100644 --- a/test/gc/Dialect/CPURuntime/cpuruntime-atexit-to-omp.mlir +++ b/test/gc/Dialect/CPURuntime/cpuruntime-atexit-to-omp.mlir @@ -10,18 +10,21 @@ module { memref.copy %arg0, %alloc : memref<512x512xf32> to memref<512x512xf32> %0 = llvm.mlir.constant(1 : i64) : i64 omp.parallel { - omp.wsloop for (%arg1, %arg2) : index = (%c0, %c0) to (%c1, %c512) step (%c1, %c1) { - memref.alloca_scope { - %alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<512xf32> - %subview = memref.subview %alloc[%arg1, 0] [1, 512] [1, 1] : memref<512x512xf32> to memref<512xf32, strided<[1], offset: ?>> - memref.copy %alloc_0, %subview : memref<512xf32> to memref<512xf32, strided<[1], offset: ?>> - memref.dealloc %alloc_0 : memref<512xf32> - cpuruntime.at_parallel_exit { - memref.prefetch %alloc[%c1,%c0], read, locality<3>, data : memref<512x512xf32> - cpuruntime.parallel_exit.return + omp.wsloop { + omp.loop_nest (%arg1, %arg2) : index = (%c0, %c0) to (%c1, %c512) step (%c1, %c1) { + memref.alloca_scope { + %alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<512xf32> + %subview = memref.subview %alloc[%arg1, 0] [1, 512] [1, 1] : memref<512x512xf32> to memref<512xf32, strided<[1], offset: ?>> + memref.copy %alloc_0, %subview : memref<512xf32> to memref<512xf32, strided<[1], offset: ?>> + memref.dealloc %alloc_0 : memref<512xf32> + cpuruntime.at_parallel_exit { + memref.prefetch %alloc[%c1,%c0], read, locality<3>, data : memref<512x512xf32> + cpuruntime.parallel_exit.return + } } + omp.yield } - omp.yield + omp.terminator } memref.prefetch %alloc[%c0,%c0], read, locality<3>, data : memref<512x512xf32> omp.terminator @@ -30,7 +33,7 @@ module { // CHECK-DAG: %[[C0:.*]] = arith.constant 0 // CHECK: omp.parallel // CHECK-NEXT: omp.wsloop - // CHECK-NEXT: memref.alloca_scope + // CHECK: memref.alloca_scope // CHECK-NOT: cpuruntime.at_parallel_exit // CHECK: omp.yield // CHECK: memref.prefetch {{%alloc}}[%[[C0]], %[[C0]]] From 1cfede8e24231618e69ef692c3b1a0120d5a8857 Mon Sep 17 00:00:00 2001 From: "Mei, Yijie" Date: Wed, 15 May 2024 15:24:41 +0800 Subject: [PATCH 05/19] add --- include/gc/Transforms/Passes.h | 26 +++ include/gc/Transforms/Passes.td | 13 ++ lib/gc/Transforms/CMakeLists.txt | 1 + lib/gc/Transforms/Pipeline.cpp | 164 +++++++++++++++++++ test/gc/Transforms/Pipeline/run.mlir | 23 +++ test/gc/Transforms/Pipeline/tensor_args.mlir | 13 ++ 6 files changed, 240 insertions(+) create mode 100644 lib/gc/Transforms/Pipeline.cpp create mode 100644 test/gc/Transforms/Pipeline/run.mlir create mode 100644 test/gc/Transforms/Pipeline/tensor_args.mlir diff --git a/include/gc/Transforms/Passes.h b/include/gc/Transforms/Passes.h index 243a6f4f6..e5e4aee33 100644 --- a/include/gc/Transforms/Passes.h +++ b/include/gc/Transforms/Passes.h @@ -12,8 +12,34 @@ #include "mlir/Pass/Pass.h" namespace mlir { + +namespace LLVM { +class LLVMDialect; +} + +namespace scf { +class SCFDialect; +} + +namespace openmp { +class OpenMPDialect; +} + +namespace linalg { +class LinalgDialect; +} + +namespace MemRef { +class MemRefDialect; +} + +class PassManager; + namespace gc { +void populateFrontendPasses(mlir::PassManager &); +void populateCPUPipeline(mlir::PassManager &); + #define GEN_PASS_DECL #include "gc/Transforms/Passes.h.inc" diff --git a/include/gc/Transforms/Passes.td b/include/gc/Transforms/Passes.td index d31baa5a7..ff0cd8a90 100644 --- a/include/gc/Transforms/Passes.td +++ b/include/gc/Transforms/Passes.td @@ -31,4 +31,17 @@ def ConvertOneDNNGraphToLinalg : Pass<"convert-onednn-graph-to-linalg"> { ]; } +def GCCPUPipeline: Pass<"gc-cpu-pipeline"> { + let summary = "All-in-one pipeline for GC for CPU"; + let dependentDialects = ["onednn_graph::OneDNNGraphDialect", + "tensor::TensorDialect", + "memref::MemRefDialect", + "linalg::LinalgDialect", + "LLVM::LLVMDialect", + "scf::SCFDialect", + "bufferization::BufferizationDialect", + "omp::OpenMPDialect", + "vector::VectorDialect"]; +} + #endif // GC_DIALECT_GC_PASSES diff --git a/lib/gc/Transforms/CMakeLists.txt b/lib/gc/Transforms/CMakeLists.txt index e7e97ea26..f3fa43e04 100644 --- a/lib/gc/Transforms/CMakeLists.txt +++ b/lib/gc/Transforms/CMakeLists.txt @@ -6,6 +6,7 @@ gc_set_mlir_link_components(MLIR_LINK_COMPONENTS add_mlir_library(GCPasses OneDNNGraphToLinalg.cpp + Pipeline.cpp TileNamed.cpp ADDITIONAL_HEADER_DIRS diff --git a/lib/gc/Transforms/Pipeline.cpp b/lib/gc/Transforms/Pipeline.cpp new file mode 100644 index 000000000..ed50925f6 --- /dev/null +++ b/lib/gc/Transforms/Pipeline.cpp @@ -0,0 +1,164 @@ +//===- Pipeline.cpp - Graph Compiler all-in-one pipeline --------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "mlir/Conversion/Passes.h" +#include "mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h" +#include "mlir/Dialect/Bufferization/Transforms/Passes.h" +#include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/Dialect/Linalg/Passes.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/Dialect/MemRef/Transforms/Passes.h" +#include "mlir/Dialect/OpenMP/OpenMPDialect.h" +#include "mlir/Dialect/SCF/IR/SCF.h" +#include "mlir/Dialect/Tensor/IR/Tensor.h" +#include "mlir/InitAllPasses.h" +#include "mlir/Pass/PassManager.h" +#include "mlir/Support/LogicalResult.h" +#include "mlir/Transforms/Passes.h" + +#include "gc/Dialect/OneDNNGraph/OneDNNGraphDialect.h" +#include "gc/Transforms/Passes.h" + +namespace mlir::gc { + +void populateFrontendPasses(mlir::PassManager &pm) { + // pm.addPass(onednn_graph::createConvertOneDNNGraphToLinalg()); +} +// linalg + linalgX + tensor ==> GC V1 GIR + +void populateTensorPasses(mlir::PassManager &pm) { + // + padding propagation pass, upstream-able 127x127 -> tilling size:32 + // ->padding to 128x128 + // + layout propagation pass, upstream-able 4x32x4x32 -> + // tensor.pack/tensor.unpack + // + tensor constant propagation pass, down-stream pass, designed to support + // oneDNN graph spec + // + linalg.matmul lowering to (scf.loop + linalg.brgemm) pass, upstream-able + // + fine-grain fusion pass, upstream-able -> scf.for + linalgx.mask + // + lower linalg to arith/math on virtual vector pass, up-streamable + + // REMOVE this pass after the above passes are added. Currently we add this + // pass to make the pipeline work properly + pm.addNestedPass(createLinalgGeneralizeNamedOpsPass()); +} +// scf + arith + math + vector + tensor + linalg.brgemm + tensor.pack/unpack ==> +// GC V1 TIR + +void populateVectorPasses(mlir::PassManager &pm) { + // + bf16 promotion pass, down-stream pass, device dependent pass, maybe can + // upstream + // + bf16 cast elimilation pass, down-stream pass, fast-math kind pass, + // designed to support oneDNN graph spec + pm.addNestedPass(arith::createArithExpandOpsPass()); + // + lower to physical vector pass, down-stream pass, device dependent pass, + // maybe can upstream +} +// scf + arith + math + vector + tensor + linalg.brgemm + +void populateBufferizationPasses(mlir::PassManager &pm) { + bufferization::OneShotBufferizationOptions options; + pm.addPass(bufferization::createOneShotBufferizePass(options)); + pm.addPass(createCSEPass()); + pm.addPass(mlir::func::createFuncBufferizePass()); + pm.addPass(bufferization::createBufferResultsToOutParamsPass()); + pm.addNestedPass( + bufferization::createBufferizationBufferizePass()); + pm.addNestedPass( + bufferization::createFinalizingBufferizePass()); + // + buffer schedule pass, down-stream pass, to migrate buffer reschedule pass + // from GC V1. + pm.addNestedPass( + bufferization::createBufferHoistingPass()); // Need to improve this pass + // to support thread-local + // allocator. + pm.addNestedPass(bufferization::createBufferLoopHoistingPass()); + pm.addNestedPass(bufferization::createBufferDeallocationPass()); + pm.addPass(createBufferizationToMemRefPass()); +} +// scf + arith + math + vector + memref + linalg.brgemm + +void populateMicroKernelPasses(mlir::PassManager &pm) { + // + ConvertLinalgToMicrokernel pass, upstream-able, + // + CleanupInvalidMicrokernel pass, upstream-able + // + InvariantMicrokernelMotion pass, upstream-able + // + ConvertMicrokernelToDnnlFunc, down-stream pass, to lower brgemm to dnnl + // call + // + ConvertMicrokernelToXsmm, down-stream pass, to lower brgemm to libxsmm + // call + // + LowerMicrokernel pass, upstream-able + // + DispatchMicrokernel, down-stream pass +} +// scf + arith + math + vector + memref + func/microkernel + +void populateCPURuntimePasses(mlir::PassManager &pm) { + // + flatten nested parallel pass, down-stream pass, to support coarse-grain + // fusion + // pm.addNestedPass(parallelcpu::createParallelCPUAtExitToOmp()); + // remove this pass after we add FlattenNestedParallel + pm.addPass(createConvertSCFToOpenMPPass()); +} + +void populateLoweringToLLVMPasses(mlir::PassManager &pm) { + pm.addPass(createConvertSCFToCFPass()); + // pm.addPass(parallelcpu::createParallelCPUToLLVM()); + pm.addPass(createConvertOpenMPToLLVMPass()); + pm.addNestedPass(createConvertMathToLLVMPass()); + pm.addPass(createConvertMathToLibmPass()); + pm.addPass(createFinalizeMemRefToLLVMConversionPass()); + pm.addNestedPass(createArithToLLVMConversionPass()); + pm.addPass(createConvertFuncToLLVMPass()); + pm.addPass(createConvertControlFlowToLLVMPass()); + pm.addPass(createCSEPass()); + pm.addPass(createCanonicalizerPass()); + pm.addPass(createReconcileUnrealizedCastsPass()); + pm.addPass(createSymbolDCEPass()); +} + +void populateLLVMPasses(mlir::PassManager &pm) { + pm.addPass(memref::createExpandOpsPass()); + pm.addPass(memref::createExpandStridedMetadataPass()); + populateLoweringToLLVMPasses(pm); +} + +void populateCPUPipeline(mlir::PassManager &pm) { + // front-end, oneDNN graph dialect + populateFrontendPasses(pm); + // middle-end, LinalgX/Linalg/tensor dialects + populateTensorPasses(pm); + // middle-end, arith/math/vector dialects + populateVectorPasses(pm); + // back-end, arith/math/vector/memref dialects + populateBufferizationPasses(pm); + // REMOVE this pass after the TensorPasses are added. Currently we add this + // pass to make the pipeline work properly + pm.addNestedPass(createConvertLinalgToParallelLoopsPass()); + populateMicroKernelPasses(pm); + populateCPURuntimePasses(pm); + // // back-end, llvm dialect + populateLLVMPasses(pm); +} + +#define GEN_PASS_DEF_GCCPUPIPELINE +#include "gc/Transforms/Passes.h.inc" +namespace { + +class GCCPUPipeline : public impl::GCCPUPipelineBase { +public: + friend struct PassHelper; + using impl::GCCPUPipelineBase::GCCPUPipelineBase; + void runOnOperation() final { + auto op = getOperation(); + PassManager pm{op->getContext()}; + populateCPUPipeline(pm); + if (failed(pm.run(op))) + signalPassFailure(); + } +}; + +} // namespace +} // namespace mlir::gc diff --git a/test/gc/Transforms/Pipeline/run.mlir b/test/gc/Transforms/Pipeline/run.mlir new file mode 100644 index 000000000..799935006 --- /dev/null +++ b/test/gc/Transforms/Pipeline/run.mlir @@ -0,0 +1,23 @@ +// RUN: gc-opt %s --gc-cpu-pipeline | gc-cpu-runner -e main -entry-point-result=void | FileCheck %s + +module { +func.func @aaa() -> tensor<128xf32> { + %c2 = arith.constant 2.0 : f32 + %a = tensor.empty() : tensor<128xf32> + %2 = linalg.fill ins(%c2 : f32) outs(%a : tensor<128xf32>) -> tensor<128xf32> + return %2 : tensor<128xf32> +} + +func.func @main() { + %result = call @aaa() : ()-> tensor<128xf32> + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + scf.for %iv = %c0 to %c128 step %c1 { + %4 = tensor.extract %result[%iv] : tensor<128xf32> + parallelcpu.printf "%f\n" %4 : f32 + } + return +} +// CHECK-COUNT-128: 2.000000 +} \ No newline at end of file diff --git a/test/gc/Transforms/Pipeline/tensor_args.mlir b/test/gc/Transforms/Pipeline/tensor_args.mlir new file mode 100644 index 000000000..73d916d04 --- /dev/null +++ b/test/gc/Transforms/Pipeline/tensor_args.mlir @@ -0,0 +1,13 @@ +// RUN: gc-opt %s --gc-cpu-pipeline | FileCheck %s + +module { +// CHECK: aaa +// check that the func returns void +// CHECK-NOT: ) -> !llvm.struct< +func.func @aaa(%a: tensor<128xf32>, %b: tensor<128xf32>) -> tensor<128xf32> { + %out = tensor.empty() : tensor<128xf32> + %2 = linalg.add ins(%a, %b : tensor<128xf32>,tensor<128xf32>) outs(%out : tensor<128xf32>) -> tensor<128xf32> + // CHECK: memcpy + return %out : tensor<128xf32> +} +} \ No newline at end of file From 475faf8052309cfd9e170f61e2622d5c4cd7a5ad Mon Sep 17 00:00:00 2001 From: "Mei, Yijie" Date: Wed, 15 May 2024 15:46:46 +0800 Subject: [PATCH 06/19] update --- lib/gc/Transforms/Pipeline.cpp | 9 ++++++--- test/gc/Transforms/Pipeline/run.mlir | 2 +- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/lib/gc/Transforms/Pipeline.cpp b/lib/gc/Transforms/Pipeline.cpp index ed50925f6..8b74df1b9 100644 --- a/lib/gc/Transforms/Pipeline.cpp +++ b/lib/gc/Transforms/Pipeline.cpp @@ -21,6 +21,7 @@ #include "mlir/Support/LogicalResult.h" #include "mlir/Transforms/Passes.h" +#include "gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.h" #include "gc/Dialect/OneDNNGraph/OneDNNGraphDialect.h" #include "gc/Transforms/Passes.h" @@ -65,7 +66,9 @@ void populateBufferizationPasses(mlir::PassManager &pm) { pm.addPass(bufferization::createOneShotBufferizePass(options)); pm.addPass(createCSEPass()); pm.addPass(mlir::func::createFuncBufferizePass()); - pm.addPass(bufferization::createBufferResultsToOutParamsPass()); + bufferization::BufferResultsToOutParamsOpts opt{}; + // opt.hoistStaticAllocs = true; + pm.addPass(bufferization::createBufferResultsToOutParamsPass(opt)); pm.addNestedPass( bufferization::createBufferizationBufferizePass()); pm.addNestedPass( @@ -98,14 +101,14 @@ void populateMicroKernelPasses(mlir::PassManager &pm) { void populateCPURuntimePasses(mlir::PassManager &pm) { // + flatten nested parallel pass, down-stream pass, to support coarse-grain // fusion - // pm.addNestedPass(parallelcpu::createParallelCPUAtExitToOmp()); + pm.addNestedPass(cpuruntime::createCPURuntimeAtExitToOmp()); // remove this pass after we add FlattenNestedParallel pm.addPass(createConvertSCFToOpenMPPass()); } void populateLoweringToLLVMPasses(mlir::PassManager &pm) { pm.addPass(createConvertSCFToCFPass()); - // pm.addPass(parallelcpu::createParallelCPUToLLVM()); + pm.addPass(cpuruntime::createCPURuntimeToLLVM()); pm.addPass(createConvertOpenMPToLLVMPass()); pm.addNestedPass(createConvertMathToLLVMPass()); pm.addPass(createConvertMathToLibmPass()); diff --git a/test/gc/Transforms/Pipeline/run.mlir b/test/gc/Transforms/Pipeline/run.mlir index 799935006..71feb0843 100644 --- a/test/gc/Transforms/Pipeline/run.mlir +++ b/test/gc/Transforms/Pipeline/run.mlir @@ -15,7 +15,7 @@ func.func @main() { %c1 = arith.constant 1 : index scf.for %iv = %c0 to %c128 step %c1 { %4 = tensor.extract %result[%iv] : tensor<128xf32> - parallelcpu.printf "%f\n" %4 : f32 + cpuruntime.printf "%f\n" %4 : f32 } return } From 0ac087deb28d4ec1efcf53519d95e1700690467f Mon Sep 17 00:00:00 2001 From: "Mei, Yijie" Date: Wed, 15 May 2024 17:02:27 +0800 Subject: [PATCH 07/19] fix --- src/gc-opt/CMakeLists.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/gc-opt/CMakeLists.txt b/src/gc-opt/CMakeLists.txt index 74eb4b28e..36ace6847 100644 --- a/src/gc-opt/CMakeLists.txt +++ b/src/gc-opt/CMakeLists.txt @@ -15,7 +15,9 @@ endif() set(gc_opt_libs ${dialect_libs} ${conversion_libs} - MLIROptLib) + ${MLIR_LINK_COMPONENTS} + GCPasses) + if(GC_MLIR_CXX_FLAGS) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${GC_MLIR_CXX_FLAGS}") endif() From 74b0d342fba8e5896a9ca07bf57b449f34911155 Mon Sep 17 00:00:00 2001 From: "Mei, Yijie" Date: Thu, 16 May 2024 11:00:23 +0800 Subject: [PATCH 08/19] remove at exit --- .../gc/Dialect/CPURuntime/IR/CPURuntimeOps.td | 41 +--------- .../CPURuntime/Transforms/CPURuntimePasses.td | 35 --------- .../Dialect/CPURuntime/IR/CPURuntimeOps.cpp | 34 -------- .../CPURuntime/Transforms/CMakeLists.txt | 1 - .../Transforms/CPURuntimePasses.cpp | 78 ------------------- .../CPURuntime/cpuruntime-atexit-to-omp.mlir | 44 ----------- 6 files changed, 1 insertion(+), 232 deletions(-) delete mode 100644 lib/gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.cpp delete mode 100644 test/gc/Dialect/CPURuntime/cpuruntime-atexit-to-omp.mlir diff --git a/include/gc/Dialect/CPURuntime/IR/CPURuntimeOps.td b/include/gc/Dialect/CPURuntime/IR/CPURuntimeOps.td index cc1b7c555..bd77ad997 100644 --- a/include/gc/Dialect/CPURuntime/IR/CPURuntimeOps.td +++ b/include/gc/Dialect/CPURuntime/IR/CPURuntimeOps.td @@ -10,46 +10,7 @@ #define CPURUNTIME_OPS include "gc/Dialect/CPURuntime/IR/CPURuntimeDialect.td" -include "mlir/Interfaces/InferTypeOpInterface.td" include "mlir/Interfaces/SideEffectInterfaces.td" -include "mlir/Interfaces/DestinationStyleOpInterface.td" -include "mlir/Interfaces/ControlFlowInterfaces.td" -include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.td" - - -def CPURuntime_AtParallelExitOp : CPURuntime_Op<"at_parallel_exit", [ - ParentOneOf<["scf::ForallOp", "scf::ParallelOp", "omp::WsloopOp", "memref::AllocaScopeOp"]>, - SingleBlockImplicitTerminator<"ParallelExitReturnOp"> - ]> { - let summary = "Runs the block once in all threads at the exit of the parallel section"; - let description = [{ - It executes the block for each thread working in the parallel section for - once, at the exit of parallel section. - }]; - - let regions = (region SizedRegion<1>:$region); - - let hasCustomAssemblyFormat = 1; - - // The default builder does not add a region with an empty body, add our own. - let skipDefaultBuilders = 1; - let builders = [ - OpBuilder<(ins)>, - ]; -} - -def CPURuntime_ParallelExitReturnOp : CPURuntime_Op<"parallel_exit.return", [ - Pure, - HasParent<"AtParallelExitOp">, - Terminator, ReturnLike - ]> { - let summary = "Terminates at_parallel_exit block"; - let description = [{ - at_parallel_exit should ends with parallel_exit.return - }]; - let assemblyFormat = - [{ attr-dict }]; -} def CPURuntime_PrintfOp : CPURuntime_Op<"printf", [MemoryEffects<[MemWrite]>]>, @@ -61,7 +22,7 @@ def CPURuntime_PrintfOp : CPURuntime_Op<"printf", [MemoryEffects<[MemWrite]>]>, scalar arguments that should be printed. The format string is a C-style printf string, subject to any restrictions - imposed by one's target platform. + imposed by the target platform. }]; let assemblyFormat = [{ $format attr-dict ($args^ `:` type($args))? diff --git a/include/gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.td b/include/gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.td index 0685ce498..20c81e10a 100644 --- a/include/gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.td +++ b/include/gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.td @@ -11,41 +11,6 @@ include "mlir/Pass/PassBase.td" - -def CPURuntimeAtExitToOmp: Pass<"cpuruntime-atexit-to-omp", "::mlir::func::FuncOp"> { - let summary = "Lower at_parallel_exit to code in omp.parallel section"; - let description = [{ - Switches the name of a FuncOp named `bar` to `foo` and folds. - ``` - omp.parallel { - omp.wsloop for (%arg1, %arg2) : index = (%c0, %c0) to (%c1, %c512) step (%c1, %c1) { - memref.alloca_scope { - cpuruntime.at_parallel_exit { - "your.op"() - cpuruntime.parallel_exit.return - } - } - omp.yield - } - omp.terminator - } - ``` - Will be changed into - ``` - omp.parallel { - omp.wsloop for (%arg1, %arg2) : index = (%c0, %c0) to (%c1, %c512) step (%c1, %c1) { - memref.alloca_scope { - } - omp.yield - } - "your.op"() - omp.terminator - } - ``` - }]; -} - - def CPURuntimeToLLVM: Pass<"convert-cpuruntime-to-llvm"> { let summary = "Convert cpuruntime to LLVM dialect"; let description = [{ diff --git a/lib/gc/Dialect/CPURuntime/IR/CPURuntimeOps.cpp b/lib/gc/Dialect/CPURuntime/IR/CPURuntimeOps.cpp index ca632e9db..ed7bc6581 100644 --- a/lib/gc/Dialect/CPURuntime/IR/CPURuntimeOps.cpp +++ b/lib/gc/Dialect/CPURuntime/IR/CPURuntimeOps.cpp @@ -15,42 +15,8 @@ #include namespace mlir { -using namespace bufferization; - namespace cpuruntime { -void AtParallelExitOp::build(OpBuilder &b, OperationState &result) { - OpBuilder::InsertionGuard g(b); - Region *bodyRegion = result.addRegion(); - b.createBlock(bodyRegion); -} - -void AtParallelExitOp::print(OpAsmPrinter &p) { - p << " "; - p.printRegion(getRegion(), - /*printEntryBlockArgs=*/false, - /*printBlockTerminators=*/true); - p.printOptionalAttrDict(getOperation()->getAttrs()); -} - -ParseResult AtParallelExitOp::parse(OpAsmParser &parser, - OperationState &result) { - auto &builder = parser.getBuilder(); - - SmallVector regionOperands; - std::unique_ptr region = std::make_unique(); - if (parser.parseRegion(*region, regionOperands)) - return failure(); - - if (region->empty()) - OpBuilder(builder.getContext()).createBlock(region.get()); - result.addRegion(std::move(region)); - - // Parse the optional attribute list. - if (parser.parseOptionalAttrDict(result.attributes)) - return failure(); - return success(); -} } // namespace cpuruntime } // namespace mlir \ No newline at end of file diff --git a/lib/gc/Dialect/CPURuntime/Transforms/CMakeLists.txt b/lib/gc/Dialect/CPURuntime/Transforms/CMakeLists.txt index ee6148aa4..3bc84f6c8 100644 --- a/lib/gc/Dialect/CPURuntime/Transforms/CMakeLists.txt +++ b/lib/gc/Dialect/CPURuntime/Transforms/CMakeLists.txt @@ -1,5 +1,4 @@ add_mlir_dialect_library(MLIRCPURuntimeTransforms - CPURuntimePasses.cpp CPURuntimeToLLVM.cpp ADDITIONAL_HEADER_DIRS diff --git a/lib/gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.cpp b/lib/gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.cpp deleted file mode 100644 index a8f74c079..000000000 --- a/lib/gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.cpp +++ /dev/null @@ -1,78 +0,0 @@ -//===- CPURuntimePasses.cpp - CPU Runtime Passes ----------------*- C++ -*-===// -// -// This file is licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "mlir/Dialect/Func/IR/FuncOps.h" -#include "mlir/IR/PatternMatch.h" -#include "mlir/Rewrite/FrozenRewritePatternSet.h" -#include "mlir/Support/LogicalResult.h" -#include "mlir/Transforms/GreedyPatternRewriteDriver.h" - -#include "gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.h" - -namespace mlir::cpuruntime { -#define GEN_PASS_DEF_CPURUNTIMEATEXITTOOMP -#include "gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.h.inc" - -namespace { - -class CPURuntimeAtExitToOmpRewriter - : public OpRewritePattern { -public: - using OpRewritePattern::OpRewritePattern; - LogicalResult matchAndRewrite(AtParallelExitOp op, - PatternRewriter &rewriter) const final { - auto parent = op->getParentOp(); - omp::ParallelOp parallel; - while (parent) { - parallel = llvm::dyn_cast(parent); - if (parallel) { - break; - } - parent = parent->getParentOp(); - } - if (!parallel) { - return failure(); - } - auto &block = parallel.getRegion().front(); - auto itr = block.end(); - --itr; - rewriter.inlineBlockBefore(&op->getRegion(0).getBlocks().front(), &block, - itr); - rewriter.eraseOp(op); - return success(); - } -}; - -class CPURuntimeExitReturnRewriter - : public OpRewritePattern { -public: - using OpRewritePattern::OpRewritePattern; - LogicalResult matchAndRewrite(ParallelExitReturnOp op, - PatternRewriter &rewriter) const final { - rewriter.eraseOp(op); - return success(); - } -}; - -class CPURuntimeAtExitToOmp - : public impl::CPURuntimeAtExitToOmpBase { -public: - using impl::CPURuntimeAtExitToOmpBase< - CPURuntimeAtExitToOmp>::CPURuntimeAtExitToOmpBase; - void runOnOperation() final { - RewritePatternSet patterns(&getContext()); - patterns.add(&getContext()); - patterns.add(&getContext()); - FrozenRewritePatternSet patternSet(std::move(patterns)); - if (failed(applyPatternsAndFoldGreedily(getOperation(), patternSet))) - signalPassFailure(); - } -}; - -} // namespace -} // namespace mlir::cpuruntime diff --git a/test/gc/Dialect/CPURuntime/cpuruntime-atexit-to-omp.mlir b/test/gc/Dialect/CPURuntime/cpuruntime-atexit-to-omp.mlir deleted file mode 100644 index 172777690..000000000 --- a/test/gc/Dialect/CPURuntime/cpuruntime-atexit-to-omp.mlir +++ /dev/null @@ -1,44 +0,0 @@ -// RUN: gc-opt %s --cpuruntime-atexit-to-omp | FileCheck %s - -module { - func.func @parallel_insert_slice(%arg0: memref<512x512xf32>) -> memref<512x512xf32> { - %cst = arith.constant 0.000000e+00 : f32 - %alloc = memref.alloc() {alignment = 64 : i64} : memref<512x512xf32> - %c512 = arith.constant 512 : index - %c1 = arith.constant 1 : index - %c0 = arith.constant 0 : index - memref.copy %arg0, %alloc : memref<512x512xf32> to memref<512x512xf32> - %0 = llvm.mlir.constant(1 : i64) : i64 - omp.parallel { - omp.wsloop { - omp.loop_nest (%arg1, %arg2) : index = (%c0, %c0) to (%c1, %c512) step (%c1, %c1) { - memref.alloca_scope { - %alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<512xf32> - %subview = memref.subview %alloc[%arg1, 0] [1, 512] [1, 1] : memref<512x512xf32> to memref<512xf32, strided<[1], offset: ?>> - memref.copy %alloc_0, %subview : memref<512xf32> to memref<512xf32, strided<[1], offset: ?>> - memref.dealloc %alloc_0 : memref<512xf32> - cpuruntime.at_parallel_exit { - memref.prefetch %alloc[%c1,%c0], read, locality<3>, data : memref<512x512xf32> - cpuruntime.parallel_exit.return - } - } - omp.yield - } - omp.terminator - } - memref.prefetch %alloc[%c0,%c0], read, locality<3>, data : memref<512x512xf32> - omp.terminator - } - // CHECK-DAG: %[[C1:.*]] = arith.constant 1 - // CHECK-DAG: %[[C0:.*]] = arith.constant 0 - // CHECK: omp.parallel - // CHECK-NEXT: omp.wsloop - // CHECK: memref.alloca_scope - // CHECK-NOT: cpuruntime.at_parallel_exit - // CHECK: omp.yield - // CHECK: memref.prefetch {{%alloc}}[%[[C0]], %[[C0]]] - // CHECK-NEXT: memref.prefetch {{%alloc}}[%[[C1]], %[[C0]]] - // CHECK-NEXT: omp.terminator - return %alloc : memref<512x512xf32> - } -} From 2cebba99452bd68876269b77293838b3a263f112 Mon Sep 17 00:00:00 2001 From: "Mei, Yijie" Date: Thu, 16 May 2024 11:01:58 +0800 Subject: [PATCH 09/19] fix lint --- lib/gc/Dialect/CPURuntime/IR/CPURuntimeOps.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/lib/gc/Dialect/CPURuntime/IR/CPURuntimeOps.cpp b/lib/gc/Dialect/CPURuntime/IR/CPURuntimeOps.cpp index ed7bc6581..460c421cc 100644 --- a/lib/gc/Dialect/CPURuntime/IR/CPURuntimeOps.cpp +++ b/lib/gc/Dialect/CPURuntime/IR/CPURuntimeOps.cpp @@ -15,8 +15,5 @@ #include namespace mlir { -namespace cpuruntime { - - -} // namespace cpuruntime +namespace cpuruntime {} // namespace cpuruntime } // namespace mlir \ No newline at end of file From 34d10ea127052b423b3384c2ada95b3c30a1eb36 Mon Sep 17 00:00:00 2001 From: "Mei, Yijie" Date: Thu, 16 May 2024 13:29:48 +0800 Subject: [PATCH 10/19] Add kmp_* wrapper for gomp environment --- lib/gc/CMakeLists.txt | 3 +- lib/gc/ExecutionEngine/CMakeLists.txt | 1 + .../ExecutionEngine/CPURuntime/CMakeLists.txt | 15 ++ .../ExecutionEngine/CPURuntime/Parallel.cpp | 189 ++++++++++++++++++ src/gc-cpu-runner/CMakeLists.txt | 3 +- src/gc-cpu-runner/gc-cpu-runner.cpp | 4 + test/gc/cpu-runner/tid.mlir | 37 ++++ 7 files changed, 250 insertions(+), 2 deletions(-) create mode 100644 lib/gc/ExecutionEngine/CMakeLists.txt create mode 100644 lib/gc/ExecutionEngine/CPURuntime/CMakeLists.txt create mode 100644 lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp create mode 100644 test/gc/cpu-runner/tid.mlir diff --git a/lib/gc/CMakeLists.txt b/lib/gc/CMakeLists.txt index fd78d6cab..921853a08 100644 --- a/lib/gc/CMakeLists.txt +++ b/lib/gc/CMakeLists.txt @@ -5,4 +5,5 @@ endif() include(functions) add_subdirectory(Dialect) -add_subdirectory(Transforms) \ No newline at end of file +add_subdirectory(Transforms) +add_subdirectory(ExecutionEngine) \ No newline at end of file diff --git a/lib/gc/ExecutionEngine/CMakeLists.txt b/lib/gc/ExecutionEngine/CMakeLists.txt new file mode 100644 index 000000000..8aa223412 --- /dev/null +++ b/lib/gc/ExecutionEngine/CMakeLists.txt @@ -0,0 +1 @@ +add_subdirectory(CPURuntime) diff --git a/lib/gc/ExecutionEngine/CPURuntime/CMakeLists.txt b/lib/gc/ExecutionEngine/CPURuntime/CMakeLists.txt new file mode 100644 index 000000000..6be58e28f --- /dev/null +++ b/lib/gc/ExecutionEngine/CPURuntime/CMakeLists.txt @@ -0,0 +1,15 @@ +find_package(OpenMP REQUIRED) + +if ("iomp" IN_LIST OpenMP_C_LIB_NAMES OR "omp" IN_LIST OpenMP_C_LIB_NAMES OR "omp5" IN_LIST OpenMP_C_LIB_NAMES) +else() + add_definitions("-DGC_NEEDS_OMP_WRAPPER=1") +endif() + +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fopenmp") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp") +add_mlir_library(GCCpuRuntime + SHARED + Parallel.cpp + + EXCLUDE_FROM_LIBMLIR + ) \ No newline at end of file diff --git a/lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp b/lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp new file mode 100644 index 000000000..a71b8522c --- /dev/null +++ b/lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp @@ -0,0 +1,189 @@ +//===- Parallel.cpp - Definitions for parallel runtime -----------*- C++ -*-=// +//-*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include + +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) + +#define WEAK_SYMBOL __attribute__((weak)) + +namespace { +struct barrier_t { + alignas(64) std::atomic pending_; + std::atomic rounds_; + uint64_t total_; + // pad barrier to size of cacheline to avoid false sharing + char padding_[64 - 4 * sizeof(int32_t)]; +}; + +typedef uint64_t (*barrier_idle_func)(std::atomic *remaining, + int32_t expected_remain, int32_t tid, + void *args); +} // namespace + +extern "C" { +int gc_runtime_keep_alive = 0; +void gc_arrive_at_barrier(barrier_t *b, barrier_idle_func idle_func, + void *idle_args) { + auto cur_round = b->rounds_.load(std::memory_order_acquire); + auto cnt = --b->pending_; + assert(cnt >= 0); + // int count = 0; + if (cnt == 0) { + b->pending_.store(b->total_); + b->rounds_.store(cur_round + 1); + } else { + if (idle_func) { + if (cur_round != b->rounds_.load()) { + return; + } + idle_func(&b->rounds_, cur_round + 1, -1, idle_args); + // count = ret & 0xffffffff; + } + while (cur_round == b->rounds_.load()) { + _mm_pause(); + } + } +} + +static_assert(sizeof(barrier_t) == 64, "size of barrier_t should be 64-byte"); + +void gc_init_barrier(barrier_t *b, int num_barriers, uint64_t thread_count) { + for (int i = 0; i < num_barriers; i++) { + b[i].total_ = thread_count; + b[i].pending_.store(thread_count); + b[i].rounds_.store(0); + } +} + +#if GC_NEEDS_OMP_WRAPPER +void WEAK_SYMBOL __kmpc_barrier(void *loc, int32_t global_tid) { +#pragma omp barrier +} + +int WEAK_SYMBOL __kmpc_global_thread_num(void *loc) { + return omp_get_thread_num(); +} + +void WEAK_SYMBOL __kmpc_for_static_init_8u(void *loc, int32_t gtid, + int32_t schedtype, + int32_t *plastiter, uint64_t *plower, + uint64_t *pupper, int64_t *pstride, + int64_t incr, int64_t chunk) { + if (unlikely(schedtype != 34)) { + std::abort(); + } + const int32_t FALSE = 0; + const int32_t TRUE = 0; + using UT = uint64_t; + // using ST = int64_t; + /* this all has to be changed back to TID and such.. */ + uint32_t tid = gtid; + uint32_t nth = omp_get_num_threads(); + UT trip_count; + + /* special handling for zero-trip loops */ + if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) { + if (plastiter != nullptr) + *plastiter = FALSE; + /* leave pupper and plower set to entire iteration space */ + *pstride = incr; /* value should never be used */ + return; + } + + if (nth == 1) { + if (plastiter != nullptr) + *plastiter = TRUE; + *pstride = + (incr > 0) ? (*pupper - *plower + 1) : (-(*plower - *pupper + 1)); + return; + } + + /* compute trip count */ + if (incr == 1) { + trip_count = *pupper - *plower + 1; + } else if (incr == -1) { + trip_count = *plower - *pupper + 1; + } else if (incr > 0) { + // upper-lower can exceed the limit of signed type + trip_count = (UT)(*pupper - *plower) / incr + 1; + } else { + trip_count = (UT)(*plower - *pupper) / (-incr) + 1; + } + if (trip_count < nth) { + if (tid < trip_count) { + *pupper = *plower = *plower + tid * incr; + } else { + // set bounds so non-active threads execute no iterations + *plower = *pupper + (incr > 0 ? 1 : -1); + } + if (plastiter != nullptr) + *plastiter = (tid == trip_count - 1); + } else { + UT small_chunk = trip_count / nth; + UT extras = trip_count % nth; + *plower += incr * (tid * small_chunk + (tid < extras ? tid : extras)); + *pupper = *plower + small_chunk * incr - (tid < extras ? 0 : incr); + if (plastiter != nullptr) + *plastiter = (tid == nth - 1); + } + *pstride = trip_count; +} + +void WEAK_SYMBOL __kmpc_for_static_fini(void *ptr, int32_t v) {} + +static thread_local int next_num_threads = 0; + +/*! +@ingroup PARALLEL +The type for a microtask which gets passed to @ref __kmpc_fork_call(). +The arguments to the outlined function are +@param global_tid the global thread identity of the thread executing the +function. +@param bound_tid the local identity of the thread executing the function +@param ... pointers to shared variables accessed by the function. +*/ +using kmpc_micro = void (*)(int32_t *global_tid, int32_t *bound_tid, ...); +void WEAK_SYMBOL __kmpc_fork_call(void *loc, int32_t argc, void *pfunc, ...) { + if (unlikely(argc != 1 && argc != 0)) { + std::abort(); + } + va_list ap; + va_start(ap, pfunc); + void *c = va_arg(ap, void *); + int32_t global_tid = 0; + if (unlikely(next_num_threads)) { +#pragma omp parallel num_threads(next_num_threads) + { + kmpc_micro func = (kmpc_micro)(pfunc); + func(&global_tid, nullptr, c); + } + next_num_threads = 0; + } else { +#pragma omp parallel + { + kmpc_micro func = (kmpc_micro)(pfunc); + func(&global_tid, nullptr, c); + } + } + va_end(ap); +} + +void WEAK_SYMBOL __kmpc_push_num_threads(void *loc, int32_t global_tid, + int32_t num_threads) { + next_num_threads = num_threads; +} +#endif +} diff --git a/src/gc-cpu-runner/CMakeLists.txt b/src/gc-cpu-runner/CMakeLists.txt index f3f768612..85dbb6995 100644 --- a/src/gc-cpu-runner/CMakeLists.txt +++ b/src/gc-cpu-runner/CMakeLists.txt @@ -36,7 +36,8 @@ endif() #LLVM_LINK_COMPONENTS is processed by LLVM cmake in add_llvm_executable set(gc_cpu_runner_libs - ${MLIR_LINK_COMPONENTS}) + ${MLIR_LINK_COMPONENTS} + GCCpuRuntime) add_mlir_tool(gc-cpu-runner gc-cpu-runner.cpp ) diff --git a/src/gc-cpu-runner/gc-cpu-runner.cpp b/src/gc-cpu-runner/gc-cpu-runner.cpp index 3ece8f2ff..353abffe9 100644 --- a/src/gc-cpu-runner/gc-cpu-runner.cpp +++ b/src/gc-cpu-runner/gc-cpu-runner.cpp @@ -27,7 +27,11 @@ #include "llvm/Support/TargetSelect.h" #include +extern int gc_runtime_keep_alive; + int main(int argc, char **argv) { + // keeps GCCPURuntime linked + gc_runtime_keep_alive = 0; llvm::InitLLVM y(argc, argv); llvm::InitializeNativeTarget(); llvm::InitializeNativeTargetAsmPrinter(); diff --git a/test/gc/cpu-runner/tid.mlir b/test/gc/cpu-runner/tid.mlir new file mode 100644 index 000000000..aedcc0a20 --- /dev/null +++ b/test/gc/cpu-runner/tid.mlir @@ -0,0 +1,37 @@ +// RUN: gc-opt %s --convert-cpuruntime-to-llvm --convert-openmp-to-llvm --convert-func-to-llvm --convert-arith-to-llvm --convert-cf-to-llvm --reconcile-unrealized-casts | gc-cpu-runner -e main -entry-point-result=void | FileCheck %s +module { + func.func private @omp_get_thread_num() -> i32 + + func.func @check_parallel() { + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %0 = llvm.mlir.constant(1 : i64) : i64 + omp.parallel num_threads(%c8: index) { + omp.wsloop { + omp.loop_nest (%arg1, %arg2) : index = (%c0, %c0) to (%c1, %c64) step (%c1, %c1) { + cpuruntime.printf "ITR %zu\n" %arg2 : index + omp.yield + } + omp.terminator + } + %tid = func.call @omp_get_thread_num() : () -> i32 + cpuruntime.printf "EXIT %d\n" %tid : i32 + omp.terminator + } + return + } + + func.func @main() { + %0 = func.call @omp_get_thread_num() : () -> i32 + cpuruntime.printf "TID %d\n" %0 : i32 + call @check_parallel() : ()->() + return + } + // CHECK: TID 0 + // CHECK-COUNT-64: ITR {{[0-9]+}} + // CHECK-NOT: ITR + // CHECK-COUNT-8: EXIT {{[0-9]+}} + // CHECK-NOT: EXIT +} \ No newline at end of file From 80a597f0887889f1721c83e502de97e0c05dac97 Mon Sep 17 00:00:00 2001 From: "Mei, Yijie" Date: Thu, 16 May 2024 14:18:55 +0800 Subject: [PATCH 11/19] fix --- lib/gc/Transforms/Pipeline.cpp | 7 +++---- test/gc/Transforms/Pipeline/tensor_args.mlir | 4 ++-- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/lib/gc/Transforms/Pipeline.cpp b/lib/gc/Transforms/Pipeline.cpp index 8b74df1b9..b336684a1 100644 --- a/lib/gc/Transforms/Pipeline.cpp +++ b/lib/gc/Transforms/Pipeline.cpp @@ -66,13 +66,13 @@ void populateBufferizationPasses(mlir::PassManager &pm) { pm.addPass(bufferization::createOneShotBufferizePass(options)); pm.addPass(createCSEPass()); pm.addPass(mlir::func::createFuncBufferizePass()); - bufferization::BufferResultsToOutParamsOpts opt{}; - // opt.hoistStaticAllocs = true; - pm.addPass(bufferization::createBufferResultsToOutParamsPass(opt)); pm.addNestedPass( bufferization::createBufferizationBufferizePass()); pm.addNestedPass( bufferization::createFinalizingBufferizePass()); + bufferization::BufferResultsToOutParamsOpts opt{}; + opt.hoistStaticAllocs = true; + pm.addPass(bufferization::createBufferResultsToOutParamsPass(opt)); // + buffer schedule pass, down-stream pass, to migrate buffer reschedule pass // from GC V1. pm.addNestedPass( @@ -101,7 +101,6 @@ void populateMicroKernelPasses(mlir::PassManager &pm) { void populateCPURuntimePasses(mlir::PassManager &pm) { // + flatten nested parallel pass, down-stream pass, to support coarse-grain // fusion - pm.addNestedPass(cpuruntime::createCPURuntimeAtExitToOmp()); // remove this pass after we add FlattenNestedParallel pm.addPass(createConvertSCFToOpenMPPass()); } diff --git a/test/gc/Transforms/Pipeline/tensor_args.mlir b/test/gc/Transforms/Pipeline/tensor_args.mlir index 73d916d04..adcfb3bd8 100644 --- a/test/gc/Transforms/Pipeline/tensor_args.mlir +++ b/test/gc/Transforms/Pipeline/tensor_args.mlir @@ -7,7 +7,7 @@ module { func.func @aaa(%a: tensor<128xf32>, %b: tensor<128xf32>) -> tensor<128xf32> { %out = tensor.empty() : tensor<128xf32> %2 = linalg.add ins(%a, %b : tensor<128xf32>,tensor<128xf32>) outs(%out : tensor<128xf32>) -> tensor<128xf32> - // CHECK: memcpy - return %out : tensor<128xf32> + // CHECK-NOT: memcpy + return %2 : tensor<128xf32> } } \ No newline at end of file From 0b4332b2fe3a1e0c481fa1bcfb67a86130241ac1 Mon Sep 17 00:00:00 2001 From: "Mei, Yijie" Date: Thu, 16 May 2024 17:06:44 +0800 Subject: [PATCH 12/19] fix --- lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp b/lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp index a71b8522c..4a25a5ee0 100644 --- a/lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp +++ b/lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp @@ -40,7 +40,6 @@ void gc_arrive_at_barrier(barrier_t *b, barrier_idle_func idle_func, auto cur_round = b->rounds_.load(std::memory_order_acquire); auto cnt = --b->pending_; assert(cnt >= 0); - // int count = 0; if (cnt == 0) { b->pending_.store(b->total_); b->rounds_.store(cur_round + 1); @@ -50,7 +49,6 @@ void gc_arrive_at_barrier(barrier_t *b, barrier_idle_func idle_func, return; } idle_func(&b->rounds_, cur_round + 1, -1, idle_args); - // count = ret & 0xffffffff; } while (cur_round == b->rounds_.load()) { _mm_pause(); @@ -86,7 +84,7 @@ void WEAK_SYMBOL __kmpc_for_static_init_8u(void *loc, int32_t gtid, std::abort(); } const int32_t FALSE = 0; - const int32_t TRUE = 0; + const int32_t TRUE = 1; using UT = uint64_t; // using ST = int64_t; /* this all has to be changed back to TID and such.. */ From 382171bf38ec16e7b9f3edeb0d07834800aba98d Mon Sep 17 00:00:00 2001 From: "Mei, Yijie" Date: Thu, 23 May 2024 16:49:08 +0800 Subject: [PATCH 13/19] fix lint --- lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp | 7 +++---- src/gc-cpu-runner/CMakeLists.txt | 17 +++++++++++++++++ 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp b/lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp index 4a25a5ee0..6efb38142 100644 --- a/lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp +++ b/lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp @@ -1,5 +1,4 @@ //===- Parallel.cpp - Definitions for parallel runtime -----------*- C++ -*-=// -//-*-===// // // This file is licensed under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -28,9 +27,9 @@ struct barrier_t { char padding_[64 - 4 * sizeof(int32_t)]; }; -typedef uint64_t (*barrier_idle_func)(std::atomic *remaining, - int32_t expected_remain, int32_t tid, - void *args); +using barrier_idle_func = uint64_t (*)(std::atomic *remaining, + int32_t expected_remain, int32_t tid, + void *args); } // namespace extern "C" { diff --git a/src/gc-cpu-runner/CMakeLists.txt b/src/gc-cpu-runner/CMakeLists.txt index 85dbb6995..2599eef84 100644 --- a/src/gc-cpu-runner/CMakeLists.txt +++ b/src/gc-cpu-runner/CMakeLists.txt @@ -1,3 +1,20 @@ +################################################################################ +# Copyright (C) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions +# and limitations under the License. +# SPDX-License-Identifier: Apache-2.0 +################################################################################ + if(GC_DEV_LINK_LLVM_DYLIB) set(LLVM_LINK_COMPONENTS LLVM From f1fd0ae69fff9b17c8d7bf354e3e160a5d3be0aa Mon Sep 17 00:00:00 2001 From: "Mei, Yijie" Date: Thu, 23 May 2024 16:51:25 +0800 Subject: [PATCH 14/19] fix --- lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp b/lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp index 6efb38142..d81e8d3a9 100644 --- a/lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp +++ b/lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp @@ -1,4 +1,4 @@ -//===- Parallel.cpp - Definitions for parallel runtime -----------*- C++ -*-=// +//===-- Parallel.cpp - Definitions for parallel runtime -----------*- C++ -*-=// // // This file is licensed under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. From a773ea6bcbcfebc1370aa78552a703278ee502da Mon Sep 17 00:00:00 2001 From: "Mei, Yijie" Date: Thu, 23 May 2024 16:52:29 +0800 Subject: [PATCH 15/19] f --- lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp b/lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp index d81e8d3a9..ea7641417 100644 --- a/lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp +++ b/lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp @@ -1,9 +1,9 @@ -//===-- Parallel.cpp - Definitions for parallel runtime -----------*- C++ -*-=// -// +//===-- Parallel.cpp - parallel ---------------------------------*- C++ -*-===// +// // This file is licensed under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// +// //===----------------------------------------------------------------------===// #include From 84933c21b7f2c73361953166add0e1143238906d Mon Sep 17 00:00:00 2001 From: "Mei, Yijie" Date: Thu, 23 May 2024 16:57:14 +0800 Subject: [PATCH 16/19] fix --- lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp b/lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp index ea7641417..5591dc3af 100644 --- a/lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp +++ b/lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp @@ -1,9 +1,9 @@ //===-- Parallel.cpp - parallel ---------------------------------*- C++ -*-===// -// +// // This file is licensed under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// +// //===----------------------------------------------------------------------===// #include From 4cca4dfe7bbcf9dbe4ddab9ff66c0c4ff59ffa86 Mon Sep 17 00:00:00 2001 From: "Mei, Yijie" Date: Thu, 23 May 2024 17:11:32 +0800 Subject: [PATCH 17/19] add reference --- lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp b/lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp index 5591dc3af..3a5b4c2c1 100644 --- a/lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp +++ b/lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp @@ -74,6 +74,8 @@ int WEAK_SYMBOL __kmpc_global_thread_num(void *loc) { return omp_get_thread_num(); } +// The implementation was extracted and simplified from LLVM libomp +// at openmp/runtime/src/kmp_sched.cpp void WEAK_SYMBOL __kmpc_for_static_init_8u(void *loc, int32_t gtid, int32_t schedtype, int32_t *plastiter, uint64_t *plower, From 1e06c9844c7753400c6efc254898475c52b2c38b Mon Sep 17 00:00:00 2001 From: "Mei, Yijie" Date: Tue, 28 May 2024 10:42:03 +0800 Subject: [PATCH 18/19] fix license.py --- scripts/license.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/scripts/license.py b/scripts/license.py index 49f28eaa8..3ed2ce521 100644 --- a/scripts/license.py +++ b/scripts/license.py @@ -15,10 +15,10 @@ # SPDX-License-Identifier: Apache-2.0 import datetime, sys, re, argparse -from typing import Dict, Set +from typing import Dict, Set, List WIDTH: int = 80 -intel_license: list[str] = [ +intel_license: List[str] = [ 'Copyright \\(C\\) (\\d\\d\\d\\d-)?$YEAR Intel Corporation', '', 'Licensed under the Apache License, Version 2.0 (the "License");', @@ -35,7 +35,7 @@ 'SPDX-License-Identifier: Apache-2.0', ] -llvm_license: list[str] = [ +llvm_license: List[str] = [ "===-{1,2} $FILE - .* -*\\*- $LANG -\\*-===", '', 'This file is licensed under the Apache License v2.0 with LLVM Exceptions.', @@ -45,7 +45,7 @@ "===-*===", ] -def check_license(filepath: str, license: list[str], var: Dict[str, str], re_line: Set[int]): +def check_license(filepath: str, license: List[str], var: Dict[str, str], re_line: Set[int]): with open(filepath, 'r') as f: idx: int = 0 for line in f.readlines(): @@ -117,7 +117,7 @@ def use_llvm_license(path: str) -> bool: var: Dict[str, str] = {} re_line: Set[int] = set() - lic = list[str] + lic = List[str] if filepath.startswith("test/") or filepath.startswith("./test/"): continue From 381677a27f0c796dd94e208a4c2f1378994b1f59 Mon Sep 17 00:00:00 2001 From: "Mei, Yijie" Date: Tue, 28 May 2024 14:32:18 +0800 Subject: [PATCH 19/19] fix comments --- lib/gc/Transforms/Pipeline.cpp | 64 ++++++++++++++-------------------- 1 file changed, 26 insertions(+), 38 deletions(-) diff --git a/lib/gc/Transforms/Pipeline.cpp b/lib/gc/Transforms/Pipeline.cpp index b336684a1..81fae6877 100644 --- a/lib/gc/Transforms/Pipeline.cpp +++ b/lib/gc/Transforms/Pipeline.cpp @@ -27,40 +27,35 @@ namespace mlir::gc { +// linalg + linalgX + tensor void populateFrontendPasses(mlir::PassManager &pm) { // pm.addPass(onednn_graph::createConvertOneDNNGraphToLinalg()); } -// linalg + linalgX + tensor ==> GC V1 GIR +// scf + arith + math + vector + tensor + linalg.brgemm + tensor.pack/unpack void populateTensorPasses(mlir::PassManager &pm) { - // + padding propagation pass, upstream-able 127x127 -> tilling size:32 - // ->padding to 128x128 - // + layout propagation pass, upstream-able 4x32x4x32 -> - // tensor.pack/tensor.unpack - // + tensor constant propagation pass, down-stream pass, designed to support - // oneDNN graph spec - // + linalg.matmul lowering to (scf.loop + linalg.brgemm) pass, upstream-able - // + fine-grain fusion pass, upstream-able -> scf.for + linalgx.mask - // + lower linalg to arith/math on virtual vector pass, up-streamable + // todo: padding propagation pass + // todo: layout propagation pass + // todo: tensor constant propagation pass + // todo: linalg.matmul lowering to (scf.loop + linalg.brgemm) pass + // todo: fine-grain fusion pass + // todo: lower linalg to arith/math on virtual vector pass // REMOVE this pass after the above passes are added. Currently we add this // pass to make the pipeline work properly pm.addNestedPass(createLinalgGeneralizeNamedOpsPass()); } -// scf + arith + math + vector + tensor + linalg.brgemm + tensor.pack/unpack ==> -// GC V1 TIR +// scf + arith + math + vector + tensor + linalg.brgemm void populateVectorPasses(mlir::PassManager &pm) { - // + bf16 promotion pass, down-stream pass, device dependent pass, maybe can - // upstream - // + bf16 cast elimilation pass, down-stream pass, fast-math kind pass, - // designed to support oneDNN graph spec + // todo: bf16 promotion pass, device dependent pass + // todo: bf16 cast elimilation pass, fast-math kind pass, designed to support + // oneDNN graph spec pm.addNestedPass(arith::createArithExpandOpsPass()); - // + lower to physical vector pass, down-stream pass, device dependent pass, - // maybe can upstream + // todo: lower to physical vector pass, device dependent pass } -// scf + arith + math + vector + tensor + linalg.brgemm +// scf + arith + math + vector + memref + linalg.brgemm void populateBufferizationPasses(mlir::PassManager &pm) { bufferization::OneShotBufferizationOptions options; pm.addPass(bufferization::createOneShotBufferizePass(options)); @@ -73,34 +68,27 @@ void populateBufferizationPasses(mlir::PassManager &pm) { bufferization::BufferResultsToOutParamsOpts opt{}; opt.hoistStaticAllocs = true; pm.addPass(bufferization::createBufferResultsToOutParamsPass(opt)); - // + buffer schedule pass, down-stream pass, to migrate buffer reschedule pass - // from GC V1. - pm.addNestedPass( - bufferization::createBufferHoistingPass()); // Need to improve this pass - // to support thread-local - // allocator. + // todo: buffer schedule pass + // todo: Need to improve this pass to support nested parallel. + pm.addNestedPass(bufferization::createBufferHoistingPass()); pm.addNestedPass(bufferization::createBufferLoopHoistingPass()); pm.addNestedPass(bufferization::createBufferDeallocationPass()); pm.addPass(createBufferizationToMemRefPass()); } -// scf + arith + math + vector + memref + linalg.brgemm +// scf + arith + math + vector + memref + func/microkernel void populateMicroKernelPasses(mlir::PassManager &pm) { - // + ConvertLinalgToMicrokernel pass, upstream-able, - // + CleanupInvalidMicrokernel pass, upstream-able - // + InvariantMicrokernelMotion pass, upstream-able - // + ConvertMicrokernelToDnnlFunc, down-stream pass, to lower brgemm to dnnl - // call - // + ConvertMicrokernelToXsmm, down-stream pass, to lower brgemm to libxsmm - // call - // + LowerMicrokernel pass, upstream-able - // + DispatchMicrokernel, down-stream pass + // todo: ConvertLinalgToMicrokernel pass + // todo: CleanupInvalidMicrokernel pass + // todo: InvariantMicrokernelMotion pass + // todo: ConvertMicrokernelToDnnlFunc to lower brgemm to dnnl call + // todo: ConvertMicrokernelToXsmm, to lower brgemm to libxsmm call + // todo: LowerMicrokernel pass + // todo: DispatchMicrokernel } -// scf + arith + math + vector + memref + func/microkernel void populateCPURuntimePasses(mlir::PassManager &pm) { - // + flatten nested parallel pass, down-stream pass, to support coarse-grain - // fusion + // todo: flatten nested parallel pass to support coarse-grain usion // remove this pass after we add FlattenNestedParallel pm.addPass(createConvertSCFToOpenMPPass()); }