intel · zhczhong · Sep 4, 2024 · Aug 28, 2024 · Aug 29, 2024 · Aug 30, 2024
diff --git a/include/gc/Analysis/MatmulConfigAnalysis.h b/include/gc/Analysis/MatmulConfigAnalysis.h
@@ -47,15 +47,67 @@ inline SmallVector<unsigned> extractDimTypeIdx(ArrayRef<DimType> tyList,
   return idxList;
 }
 
+inline void getDimTypeFromIterators(linalg::LinalgOp linalgOp,
+                                    SmallVectorImpl<DimType> &dimTypes) {
+  SmallVector<mlir::utils::IteratorType> iteratorTypes =
+      linalgOp.getIteratorTypesArray();
+
+  for (const auto &&[idx, iterType] : llvm::enumerate(iteratorTypes)) {
+    if (iterType == mlir::utils::IteratorType::parallel) {
+      SmallVector<std::pair<Value, unsigned>> operandDimPairs;
+      linalgOp.mapIterationSpaceDimToAllOperandDims(idx, operandDimPairs);
+      if (operandDimPairs.size() == 3) {
+        dimTypes.push_back(DimType::Batch);
+      } else if (llvm::any_of(operandDimPairs,
+                              [&](std::pair<Value, unsigned> pair) {
+                                return pair.first ==
+                                       dyn_cast<linalg::ContractionOpInterface>(
+                                           linalgOp.getOperation())
+                                           .lhs();
+                              })) {
+        dimTypes.push_back(DimType::M);
+      } else {
+        dimTypes.push_back(DimType::N);
+      }
+    } else if (iterType == mlir::utils::IteratorType::reduction) {
+      dimTypes.push_back(DimType::K);
+    }
+  }
+}
+
+inline SmallVector<DimType>
+matchOperandToDimTypes(linalg::LinalgOp linalgOp, OpOperand *operand,
+                       ArrayRef<DimType> allDimTypes) {
+  ArrayRef<AffineExpr> map =
+      linalgOp.getMatchingIndexingMap(operand).getResults();
+  SmallVector<DimType> res;
+  for (const AffineExpr &dim : map) {
+    AffineDimExpr dimExpr = dyn_cast<AffineDimExpr>(dim);
+    res.push_back(allDimTypes[dimExpr.getPosition()]);
+  }
+  return res;
+}
+
+inline SmallVector<SmallVector<DimType>>
+getContractionOpOperandDimType(linalg::LinalgOp linalgOp) {
+  SmallVector<DimType> dimTypes;
+  getDimTypeFromIterators(linalgOp, dimTypes);
+  SmallVector<DimType> ADimTypes = matchOperandToDimTypes(
+      linalgOp, linalgOp.getDpsInputOperand(0), dimTypes);
+  SmallVector<DimType> BDimTypes = matchOperandToDimTypes(
+      linalgOp, linalgOp.getDpsInputOperand(1), dimTypes);
+  SmallVector<DimType> CDimTypes =
+      matchOperandToDimTypes(linalgOp, linalgOp.getDpsInitOperand(0), dimTypes);
+
+  return SmallVector<SmallVector<DimType>>{ADimTypes, BDimTypes, CDimTypes};
+}
+
 // Get the operand dim type for every operand for the given linalg op
 inline FailureOr<SmallVector<SmallVector<DimType>>>
 getOprandDimType(linalg::LinalgOp &linalgOp) {
   // TODO: replace the linalgx op with generic op
-  if (llvm::isa<linalg::MatmulOp>(linalgOp)) {
-    return SmallVector<SmallVector<DimType>>{
-        SmallVector<DimType>{DimType::M, DimType::K},
-        SmallVector<DimType>{DimType::K, DimType::N},
-        SmallVector<DimType>{DimType::M, DimType::N}};
+  if (llvm::isa<linalg::ContractionOpInterface>(linalgOp.getOperation())) {
+    return getContractionOpOperandDimType(linalgOp);
   } else if (linalgx::isGenericPackedMatmulOp(
                  linalgOp.getOperation(), linalgx::PackingType::VNNI_MM2D) ||
              llvm::isa<linalgx::Mm2DVnniOp>(linalgOp)) {
@@ -72,31 +124,6 @@ getOprandDimType(linalg::LinalgOp &linalgOp) {
         SmallVector<DimType>{DimType::N, DimType::K, DimType::K, DimType::N,
                              DimType::K},
         SmallVector<DimType>{DimType::M, DimType::N, DimType::M, DimType::N}};
-  } else if (llvm::isa<linalg::BatchMatmulOp>(linalgOp)) {
-    return SmallVector<SmallVector<DimType>>{
-        SmallVector<DimType>{DimType::Batch, DimType::M, DimType::K},
-        SmallVector<DimType>{DimType::Batch, DimType::K, DimType::N},
-        SmallVector<DimType>{DimType::Batch, DimType::M, DimType::N}};
-  } else if (llvm::isa<linalg::MatmulTransposeAOp>(linalgOp)) {
-    return SmallVector<SmallVector<DimType>>{
-        SmallVector<DimType>{DimType::K, DimType::M},
-        SmallVector<DimType>{DimType::K, DimType::N},
-        SmallVector<DimType>{DimType::M, DimType::N}};
-  } else if (llvm::isa<linalg::MatmulTransposeBOp>(linalgOp)) {
-    return SmallVector<SmallVector<DimType>>{
-        SmallVector<DimType>{DimType::M, DimType::K},
-        SmallVector<DimType>{DimType::N, DimType::K},
-        SmallVector<DimType>{DimType::M, DimType::N}};
-  } else if (llvm::isa<linalg::BatchMatmulTransposeAOp>(linalgOp)) {
-    return SmallVector<SmallVector<DimType>>{
-        SmallVector<DimType>{DimType::Batch, DimType::K, DimType::M},
-        SmallVector<DimType>{DimType::Batch, DimType::K, DimType::N},
-        SmallVector<DimType>{DimType::Batch, DimType::M, DimType::N}};
-  } else if (llvm::isa<linalg::BatchMatmulTransposeBOp>(linalgOp)) {
-    return SmallVector<SmallVector<DimType>>{
-        SmallVector<DimType>{DimType::Batch, DimType::M, DimType::K},
-        SmallVector<DimType>{DimType::Batch, DimType::N, DimType::K},
-        SmallVector<DimType>{DimType::Batch, DimType::M, DimType::N}};
   } else if (linalgx::isGenericPackedMatmulOp(linalgOp.getOperation(),
                                               linalgx::PackingType::MM4D)) {
     return SmallVector<SmallVector<DimType>>{

diff --git a/lib/gc/Analysis/MatmulConfigAnalysis.cpp b/lib/gc/Analysis/MatmulConfigAnalysis.cpp
@@ -55,9 +55,11 @@ bool validateConfig(const MatmulConfig &cfg) {
 std::vector<uint32_t>
 getCandidate(uint32_t num, uint32_t floor,
              uint32_t ceil = std::numeric_limits<uint32_t>::max()) {
+  int defaultBlock = 32;
   // factor
   std::vector<uint32_t> candidates;
-  uint32_t upperbound = std::min(num, ceil);
+  uint32_t upperbound =
+      std::min(llvm::divideCeil(num, defaultBlock) * defaultBlock, ceil);
   for (uint32_t i = floor; i <= upperbound; i++)
     if (num % i == 0)
       candidates.push_back(i);
@@ -199,6 +201,29 @@ double dynamicBufferizationCost(linalg::LinalgOp &linalgOp,
   return cost;
 }
 
+double paddingCost(linalg::LinalgOp &linalgOp, ArrayRef<uint32_t> shape,
+                   const MatmulConfig &config,
+                   CPUTargetDescriptionAnalysis &sysDesc) {
+  double cost = 0;
+  uint32_t M = shape[0], N = shape[1], K = shape[2];
+  bool isPadOnM = M % config.innerMostMBlock != 0,
+       isPadOnK = K % config.innerMostKBlock != 0,
+       isPadOnN = N % config.innerMostNBlock != 0;
+  if (isPadOnM || isPadOnK) {
+    cost += llvm::divideCeil(M, config.innerMostMBlock) *
+            llvm::divideCeil(K, config.innerMostKBlock);
+  }
+  if (isPadOnK || isPadOnN) {
+    cost += llvm::divideCeil(N, config.innerMostNBlock) *
+            llvm::divideCeil(K, config.innerMostKBlock);
+  }
+  if (isPadOnM || isPadOnN) {
+    cost += llvm::divideCeil(N, config.innerMostNBlock) *
+            llvm::divideCeil(M, config.innerMostMBlock);
+  }
+  return cost;
+}
+
 using CostModelFn = std::function<double(
     linalg::LinalgOp &linalgOp, ArrayRef<uint32_t> shape, MatmulConfig cfg,
     CPUTargetDescriptionAnalysis &sysDesc)>;
@@ -243,6 +268,8 @@ prepareConfigCandidates(Operation *root, CPUTargetDescriptionAnalysis &sysDesc,
                         ArrayRef<uint32_t> shape,
                         ArrayRef<uint32_t> givenInnermostBlock,
                         bool allowIndivisibleInnerblock = false) {
+  LLVM_DEBUG(llvm::dbgs() << "allowIndivisibleInnerblock: "
+                          << allowIndivisibleInnerblock << "\n");
   assert(shape.size() >= 3 && "shape.size() should >= 3");
   std::vector<MatmulConfig> configs;
   uint32_t threads = sysDesc.getNumThreads();
@@ -278,6 +305,13 @@ prepareConfigCandidates(Operation *root, CPUTargetDescriptionAnalysis &sysDesc,
           : getCandidate((uint32_t)shape[2],
                          shape[2] >= noSmallBlockNeedThreshold ? 8U : 1U, 256U);
 
+  if (allowIndivisibleInnerblock) {
+    innerMostKBlockCandidates = {16, 32, 64};
+    innerMostNBlockCandidates = {16, 32, 64};
+    NBlockCandidates = innerMostNBlockCandidates;
+    KBlockCandidates = innerMostKBlockCandidates;
+  }
+
   // TODO: improve via multi threading or add more constraints to restrict the
   // candidate size
   for (uint32_t MThreads : MThreadsCandidates) {
@@ -464,14 +498,17 @@ MatmulConfig MatmulConfigAnalysis::getConfig() {
                 {computationIntensityOnL2Cache, "computationIntensityOnL2Cache",
                  -1},
                 {memoryConsumptionOnThreadCost, "memoryConsumptionOnThreadCost",
-                 -1}};
+                 -1},
+                {paddingCost, "paddingCost", -1}};
         SmallVector<uint32_t> shape = {M, N, K};
         std::vector<MatmulConfig> configCandidates =
             prepareConfigCandidates(root, sysDesc, shape, givenInnermostBlock,
                                     allowIndivisibleInnerBlock);
-        for (auto &&[fn, name, threshold] : costModelList)
+        for (auto &&[fn, name, threshold] : costModelList) {
+          LLVM_DEBUG(llvm::dbgs() << name << "\n");
           configCandidates = filterConfigByCostModel(
               configCandidates, linalgOp, shape, sysDesc, fn, 0.5, threshold);
+        }
         if (!configCandidates.empty())
           config = configCandidates[0];
       }