@@ -58,7 +58,8 @@ getCandidate(uint32_t num, uint32_t floor,
5858 int defaultBlock = 32 ;
5959 // factor
6060 std::vector<uint32_t > candidates;
61- uint32_t upperbound = std::min (llvm::divideCeil (num, defaultBlock), ceil);
61+ uint32_t upperbound =
62+ std::min (llvm::divideCeil (num, defaultBlock) * defaultBlock, ceil);
6263 for (uint32_t i = floor; i <= upperbound; i++)
6364 if (num % i == 0 )
6465 candidates.push_back (i);
@@ -200,6 +201,29 @@ double dynamicBufferizationCost(linalg::LinalgOp &linalgOp,
200201 return cost;
201202}
202203
204+ double paddingCost (linalg::LinalgOp &linalgOp, ArrayRef<uint32_t > shape,
205+ const MatmulConfig &config,
206+ CPUTargetDescriptionAnalysis &sysDesc) {
207+ double cost = 0 ;
208+ uint32_t M = shape[0 ], N = shape[1 ], K = shape[2 ];
209+ bool isPadOnM = M % config.innerMostMBlock != 0 ,
210+ isPadOnK = K % config.innerMostKBlock != 0 ,
211+ isPadOnN = N % config.innerMostNBlock != 0 ;
212+ if (isPadOnM || isPadOnK) {
213+ cost += llvm::divideCeil (M, config.innerMostMBlock ) *
214+ llvm::divideCeil (K, config.innerMostKBlock );
215+ }
216+ if (isPadOnK || isPadOnN) {
217+ cost += llvm::divideCeil (N, config.innerMostNBlock ) *
218+ llvm::divideCeil (K, config.innerMostKBlock );
219+ }
220+ if (isPadOnM || isPadOnN) {
221+ cost += llvm::divideCeil (N, config.innerMostNBlock ) *
222+ llvm::divideCeil (M, config.innerMostMBlock );
223+ }
224+ return cost;
225+ }
226+
203227using CostModelFn = std::function<double (
204228 linalg::LinalgOp &linalgOp, ArrayRef<uint32_t > shape, MatmulConfig cfg,
205229 CPUTargetDescriptionAnalysis &sysDesc)>;
@@ -474,14 +498,17 @@ MatmulConfig MatmulConfigAnalysis::getConfig() {
474498 {computationIntensityOnL2Cache, " computationIntensityOnL2Cache" ,
475499 -1 },
476500 {memoryConsumptionOnThreadCost, " memoryConsumptionOnThreadCost" ,
477- -1 }};
501+ -1 },
502+ {paddingCost, " paddingCost" , -1 }};
478503 SmallVector<uint32_t > shape = {M, N, K};
479504 std::vector<MatmulConfig> configCandidates =
480505 prepareConfigCandidates (root, sysDesc, shape, givenInnermostBlock,
481506 allowIndivisibleInnerBlock);
482- for (auto &&[fn, name, threshold] : costModelList)
507+ for (auto &&[fn, name, threshold] : costModelList) {
508+ LLVM_DEBUG (llvm::dbgs () << name << " \n " );
483509 configCandidates = filterConfigByCostModel (
484510 configCandidates, linalgOp, shape, sysDesc, fn, 0.5 , threshold);
511+ }
485512 if (!configCandidates.empty ())
486513 config = configCandidates[0 ];
487514 }
0 commit comments