-
Notifications
You must be signed in to change notification settings - Fork 16
Closed
Description
Currently, deep tiled matmul will read DLTI info to determine matmul config and perform tiling accordingly. The current MLP driver only generates the mlir MLP graph, neglecting the DLTI information, which leads to deep tiled matmul generating a sub-optimal result.
Please help detect and add DLTI info according to runtime environment, e.g.
module attributes {
dlti.target_system_spec = #dlti.target_system_spec<
"CPU": #dlti.target_device_spec<
#dlti.dl_entry<"L1_cache_size_in_bytes", 49152 : ui32>,
#dlti.dl_entry<"L2_cache_size_in_bytes", 2097152 : ui64>,
#dlti.dl_entry<"L3_cache_size_in_bytes", "110100480">,
#dlti.dl_entry<"num_threads", 56 : i32>,
#dlti.dl_entry<"max_vector_width", 512 : i64>>
>} {
func.func @main_entry(%arg0: tensor<128x256xf32>, %arg1: tensor<256x128xf32>, %arg2: tensor<128xf32>) -> tensor<128x128xf32> attributes {llvm.emit_c_interface} {
%0 = tensor.empty() : tensor<128x128xf32>
%1 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%arg0, %arg1 : tensor<128x256xf32>, tensor<256x128xf32>) outs(%0 : tensor<128x128xf32>) -> tensor<128x128xf32>
%2 = tensor.empty() : tensor<128x128xf32>
%broadcasted = linalg.broadcast ins(%arg2 : tensor<128xf32>) outs(%2 : tensor<128x128xf32>) dimensions = [0]
%3 = tensor.empty() : tensor<128x128xf32>
%4 = linalg.add ins(%1, %broadcasted : tensor<128x128xf32>, tensor<128x128xf32>) outs(%3 : tensor<128x128xf32>) -> tensor<128x128xf32>
%cst = arith.constant dense<0.000000e+00> : tensor<128x128xf32>
%5 = tensor.empty() : tensor<128x128xf32>
%6 = linalg.max ins(%4, %cst : tensor<128x128xf32>, tensor<128x128xf32>) outs(%5 : tensor<128x128xf32>) -> tensor<128x128xf32>
return %6 : tensor<128x128xf32>
}
}