diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 505062ecd..355aba91f 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -49,6 +49,8 @@ declare_mlir_python_sources(GcPythonSources.Common __init__.py graph_compiler.py dialects/__init__.py + tools/__init__.py + tools/cpuinfo.py # init hooks _mlir_libs/_site_initialize_0.py ) @@ -86,6 +88,13 @@ declare_mlir_python_extension(GcPythonSources.Extension GcCAPI ) +declare_mlir_python_extension(GcPythonSources.CpuInfoExtension + MODULE_NAME _cpuinfo + ADD_TO_PARENT GcPythonSources + SOURCES + CPUInfo.cpp +) + ################################################################################ # Common CAPI ################################################################################ diff --git a/python/CPUInfo.cpp b/python/CPUInfo.cpp new file mode 100644 index 000000000..98cecc9e1 --- /dev/null +++ b/python/CPUInfo.cpp @@ -0,0 +1,87 @@ +/* + * Copyright (C) 2024 Intel Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "mlir/Bindings/Python/PybindAdaptors.h" + +#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \ + defined(_M_IX86) +// x86 or x86_64 specific code +void cpuid(int info[4], int leaf, int subleaf) { + __asm__ __volatile__("cpuid" + : "=a"(info[0]), "=b"(info[1]), "=c"(info[2]), + "=d"(info[3]) + : "a"(leaf), "c"(subleaf)); +} + +std::vector getCacheSizes() { + int info[4]; + cpuid(info, 0, 0); + int nIds = info[0]; + int caches[3] = {}; + for (int i = 0; i <= nIds; ++i) { + cpuid(info, 4, i); + int cacheType = info[0] & 0x1F; + if (cacheType == 0) { + break; + } + if (cacheType == 2) { + // skip instruction cache + continue; + } + int cacheLevel = (info[0] >> 5) & 0x7; + int cacheLinesPerTag = ((info[1] >> 0) & 0xFFF) + 1; + int cacheAssociativity = ((info[1] >> 12) & 0x3FF) + 1; + int cachePartitions = ((info[1] >> 22) & 0x3FF) + 1; + int cacheSets = info[2] + 1; + int cacheSize = + cacheLinesPerTag * cacheAssociativity * cachePartitions * cacheSets; + if (cacheLevel >= 1 && cacheLevel <= 3) { + caches[cacheLevel - 1] = cacheSize; + } + } + return std::vector(std::begin(caches), std::end(caches)); +} + +bool isFeatureSupported(int function_id, int register_idx, int bit) { + int info[4]; + cpuid(info, function_id, 0); + return (info[register_idx] & (1 << bit)) != 0; +} + +int getMaxVectorWidth() { + if (isFeatureSupported(7, 1, 16)) { // Check for AVX-512F support + return 512; + } else if (isFeatureSupported(1, 2, 28)) { // Check for AVX support + return 256; + } else if (isFeatureSupported(1, 3, 25)) { // Check for SSE support + return 128; + } + return 64; // Default to 64 if none of the above features are supported +} +#else +std::vector getCacheSizes() { return {}; } + +int getMaxVectorWidth { return 0; } +#endif + +PYBIND11_MODULE(_cpuinfo, m) { + m.doc() = "Graph-compiler MLIR Python binding"; + m.def("get_cache_sizes", &getCacheSizes, "Get CPU L1,L2,L3 cache size"); + m.def("get_max_vector_width", &getMaxVectorWidth, + "Get CPU supported max vector width"); +} \ No newline at end of file diff --git a/python/gc_mlir/tools/__init__.py b/python/gc_mlir/tools/__init__.py new file mode 100644 index 000000000..172887970 --- /dev/null +++ b/python/gc_mlir/tools/__init__.py @@ -0,0 +1,7 @@ +# ===-- __init__.py - init ------------------------------------*- Python -*-===# +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# ===-----------------------------------------------------------------------===# diff --git a/python/gc_mlir/tools/cpuinfo.py b/python/gc_mlir/tools/cpuinfo.py new file mode 100644 index 000000000..7833ece68 --- /dev/null +++ b/python/gc_mlir/tools/cpuinfo.py @@ -0,0 +1,26 @@ +# ===-- cpuinfo.py - Getting the CPU info ---------------------*- Python -*-===# +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# ===-----------------------------------------------------------------------===# + +from .._mlir_libs import _cpuinfo + +_cache_sizes = [] +_max_vector_width = None + + +def get_cache_sizes(): + global _cache_sizes + if not _cache_sizes: + _cache_sizes = _cpuinfo.get_cache_sizes() + return _cache_sizes + + +def get_max_vector_width(): + global _max_vector_width + if _max_vector_width is None: + _max_vector_width = _cpuinfo.get_max_vector_width() + return _max_vector_width diff --git a/test/benchgc/README.md b/test/benchgc/README.md index 9f18cc398..239105c82 100644 --- a/test/benchgc/README.md +++ b/test/benchgc/README.md @@ -8,6 +8,8 @@ Benchgc is a tool used to verify the correctness and performance of graph compil * python >= 3.10 * torch >= 2.2 * Enable mlir python binding, Refer to [`python/README.md`](../../python/README.md) for detail +* Set the envs + * OMP_NUM_THREADS [int] : the `num_threads` for dlti attr, default = 1 ## Build There are two ways for using benchgc @@ -107,6 +109,12 @@ module { | Pytorch tensor dump | F | dump filename | | Benchdnn driver | D | driver_name[:driver filling parameter]* | +### --cpu_cache_sizes, --max_vector_width +* BenchGC will automatically obtain target info and add the DLTI attr to the IR +* In some cases, if the system info obtained by BenchGC is not accurate, you can specify the relevant attributes for BenchGC through these options. +* --cpu_cache_sizes: cpu cache sizes in bytes, format: L1:L2:L3, example: `--cpu_cache_sizes 49152:2097152:110100480` +* --max_vector_width: the maximum width of vector registers available in a CPU, example `--max_vector_width ` + #### Benchdnn driver filling | driver_name | driver filling parameter | diff --git a/test/benchgc/setup.py b/test/benchgc/setup.py index 3d67af539..a9b49e6f0 100644 --- a/test/benchgc/setup.py +++ b/test/benchgc/setup.py @@ -26,5 +26,5 @@ packages=setuptools.find_packages("src") + setuptools.find_namespace_packages("../../python_packages/gc_mlir_core"), package_data={"gc_mlir": ["_mlir_libs/*.so"]}, - install_requires=["torch", "numpy", "ml_dtypes"], + install_requires=["torch", "numpy"], ) diff --git a/test/benchgc/src/benchgc/__main__.py b/test/benchgc/src/benchgc/__main__.py index 1f6d7261e..d630d5eba 100644 --- a/test/benchgc/src/benchgc/__main__.py +++ b/test/benchgc/src/benchgc/__main__.py @@ -124,6 +124,20 @@ def add_common_options(parser: argparse.ArgumentParser): help="if we need print the ir during the pass-pipeline", ) + parser.add_argument( + "--cpu_cache_sizes", + required=False, + help="set the cpu cache sizes, format: L1:L2:L3", + type=str, + ) + + parser.add_argument( + "--max_vector_width", + required=False, + help="set the cpu max_vector_width", + type=int, + ) + if parser.parse_known_args()[0].driver == "linalg": parser.add_argument( "--cast", @@ -269,6 +283,8 @@ def get_module_and_args(flags: argparse.Namespace): for arg in args: arg.print_verbose(flags.verbose) + benchgc.mlir.util.attach_dlti(flags, module) + if flags.verbose >= benchgc.util.MODULE_VERBOSE: print(module) return module, args diff --git a/test/benchgc/src/benchgc/mlir/util.py b/test/benchgc/src/benchgc/mlir/util.py index 7161a516b..532b899fc 100644 --- a/test/benchgc/src/benchgc/mlir/util.py +++ b/test/benchgc/src/benchgc/mlir/util.py @@ -14,12 +14,15 @@ # limitations under the License. ################################################################################ +import argparse import ctypes +import os from typing import Any, List import torch from gc_mlir import ir from gc_mlir.dialects import arith, func, memref +from gc_mlir.tools import cpuinfo # calling python binding consumes a lot of time e.g. get_name() @@ -152,3 +155,49 @@ def get_kernel_func_from_module( if type(f) is func.FuncOp and str(f.name).strip('"') == func_name: return f raise ValueError("can not find the entry function") + + +def attach_dlti(flags: argparse.Namespace, module: ir.Module): + # the moudle already had dlti attr + if "dlti.target_system_spec" in module.operation.attributes: + return + if flags.cpu_cache_sizes: + caches_sizes = [int(x) for x in flags.cpu_cache_sizes.strip().split(":")] + else: + caches_sizes = cpuinfo.get_cache_sizes() + if not caches_sizes or len(caches_sizes) != 3: + print( + "Failed to get CPU cache sizes, please added them manually br --cpu_cache_sizes" + ) + return + if flags.max_vector_width: + max_vector_width = flags.max_vector_width + else: + max_vector_width = cpuinfo.get_max_vector_width() + if not max_vector_width: + print( + "Failed to get CPU max vector width, please added them manually br --max_vector_width" + ) + return + l1_data_cache_size, l2_cache_size, l3_cache_size = caches_sizes + if "OMP_NUM_THREADS" not in os.environ: + print("OMP_NUM_THREADS is not found, using 1 as default") + num_threads = os.environ.get("OMP_NUM_THREADS", 1) + + dlti_template = f""" + module attributes {{ + dlti.target_system_spec = #dlti.target_system_spec< + "CPU": #dlti.target_device_spec< + #dlti.dl_entry<"L1_cache_size_in_bytes", {l1_data_cache_size} : ui32>, + #dlti.dl_entry<"L2_cache_size_in_bytes", {l2_cache_size} : ui64>, + #dlti.dl_entry<"L3_cache_size_in_bytes", {l3_cache_size} : ui64>, + #dlti.dl_entry<"num_threads", {num_threads} : i32>, + #dlti.dl_entry<"max_vector_width", {max_vector_width} : i64>> + >}} {{}} + """ + print(dlti_template) + with module.context: + template_module = ir.Module.parse(dlti_template) + module.operation.attributes["dlti.target_system_spec"] = ( + template_module.operation.attributes["dlti.target_system_spec"] + )