Skip to content
This repository was archived by the owner on Aug 30, 2024. It is now read-only.

Commit fd19a44

Browse files
yuchengliu1zhewang1-intcluoyu-intel
authored
[BesTLA] New thread pool and hybrid dispatcher (#118)
--------- Co-authored-by: ZheWang <zhe1.wang@intel.com> Co-authored-by: Luo, Yu <yu.luo@intel.com>
1 parent ad3d19e commit fd19a44

29 files changed

+1714
-1262
lines changed

.github/workflows/scripts/formatScan/clangtidy.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ log_path=${log_dir}/clangtidy.log
1111
cd ${REPO_DIR}
1212
mkdir build
1313
cd build
14-
cmake .. -G Ninja -DNS_USE_CLANG_TIDY=CHECK -DBTLA_USE_OPENMP=OFF
14+
cmake .. -G Ninja -DNS_USE_CLANG_TIDY=CHECK -DBTLA_ENABLE_OPENMP=OFF -DNS_USE_OMP=OFF
1515
ninja 2>&1 | tee ${log_path}
1616

1717
if [[ ! -f ${log_path} ]] || [[ $(grep -c "warning:" ${log_path}) != 0 ]] || [[ $(grep -c "error" ${log_path}) != 0 ]]; then

CMakeLists.txt

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -60,9 +60,9 @@ option(NS_AVX512_VBMI "neural_speed: enable AVX512-VBMI"
6060
option(NS_AVX512_VNNI "neural_speed: enable AVX512-VNNI" OFF)
6161
option(NS_FMA "neural_speed: enable FMA" ON)
6262
option(NS_AMX "neural_speed: enable AMX" OFF)
63+
option(NS_USE_OMP "neural_speed: use OpenMP thread pool." ON)
6364

6465
option(NS_BUILD_TESTS "neural_speed: build tests" ${NS_STANDALONE})
65-
option(NS_BTLA_UT "enable BesTLA's unit tests" OFF)
6666
option(NS_BUILD_EXAMPLES "neural_speed: build examples" ${NS_STANDALONE})
6767
option(NS_USE_CLANG_TIDY "neural_speed: clang-tidy check" OFF)
6868

@@ -135,12 +135,13 @@ if (NS_PYTHON_API)
135135
add_subdirectory(third_party/pybind11)
136136
endif()
137137

138-
if (NS_BTLA_UT)
139-
set(BTLA_UT_ALL ON)
138+
if(NS_USE_OMP)
139+
include(FindOpenMP)
140+
# compile BesTLA's OMPTheading class, then it can be used in ne_layers
141+
set(BTLA_ENABLE_OPENMP ON CACHE BOOL "BesTLA enable compiling OpenMP threading")
142+
add_compile_definitions(NS_USE_OMP)
140143
endif()
141-
include(FindOpenMP)
142144

143-
set(BTLA_USE_OPENMP ON CACHE BOOL "BesTLA use OpenMP")
144145
add_subdirectory(bestla)
145146

146147
add_subdirectory(neural_speed)

CMakePresets.json

Lines changed: 43 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,16 @@
2323
"inherits": "linux-debug",
2424
"cacheVariables": { "CMAKE_BUILD_TYPE": "Release" }
2525
},
26+
{
27+
"name": "linux-release-thread",
28+
"displayName": "Linux Release Thread Pool",
29+
"description": "Release",
30+
"inherits": "linux-debug",
31+
"cacheVariables": {
32+
"CMAKE_BUILD_TYPE": "Release",
33+
"NS_USE_OMP": "OFF"
34+
}
35+
},
2636
{
2737
"name": "windows-base",
2838
"description": "Target Windows with the Visual Studio development environment.",
@@ -49,23 +59,51 @@
4959
"value": "x64",
5060
"strategy": "external"
5161
},
52-
"cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" }
62+
"cacheVariables": {
63+
"CMAKE_BUILD_TYPE": "Debug",
64+
"NS_PROFILING": "ON",
65+
"NS_USE_OMP": "ON",
66+
"BTLA_UT_DEBUG": "ON"
67+
}
5368
},
5469
{
5570
"name": "x64-release",
5671
"displayName": "x64 Release",
5772
"description": "Target Windows (64-bit) with the Visual Studio development environment. (RelWithDebInfo)",
5873
"inherits": "x64-debug",
59-
"cacheVariables": { "CMAKE_BUILD_TYPE": "Release" }
74+
"cacheVariables": {
75+
"CMAKE_BUILD_TYPE": "Release",
76+
"BTLA_UT_DEBUG": "OFF"
77+
}
78+
},
79+
{
80+
"name": "x64-release-thread",
81+
"displayName": "x64 Release without OpenMP",
82+
"description": "Target Windows (64-bit) with the Visual Studio development environment. (RelWithDebInfo)",
83+
"inherits": "x64-release",
84+
"cacheVariables": {
85+
"NS_USE_OMP": "OFF"
86+
}
6087
},
6188
{
6289
"name": "x64-bestla-UT",
6390
"displayName": "x64 BesTLA unit test",
6491
"description": "Target Windows (64-bit) with the Visual Studio development environment. (RelWithDebInfo)",
65-
"inherits": "x64-debug",
92+
"inherits": "x64-release",
6693
"cacheVariables": {
67-
"CMAKE_BUILD_TYPE": "Release",
68-
"NS_BTLA_UT": "ON"
94+
"CMAKE_BUILD_TYPE": "RelWithDebInfo",
95+
"BTLA_UT_ALL": "ON",
96+
"BTLA_UT_BENCHMARK": "ON",
97+
"BTLA_UT_OPENMP": "ON"
98+
}
99+
},
100+
{
101+
"name": "x64-ut-thread",
102+
"displayName": "x64 BesTLA UT without OpenMP",
103+
"description": "Target Windows (64-bit) with the Visual Studio development environment. (RelWithDebInfo)",
104+
"inherits": "x64-bestla-UT",
105+
"cacheVariables": {
106+
"BTLA_UT_OPENMP": "OFF"
69107
}
70108
}
71109
]

bestla/CMakeLists.txt

Lines changed: 23 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ project(bestla LANGUAGES CXX VERSION 0.1.0)
44
file(GLOB headers ${PROJECT_NAME}/*.h ${PROJECT_NAME}/*.hpp)
55
file(GLOB xbyak_headers ${PROJECT_NAME}/xbyak/*.h ${PROJECT_NAME}/xbyak/*.hpp)
66

7-
option(BTLA_USE_OPENMP "Enable OpenMP thread pool" OFF)
7+
option(BTLA_ENABLE_OPENMP "Compile OpenMP thread pool if OMP can be found" OFF)
88

99
option(BTLA_UT_ALL "Enable all unit tests" OFF)
1010
option(BTLA_UT_DEBUG "Enable debug unit tests" OFF)
@@ -19,7 +19,7 @@ option(BTLA_UT_KERNEL_INTRIN "Enable unit test for intrinsic kernels" OFF)
1919
option(BTLA_UT_KERNEL_WRAPPER "Enable unit test for runtime ISA kernels" OFF)
2020
option(BTLA_UT_NOASAN "Disable sanitize" OFF)
2121
option(BTLA_UT_BENCHMARK "Benchmark ON may take a long time to finish all tests" OFF)
22-
option(BTLA_UT_OPENMP "Use OpenMP" ON)
22+
option(BTLA_UT_OPENMP "Use OpenMP for UT tests" OFF)
2323

2424
add_library(${PROJECT_NAME} INTERFACE)
2525
add_library(neural_speed::${PROJECT_NAME} ALIAS ${PROJECT_NAME})
@@ -30,10 +30,10 @@ target_include_directories(
3030
)
3131

3232

33-
if(BTLA_USE_OPENMP)
34-
message(STATUS "BesTLA using OpenMP")
33+
if(BTLA_ENABLE_OPENMP)
34+
message(STATUS "BesTLA enable OpenMP ThreadPool")
3535
target_compile_definitions(${PROJECT_NAME} INTERFACE BTLA_USE_OPENMP)
36-
endif(BTLA_USE_OPENMP)
36+
endif(BTLA_ENABLE_OPENMP)
3737

3838
if(WIN32)
3939
target_compile_definitions(${PROJECT_NAME} INTERFACE _CRT_SECURE_NO_WARNINGS NOMINMAX)
@@ -64,12 +64,14 @@ endif()
6464

6565
function(add_ut_flag UT_OPTION)
6666
if(${${UT_OPTION}})
67-
target_compile_definitions(${PROJECT_NAME}_ut PRIVATE ${UT_OPTION})
67+
# target_compile_definitions(${PROJECT_NAME}_ut PRIVATE ${UT_OPTION})
68+
add_compile_definitions(${UT_OPTION})
6869
endif()
6970
endfunction()
7071

7172
if(UT_BUILD)
7273
file(GLOB srcs ${PROJECT_NAME}/ut/*.cc ${PROJECT_NAME}/ut/*.cpp) #compile everything even run parts of UTs
74+
list(REMOVE_ITEM srcs ${CMAKE_CURRENT_SOURCE_DIR}/${PROJECT_NAME}/ut/bestla_benchmark.cpp)
7375
file(GLOB ut_headers ${PROJECT_NAME}/ut/*.h)
7476
include_directories(${PROJECT_NAME})
7577
add_executable(${PROJECT_NAME}_ut ${srcs} ${headers} ${ut_headers})
@@ -96,8 +98,21 @@ if(UT_BUILD)
9698
add_ut_flag(BTLA_UT_KERNEL_INTRIN)
9799
add_ut_flag(BTLA_UT_KERNEL_JIT)
98100
add_ut_flag(BTLA_UT_KERNEL_WRAPPER)
99-
add_ut_flag(BTLA_UT_BENCHMARK)
100-
101101
target_link_libraries(${PROJECT_NAME}_ut PRIVATE ${PROJECT_NAME})
102102
endif(UT_BUILD)
103103

104+
if(BTLA_UT_BENCHMARK)
105+
file(GLOB srcs ${PROJECT_NAME}/ut/bestla_benchmark.cpp) #compile everything even run parts of UTs
106+
file(GLOB ut_headers ${PROJECT_NAME}/ut/*.h)
107+
include_directories(${PROJECT_NAME})
108+
add_executable(${PROJECT_NAME}_benchmark ${srcs} ${headers} ${ut_headers})
109+
if(BTLA_UT_OPENMP)
110+
include(FindOpenMP)
111+
target_compile_definitions(${PROJECT_NAME} INTERFACE BTLA_USE_OPENMP)
112+
target_link_libraries(${PROJECT_NAME}_benchmark PRIVATE OpenMP::OpenMP_CXX)
113+
endif()
114+
if(NOT WIN32)
115+
target_link_options(${PROJECT_NAME}_benchmark PRIVATE -lpthread)
116+
endif()
117+
target_link_libraries(${PROJECT_NAME}_benchmark PRIVATE ${PROJECT_NAME})
118+
endif(BTLA_UT_BENCHMARK)

bestla/bestla/bestla.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ enum class BTLA_ISA : uint8_t {
3131
AMX_INT8,
3232
AVX512_FP16,
3333
AVX512_BF16,
34+
ISA_COUNT,
3435
};
3536
enum class BTLA_DTYPE : uint32_t {
3637
EleBitsMask = 0xff,

bestla/bestla/bestla_device.h

Lines changed: 81 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,7 @@ class CpuDevice {
215215
public:
216216
inline int getThreads() { return numthreads; }
217217
inline int getCores() { return numcores; }
218+
inline uint32_t getL3CacheSize() { return L3Cache; }
218219
inline uint32_t getL2CacheSize() { return L2Cache; }
219220
inline uint32_t getL1CacheSize() { return L1Cache; }
220221
inline uint32_t getL2CacheSize_E() { return E_L2Cache; }
@@ -228,7 +229,7 @@ class CpuDevice {
228229
inline bool AMX_BF16() { return mHasAMX_BF16; }
229230
inline bool AVX512_BF16() { return mHasAVX512_BF16; }
230231
inline bool AVX512_FP16() { return mHasAVX512_FP16; }
231-
inline float getPE() { return (P_core.size() * P_power) / (E_core.size() * E_power); }
232+
inline float* const getPE() { return PE; }
232233
inline size_t getPcoreNum() { return P_core.size(); }
233234
inline size_t getEcoreNum() { return E_core.size(); }
234235
inline size_t getSMTcoreNum() { return SMT_core.size(); }
@@ -328,12 +329,40 @@ class CpuDevice {
328329
}
329330
}
330331
numcores = P_core.size() + E_core.size();
331-
numthreads = P_core.size() * 2 + E_core.size();
332+
numthreads = P_core.size() + E_core.size() + SMT_core.size();
333+
334+
{
335+
// set PE
336+
uint32_t tmp[4];
337+
_cpu.getCpuid(1, tmp);
338+
if (p) printf("!!!\t%x\t%x\t%x\t%x!!!\n", tmp[0], tmp[1], tmp[2], tmp[3]);
339+
const int famliy = (tmp[0] >> 8) & ((1u << 4) - 1); // cpu.extractBit(a[0], 8, 11);
340+
const int extendedModel = (tmp[0] >> 16) & ((1u << 4) - 1); // cpu.extractBit(a[0], 16, 24);
341+
{
342+
for (int i = 0; i < int(BTLA_ISA::ISA_COUNT); i++) PE[i] = 1.0f;
343+
// CPU identification refer to: https://en.wikichip.org/wiki/intel/cpuid
344+
if (famliy == 6) switch (extendedModel) {
345+
case 9: // ALD
346+
PE[int(BTLA_ISA::AVX2)] = 3.0f;
347+
PE[int(BTLA_ISA::AVX_VNNI)] = 5.0f;
348+
break;
349+
case 10: // MTL
350+
PE[int(BTLA_ISA::AVX2)] = 2.2f;
351+
PE[int(BTLA_ISA::AVX_VNNI)] = 3.0f;
352+
break;
353+
case 11: // RPL
354+
PE[int(BTLA_ISA::AVX2)] = 1.8f;
355+
PE[int(BTLA_ISA::AVX_VNNI)] = 2.6f;
356+
break;
357+
}
358+
}
359+
}
332360
} else {
333361
L1Cache = _cpu.getDataCacheSize(0);
334362
L2Cache = _cpu.getDataCacheSize(1);
335363
numthreads = numcores;
336364
}
365+
L3Cache = _cpu.getDataCacheSize(2);
337366
#if FIXED_CACHE
338367
L2Cache = L2Cache >= FIXED_CACHE_SIZE ? FIXED_CACHE_SIZE : L2Cache;
339368
E_L2Cache = E_L2Cache >= FIXED_CACHE_SIZE ? FIXED_CACHE_SIZE : E_L2Cache;
@@ -357,7 +386,7 @@ class CpuDevice {
357386
Xbyak::util::Cpu cpu;
358387
uint32_t tmp[4];
359388
cpu.getCpuid(0x1A, tmp);
360-
int core_type = (tmp[0] >> 24) & ((1u << 7) - 1); // cpu.extractBit(a[0], 24, 31);
389+
int core_type = (tmp[0] >> 24) & ((1u << 8) - 1); // cpu.extractBit(a[0], 24, 31);
361390
switch (core_type) {
362391
case 32:
363392
// printf("Atom\n");
@@ -407,7 +436,7 @@ class CpuDevice {
407436
}
408437
static void core_bond(int core) {
409438
#ifdef _WIN32
410-
SetThreadAffinityMask(GetCurrentThread(), 1 << core);
439+
SetThreadAffinityMask(GetCurrentThread(), 1LL << core);
411440
#else
412441
cpu_set_t cpuset;
413442
CPU_ZERO(&cpuset);
@@ -420,7 +449,7 @@ class CpuDevice {
420449
static void core_bond(std::thread& thread, int core) {
421450
#ifdef _WIN32
422451
HANDLE handle = thread.native_handle();
423-
SetThreadAffinityMask(handle, 1 << core);
452+
SetThreadAffinityMask(handle, 1LL << core);
424453
#else
425454
cpu_set_t cpuset;
426455
CPU_ZERO(&cpuset);
@@ -434,29 +463,69 @@ class CpuDevice {
434463
bool isHybrid() { return mHybrid; }
435464

436465
protected:
437-
uint32_t L2Cache, L1Cache;
466+
uint32_t L2Cache, L1Cache, L3Cache;
438467
bool mHybrid = false;
439468
bool mHasAVX2, mHasAVX_VNNI, mHasAVX, mHasAVX512_VNNI, mHasAMX_INT8, mHasAMX_BF16, mHasAVX512F, mHasAVX512_BF16,
440469
mHasAVX512_FP16;
441470
int numcores;
442471
int numthreads;
443472
std::vector<int> P_core, E_core, SMT_core;
444473
uint32_t E_L2Cache, E_L1Cache;
445-
float P_power = 4.8, E_power = 2.3;
474+
float PE[int(BTLA_ISA::ISA_COUNT)];
446475
};
447476

448477
#define GetCPUDevice() auto _cd = bestla::device::CpuDevice::getInstance();
449478

450-
class CpuBase {
479+
class CpuRuntime {
451480
public:
452-
CpuBase() {
481+
CpuRuntime() = default;
482+
static CpuRuntime& getInstance(int thread) {
483+
static std::map<int, CpuRuntime> instances;
484+
if (instances.count(thread) == 0) instances[thread] = CpuRuntime(thread);
485+
return instances[thread];
486+
}
487+
488+
inline float getPE(const BTLA_ISA isa) {
489+
// printf("GET:%d\t%f\n",int(isa), *cur_PE);
490+
return PE[int(isa)] * P_core_num / E_core_num;
491+
}
492+
493+
inline void adjustPE(const BTLA_ISA isa, const float PE_) {
494+
// printf("Adjust:%d,%f\n",int(isa),PE_);
495+
PE[int(isa)] *= PE_;
496+
}
497+
498+
size_t mL2Cache, mL1Cache, mL2Cache_P = 0, mL1Cache_P = 0, mL2Cache_E = 0, mL1Cache_E = 0;
499+
int P_core_num = 0, E_core_num = 0;
500+
bool mHybrid = false;
501+
502+
private:
503+
CpuRuntime(int thread) {
453504
GetCPUDevice();
454505
mL2Cache = _cd->getL2CacheSize();
455506
mL1Cache = _cd->getL1CacheSize();
456-
mNumThreads = _cd->getThreads();
507+
maxThreads = _cd->getThreads();
508+
mHybrid = false;
509+
if (_cd->isHybrid() && thread > _cd->getPcoreNum()) {
510+
if (thread > _cd->getPcoreNum() + _cd->getEcoreNum()) {
511+
mL1Cache_P = mL1Cache / 2;
512+
mL2Cache_P = mL2Cache / 2;
513+
P_core_num = _cd->getPcoreNum();
514+
E_core_num = _cd->getEcoreNum();
515+
} else {
516+
mL1Cache_P = mL1Cache;
517+
mL2Cache_P = mL2Cache;
518+
P_core_num = _cd->getPcoreNum();
519+
E_core_num = thread - P_core_num;
520+
}
521+
mL1Cache_E = _cd->getL1CacheSize_E();
522+
mL2Cache_E = _cd->getL2CacheSize_E();
523+
mHybrid = true;
524+
memcpy(PE, _cd->getPE(), int(BTLA_ISA::ISA_COUNT) * sizeof(float));
525+
}
457526
}
458-
size_t mL2Cache, mL1Cache;
459-
int mNumThreads;
527+
float PE[int(BTLA_ISA::ISA_COUNT)];
528+
int maxThreads;
460529
};
461530
} // namespace device
462531
} // namespace bestla

0 commit comments

Comments
 (0)