@@ -215,6 +215,7 @@ class CpuDevice {
215
215
public:
216
216
inline int getThreads () { return numthreads; }
217
217
inline int getCores () { return numcores; }
218
+ inline uint32_t getL3CacheSize () { return L3Cache; }
218
219
inline uint32_t getL2CacheSize () { return L2Cache; }
219
220
inline uint32_t getL1CacheSize () { return L1Cache; }
220
221
inline uint32_t getL2CacheSize_E () { return E_L2Cache; }
@@ -228,7 +229,7 @@ class CpuDevice {
228
229
inline bool AMX_BF16 () { return mHasAMX_BF16 ; }
229
230
inline bool AVX512_BF16 () { return mHasAVX512_BF16 ; }
230
231
inline bool AVX512_FP16 () { return mHasAVX512_FP16 ; }
231
- inline float getPE () { return (P_core. size () * P_power) / (E_core. size () * E_power) ; }
232
+ inline float * const getPE () { return PE ; }
232
233
inline size_t getPcoreNum () { return P_core.size (); }
233
234
inline size_t getEcoreNum () { return E_core.size (); }
234
235
inline size_t getSMTcoreNum () { return SMT_core.size (); }
@@ -328,12 +329,40 @@ class CpuDevice {
328
329
}
329
330
}
330
331
numcores = P_core.size () + E_core.size ();
331
- numthreads = P_core.size () * 2 + E_core.size ();
332
+ numthreads = P_core.size () + E_core.size () + SMT_core.size ();
333
+
334
+ {
335
+ // set PE
336
+ uint32_t tmp[4 ];
337
+ _cpu.getCpuid (1 , tmp);
338
+ if (p) printf (" !!!\t %x\t %x\t %x\t %x!!!\n " , tmp[0 ], tmp[1 ], tmp[2 ], tmp[3 ]);
339
+ const int famliy = (tmp[0 ] >> 8 ) & ((1u << 4 ) - 1 ); // cpu.extractBit(a[0], 8, 11);
340
+ const int extendedModel = (tmp[0 ] >> 16 ) & ((1u << 4 ) - 1 ); // cpu.extractBit(a[0], 16, 24);
341
+ {
342
+ for (int i = 0 ; i < int (BTLA_ISA::ISA_COUNT); i++) PE[i] = 1 .0f ;
343
+ // CPU identification refer to: https://en.wikichip.org/wiki/intel/cpuid
344
+ if (famliy == 6 ) switch (extendedModel) {
345
+ case 9 : // ALD
346
+ PE[int (BTLA_ISA::AVX2)] = 3 .0f ;
347
+ PE[int (BTLA_ISA::AVX_VNNI)] = 5 .0f ;
348
+ break ;
349
+ case 10 : // MTL
350
+ PE[int (BTLA_ISA::AVX2)] = 2 .2f ;
351
+ PE[int (BTLA_ISA::AVX_VNNI)] = 3 .0f ;
352
+ break ;
353
+ case 11 : // RPL
354
+ PE[int (BTLA_ISA::AVX2)] = 1 .8f ;
355
+ PE[int (BTLA_ISA::AVX_VNNI)] = 2 .6f ;
356
+ break ;
357
+ }
358
+ }
359
+ }
332
360
} else {
333
361
L1Cache = _cpu.getDataCacheSize (0 );
334
362
L2Cache = _cpu.getDataCacheSize (1 );
335
363
numthreads = numcores;
336
364
}
365
+ L3Cache = _cpu.getDataCacheSize (2 );
337
366
#if FIXED_CACHE
338
367
L2Cache = L2Cache >= FIXED_CACHE_SIZE ? FIXED_CACHE_SIZE : L2Cache;
339
368
E_L2Cache = E_L2Cache >= FIXED_CACHE_SIZE ? FIXED_CACHE_SIZE : E_L2Cache;
@@ -357,7 +386,7 @@ class CpuDevice {
357
386
Xbyak::util::Cpu cpu;
358
387
uint32_t tmp[4 ];
359
388
cpu.getCpuid (0x1A , tmp);
360
- int core_type = (tmp[0 ] >> 24 ) & ((1u << 7 ) - 1 ); // cpu.extractBit(a[0], 24, 31);
389
+ int core_type = (tmp[0 ] >> 24 ) & ((1u << 8 ) - 1 ); // cpu.extractBit(a[0], 24, 31);
361
390
switch (core_type) {
362
391
case 32 :
363
392
// printf("Atom\n");
@@ -407,7 +436,7 @@ class CpuDevice {
407
436
}
408
437
static void core_bond (int core) {
409
438
#ifdef _WIN32
410
- SetThreadAffinityMask (GetCurrentThread (), 1 << core);
439
+ SetThreadAffinityMask (GetCurrentThread (), 1LL << core);
411
440
#else
412
441
cpu_set_t cpuset;
413
442
CPU_ZERO (&cpuset);
@@ -420,7 +449,7 @@ class CpuDevice {
420
449
static void core_bond (std::thread& thread, int core) {
421
450
#ifdef _WIN32
422
451
HANDLE handle = thread.native_handle ();
423
- SetThreadAffinityMask (handle, 1 << core);
452
+ SetThreadAffinityMask (handle, 1LL << core);
424
453
#else
425
454
cpu_set_t cpuset;
426
455
CPU_ZERO (&cpuset);
@@ -434,29 +463,69 @@ class CpuDevice {
434
463
bool isHybrid () { return mHybrid ; }
435
464
436
465
protected:
437
- uint32_t L2Cache, L1Cache;
466
+ uint32_t L2Cache, L1Cache, L3Cache ;
438
467
bool mHybrid = false ;
439
468
bool mHasAVX2 , mHasAVX_VNNI , mHasAVX , mHasAVX512_VNNI , mHasAMX_INT8 , mHasAMX_BF16 , mHasAVX512F , mHasAVX512_BF16 ,
440
469
mHasAVX512_FP16 ;
441
470
int numcores;
442
471
int numthreads;
443
472
std::vector<int > P_core, E_core, SMT_core;
444
473
uint32_t E_L2Cache, E_L1Cache;
445
- float P_power = 4.8 , E_power = 2.3 ;
474
+ float PE[ int (BTLA_ISA::ISA_COUNT)] ;
446
475
};
447
476
448
477
#define GetCPUDevice () auto _cd = bestla::device::CpuDevice::getInstance();
449
478
450
- class CpuBase {
479
+ class CpuRuntime {
451
480
public:
452
- CpuBase () {
481
+ CpuRuntime () = default ;
482
+ static CpuRuntime& getInstance (int thread) {
483
+ static std::map<int , CpuRuntime> instances;
484
+ if (instances.count (thread) == 0 ) instances[thread] = CpuRuntime (thread);
485
+ return instances[thread];
486
+ }
487
+
488
+ inline float getPE (const BTLA_ISA isa) {
489
+ // printf("GET:%d\t%f\n",int(isa), *cur_PE);
490
+ return PE[int (isa)] * P_core_num / E_core_num;
491
+ }
492
+
493
+ inline void adjustPE (const BTLA_ISA isa, const float PE_) {
494
+ // printf("Adjust:%d,%f\n",int(isa),PE_);
495
+ PE[int (isa)] *= PE_;
496
+ }
497
+
498
+ size_t mL2Cache , mL1Cache , mL2Cache_P = 0 , mL1Cache_P = 0 , mL2Cache_E = 0 , mL1Cache_E = 0 ;
499
+ int P_core_num = 0 , E_core_num = 0 ;
500
+ bool mHybrid = false ;
501
+
502
+ private:
503
+ CpuRuntime (int thread) {
453
504
GetCPUDevice ();
454
505
mL2Cache = _cd->getL2CacheSize ();
455
506
mL1Cache = _cd->getL1CacheSize ();
456
- mNumThreads = _cd->getThreads ();
507
+ maxThreads = _cd->getThreads ();
508
+ mHybrid = false ;
509
+ if (_cd->isHybrid () && thread > _cd->getPcoreNum ()) {
510
+ if (thread > _cd->getPcoreNum () + _cd->getEcoreNum ()) {
511
+ mL1Cache_P = mL1Cache / 2 ;
512
+ mL2Cache_P = mL2Cache / 2 ;
513
+ P_core_num = _cd->getPcoreNum ();
514
+ E_core_num = _cd->getEcoreNum ();
515
+ } else {
516
+ mL1Cache_P = mL1Cache ;
517
+ mL2Cache_P = mL2Cache ;
518
+ P_core_num = _cd->getPcoreNum ();
519
+ E_core_num = thread - P_core_num;
520
+ }
521
+ mL1Cache_E = _cd->getL1CacheSize_E ();
522
+ mL2Cache_E = _cd->getL2CacheSize_E ();
523
+ mHybrid = true ;
524
+ memcpy (PE, _cd->getPE (), int (BTLA_ISA::ISA_COUNT) * sizeof (float ));
525
+ }
457
526
}
458
- size_t mL2Cache , mL1Cache ;
459
- int mNumThreads ;
527
+ float PE[ int (BTLA_ISA::ISA_COUNT)] ;
528
+ int maxThreads ;
460
529
};
461
530
} // namespace device
462
531
} // namespace bestla
0 commit comments