diff --git a/.gitignore b/.gitignore index 489a98f4..8f811bd9 100644 --- a/.gitignore +++ b/.gitignore @@ -27,3 +27,4 @@ Release64 .metadata/ html/ latex/ +*.swp diff --git a/cpucounters.cpp b/cpucounters.cpp index 6d0a47f3..e8cb48fe 100644 --- a/cpucounters.cpp +++ b/cpucounters.cpp @@ -89,7 +89,7 @@ int convertUnknownToInt(size_t size, char* value); #endif #undef PCM_UNCORE_PMON_BOX_CHECK_STATUS // debug only -#undef PCM_DEBUG_TOPOLOGY // debug of topoogy enumeration routine +#undef PCM_DEBUG_TOPOLOGY // debug of topology enumeration routine // FreeBSD is much more restrictive about names for semaphores #if defined (__FreeBSD__) @@ -645,28 +645,28 @@ void PCM::initCStateSupportTables() case ATOM_CHERRYTRAIL: case ATOM_APOLLO_LAKE: case ATOM_DENVERTON: - PCM_CSTATE_ARRAY(pkgCStateMsr, PCM_PARAM_PROTECT({0, 0, 0x3F8, 0, 0x3F9, 0, 0x3FA, 0, 0, 0, 0 }) ); + PCM_CSTATE_ARRAY(pkgCStateMsr, PCM_PARAM_PROTECT({0, 0, 0x3F8, 0, 0x3F9, 0, 0x3FA, 0, 0, 0, 0 }) ); case NEHALEM_EP: case NEHALEM: case CLARKDALE: case WESTMERE_EP: case NEHALEM_EX: case WESTMERE_EX: - PCM_CSTATE_ARRAY(pkgCStateMsr, PCM_PARAM_PROTECT({0, 0, 0, 0x3F8, 0, 0, 0x3F9, 0x3FA, 0, 0, 0}) ); + PCM_CSTATE_ARRAY(pkgCStateMsr, PCM_PARAM_PROTECT({0, 0, 0, 0x3F8, 0, 0, 0x3F9, 0x3FA, 0, 0, 0}) ); case SANDY_BRIDGE: case JAKETOWN: case IVY_BRIDGE: case IVYTOWN: - PCM_CSTATE_ARRAY(pkgCStateMsr, PCM_PARAM_PROTECT({0, 0, 0x60D, 0x3F8, 0, 0, 0x3F9, 0x3FA, 0, 0, 0}) ); + PCM_CSTATE_ARRAY(pkgCStateMsr, PCM_PARAM_PROTECT({0, 0, 0x60D, 0x3F8, 0, 0, 0x3F9, 0x3FA, 0, 0, 0}) ); case HASWELL: case HASWELL_2: case HASWELLX: case BDX_DE: case BDX: case KNL: - PCM_CSTATE_ARRAY(pkgCStateMsr, PCM_PARAM_PROTECT({0, 0, 0x60D, 0x3F8, 0, 0, 0x3F9, 0x3FA, 0, 0, 0}) ); + PCM_CSTATE_ARRAY(pkgCStateMsr, PCM_PARAM_PROTECT({0, 0, 0x60D, 0x3F8, 0, 0, 0x3F9, 0x3FA, 0, 0, 0}) ); case SKX: - PCM_CSTATE_ARRAY(pkgCStateMsr, PCM_PARAM_PROTECT({0, 0, 0x60D, 0, 0, 0, 0x3F9, 0, 0, 0, 0}) ); + PCM_CSTATE_ARRAY(pkgCStateMsr, PCM_PARAM_PROTECT({0, 0, 0x60D, 0, 0, 0, 0x3F9, 0, 0, 0, 0}) ); case HASWELL_ULT: case BROADWELL: case SKL: @@ -674,7 +674,7 @@ void PCM::initCStateSupportTables() case KBL: case KBL_1: case BROADWELL_XEON_E3: - PCM_CSTATE_ARRAY(pkgCStateMsr, PCM_PARAM_PROTECT({0, 0, 0x60D, 0x3F8, 0, 0, 0x3F9, 0x3FA, 0x630, 0x631, 0x632}) ); + PCM_CSTATE_ARRAY(pkgCStateMsr, PCM_PARAM_PROTECT({0, 0, 0x60D, 0x3F8, 0, 0, 0x3F9, 0x3FA, 0x630, 0x631, 0x632}) ); default: std::cerr << "PCM error: package C-states support array is not initialized. Package C-states metrics will not be shown." << std::endl; @@ -694,7 +694,7 @@ void PCM::initCStateSupportTables() case WESTMERE_EP: case NEHALEM_EX: case WESTMERE_EX: - PCM_CSTATE_ARRAY(coreCStateMsr, PCM_PARAM_PROTECT({0, 0, 0, 0x3FC, 0, 0, 0x3FD, 0, 0, 0, 0}) ); + PCM_CSTATE_ARRAY(coreCStateMsr, PCM_PARAM_PROTECT({0, 0, 0, 0x3FC, 0, 0, 0x3FD, 0, 0, 0, 0}) ); case SANDY_BRIDGE: case JAKETOWN: case IVY_BRIDGE: @@ -716,11 +716,11 @@ void PCM::initCStateSupportTables() case SKL: case KBL: case KBL_1: - PCM_CSTATE_ARRAY(coreCStateMsr, PCM_PARAM_PROTECT({0, 0, 0, 0x3FC, 0, 0, 0x3FD, 0x3FE, 0, 0, 0}) ); + PCM_CSTATE_ARRAY(coreCStateMsr, PCM_PARAM_PROTECT({0, 0, 0, 0x3FC, 0, 0, 0x3FD, 0x3FE, 0, 0, 0}) ); case KNL: - PCM_CSTATE_ARRAY(coreCStateMsr, PCM_PARAM_PROTECT({0, 0, 0, 0, 0, 0, 0x3FF, 0, 0, 0, 0}) ); + PCM_CSTATE_ARRAY(coreCStateMsr, PCM_PARAM_PROTECT({0, 0, 0, 0, 0, 0, 0x3FF, 0, 0, 0, 0}) ); case SKX: - PCM_CSTATE_ARRAY(coreCStateMsr, PCM_PARAM_PROTECT({0, 0, 0, 0, 0, 0, 0x3FD, 0, 0, 0, 0}) ); + PCM_CSTATE_ARRAY(coreCStateMsr, PCM_PARAM_PROTECT({0, 0, 0, 0, 0, 0, 0x3FD, 0, 0, 0, 0}) ); default: std::cerr << "PCM error: core C-states support array is not initialized. Core C-states metrics will not be shown." << std::endl; PCM_CSTATE_ARRAY(coreCStateMsr, PCM_PARAM_PROTECT({ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }) ); @@ -769,7 +769,8 @@ bool PCM::discoverSystemTopology() PCM_CPUID_INFO cpuid_args; pcm_cpuid(1, cpuid_args); - int apic_ids_per_package = (cpuid_args.array[1] & 0x00FF0000) >> 16, apic_ids_per_core; + int apic_ids_per_package = extract_bits_ui(cpuid_args.array[1], 16, 23); + int apic_ids_per_core; if (apic_ids_per_package == 0) { @@ -779,8 +780,8 @@ bool PCM::discoverSystemTopology() pcm_cpuid(0xb, 0x0, cpuid_args); - if ((cpuid_args.array[2] & 0xFF00) == 0x100) - apic_ids_per_core = cpuid_args.array[1] & 0xFFFF; + if (extract_bits_ui(cpuid_args.array[2], 8, 15) == 0x1) + apic_ids_per_core = extract_bits_ui(cpuid_args.array[1], 0, 15); else apic_ids_per_core = 1; @@ -790,71 +791,23 @@ bool PCM::discoverSystemTopology() return false; } - // init constants for CPU topology leaf 0xB - // adapted from Topology Enumeration Reference code for Intel 64 Architecture - // https://software.intel.com/en-us/articles/intel-64-architecture-processor-topology-enumeration - int wasCoreReported = 0, wasThreadReported = 0; - int subleaf = 0, levelType, levelShift; - unsigned long coreplusSMT_Mask = 0L; - uint32 coreSelectMask = 0, smtSelectMask = 0, smtMaskWidth = 0; - uint32 l2CacheMaskShift = 0, l2CacheMaskWidth; - uint32 pkgSelectMask = (-1), pkgSelectMaskShift = 0; - unsigned long mask; - - do - { - pcm_cpuid(0xb, subleaf, cpuid_args); - if (cpuid_args.array[1] == 0) - { // if EBX ==0 then this subleaf is not valid, we can exit the loop - break; - } - mask = (1<<(16)) - 1; - levelType = (cpuid_args.array[2] & mask) >> 8; - mask = (1<<(5)) - 1; - levelShift = (cpuid_args.array[0] & mask); - switch (levelType) - { - case 1: //level type is SMT, so levelShift is the SMT_Mask_Width - smtSelectMask = ~((-1) << levelShift); - smtMaskWidth = levelShift; - wasThreadReported = 1; - break; - case 2: //level type is Core, so levelShift is the CorePlsuSMT_Mask_Width - coreplusSMT_Mask = ~((-1) << levelShift); - pkgSelectMaskShift = levelShift; - pkgSelectMask = (-1) ^ coreplusSMT_Mask; - wasCoreReported = 1; - break; - default: - break; - } - subleaf++; - } while (1); - - if(wasThreadReported && wasCoreReported) - { - coreSelectMask = coreplusSMT_Mask ^ smtSelectMask; - } - else if (!wasCoreReported && wasThreadReported) - { - pkgSelectMaskShift = smtMaskWidth; - pkgSelectMask = (-1) ^ smtSelectMask; - } - else - { - std::cerr << "ERROR: this should not happen if hardware function normally" << std::endl; - return false; - } + uint32 l2CacheMaskShift = 0; +#ifdef PCM_DEBUG_TOPOLOGY + uint32 threadsSharingL2; +#endif + uint32 l2CacheMaskWidth; pcm_cpuid(0x4, 2, cpuid_args); // get ID for L2 cache - mask = ((1<<(12)) - 1) << (14); // mask with bits 25:14 set to 1 - l2CacheMaskWidth = 1 + ((cpuid_args.array[0] & mask) >> 14); // number of APIC IDs sharing L2 cache + l2CacheMaskWidth = 1 + extract_bits_ui(cpuid_args.array[0],14,25); // number of APIC IDs sharing L2 cache +#ifdef PCM_DEBUG_TOPOLOGY + threadsSharingL2 = l2CacheMaskWidth; +#endif for( ; l2CacheMaskWidth > 1; l2CacheMaskWidth >>= 1) { l2CacheMaskShift++; } #ifdef PCM_DEBUG_TOPOLOGY - std::cerr << "DEBUG: Number of threads sharing L2 cache = " << l2CacheMaskWidth + std::cerr << "DEBUG: Number of threads sharing L2 cache = " << threadsSharingL2 << " [the most significant bit = " << l2CacheMaskShift << "]" << std::endl; #endif @@ -932,11 +885,62 @@ bool PCM::discoverSystemTopology() TopologyEntry entry; #ifdef __linux__ + // init constants for CPU topology leaf 0xB + // adapted from Topology Enumeration Reference code for Intel 64 Architecture + // https://software.intel.com/en-us/articles/intel-64-architecture-processor-topology-enumeration + int wasCoreReported = 0, wasThreadReported = 0; + int subleaf = 0, levelType, levelShift; + //uint32 coreSelectMask = 0, smtSelectMask = 0; + uint32 smtMaskWidth = 0; + //uint32 pkgSelectMask = (-1), pkgSelectMaskShift = 0; + uint32 corePlusSMTMaskWidth = 0; + uint32 coreMaskWidth = 0; + + // This code needs to run affinitized to a single core, how do we make sure of that? + do + { + pcm_cpuid(0xb, subleaf, cpuid_args); + if (cpuid_args.array[1] == 0) + { // if EBX ==0 then this subleaf is not valid, we can exit the loop + break; + } + levelType = extract_bits_ui(cpuid_args.array[2], 8, 15); + levelShift = extract_bits_ui(cpuid_args.array[0], 0, 4); + switch (levelType) + { + case 1: //level type is SMT, so levelShift is the SMT_Mask_Width + smtMaskWidth = levelShift; + wasThreadReported = 1; + break; + case 2: //level type is Core, so levelShift is the CorePlusSMT_Mask_Width + corePlusSMTMaskWidth = levelShift; + wasCoreReported = 1; + break; + default: + break; + } + subleaf++; + } while (1); + + if(wasThreadReported && wasCoreReported) + { + coreMaskWidth = corePlusSMTMaskWidth - smtMaskWidth; + } + else if (!wasCoreReported && wasThreadReported) + { + coreMaskWidth = smtMaskWidth; + } + else + { + std::cerr << "ERROR: Major problem? No leaf 0 under cpuid function 11." << std::endl; + return false; + } + num_cores = readMaxFromSysFS("/sys/devices/system/cpu/present"); if(num_cores == -1) { - std::cerr << "Can not read number of present cores" << std::endl; + std::cerr << "Cannot read number of present cores" << std::endl; return false; } ++num_cores; @@ -945,7 +949,7 @@ bool PCM::discoverSystemTopology() FILE * f_cpuinfo = fopen("/proc/cpuinfo", "r"); if (!f_cpuinfo) { - std::cerr << "Can not open /proc/cpuinfo file." << std::endl; + std::cerr << "Cannot open /proc/cpuinfo file." << std::endl; return false; } @@ -964,10 +968,10 @@ bool PCM::discoverSystemTopology() pcm_cpuid(0xb, 0x0, cpuid_args); int apic_id = cpuid_args.array[3]; - entry.thread_id = (apic_id & smtSelectMask); - entry.core_id = (apic_id & coreSelectMask) >> smtMaskWidth; - entry.socket = (apic_id & pkgSelectMask) >> pkgSelectMaskShift; - entry.tile_id = (apic_id >> l2CacheMaskShift); + entry.thread_id = extract_bits_ui(apic_id, 0, smtMaskWidth-1); + entry.core_id = extract_bits_ui(apic_id, smtMaskWidth, smtMaskWidth+coreMaskWidth-1); + entry.socket = extract_bits_ui(apic_id, smtMaskWidth+coreMaskWidth, 31); + entry.tile_id = extract_bits_ui(apic_id, l2CacheMaskShift, 31); topology[entry.os_id] = entry; socketIdMap[entry.socket] = 0; @@ -983,7 +987,8 @@ bool PCM::discoverSystemTopology() std::map > os_id_by_core, os_id_by_tile, core_id_by_socket; for(auto it = topology.begin(); it != topology.end(); ++it) { - std::cerr << std::left << std::setfill(' ') << std::setw(16) << it->os_id + std::cerr << std::left << std::setfill(' ') + << std::setw(16) << it->os_id << std::setw(16) << it->thread_id << std::setw(16) << it->core_id << std::setw(16) << it->tile_id @@ -1702,8 +1707,8 @@ bool PCM::isCPUModelSupported(int model_) || model_ == BROADWELL || model_ == KNL || model_ == SKL - || model_ == SKX || model_ == KBL + || model_ == SKX ); } @@ -2664,8 +2669,9 @@ bool PCM::PMUinUse() return false; } -const char * PCM::getUArchCodename(int32 cpu_model_) const +const char * PCM::getUArchCodename(const int32 cpu_model_param) const { + auto cpu_model_ = cpu_model_param; if(cpu_model_ < 0) cpu_model_ = this->cpu_model ; @@ -2709,6 +2715,15 @@ const char * PCM::getUArchCodename(int32 cpu_model_) const case KBL: return "Kabylake"; case SKX: + if (cpu_model_param >= 0) + { + // query for specified cpu_model_param, stepping not provided + return "Skylake-SP, Cascade Lake-SP"; + } + if (isCLX()) + { + return "Cascade Lake-SP"; + } return "Skylake-SP"; } return "unknown"; @@ -3177,13 +3192,13 @@ void BasicCounterState::readAndAggregate(std::shared_ptr msr) SMICount += cSMICount; } -PCM::ErrorCode PCM::programServerUncoreMemoryMetrics(int rankA, int rankB) +PCM::ErrorCode PCM::programServerUncoreMemoryMetrics(int rankA, int rankB, bool PMM) { if(MSR.empty() || server_pcicfg_uncore.empty()) return PCM::MSRAccessDenied; for (int i = 0; (i < (int)server_pcicfg_uncore.size()) && MSR.size(); ++i) { - server_pcicfg_uncore[i]->programServerUncoreMemoryMetrics(rankA, rankB); + server_pcicfg_uncore[i]->programServerUncoreMemoryMetrics(rankA, rankB, PMM); } return PCM::Success; @@ -3458,8 +3473,16 @@ void PCM::readAndAggregateUncoreMCCounters(const uint32 socket, CounterStateType server_pcicfg_uncore[socket]->freezeCounters(); result.UncMCNormalReads += server_pcicfg_uncore[socket]->getImcReads(); result.UncMCFullWrites += server_pcicfg_uncore[socket]->getImcWrites(); - result.UncEDCNormalReads += server_pcicfg_uncore[socket]->getEdcReads(); - result.UncEDCFullWrites += server_pcicfg_uncore[socket]->getEdcWrites(); + if (PMMTrafficMetricsAvailable()) + { + result.UncPMMReads += server_pcicfg_uncore[socket]->getPMMReads(); + result.UncPMMWrites += server_pcicfg_uncore[socket]->getPMMWrites(); + } + if (MCDRAMmemoryTrafficMetricsAvailable()) + { + result.UncEDCNormalReads += server_pcicfg_uncore[socket]->getEdcReads(); + result.UncEDCFullWrites += server_pcicfg_uncore[socket]->getEdcWrites(); + } server_pcicfg_uncore[socket]->unfreezeCounters(); } if (LLCReadMissLatencyMetricsAvailable()) @@ -3876,6 +3899,11 @@ ServerUncorePowerState PCM::getServerUncorePowerState(uint32 socket) for(uint32 cnt=0;cnt<4;++cnt) result.EDCCounter[channel][cnt] = server_pcicfg_uncore[socket]->getEDCCounter(channel,cnt); } + for (uint32 controller = 0; controller < (uint32)server_pcicfg_uncore[socket]->getNumMC(); ++controller) + { + for(uint32 cnt=0;cnt<4;++cnt) + result.M2MCounter[controller][cnt] = server_pcicfg_uncore[socket]->getM2MCounter(controller,cnt); + } server_pcicfg_uncore[socket]->unfreezeCounters(); } if(MSR.size()) @@ -3986,12 +4014,21 @@ static const uint32 UPI_DEV_IDS[] = { 0x2058 }; +static const uint32 M2M_DEV_IDS[] = { + 0x2066 +}; + PCM_Util::Mutex ServerPCICFGUncore::socket2busMutex; std::vector > ServerPCICFGUncore::socket2iMCbus; std::vector > ServerPCICFGUncore::socket2UPIbus; +std::vector > ServerPCICFGUncore::socket2M2Mbus; void ServerPCICFGUncore::initSocket2Bus(std::vector > & socket2bus, uint32 device, uint32 function, const uint32 DEV_IDS[], uint32 devIdsSize) { + if (device == PCM_INVALID_DEV_ADDR || function == PCM_INVALID_FUNC_ADDR) + { + return; + } PCM_Util::Mutex::Scope _(socket2busMutex); if(!socket2bus.empty()) return; @@ -4084,9 +4121,11 @@ PciHandleType * ServerPCICFGUncore::createIntelPerfMonDevice(uint32 groupnr_, in ServerPCICFGUncore::ServerPCICFGUncore(uint32 socket_, const PCM * pcm) : iMCbus(-1) , UPIbus(-1) + , M2Mbus(-1) , groupnr(0) , qpi_speed(0) , num_imc(0) + , num_imc_channels1(0) { #define PCM_PCICFG_MC_INIT(controller, channel, arch) \ @@ -4099,6 +4138,14 @@ ServerPCICFGUncore::ServerPCICFGUncore(uint32 socket_, const PCM * pcm) : EDCX_ECLK_REGISTER_DEV_ADDR[controller] = arch##_EDC##controller##_##clock##_REGISTER_DEV_ADDR; \ EDCX_ECLK_REGISTER_FUNC_ADDR[controller] = arch##_EDC##controller##_##clock##_REGISTER_FUNC_ADDR; +#define PCM_PCICFG_M2M_INIT(x, arch) \ + M2M_REGISTER_DEV_ADDR[x] = arch##_M2M_##x##_REGISTER_DEV_ADDR; \ + M2M_REGISTER_FUNC_ADDR[x] = arch##_M2M_##x##_REGISTER_FUNC_ADDR; + + M2M_REGISTER_DEV_ADDR[0] = PCM_INVALID_DEV_ADDR; + M2M_REGISTER_FUNC_ADDR[0] = PCM_INVALID_FUNC_ADDR; + M2M_REGISTER_DEV_ADDR[1] = PCM_INVALID_DEV_ADDR; + M2M_REGISTER_FUNC_ADDR[1] = PCM_INVALID_FUNC_ADDR; if(cpu_model == PCM::JAKETOWN || cpu_model == PCM::IVYTOWN) { @@ -4132,6 +4179,9 @@ ServerPCICFGUncore::ServerPCICFGUncore(uint32 socket_, const PCM * pcm) : PCM_PCICFG_MC_INIT(1, 1, SKX) PCM_PCICFG_MC_INIT(1, 2, SKX) PCM_PCICFG_MC_INIT(1, 3, SKX) + + PCM_PCICFG_M2M_INIT(0, SKX) + PCM_PCICFG_M2M_INIT(1, SKX) } else if(cpu_model == PCM::KNL) { @@ -4161,12 +4211,26 @@ ServerPCICFGUncore::ServerPCICFGUncore(uint32 socket_, const PCM * pcm) : #undef PCM_PCICFG_MC_INIT #undef PCM_PCICFG_EDC_INIT +#undef PCM_PCICFG_M2M_INIT - initSocket2Bus(socket2iMCbus, MCX_CHY_REGISTER_DEV_ADDR[0][0], MCX_CHY_REGISTER_FUNC_ADDR[0][0], IMC_DEV_IDS, (uint32)sizeof(IMC_DEV_IDS) / sizeof(IMC_DEV_IDS[0])); const uint32 total_sockets_ = pcm->getNumSockets(); + initSocket2Bus(socket2M2Mbus, M2M_REGISTER_DEV_ADDR[0], M2M_REGISTER_FUNC_ADDR[0], M2M_DEV_IDS, (uint32)sizeof(M2M_DEV_IDS) / sizeof(M2M_DEV_IDS[0])); + if (total_sockets_ == socket2M2Mbus.size()) + { + groupnr = socket2M2Mbus[socket_].first; + M2Mbus = socket2M2Mbus[socket_].second; + } + + initSocket2Bus(socket2iMCbus, MCX_CHY_REGISTER_DEV_ADDR[0][0], MCX_CHY_REGISTER_FUNC_ADDR[0][0], IMC_DEV_IDS, (uint32)sizeof(IMC_DEV_IDS) / sizeof(IMC_DEV_IDS[0])); + if(total_sockets_ == socket2iMCbus.size()) { + if (total_sockets_ == socket2M2Mbus.size() && socket2iMCbus[socket_].first != socket2M2Mbus[socket_].first) + { + std::cerr << "PCM error: mismatching PCICFG group number for M2M and IMC perfmon devices." << std::endl; + M2Mbus = -1; + } groupnr = socket2iMCbus[socket_].first; iMCbus = socket2iMCbus[socket_].second; @@ -4203,7 +4267,7 @@ ServerPCICFGUncore::ServerPCICFGUncore(uint32 socket_, const PCM * pcm) : PCM_PCICFG_SETUP_MC_HANDLE(0,3) if (!imcHandles.empty()) ++num_imc; // at least one memory controller - const size_t num_imc_channels1 = (size_t)imcHandles.size(); + num_imc_channels1 = (uint32)imcHandles.size(); PCM_PCICFG_SETUP_MC_HANDLE(1,0) PCM_PCICFG_SETUP_MC_HANDLE(1,1) @@ -4241,6 +4305,20 @@ ServerPCICFGUncore::ServerPCICFGUncore(uint32 socket_, const PCM * pcm) : #undef PCM_PCICFG_SETUP_EDC_HANDLE } +#define PCM_PCICFG_SETUP_M2M_HANDLE(x) \ + if (M2Mbus >= 0 && M2M_REGISTER_DEV_ADDR[x] != PCM_INVALID_DEV_ADDR && \ + M2M_REGISTER_FUNC_ADDR[x] != PCM_INVALID_FUNC_ADDR ) \ + { \ + PciHandleType * handle = createIntelPerfMonDevice(groupnr, M2Mbus, \ + M2M_REGISTER_DEV_ADDR[x], M2M_REGISTER_FUNC_ADDR[x], true); \ + if (handle) m2mHandles.push_back(std::shared_ptr(handle));\ + } + + PCM_PCICFG_SETUP_M2M_HANDLE(0) + PCM_PCICFG_SETUP_M2M_HANDLE(1) + +#undef PCM_PCICFG_SETUP_M2M_HANDLE + if (total_sockets_ == 1) { /* * For single socket systems, do not worry at all about QPI ports. This @@ -4250,13 +4328,15 @@ ServerPCICFGUncore::ServerPCICFGUncore(uint32 socket_, const PCM * pcm) : * is possible with single socket systems. */ qpiLLHandles.clear(); - std::cerr << "On the socket detected " << num_imc << " memory controllers with total number of " << imcHandles.size() << " channels. " << std::endl; + std::cerr << "On the socket detected " << num_imc << " memory controllers with total number of " << imcHandles.size() << " channels. " << + m2mHandles.size() << " M2M (mesh to memory) blocks detected."<< std::endl; return; } #ifdef PCM_NOQPI qpiLLHandles.clear(); - std::cerr << num_imc<<" memory controllers detected with total number of "<< imcHandles.size() <<" channels. " << std::endl; + std::cerr << num_imc<<" memory controllers detected with total number of "<< imcHandles.size() <<" channels. " << + m2mHandles.size() << " M2M (mesh to memory) blocks detected."<< std::endl; return; #else @@ -4374,7 +4454,20 @@ ServerPCICFGUncore::ServerPCICFGUncore(uint32 socket_, const PCM * pcm) : #endif std::cerr << "Socket "<getCPUModel(); @@ -4401,7 +4494,23 @@ void ServerPCICFGUncore::programServerUncoreMemoryMetrics(int rankA, int rankB) default: MCCntConfig[0] = MC_CH_PCI_PMON_CTL_EVENT(0x04) + MC_CH_PCI_PMON_CTL_UMASK(3); // monitor reads on counter 0: CAS_COUNT.RD MCCntConfig[1] = MC_CH_PCI_PMON_CTL_EVENT(0x04) + MC_CH_PCI_PMON_CTL_UMASK(12); // monitor writes on counter 1: CAS_COUNT.WR - MCCntConfig[2] = MC_CH_PCI_PMON_CTL_EVENT(0x04) + MC_CH_PCI_PMON_CTL_UMASK(2); // monitor partial writes on counter 2: CAS_COUNT.RD_UNDERFILL, + if (PMM) + { + if (pcm->PMMTrafficMetricsAvailable()) + { + MCCntConfig[2] = MC_CH_PCI_PMON_CTL_EVENT(0xe3); // monitor PMM_RDQ_REQUESTS on counter 2 + MCCntConfig[3] = MC_CH_PCI_PMON_CTL_EVENT(0xe7); // monitor PMM_WPQ_REQUESTS on counter 3 + } + else + { + std::cerr << "PCM Error: PMM metrics are not available on your platform" << std::endl; + return; + } + } + else + { + MCCntConfig[2] = MC_CH_PCI_PMON_CTL_EVENT(0x04) + MC_CH_PCI_PMON_CTL_UMASK(2); // monitor partial writes on counter 2: CAS_COUNT.RD_UNDERFILL, + } } } else { switch(cpu_model) @@ -4435,6 +4544,8 @@ void ServerPCICFGUncore::programServerUncoreMemoryMetrics(int rankA, int rankB) programIMC(MCCntConfig); if(cpu_model == PCM::KNL) programEDC(EDCCntConfig); + programM2M(); + qpiLLHandles.clear(); // no QPI events used return; } @@ -4453,9 +4564,14 @@ void ServerPCICFGUncore::program() EDCCntConfig[0] = MC_CH_PCI_PMON_CTL_EVENT(0x01) + MC_CH_PCI_PMON_CTL_UMASK(1); // monitor reads on counter 0: RPQ EDCCntConfig[1] = MC_CH_PCI_PMON_CTL_EVENT(0x02) + MC_CH_PCI_PMON_CTL_UMASK(1); // monitor reads on counter 1: WPQ break; - default: // check if this should be set for specific processors, like BDX only? + default: MCCntConfig[0] = MC_CH_PCI_PMON_CTL_EVENT(0x04) + MC_CH_PCI_PMON_CTL_UMASK(3); // monitor reads on counter 0: CAS_COUNT.RD MCCntConfig[1] = MC_CH_PCI_PMON_CTL_EVENT(0x04) + MC_CH_PCI_PMON_CTL_UMASK(12); // monitor writes on counter 1: CAS_COUNT.WR + if (pcm->PMMTrafficMetricsAvailable()) + { + MCCntConfig[2] = MC_CH_PCI_PMON_CTL_EVENT(0xe3); // monitor PMM_RDQ_REQUESTS on counter 2 + MCCntConfig[3] = MC_CH_PCI_PMON_CTL_EVENT(0xe7); // monitor PMM_WPQ_REQUESTS on counter 3 + } } programIMC(MCCntConfig); @@ -4540,6 +4656,29 @@ void ServerPCICFGUncore::cleanupQPIHandles() } uint64 ServerPCICFGUncore::getImcReads() +{ + return getImcReadsForChannels((uint32)0, (uint32)imcHandles.size()); +} + +uint64 ServerPCICFGUncore::getImcReadsForController(uint32 controller) +{ + uint32 beginChannel = 0; + uint32 endChannel = 0; + switch (controller) + { + case 0: + beginChannel = 0; + endChannel = num_imc_channels1; + break; + case 1: + beginChannel = num_imc_channels1; + endChannel = (uint32)imcHandles.size(); + break; + } + return getImcReadsForChannels(beginChannel, endChannel); +} + +uint64 ServerPCICFGUncore::getImcReadsForChannels(uint32 beginChannel, uint32 endChannel) { uint64 result = 0; uint64 MC_CH_PCI_PMON_CTR0_ADDR = 0; @@ -4548,19 +4687,17 @@ uint64 ServerPCICFGUncore::getImcReads() const uint32 cpu_model = pcm->getCPUModel(); if (cpu_model == PCM::KNL) { MC_CH_PCI_PMON_CTR0_ADDR = KNX_MC_CH_PCI_PMON_CTR0_ADDR; - } else { + } + else { MC_CH_PCI_PMON_CTR0_ADDR = XPF_MC_CH_PCI_PMON_CTR0_ADDR; } - - // std::cout << "DEBUG: imcHandles.size() = " << imcHandles.size() << std::endl; - for (uint32 i = 0; i < (uint32)imcHandles.size(); ++i) + for (uint32 i = beginChannel; i < endChannel && i < imcHandles.size(); ++i) { uint64 value = 0; imcHandles[i]->read64(MC_CH_PCI_PMON_CTR0_ADDR, &value); - // std::cout << "DEBUG: getImcReads() with fd = " << imcHandles[i]->fd << " value = " << value << std::endl; + // std::cout << "DEBUG: getImcReads() with fd = " << imcHandles[i]->fd << " value = " << value << std::endl; result += value; } - return result; } @@ -4587,6 +4724,26 @@ uint64 ServerPCICFGUncore::getImcWrites() return result; } +uint64 ServerPCICFGUncore::getPMMReads() +{ + uint64 result = 0; + for (uint32 i = 0; i < (uint32)imcHandles.size(); ++i) + { + result += getMCCounter(i, 2); + } + return result; +} + +uint64 ServerPCICFGUncore::getPMMWrites() +{ + uint64 result = 0; + for (uint32 i = 0; i < (uint32)imcHandles.size(); ++i) + { + result += getMCCounter(i, 3); + } + return result; +} + uint64 ServerPCICFGUncore::getEdcReads() { uint64 result = 0; @@ -4886,13 +5043,50 @@ void ServerPCICFGUncore::programEDC(const uint32 * EDCCntConfig) } } +void ServerPCICFGUncore::programM2M() +{ +#if 0 + PCM * pcm = PCM::getInstance(); + const uint32 cpu_model = pcm->getCPUModel(); + if (cpu_model == PCM::SKX) +#endif + { + for (auto & m2mHandle : m2mHandles) + { + // freeze enable + m2mHandle->write32(M2M_PCI_PMON_BOX_CTL_ADDR, UNC_PMON_UNIT_CTL_RSV); + // freeze + m2mHandle->write32(M2M_PCI_PMON_BOX_CTL_ADDR, UNC_PMON_UNIT_CTL_RSV + UNC_PMON_UNIT_CTL_FRZ); + +#ifdef PCM_UNCORE_PMON_BOX_CHECK_STATUS + uint32 val = 0; + m2mHandle->write32(M2M_PCI_PMON_BOX_CTL_ADDR, &val); + if ((val & UNC_PMON_UNIT_CTL_VALID_BITS_MASK) != (extra + UNC_PMON_UNIT_CTL_FRZ)) + { + std::cerr << "ERROR: M2M counter programming seems not to work. M2M_PCI_PMON_BOX_CTL=0x" << std::hex << val << std::endl; + std::cerr << " Please see BIOS options to enable the export of performance monitoring devices." << std::endl; + } +#endif + + m2mHandle->write32(M2M_PCI_PMON_CTL0_ADDR, M2M_PCI_PMON_CTL_EN); + // TAG_HIT.NM_DRD_HIT_* events (CLEAN | DIRTY) + m2mHandle->write32(M2M_PCI_PMON_CTL0_ADDR, M2M_PCI_PMON_CTL_EN + M2M_PCI_PMON_CTL_EVENT(0x2c) + M2M_PCI_PMON_CTL_UMASK(3)); + m2mHandle->write32(M2M_PCI_PMON_CTL3_ADDR, M2M_PCI_PMON_CTL_EN); // CLOCKTICKS + // reset counters values + m2mHandle->write32(M2M_PCI_PMON_BOX_CTL_ADDR, UNC_PMON_UNIT_CTL_RSV + UNC_PMON_UNIT_CTL_FRZ + UNC_PMON_UNIT_CTL_RST_COUNTERS); + + // unfreeze counters + m2mHandle->write32(M2M_PCI_PMON_BOX_CTL_ADDR, UNC_PMON_UNIT_CTL_RSV); + } + } +} + void ServerPCICFGUncore::freezeCounters() { uint64 MC_CH_PCI_PMON_BOX_CTL_ADDR = 0; uint64 EDC_CH_PCI_PMON_BOX_CTL_ADDR = 0; const uint32 cpu_model = PCM::getInstance()->getCPUModel(); const uint32 extra = (cpu_model == PCM::SKX)?UNC_PMON_UNIT_CTL_RSV:UNC_PMON_UNIT_CTL_FRZ_EN; - const uint32 extraIMC = (cpu_model == PCM::SKX)?UNC_PMON_UNIT_CTL_RSV:UNC_PMON_UNIT_CTL_FRZ_EN; if (cpu_model == PCM::KNL) { MC_CH_PCI_PMON_BOX_CTL_ADDR = KNX_MC_CH_PCI_PMON_BOX_CTL_ADDR; EDC_CH_PCI_PMON_BOX_CTL_ADDR = KNX_EDC_CH_PCI_PMON_BOX_CTL_ADDR; @@ -4905,19 +5099,22 @@ void ServerPCICFGUncore::freezeCounters() } for (size_t i = 0; i < (size_t)imcHandles.size(); ++i) { - imcHandles[i]->write32(MC_CH_PCI_PMON_BOX_CTL_ADDR, extraIMC + UNC_PMON_UNIT_CTL_FRZ); + imcHandles[i]->write32(MC_CH_PCI_PMON_BOX_CTL_ADDR, extra + UNC_PMON_UNIT_CTL_FRZ); } for (size_t i = 0; i < (size_t)edcHandles.size(); ++i) { edcHandles[i]->write32(EDC_CH_PCI_PMON_BOX_CTL_ADDR, UNC_PMON_UNIT_CTL_FRZ_EN + UNC_PMON_UNIT_CTL_FRZ); } + for (auto & handle: m2mHandles) + { + handle->write32(M2M_PCI_PMON_BOX_CTL_ADDR, extra + UNC_PMON_UNIT_CTL_FRZ); + } } void ServerPCICFGUncore::unfreezeCounters() { const uint32 cpu_model = PCM::getInstance()->getCPUModel(); const uint32 extra = (cpu_model == PCM::SKX)?UNC_PMON_UNIT_CTL_RSV:UNC_PMON_UNIT_CTL_FRZ_EN; - const uint32 extraIMC = (cpu_model == PCM::SKX)?UNC_PMON_UNIT_CTL_RSV:UNC_PMON_UNIT_CTL_FRZ_EN; uint64 MC_CH_PCI_PMON_BOX_CTL_ADDR = 0; uint64 EDC_CH_PCI_PMON_BOX_CTL_ADDR = 0; if (cpu_model == PCM::KNL) { @@ -4933,12 +5130,16 @@ void ServerPCICFGUncore::unfreezeCounters() } for (size_t i = 0; i < (size_t)imcHandles.size(); ++i) { - imcHandles[i]->write32(MC_CH_PCI_PMON_BOX_CTL_ADDR, extraIMC); + imcHandles[i]->write32(MC_CH_PCI_PMON_BOX_CTL_ADDR, extra); } for (size_t i = 0; i < (size_t)edcHandles.size(); ++i) { edcHandles[i]->write32(EDC_CH_PCI_PMON_BOX_CTL_ADDR, UNC_PMON_UNIT_CTL_FRZ_EN); } + for (auto & handle: m2mHandles) + { + handle->write32(M2M_PCI_PMON_BOX_CTL_ADDR, extra); + } } uint64 ServerPCICFGUncore::getQPIClocks(uint32 port) @@ -5101,6 +5302,33 @@ uint64 ServerPCICFGUncore::getEDCCounter(uint32 channel, uint32 counter) } +uint64 ServerPCICFGUncore::getM2MCounter(uint32 box, uint32 counter) +{ + uint64 result = 0; + + if (box < (uint32)m2mHandles.size()) + { + switch (counter) + { + case 0: + m2mHandles[box]->read64(M2M_PCI_PMON_CTR0_ADDR, &result); + break; + case 1: + m2mHandles[box]->read64(M2M_PCI_PMON_CTR1_ADDR, &result); + break; + case 2: + m2mHandles[box]->read64(M2M_PCI_PMON_CTR2_ADDR, &result); + break; + case 3: + m2mHandles[box]->read64(M2M_PCI_PMON_CTR3_ADDR, &result); + break; + } + } +// std::cout << "DEBUG: read "<< result << " from M2M box "<< box <<" counter " << counter << std::endl; + return result; +} + + uint64 ServerPCICFGUncore::getQPILLCounter(uint32 port, uint32 counter) { uint64 result = 0; @@ -5330,7 +5558,6 @@ void ServerPCICFGUncore::reportQPISpeed() const std::cerr << "Max QPI link " << i << " speed: " << qpi_speed[i] / (1e9) << " GBytes/second (" << qpi_speed[i] / (1e9 * m->getBytesPerLinkTransfer()) << " GT/second)" << std::endl; } - uint64 PCM::CX_MSR_PMON_CTRY(uint32 Cbo, uint32 Ctr) const { if(JAKETOWN == cpu_model || IVYTOWN == cpu_model) diff --git a/cpucounters.h b/cpucounters.h index dea468a2..fe9c787a 100644 --- a/cpucounters.h +++ b/cpucounters.h @@ -93,20 +93,24 @@ struct PCM_API TopologyEntry // decribes a core //! Object to access uncore counters in a socket/processor with microarchitecture codename SandyBridge-EP (Jaketown) or Ivytown-EP or Ivytown-EX class ServerPCICFGUncore { - int32 iMCbus,UPIbus; + int32 iMCbus,UPIbus,M2Mbus; uint32 groupnr; int32 cpu_model; std::vector > imcHandles; std::vector > edcHandles; std::vector > qpiLLHandles; + std::vector > m2mHandles; std::vector qpi_speed; uint32 num_imc; + uint32 num_imc_channels1; // number of memory channels in the first memory controller uint32 MCX_CHY_REGISTER_DEV_ADDR[2][4]; uint32 MCX_CHY_REGISTER_FUNC_ADDR[2][4]; uint32 EDCX_ECLK_REGISTER_DEV_ADDR[8]; uint32 EDCX_ECLK_REGISTER_FUNC_ADDR[8]; uint32 QPI_PORTX_REGISTER_DEV_ADDR[3]; uint32 QPI_PORTX_REGISTER_FUNC_ADDR[3]; + uint32 M2M_REGISTER_DEV_ADDR[2]; + uint32 M2M_REGISTER_FUNC_ADDR[2]; uint32 LINK_PCI_PMON_BOX_CTL_ADDR; uint32 LINK_PCI_PMON_CTL_ADDR[4]; uint32 LINK_PCI_PMON_CTR_ADDR[4]; @@ -114,6 +118,7 @@ class ServerPCICFGUncore static PCM_Util::Mutex socket2busMutex; static std::vector > socket2iMCbus; static std::vector > socket2UPIbus; + static std::vector > socket2M2Mbus; void initSocket2Bus(std::vector > & socket2bus, uint32 device, uint32 function, const uint32 DEV_IDS[], uint32 devIdsSize); ServerPCICFGUncore(); // forbidden @@ -122,6 +127,7 @@ class ServerPCICFGUncore PciHandleType * createIntelPerfMonDevice(uint32 groupnr, int32 bus, uint32 dev, uint32 func, bool checkVendor = false); void programIMC(const uint32 * MCCntConfig); void programEDC(const uint32 * EDCCntConfig); + void programM2M(); typedef std::pair > MemTestParam; void initMemTest(MemTestParam & param); void doMemTest(const MemTestParam & param); @@ -137,9 +143,21 @@ class ServerPCICFGUncore void program(); //! \brief Get the number of integrated controller reads (in cache lines) uint64 getImcReads(); + //! \brief Get the number of integrated controller reads for given controller (in cache lines) + //! \param controller controller ID/number + uint64 getImcReadsForController(uint32 controller); + //! \brief Get the number of integrated controller reads for given channels (in cache lines) + //! \param beginChannel first channel in the range + //! \param endChannel last channel + 1: the range is [beginChannel, endChannel). endChannel is not included. + uint64 getImcReadsForChannels(uint32 beginChannel, uint32 endChannel); //! \brief Get the number of integrated controller writes (in cache lines) uint64 getImcWrites(); + //! \brief Get the number of PMM memory reads (in cache lines) + uint64 getPMMReads(); + //! \brief Get the number of PMM memory writes (in cache lines) + uint64 getPMMWrites(); + //! \brief Get the number of cache lines read by EDC (embedded DRAM controller) uint64 getEdcReads(); //! \brief Get the number of cache lines written by EDC (embedded DRAM controller) @@ -162,7 +180,8 @@ class ServerPCICFGUncore //! \brief Program memory counters (disables programming performance counters) //! \param rankA count DIMM rank1 statistics (disables memory channel monitoring) //! \param rankB count DIMM rank2 statistics (disables memory channel monitoring) - void programServerUncoreMemoryMetrics(int rankA = -1, int rankB = -1); + //! \param PMM monitor PMM bandwidth instead of partial writes + void programServerUncoreMemoryMetrics(int rankA = -1, int rankB = -1, bool PMM = false); //! \brief Get number of QPI LL clocks on a QPI port //! \param port QPI port number @@ -196,6 +215,11 @@ class ServerPCICFGUncore //! \param counter counter number uint64 getQPILLCounter(uint32 port, uint32 counter); + //! \brief Direct read of M2M counter + //! \param box box ID/number + //! \param counter counter number + uint64 getM2MCounter(uint32 box, uint32 counter); + //! \brief Freezes event counting void freezeCounters(); //! \brief Unfreezes event counting @@ -225,6 +249,10 @@ class ServerPCICFGUncore //! \brief Returns the total number of detected memory channels on all integrated memory controllers size_t getNumMCChannels() const { return (size_t)imcHandles.size(); } + //! \brief Returns the total number of detected memory channels on given integrated memory controller + //! \param controller controller number + size_t getNumMCChannels(const uint32 controller) const; + //! \brief Returns the total number of detected memory channels on all embedded DRAM controllers (EDC) size_t getNumEDCChannels() const { return (size_t)edcHandles.size(); } }; @@ -600,6 +628,11 @@ class PCM_API PCM void programLLCReadMissLatencyEvents(); uint64 getCBOCounterState(const uint32 socket, const uint32 ctr_); + bool isCLX() const // Cascade Lake-SP + { + return (PCM::SKX == cpu_model) && (cpu_stepping > 4); + } + public: /*! \brief checks if QOS monitoring support present @@ -699,6 +732,7 @@ class PCM_API PCM /*! \brief Programs uncore memory counters on microarchitectures codename SandyBridge-EP and later Xeon uarch \param rankA count DIMM rank1 statistics (disables memory channel monitoring) \param rankB count DIMM rank2 statistics (disables memory channel monitoring) + \param PMM monitor PMM bandwidth instead of partial writes Call this method before you start using the memory counter routines on microarchitecture codename SandyBridge-EP and later Xeon uarch @@ -707,7 +741,7 @@ class PCM_API PCM program PMUs: Intel(r) VTune(tm), Intel(r) Performance Tuning Utility (PTU). This code may make VTune or PTU measurements invalid. VTune or PTU measurement may make measurement with this code invalid. Please enable either usage of these routines or VTune/PTU/etc. */ - ErrorCode programServerUncoreMemoryMetrics(int rankA = -1, int rankB = -1); + ErrorCode programServerUncoreMemoryMetrics(int rankA = -1, int rankB = -1, bool PMM = false); //! \brief Freezes uncore event counting (works only on microarchitecture codename SandyBridge-EP and IvyTown) void freezeServerUncoreCounters(); @@ -987,6 +1021,33 @@ class PCM_API PCM return 0; } + //! \brief Returns the number of detected memory channels on given integrated memory controllers + //! \param socket socket + //! \param controller controller + size_t getMCChannels(uint32 socket, uint32 controller) const + { + switch (cpu_model) + { + case NEHALEM_EP: + case WESTMERE_EP: + case CLARKDALE: + return 3; + case NEHALEM_EX: + case WESTMERE_EX: + return 4; + case JAKETOWN: + case IVYTOWN: + case HASWELLX: + case BDX_DE: + case SKX: + case BDX: + case KNL: + return (socket < server_pcicfg_uncore.size() && server_pcicfg_uncore[socket].get()) ? (server_pcicfg_uncore[socket]->getNumMCChannels(controller)) : 0; + } + return 0; + } + + //! \brief Returns the total number of detected memory channels on all integrated memory controllers per socket size_t getEDCChannelsPerSocket() const { @@ -1166,7 +1227,7 @@ class PCM_API PCM //! \brief Get a string describing the codename of the processor microarchitecture //! \param cpu_model_ cpu model (if no parameter provided the codename of the detected CPU is returned) - const char * getUArchCodename(int32 cpu_model_ = -1) const; + const char * getUArchCodename(const int32 cpu_model_ = -1) const; //! \brief Get Brand string of processor static std::string getCPUBrandString(); @@ -1180,13 +1241,13 @@ class PCM_API PCM return ( cpu_model == PCM::JAKETOWN || cpu_model == PCM::IVYTOWN - || cpu_model == PCM::SANDY_BRIDGE + || cpu_model == PCM::SANDY_BRIDGE || cpu_model == PCM::IVY_BRIDGE || cpu_model == PCM::HASWELL || original_cpu_model == PCM::ATOM_AVOTON || original_cpu_model == PCM::ATOM_CHERRYTRAIL || original_cpu_model == PCM::ATOM_BAYTRAIL - || original_cpu_model == PCM::ATOM_APOLLO_LAKE + || original_cpu_model == PCM::ATOM_APOLLO_LAKE || original_cpu_model == PCM::ATOM_DENVERTON || cpu_model == PCM::HASWELLX || cpu_model == PCM::BROADWELL @@ -1201,7 +1262,7 @@ class PCM_API PCM bool dramEnergyMetricsAvailable() const { - return ( + return ( cpu_model == PCM::JAKETOWN || cpu_model == PCM::IVYTOWN || cpu_model == PCM::HASWELLX @@ -1214,15 +1275,15 @@ class PCM_API PCM bool packageThermalMetricsAvailable() const { - return packageEnergyMetricsAvailable(); + return packageEnergyMetricsAvailable(); } bool outgoingQPITrafficMetricsAvailable() const { return getQPILinksPerSocket() > 0 && ( - cpu_model == PCM::NEHALEM_EX - || cpu_model == PCM::WESTMERE_EX + cpu_model == PCM::NEHALEM_EX + || cpu_model == PCM::WESTMERE_EX || cpu_model == PCM::JAKETOWN || cpu_model == PCM::IVYTOWN || cpu_model == PCM::HASWELLX @@ -1280,6 +1341,13 @@ class PCM_API PCM ); } + bool PMMTrafficMetricsAvailable() const + { + return ( + isCLX() + ); + } + bool LLCReadMissLatencyMetricsAvailable() const { return ( @@ -1334,10 +1402,10 @@ class PCM_API PCM bool useSkylakeEvents() const { - return PCM::SKL == cpu_model - || PCM::SKX == cpu_model - || PCM::KBL == cpu_model - ; + return PCM::SKL == cpu_model + || PCM::KBL == cpu_model + || PCM::SKX == cpu_model + ; } static double getBytesPerFlit(int32 cpu_model_) @@ -1591,24 +1659,24 @@ inline uint64 RDTSC() inline uint64 RDTSCP() { - uint64 result = 0; + uint64 result = 0; #ifdef _MSC_VER - // Windows - #if _MSC_VER>= 1600 - unsigned int Aux; - result = __rdtscp(&Aux); - #endif + // Windows + #if _MSC_VER>= 1600 + unsigned int Aux; + result = __rdtscp(&Aux); + #endif #else - // Linux and OS X - uint32 high = 0, low = 0; - asm volatile ( - "rdtscp\n\t" - "mov %%edx, %0\n\t" - "mov %%eax, %1\n\t": - "=r" (high), "=r" (low) :: "%rax", "%rcx", "%rdx"); - result = low + (uint64(high)<<32ULL); + // Linux and OS X + uint32 high = 0, low = 0; + asm volatile ( + "rdtscp\n\t" + "mov %%edx, %0\n\t" + "mov %%eax, %1\n\t": + "=r" (high), "=r" (low) :: "%rax", "%rcx", "%rdx"); + result = low + (uint64(high)<<32ULL); #endif - return result; + return result; } /*! \brief Returns QPI LL clock ticks @@ -1710,6 +1778,20 @@ uint64 getMCCounter(uint32 channel, uint32 counter, const CounterStateType & bef return after.MCCounter[channel][counter] - before.MCCounter[channel][counter]; } + +/*! \brief Direct read of Memory2Mesh controller PMU counter (counter meaning depends on the programming: power/performance/etc) + \param counter counter number + \param controller controller number + \param before CPU counter state before the experiment + \param after CPU counter state after the experiment +*/ +template +uint64 getM2MCounter(uint32 controller, uint32 counter, const CounterStateType & before, const CounterStateType & after) +{ + return after.M2MCounter[controller][counter] - before.M2MCounter[controller][counter]; +} + + /*! \brief Direct read of embedded DRAM memory controller counter (counter meaning depends on the programming: power/performance/etc) \param counter counter number \param channel channel number @@ -1819,6 +1901,10 @@ class UncoreCounterState template friend uint64 getBytesWrittenToMC(const CounterStateType & before, const CounterStateType & after); template + friend uint64 getBytesReadFromPMM(const CounterStateType & before, const CounterStateType & after); + template + friend uint64 getBytesWrittenToPMM(const CounterStateType & before, const CounterStateType & after); + template friend uint64 getBytesReadFromEDC(const CounterStateType & before, const CounterStateType & after); template friend uint64 getBytesWrittenToEDC(const CounterStateType & before, const CounterStateType & after); @@ -1836,6 +1922,8 @@ class UncoreCounterState protected: uint64 UncMCFullWrites; uint64 UncMCNormalReads; + uint64 UncPMMWrites; + uint64 UncPMMReads; uint64 UncEDCFullWrites; uint64 UncEDCNormalReads; uint64 UncMCIORequests; @@ -1851,6 +1939,8 @@ class UncoreCounterState UncoreCounterState() : UncMCFullWrites(0), UncMCNormalReads(0), + UncPMMWrites(0), + UncPMMReads(0), UncEDCFullWrites(0), UncEDCNormalReads(0), UncMCIORequests(0), @@ -1868,6 +1958,8 @@ class UncoreCounterState { UncMCFullWrites += o.UncMCFullWrites; UncMCNormalReads += o.UncMCNormalReads; + UncPMMReads += o.UncPMMReads; + UncPMMWrites += o.UncPMMWrites; UncEDCFullWrites += o.UncEDCFullWrites; UncEDCNormalReads += o.UncEDCNormalReads; UncMCIORequests += o.UncMCIORequests; @@ -1891,6 +1983,7 @@ class ServerUncorePowerState : public UncoreCounterState uint64 DRAMClocks[8]; uint64 MCDRAMClocks[16]; uint64 MCCounter[8][4]; // channel X counter + uint64 M2MCounter[2][4]; // M2M/iMC boxes x counter uint64 EDCCounter[8][4]; // EDC controller X counter uint64 PCUCounter[4]; int32 PackageThermalHeadroom; @@ -1909,6 +2002,8 @@ class ServerUncorePowerState : public UncoreCounterState template friend uint64 getMCCounter(uint32 channel, uint32 counter, const CounterStateType & before, const CounterStateType & after); template + friend uint64 getM2MCounter(uint32 controller, uint32 counter, const CounterStateType & before, const CounterStateType & after); + template friend uint64 getEDCCounter(uint32 channel, uint32 counter, const CounterStateType & before, const CounterStateType & after); template friend uint64 getPCUCounter(uint32 counter, const CounterStateType & before, const CounterStateType & after); @@ -1935,7 +2030,10 @@ class ServerUncorePowerState : public UncoreCounterState for (int i = 0; i < 8; ++i) { memset(&(MCCounter[i][0]), 0, 4 * sizeof(uint64)); memset(&(EDCCounter[i][0]), 0, 4 * sizeof(uint64)); - } + } + for (int i = 0; i < 2; ++i) { + memset(&(M2MCounter[i][0]), 0, 4 * sizeof(uint64)); + } } }; @@ -2575,6 +2673,30 @@ uint64 getBytesWrittenToMC(const CounterStateType & before, const CounterStateTy return (after.UncMCFullWrites - before.UncMCFullWrites) * 64; } +/*! \brief Computes number of bytes read from PMM memory + + \param before CPU counter state before the experiment + \param after CPU counter state after the experiment + \return Number of bytes +*/ +template +uint64 getBytesReadFromPMM(const CounterStateType & before, const CounterStateType & after) +{ + return (after.UncPMMReads - before.UncPMMReads) * 64; +} + +/*! \brief Computes number of bytes written to PMM memory + + \param before CPU counter state before the experiment + \param after CPU counter state after the experiment + \return Number of bytes +*/ +template +uint64 getBytesWrittenToPMM(const CounterStateType & before, const CounterStateType & after) +{ + return (after.UncPMMWrites - before.UncPMMWrites) * 64; +} + /*! \brief Computes number of bytes read from MCDRAM memory controllers \param before CPU counter state before the experiment @@ -2855,7 +2977,11 @@ inline uint64 getAllIncomingQPILinkBytes(const SystemCounterState & now) inline double getQPItoMCTrafficRatio(const SystemCounterState & before, const SystemCounterState & after) { const uint64 totalQPI = getAllIncomingQPILinkBytes(before, after); - const uint64 memTraffic = getBytesReadFromMC(before, after) + getBytesWrittenToMC(before, after); + uint64 memTraffic = getBytesReadFromMC(before, after) + getBytesWrittenToMC(before, after); + if (PCM::getInstance()->PMMTrafficMetricsAvailable()) + { + memTraffic += getBytesReadFromPMM(before, after) + getBytesWrittenToPMM(before, after); + } return double(totalQPI) / double(memTraffic); } diff --git a/pcm-core.cpp b/pcm-core.cpp index 0f9eac82..bd0c6871 100644 --- a/pcm-core.cpp +++ b/pcm-core.cpp @@ -295,6 +295,7 @@ int main(int argc, char * argv[]) bool show_partial_core_output = false; std::bitset ycores; + PCM * m = PCM::getInstance(); conf.fixedCfg = NULL; // default diff --git a/pcm-memory.cpp b/pcm-memory.cpp index d6564655..cf74b756 100644 --- a/pcm-memory.cpp +++ b/pcm-memory.cpp @@ -16,7 +16,7 @@ /*! \file pcm-memory.cpp - \brief Example of using CPU counters: implements a performance counter monitoring utility for memory controller channels and DIMMs (ranks) + \brief Example of using CPU counters: implements a performance counter monitoring utility for memory controller channels and DIMMs (ranks) + PMM memory traffic */ #define HACK_TO_REMOVE_DUPLICATE_ERROR #include @@ -46,6 +46,9 @@ #define READ_RANK_B 2 #define WRITE_RANK_B 3 #define PARTIAL 2 +#define PMM_READ 2 +#define PMM_WRITE 3 +#define NM_HIT 0 // NM : Near Memory (DRAM cache) in Memory Mode #define PCM_DELAY_DEFAULT 1.0 // in seconds #define PCM_DELAY_MIN 0.015 // 15 milliseconds is practical on most modern CPUs #define PCM_CALIBRATION_INTERVAL 50 // calibrate clock only every 50th iteration @@ -57,17 +60,24 @@ using namespace std; const uint32 max_sockets = 256; const uint32 max_imc_channels = 8; const uint32 max_edc_channels = 8; +const uint32 max_imc_controllers = 2; typedef struct memdata { float iMC_Rd_socket_chan[max_sockets][max_imc_channels]; float iMC_Wr_socket_chan[max_sockets][max_imc_channels]; + float iMC_PMM_Rd_socket_chan[max_sockets][max_imc_channels]; + float iMC_PMM_Wr_socket_chan[max_sockets][max_imc_channels]; float iMC_Rd_socket[max_sockets]; float iMC_Wr_socket[max_sockets]; + float iMC_PMM_Rd_socket[max_sockets]; + float iMC_PMM_Wr_socket[max_sockets]; + float M2M_NM_read_hit_rate[max_sockets][max_imc_controllers]; float EDC_Rd_socket_chan[max_sockets][max_edc_channels]; float EDC_Wr_socket_chan[max_sockets][max_edc_channels]; float EDC_Rd_socket[max_sockets]; float EDC_Wr_socket[max_sockets]; uint64 partial_write[max_sockets]; + bool PMM; } memdata_t; void print_help(const string prog_name) @@ -80,6 +90,8 @@ void print_help(const string prog_name) cerr << " Supported are: " << endl; cerr << " -h | --help | /h => print this help and exit" << endl; cerr << " -rank=X | /rank=X => monitor DIMM rank X. At most 2 out of 8 total ranks can be monitored simultaneously." << endl; + cerr << " -pmm => monitor PMM memory bandwidth (instead of partial writes)." << endl; + cerr << " -nc | --nochannel | /nc => suppress output for individual channels." << endl; cerr << " -csv[=file.csv] | /csv[=file.csv] => output compact CSV format to screen or" << endl << " to a file, in case filename is provided" << endl; cerr << " -columns=X | /columns=X => Number of columns to display the NUMA Nodes, defaults to 2." << endl; @@ -93,7 +105,7 @@ void print_help(const string prog_name) cerr << endl; } -void printSocketBWHeader(uint32 no_columns, uint32 skt) +void printSocketBWHeader(uint32 no_columns, uint32 skt, const bool show_channel_output) { for (uint32 i=skt; i<(no_columns+skt); ++i) { cout << "|---------------------------------------|"; @@ -107,14 +119,16 @@ void printSocketBWHeader(uint32 no_columns, uint32 skt) cout << "|---------------------------------------|"; } cout << endl; - for (uint32 i=skt; i<(no_columns+skt); ++i) { - cout << "|-- Memory Channel Monitoring --|"; - } - cout << endl; - for (uint32 i=skt; i<(no_columns+skt); ++i) { - cout << "|---------------------------------------|"; + if (show_channel_output) { + for (uint32 i=skt; i<(no_columns+skt); ++i) { + cout << "|-- Memory Channel Monitoring --|"; + } + cout << endl; + for (uint32 i=skt; i<(no_columns+skt); ++i) { + cout << "|---------------------------------------|"; + } + cout << endl; } - cout << endl; } void printSocketRankBWHeader(uint32 no_columns, uint32 skt) @@ -161,6 +175,17 @@ void printSocketChannelBW(PCM *m, memdata_t *md, uint32 no_columns, uint32 skt) cout << "|-- Writes(MB/s): "<iMC_Wr_socket_chan[i][channel]<<" --|"; } cout << endl; + if(md->PMM) + { + for (uint32 i=skt; i<(skt+no_columns); ++i) { + cout << "|-- PMM Reads(MB/s) : "<iMC_PMM_Rd_socket_chan[i][channel]<<" --|"; + } + cout << endl; + for (uint32 i=skt; i<(skt+no_columns); ++i) { + cout << "|-- PMM Writes(MB/s): "<iMC_PMM_Wr_socket_chan[i][channel]<<" --|"; + } + cout << endl; + } } } @@ -190,22 +215,44 @@ void printSocketChannelBW(uint32 no_columns, uint32 skt, uint32 num_imc_channels } } -void printSocketBWFooter(uint32 no_columns, uint32 skt, float* iMC_Rd_socket, float* iMC_Wr_socket, uint64* partial_write) +void printSocketBWFooter(uint32 no_columns, uint32 skt, const memdata_t *md) { for (uint32 i=skt; i<(skt+no_columns); ++i) { - cout << "|-- NODE"<iMC_Rd_socket[i]<<" --|"; } cout << endl; for (uint32 i=skt; i<(skt+no_columns); ++i) { - cout << "|-- NODE"<iMC_Wr_socket[i]<<" --|"; } cout << endl; - for (uint32 i=skt; i<(skt+no_columns); ++i) { - cout << "|-- NODE"<PMM) + { + for (uint32 i=skt; i<(skt+no_columns); ++i) { + cout << "|-- NODE"<iMC_PMM_Rd_socket[i]<<" --|"; + } + cout << endl; + for (uint32 i=skt; i<(skt+no_columns); ++i) { + cout << "|-- NODE"<iMC_PMM_Wr_socket[i]<<" --|"; + } + cout << endl; + for (uint32 ctrl = 0; ctrl < max_imc_controllers; ++ctrl) + { + for (uint32 i=skt; i<(skt+no_columns); ++i) { + cout << "|-- NODE"<M2M_NM_read_hit_rate[i][ctrl]<<" --|"; + } + cout << endl; + } + } + else + { + for (uint32 i=skt; i<(skt+no_columns); ++i) { + cout << "|-- NODE"<partial_write[i]<<" --|"; + } + cout << endl; } - cout << endl; for (uint32 i=skt; i<(skt+no_columns); ++i) { - cout << "|-- NODE"<iMC_Rd_socket[i]+md->iMC_Wr_socket[i]+ + md->iMC_PMM_Rd_socket[i]+md->iMC_PMM_Wr_socket[i])<<" --|"; } cout << endl; for (uint32 i=skt; i<(no_columns+skt); ++i) { @@ -214,7 +261,7 @@ void printSocketBWFooter(uint32 no_columns, uint32 skt, float* iMC_Rd_socket, fl cout << endl; } -void display_bandwidth(PCM *m, memdata_t *md, uint32 no_columns) +void display_bandwidth(PCM *m, memdata_t *md, uint32 no_columns, const bool show_channel_output) { float sysRead = 0.0, sysWrite = 0.0; uint32 numSockets = m->getNumSockets(); @@ -227,12 +274,15 @@ void display_bandwidth(PCM *m, memdata_t *md, uint32 no_columns) // Full row if ( (skt+no_columns) <= numSockets ) { - printSocketBWHeader (no_columns, skt); - printSocketChannelBW(m, md, no_columns, skt); - printSocketBWFooter (no_columns, skt, md->iMC_Rd_socket, md->iMC_Wr_socket, md->partial_write); + printSocketBWHeader (no_columns, skt, show_channel_output); + if (show_channel_output) + printSocketChannelBW(m, md, no_columns, skt); + printSocketBWFooter (no_columns, skt, md); for (uint32 i=skt; i<(skt+no_columns); i++) { sysRead += md->iMC_Rd_socket[i]; sysWrite += md->iMC_Wr_socket[i]; + sysRead += md->iMC_PMM_Rd_socket[i]; + sysWrite += md->iMC_PMM_Wr_socket[i]; } skt += no_columns; } @@ -248,9 +298,10 @@ void display_bandwidth(PCM *m, memdata_t *md, uint32 no_columns) \r|---------------------------------------||---------------------------------------|\n\ \r"; uint32 max_channels = max_imc_channels <= max_edc_channels ? max_edc_channels : max_imc_channels; - float iMC_Rd, iMC_Wr, EDC_Rd, EDC_Wr; - for(uint64 channel = 0; channel < max_channels; ++channel) - { + if (show_channel_output) { + float iMC_Rd, iMC_Wr, EDC_Rd, EDC_Wr; + for(uint64 channel = 0; channel < max_channels; ++channel) + { if (channel <= max_imc_channels) { iMC_Rd = md->iMC_Rd_socket_chan[skt][channel]; iMC_Wr = md->iMC_Wr_socket_chan[skt][channel]; @@ -291,6 +342,7 @@ void display_bandwidth(PCM *m, memdata_t *md, uint32 no_columns) <<" --|\n"; else continue; + } } cout << "\ \r|-- DDR4 Mem Read (MB/s):"<iMC_Rd_socket[skt]<<" --||-- MCDRAM Read (MB/s):"<EDC_Rd_socket[skt]<<" --|\n\ @@ -308,28 +360,54 @@ void display_bandwidth(PCM *m, memdata_t *md, uint32 no_columns) cout << "\ \r|---------------------------------------|\n\ \r|-- Socket "<iMC_Rd_socket_chan[skt][channel] < 0.0 && md->iMC_Wr_socket_chan[skt][channel] < 0.0) //If the channel read neg. value, the channel is not working; skip it. continue; cout << "|-- Mem Ch " << channel <<": Reads (MB/s):" << setw(8) << md->iMC_Rd_socket_chan[skt][channel] <<" --|\n|-- Writes(MB/s):" << setw(8) << md->iMC_Wr_socket_chan[skt][channel] <<" --|\n"; - } + if (md->PMM) + { + cout << "|-- PMM Reads (MB/s):" << setw(8) << md->iMC_PMM_Rd_socket_chan[skt][channel] << " --|\n"; + cout << "|-- PMM Writes(MB/s):" << setw(8) << md->iMC_PMM_Wr_socket_chan[skt][channel] << " --|\n"; + } + } + } cout << "\ - \r|-- NODE"<iMC_Rd_socket[skt]<<" --|\n\ - \r|-- NODE"<iMC_Wr_socket[skt]<<" --|\n\ - \r|-- NODE"<partial_write[skt]<<" --|\n\ - \r|-- NODE"<iMC_Rd_socket[skt]+md->iMC_Wr_socket[skt]<<" --|\n\ + \r|-- NODE"<iMC_Rd_socket[skt]<<" --|\n\ + \r|-- NODE"<iMC_Wr_socket[skt]<<" --|\n"; + if(md->PMM) + { + cout << "\ + \r|-- NODE"<iMC_PMM_Rd_socket[skt]<<" --|\n\ + \r|-- NODE"<iMC_PMM_Wr_socket[skt]<<" --|\n"; + for (uint32 ctrl = 0; ctrl < max_imc_controllers; ++ctrl) + { + cout << "\r|-- NODE"<M2M_NM_read_hit_rate[skt][ctrl]<<" --|\n"; + } + } + else + { + cout << + "\r|-- NODE"<partial_write[skt]<<" --|\n"; + } + cout << + "\r|-- NODE"<iMC_Rd_socket[skt]+md->iMC_Wr_socket[skt]+ + md->iMC_PMM_Rd_socket[skt]+md->iMC_PMM_Wr_socket[skt]<<" --|\n\ \r|---------------------------------------|\n\ \r"; sysRead += md->iMC_Rd_socket[skt]; sysWrite += md->iMC_Wr_socket[skt]; + sysRead += md->iMC_PMM_Rd_socket[skt]; + sysWrite += md->iMC_PMM_Wr_socket[skt]; skt += 1; } } @@ -342,31 +420,42 @@ void display_bandwidth(PCM *m, memdata_t *md, uint32 no_columns) } } -void display_bandwidth_csv_header(PCM *m, memdata_t *md) +void display_bandwidth_csv_header(PCM *m, memdata_t *md, const bool show_channel_output) { uint32 numSockets = m->getNumSockets(); cout << ";;" ; // Time for (uint32 skt=0; skt < numSockets; ++skt) { - for(uint64 channel = 0; channel < max_imc_channels; ++channel) - { - if(md->iMC_Rd_socket_chan[skt][channel] < 0.0 && md->iMC_Wr_socket_chan[skt][channel] < 0.0) //If the channel read neg. value, the channel is not working; skip it. - continue; - cout << "SKT" << skt << ";SKT" << skt << ';'; + if (show_channel_output) { + for(uint64 channel = 0; channel < max_imc_channels; ++channel) + { + if(md->iMC_Rd_socket_chan[skt][channel] < 0.0 && md->iMC_Wr_socket_chan[skt][channel] < 0.0) //If the channel read neg. value, the channel is not working; skip it. + continue; + cout << "SKT" << skt << ";SKT" << skt << ';'; + if (md->PMM) + { + cout << "SKT" << skt << ";SKT" << skt << ';'; + } + } } cout << "SKT"<getCPUModel() != PCM::KNL) - cout << "SKT"<PMM) + cout << "SKT"<MCDRAMmemoryTrafficMetricsAvailable()) { - for(uint64 channel = 0; channel < max_edc_channels; ++channel) - { - if(md->EDC_Rd_socket_chan[skt][channel] < 0.0 && md->EDC_Wr_socket_chan[skt][channel] < 0.0) //If the channel read neg. value, the channel is not working; skip it. - continue; - cout << "SKT" << skt << ";SKT" << skt << ';'; + if (show_channel_output) { + for(uint64 channel = 0; channel < max_edc_channels; ++channel) + { + if(md->EDC_Rd_socket_chan[skt][channel] < 0.0 && md->EDC_Wr_socket_chan[skt][channel] < 0.0) //If the channel read neg. value, the channel is not working; skip it. + continue; + cout << "SKT" << skt << ";SKT" << skt << ';'; + } } cout << "SKT"<iMC_Rd_socket_chan[skt][channel] < 0.0 && md->iMC_Wr_socket_chan[skt][channel] < 0.0) //If the channel read neg. value, the channel is not working; skip it. - continue; - cout << "Ch" <iMC_Rd_socket_chan[skt][channel] < 0.0 && md->iMC_Wr_socket_chan[skt][channel] < 0.0) //If the channel read neg. value, the channel is not working; skip it. + continue; + cout << "Ch" <PMM) + { + cout << "Ch" <getCPUModel() == PCM::KNL) cout << "DDR4 Read (MB/s); DDR4 Write (MB/s); DDR4 Memory (MB/s);"; else - cout << "Mem Read (MB/s);Mem Write (MB/s); P. Write (T/s); Memory (MB/s);"; + { + if(md->PMM) + cout << "Mem Read (MB/s);Mem Write (MB/s); PMM_Read; PMM_Write; Memory (MB/s);"; + else + cout << "Mem Read (MB/s);Mem Write (MB/s); P. Write (T/s); Memory (MB/s);"; + } if (m->MCDRAMmemoryTrafficMetricsAvailable()) { - for(uint64 channel = 0; channel < max_edc_channels; ++channel) - { - if(md->EDC_Rd_socket_chan[skt][channel] < 0.0 && md->EDC_Wr_socket_chan[skt][channel] < 0.0) //If the channel read neg. value, the channel is not working; skip it. - continue; - cout << "EDC_Ch" <EDC_Rd_socket_chan[skt][channel] < 0.0 && md->EDC_Wr_socket_chan[skt][channel] < 0.0) //If the channel read neg. value, the channel is not working; skip it. + continue; + cout << "EDC_Ch" <getNumSockets(); tm tt = pcm_localtime(); @@ -422,31 +525,52 @@ void display_bandwidth_csv(PCM *m, memdata_t *md, uint64 elapsedTime) for (uint32 skt=0; skt < numSockets; ++skt) { - for(uint64 channel = 0; channel < max_imc_channels; ++channel) - { - if(md->iMC_Rd_socket_chan[skt][channel] < 0.0 && md->iMC_Wr_socket_chan[skt][channel] < 0.0) //If the channel read neg. value, the channel is not working; skip it. - continue; - cout <iMC_Rd_socket_chan[skt][channel] << ';' - <iMC_Wr_socket_chan[skt][channel] << ';'; + if (show_channel_output) { + for(uint64 channel = 0; channel < max_imc_channels; ++channel) + { + if(md->iMC_Rd_socket_chan[skt][channel] < 0.0 && md->iMC_Wr_socket_chan[skt][channel] < 0.0) //If the channel read neg. value, the channel is not working; skip it. + continue; + cout <iMC_Rd_socket_chan[skt][channel] << ';' + <iMC_Wr_socket_chan[skt][channel] << ';'; + if(md->PMM) + { + cout <iMC_PMM_Rd_socket_chan[skt][channel] << ';' + <iMC_PMM_Wr_socket_chan[skt][channel] << ';'; + } + } } cout <iMC_Rd_socket[skt] <<';' <iMC_Wr_socket[skt] <<';'; + if(md->PMM) + { + cout <iMC_PMM_Rd_socket[skt] <<';' + <iMC_PMM_Wr_socket[skt] <<';'; + } if (m->getCPUModel() != PCM::KNL) - cout <partial_write[skt] <<';'; + { + if (!md->PMM) + { + cout <partial_write[skt] <<';'; + } + } cout << setw(8) << md->iMC_Rd_socket[skt]+md->iMC_Wr_socket[skt] <<';'; sysRead += md->iMC_Rd_socket[skt]; sysWrite += md->iMC_Wr_socket[skt]; + sysRead += md->iMC_PMM_Rd_socket[skt]; + sysWrite += md->iMC_PMM_Wr_socket[skt]; if (m->MCDRAMmemoryTrafficMetricsAvailable()) { - for(uint64 channel = 0; channel < max_edc_channels; ++channel) - { - if(md->EDC_Rd_socket_chan[skt][channel] < 0.0 && md->EDC_Wr_socket_chan[skt][channel] < 0.0) //If the channel read neg. value, the channel is not working; skip it. - continue; - cout <EDC_Rd_socket_chan[skt][channel] << ';' - <EDC_Wr_socket_chan[skt][channel] << ';'; + if (show_channel_output) { + for(uint64 channel = 0; channel < max_edc_channels; ++channel) + { + if(md->EDC_Rd_socket_chan[skt][channel] < 0.0 && md->EDC_Wr_socket_chan[skt][channel] < 0.0) //If the channel read neg. value, the channel is not working; skip it. + continue; + cout <EDC_Rd_socket_chan[skt][channel] << ';' + <EDC_Wr_socket_chan[skt][channel] << ';'; - } + } + } cout <EDC_Rd_socket[skt] <<';' <EDC_Wr_socket[skt] <<';' <EDC_Rd_socket[skt]+md->EDC_Wr_socket[skt] <<';'; @@ -461,19 +585,27 @@ void display_bandwidth_csv(PCM *m, memdata_t *md, uint64 elapsedTime) <getMCChannelsPerSocket(); //const uint32 num_edc_channels = m->getEDCChannelsPerSocket(); memdata_t md; + md.PMM = PMM; for(uint32 skt = 0; skt < m->getNumSockets(); ++skt) { md.iMC_Rd_socket[skt] = 0.0; md.iMC_Wr_socket[skt] = 0.0; + md.iMC_PMM_Rd_socket[skt] = 0.0; + md.iMC_PMM_Wr_socket[skt] = 0.0; md.EDC_Rd_socket[skt] = 0.0; md.EDC_Wr_socket[skt] = 0.0; md.partial_write[skt] = 0; + for(uint32 i=0; i < max_imc_controllers; ++i) + { + md.M2M_NM_read_hit_rate[skt][i] = 0.; + } + const uint32 numChannels1 = m->getMCChannels(skt, 0); // number of channels in the first controller switch(m->getCPUModel()) { case PCM::KNL: @@ -497,9 +629,12 @@ void calculate_bandwidth(PCM *m, const ServerUncorePowerState uncState1[], const { if(getMCCounter(channel,READ,uncState1[skt],uncState2[skt]) == 0.0 && getMCCounter(channel,WRITE,uncState1[skt],uncState2[skt]) == 0.0) //In case of JKT-EN, there are only three channels. Skip one and continue. { - md.iMC_Rd_socket_chan[skt][channel] = -1.0; - md.iMC_Wr_socket_chan[skt][channel] = -1.0; - continue; + if (!PMM || (getMCCounter(channel,PMM_READ,uncState1[skt],uncState2[skt]) == 0.0 && getMCCounter(channel,PMM_WRITE,uncState1[skt],uncState2[skt]) == 0.0)) + { + md.iMC_Rd_socket_chan[skt][channel] = -1.0; + md.iMC_Wr_socket_chan[skt][channel] = -1.0; + continue; + } } md.iMC_Rd_socket_chan[skt][channel] = (float) (getMCCounter(channel,READ,uncState1[skt],uncState2[skt]) * 64 / 1000000.0 / (elapsedTime/1000.0)); @@ -508,19 +643,42 @@ void calculate_bandwidth(PCM *m, const ServerUncorePowerState uncState1[], const md.iMC_Rd_socket[skt] += md.iMC_Rd_socket_chan[skt][channel]; md.iMC_Wr_socket[skt] += md.iMC_Wr_socket_chan[skt][channel]; - md.partial_write[skt] += (uint64) (getMCCounter(channel,PARTIAL,uncState1[skt],uncState2[skt]) / (elapsedTime/1000.0)); + if(PMM) + { + md.iMC_PMM_Rd_socket_chan[skt][channel] = (float) (getMCCounter(channel,PMM_READ,uncState1[skt],uncState2[skt]) * 64 / 1000000.0 / (elapsedTime/1000.0)); + md.iMC_PMM_Wr_socket_chan[skt][channel] = (float) (getMCCounter(channel,PMM_WRITE,uncState1[skt],uncState2[skt]) * 64 / 1000000.0 / (elapsedTime/1000.0)); + + md.iMC_PMM_Rd_socket[skt] += md.iMC_PMM_Rd_socket_chan[skt][channel]; + md.iMC_PMM_Wr_socket[skt] += md.iMC_PMM_Wr_socket_chan[skt][channel]; + + md.M2M_NM_read_hit_rate[skt][(channel < numChannels1)?0:1] += (float)getMCCounter(channel,READ,uncState1[skt],uncState2[skt]); + } + else + { + md.partial_write[skt] += (uint64) (getMCCounter(channel,PARTIAL,uncState1[skt],uncState2[skt]) / (elapsedTime/1000.0)); + } } } + if (PMM) + { + for(uint32 c = 0; c < max_imc_controllers; ++c) + { + if(md.M2M_NM_read_hit_rate[skt][c] != 0.0) + { + md.M2M_NM_read_hit_rate[skt][c] = ((float)getM2MCounter(c, NM_HIT, uncState1[skt],uncState2[skt]))/ md.M2M_NM_read_hit_rate[skt][c]; + } + } + } } if (csv) { if (csvheader) { - display_bandwidth_csv_header(m, &md); + display_bandwidth_csv_header(m, &md, show_channel_output); csvheader = false; } - display_bandwidth_csv(m, &md, elapsedTime); + display_bandwidth_csv(m, &md, elapsedTime, show_channel_output); } else { - display_bandwidth(m, &md, no_columns); + display_bandwidth(m, &md, no_columns, show_channel_output); } } @@ -611,7 +769,7 @@ int main(int argc, char * argv[]) cerr << endl; double delay = -1.0; - bool csv = false, csvheader=false; + bool csv = false, csvheader=false, show_channel_output=true; uint32 no_columns = DEFAULT_DISPLAY_COLUMNS; // Default number of columns is 2 char * sysCmd = NULL; char ** sysArgv = NULL; @@ -620,6 +778,7 @@ int main(int argc, char * argv[]) int calibrated = PCM_CALIBRATION_INTERVAL - 2; // keeps track is the clock calibration needed #endif int rankA = -1, rankB = -1; + bool PMM = false; unsigned int numberOfIterations = 0; // number of iterations string program = string(argv[0]); @@ -706,6 +865,20 @@ int main(int argc, char * argv[]) } continue; } + if (strncmp(*argv, "--nochannel", 11) == 0 || + strncmp(*argv, "-nc", 3) == 0 || + strncmp(*argv, "/nc", 3) == 0) + { + show_channel_output = false; + continue; + } + + if (strncmp(*argv, "-pmm", 6) == 0 || + strncmp(*argv, "/pmm", 6) == 0) + { + PMM = true; + continue; + } #ifdef _MSC_VER else if (strncmp(*argv, "--uninstallDriver", 17) == 0) @@ -763,7 +936,22 @@ int main(int argc, char * argv[]) cerr << "For processor-level memory bandwidth statistics please use pcm.x" << endl; exit(EXIT_FAILURE); } - PCM::ErrorCode status = m->programServerUncoreMemoryMetrics(rankA, rankB); + if(PMM && (m->PMMTrafficMetricsAvailable() == false)) + { + cerr << "PMM traffic metrics are not available on your processor." << endl; + exit(EXIT_FAILURE); + } + if((rankA >= 0 || rankB >= 0) && PMM) + { + cerr << "PMM traffic metrics are not available on rank level" << endl; + exit(EXIT_FAILURE); + } + if((rankA >= 0 || rankB >= 0) && !show_channel_output) + { + cerr << "Rank level output requires channel output" << endl; + exit(EXIT_FAILURE); + } + PCM::ErrorCode status = m->programServerUncoreMemoryMetrics(rankA, rankB, PMM); switch (status) { case PCM::Success: @@ -868,7 +1056,7 @@ int main(int argc, char * argv[]) if(rankA >= 0 || rankB >= 0) calculate_bandwidth(m,BeforeState,AfterState,AfterTime-BeforeTime,csv,csvheader, no_columns, rankA, rankB); else - calculate_bandwidth(m,BeforeState,AfterState,AfterTime-BeforeTime,csv,csvheader, no_columns); + calculate_bandwidth(m,BeforeState,AfterState,AfterTime-BeforeTime,csv,csvheader, no_columns, PMM, show_channel_output); swap(BeforeTime, AfterTime); swap(BeforeState, AfterState); diff --git a/pcm-sensor.cpp b/pcm-sensor.cpp index 31de34a4..cca79bd4 100644 --- a/pcm-sensor.cpp +++ b/pcm-sensor.cpp @@ -70,6 +70,8 @@ int main() for (uint32 a = 0; a < counters.getNumSockets(); ++a) { cout << "Socket" << a << "/BytesReadFromMC\tfloat" << endl; cout << "Socket" << a << "/BytesWrittenToMC\tfloat" << endl; + cout << "Socket" << a << "/BytesReadFromPMM\tfloat" << endl; + cout << "Socket" << a << "/BytesWrittenToPMM\tfloat" << endl; cout << "Socket" << a << "/Frequency\tfloat" << endl; cout << "Socket" << a << "/IPC\tfloat" << endl; cout << "Socket" << a << "/L2CacheHitRatio\tfloat" << endl; @@ -258,6 +260,13 @@ int main() cout << "read from MC Socket" << i << "\t0\t\tGB" << endl; } } + for (uint32 i = 0; i < counters.getNumSockets(); ++i) { + stringstream c; + c << "Socket" << i << "/BytesReadFromPMM?"; + if (s == c.str()) { + cout << "read from PMM memory on Socket" << i << "\t0\t\tGB" << endl; + } + } for (uint32 i = 0; i < counters.getNumSockets(); ++i) { stringstream c; c << "Socket" << i << "/DRAMEnergy?"; @@ -337,9 +346,9 @@ int main() } for (uint32 i = 0; i < counters.getNumSockets(); ++i) { stringstream c; - c << "Socket" << i << "/BytesWrittenToMC?"; + c << "Socket" << i << "/BytesWrittenToPMM?"; if (s == c.str()) { - cout << "written to MC Socket" << i << "\t0\t\tGB" << endl; + cout << "written to PMM memory on Socket" << i << "\t0\t\tGB" << endl; //cout << "CPU" << i << "\tBytes written to memory channel\t0\t1\t GB" << endl; } } @@ -613,6 +622,8 @@ int main() OUTPUT_SOCKET_METRIC("/ThermalHeadroom", (counters.getSocket(i))) OUTPUT_SOCKET_METRIC("/BytesReadFromMC", (double(counters.getSocket(i)) / 1024 / 1024 / 1024)) OUTPUT_SOCKET_METRIC("/BytesWrittenToMC", (double(counters.getSocket(i)) / 1024 / 1024 / 1024)) + OUTPUT_SOCKET_METRIC("/BytesReadFromPMM", (double(counters.getSocket(i)) / 1024 / 1024 / 1024)) + OUTPUT_SOCKET_METRIC("/BytesWrittenToPMM", (double(counters.getSocket(i)) / 1024 / 1024 / 1024)) OUTPUT_SOCKET_METRIC("/Frequency", (counters.getSocket(i) / 1000000)) OUTPUT_SOCKET_METRIC("/IPC", (counters.getSocket(i))) OUTPUT_SOCKET_METRIC("/L2CacheHitRatio", (counters.getSocket(i))) diff --git a/pcm.cpp b/pcm.cpp index b3092a68..855143cb 100644 --- a/pcm.cpp +++ b/pcm.cpp @@ -179,6 +179,8 @@ void print_output(PCM * m, if (m->memoryTrafficMetricsAvailable()) cout << " READ : bytes read from main memory controller (in GBytes)" << "\n"; if (m->memoryTrafficMetricsAvailable()) cout << " WRITE : bytes written to main memory controller (in GBytes)" << "\n"; if (m->LLCReadMissLatencyMetricsAvailable()) cout << "LLCRDMISSLAT: average latency of last level cache miss for reads and prefetches (in ns)"; + if (m->PMMTrafficMetricsAvailable()) cout << " PMM RD : bytes read from PMM memory (in GBytes)" << "\n"; + if (m->PMMTrafficMetricsAvailable()) cout << " PMM WR : bytes written to PMM memory (in GBytes)" << "\n"; if (m->MCDRAMmemoryTrafficMetricsAvailable()) cout << " MCDRAM READ : bytes read from MCDRAM controller (in GBytes)" << "\n"; if (m->MCDRAMmemoryTrafficMetricsAvailable()) cout << " MCDRAM WRITE : bytes written to MCDRAM controller (in GBytes)" << "\n"; if (m->memoryIOTrafficMetricAvailable()) cout << " IO : bytes read/written due to IO requests to memory controller (in GBytes); this may be an over estimate due to same-cache-line partial requests" << "\n"; @@ -345,7 +347,6 @@ void print_output(PCM * m, { cout << "\n" << "Intel(r) "<< m->xPI() <<" traffic estimation in bytes (data and non-data traffic outgoing from CPU/socket through "<< m->xPI() <<" links):" << "\n" << "\n"; - const uint32 qpiLinks = (uint32)m->getQPILinksPerSocket(); cout << " "; @@ -381,6 +382,8 @@ void print_output(PCM * m, cout << "MEM (GB)->|"; if (m->memoryTrafficMetricsAvailable()) cout << " READ | WRITE |"; + if (m->PMMTrafficMetricsAvailable()) + cout << " PMM RD | PMM WR |"; if (m->MCDRAMmemoryTrafficMetricsAvailable()) cout << " MCDRAM READ | MCDRAM WRITE |"; if (m->memoryIOTrafficMetricAvailable()) @@ -399,6 +402,9 @@ void print_output(PCM * m, if (m->memoryTrafficMetricsAvailable()) cout << " " << setw(5) << getBytesReadFromMC(sktstate1[i], sktstate2[i]) / double(1e9) << " " << setw(5) << getBytesWrittenToMC(sktstate1[i], sktstate2[i]) / double(1e9); + if (m->PMMTrafficMetricsAvailable()) + cout << " " << setw(5) << getBytesReadFromPMM(sktstate1[i], sktstate2[i]) / double(1e9) << + " " << setw(5) << getBytesWrittenToPMM(sktstate1[i], sktstate2[i]) / double(1e9); if (m->MCDRAMmemoryTrafficMetricsAvailable()) cout << " " << setw(11) << getBytesReadFromEDC(sktstate1[i], sktstate2[i]) / double(1e9) << " " << setw(11) << getBytesWrittenToEDC(sktstate1[i], sktstate2[i]) / double(1e9); @@ -424,6 +430,9 @@ void print_output(PCM * m, if (m->memoryTrafficMetricsAvailable()) cout << " " << setw(5) << getBytesReadFromMC(sstate1, sstate2) / double(1e9) << " " << setw(5) << getBytesWrittenToMC(sstate1, sstate2) / double(1e9); + if (m->PMMTrafficMetricsAvailable()) + cout << " " << setw(5) << getBytesReadFromPMM(sstate1, sstate2) / double(1e9) << + " " << setw(5) << getBytesWrittenToPMM(sstate1, sstate2) / double(1e9); if (m->memoryIOTrafficMetricAvailable()) cout << " " << setw(5) << getIORequestBytesFromMC(sstate1, sstate2) / double(1e9); cout << " "; @@ -499,6 +508,9 @@ void print_csv_header(PCM * m, if (m->memoryTrafficMetricsAvailable()) cout << ";;"; + if (m->PMMTrafficMetricsAvailable()) + cout << ";;"; + if (m->MCDRAMmemoryTrafficMetricsAvailable()) cout << ";;"; @@ -540,6 +552,8 @@ void print_csv_header(PCM * m, cout << ";"; if (m->memoryTrafficMetricsAvailable()) cout << ";;"; + if (m->PMMTrafficMetricsAvailable()) + cout << ";;"; if (m->MCDRAMmemoryTrafficMetricsAvailable()) cout << ";;"; } @@ -639,6 +653,9 @@ void print_csv_header(PCM * m, if (m->memoryTrafficMetricsAvailable()) cout << "READ;WRITE;"; + if (m->PMMTrafficMetricsAvailable()) + cout << "PMM_RD;PMM_WR;"; + if (m->MCDRAMmemoryTrafficMetricsAvailable()) cout << "MCDRAM_READ;MCDRAM_WRITE;"; @@ -680,6 +697,8 @@ void print_csv_header(PCM * m, cout << "RMB;"; if (m->memoryTrafficMetricsAvailable()) cout << "READ;WRITE;"; + if (m->PMMTrafficMetricsAvailable()) + cout << "PMM_RD;PMM_WR;"; if (m->MCDRAMmemoryTrafficMetricsAvailable()) cout << "MCDRAM_READ;MCDRAM_WRITE;"; cout << "TEMP;"; @@ -835,6 +854,10 @@ void print_csv(PCM * m, cout << getBytesReadFromMC(sstate1, sstate2) / double(1e9) << ';' << getBytesWrittenToMC(sstate1, sstate2) / double(1e9) << ';'; + if (m->PMMTrafficMetricsAvailable()) + cout << getBytesReadFromPMM(sstate1, sstate2) / double(1e9) << + ';' << getBytesWrittenToPMM(sstate1, sstate2) / double(1e9) << ';'; + if (m->MCDRAMmemoryTrafficMetricsAvailable()) cout << getBytesReadFromEDC(sstate1, sstate2) / double(1e9) << ';' << getBytesWrittenToEDC(sstate1, sstate2) / double(1e9) << ';'; @@ -880,6 +903,9 @@ void print_csv(PCM * m, if (m->memoryTrafficMetricsAvailable()) cout << ';' << getBytesReadFromMC(sktstate1[i], sktstate2[i]) / double(1e9) << ';' << getBytesWrittenToMC(sktstate1[i], sktstate2[i]) / double(1e9); + if (m->PMMTrafficMetricsAvailable()) + cout << ';' << getBytesReadFromPMM(sktstate1[i], sktstate2[i]) / double(1e9) << + ';' << getBytesWrittenToPMM(sktstate1[i], sktstate2[i]) / double(1e9); if (m->MCDRAMmemoryTrafficMetricsAvailable()) cout << ';' << getBytesReadFromEDC(sktstate1[i], sktstate2[i]) / double(1e9) << ';' << getBytesWrittenToEDC(sktstate1[i], sktstate2[i]) / double(1e9); diff --git a/types.h b/types.h index 10a23d87..5e1e0ce9 100644 --- a/types.h +++ b/types.h @@ -574,7 +574,6 @@ struct BecktonUncorePMUCNTCTLRegister #define KNL_EDC7_ECLK_REGISTER_DEV_ADDR (31) #define KNL_EDC7_ECLK_REGISTER_FUNC_ADDR (2) - /** * XPF_ for Xeons: SNB, IVT, HSX, BDW, etc. * KNX_ for Xeon Phi (Knights *) processors @@ -651,6 +650,26 @@ struct BecktonUncorePMUCNTCTLRegister #define QPI_PORT1_MISC_REGISTER_FUNC_ADDR (0) #define QPI_PORT2_MISC_REGISTER_FUNC_ADDR (0) +#define SKX_M2M_0_REGISTER_DEV_ADDR (8) +#define SKX_M2M_0_REGISTER_FUNC_ADDR (0) +#define SKX_M2M_1_REGISTER_DEV_ADDR (9) +#define SKX_M2M_1_REGISTER_FUNC_ADDR (0) + +#define M2M_PCI_PMON_BOX_CTL_ADDR (0x258) + +#define M2M_PCI_PMON_CTL0_ADDR (0x228) +#define M2M_PCI_PMON_CTL1_ADDR (0x230) +#define M2M_PCI_PMON_CTL2_ADDR (0x238) +#define M2M_PCI_PMON_CTL3_ADDR (0x240) + +#define M2M_PCI_PMON_CTR0_ADDR (0x200) +#define M2M_PCI_PMON_CTR1_ADDR (0x208) +#define M2M_PCI_PMON_CTR2_ADDR (0x210) +#define M2M_PCI_PMON_CTR3_ADDR (0x218) + +#define PCM_INVALID_DEV_ADDR (~(uint32)0UL) +#define PCM_INVALID_FUNC_ADDR (~(uint32)0UL) + #define Q_P_PCI_PMON_BOX_CTL_ADDR (0x0F4) #define Q_P_PCI_PMON_CTL3_ADDR (0x0E4) @@ -847,6 +866,15 @@ struct BecktonUncorePMUCNTCTLRegister #define IIO_MSR_PMON_CTL_CH_MASK(x) ((x) << 36ULL) #define IIO_MSR_PMON_CTL_FC_MASK(x) ((x) << 44ULL) +#define M2M_PCI_PMON_CTL_EVENT(x) ((x) << 0) +#define M2M_PCI_PMON_CTL_UMASK(x) ((x) << 8) +#define M2M_PCI_PMON_CTL_RST (1 << 17) +#define M2M_PCI_PMON_CTL_EDGE_DET (1 << 18) +#define M2M_PCI_PMON_CTL_OV_EN (1 << 20) +#define M2M_PCI_PMON_CTL_EN (1 << 22) +#define M2M_PCI_PMON_CTL_INVERT (1 << 23) +#define M2M_PCI_PMON_CTL_THRESH(x) ((x) << 24ULL) + #define UCLK_FIXED_CTL_OV_EN (1 << 20) #define UCLK_FIXED_CTL_EN (1 << 22)