diff --git a/doc/CXL_README.md b/doc/CXL_README.md new file mode 100644 index 00000000..ad9265be --- /dev/null +++ b/doc/CXL_README.md @@ -0,0 +1,15 @@ +PCM can collect CLX bandwidth using the methods below. + +-------------------------------------------------------------------------------- +CXL.mem and CXL.cache traffic +-------------------------------------------------------------------------------- + +Please use pcm-memory utility for monitoring CXL.mem and CLX.cache traffic. pcm-memory will detect available CXL ports and will show traffic per CXL port and protocol (mem and cache) and per direction (read and write). + +-------------------------------------------------------------------------------- +CXL.io traffic +-------------------------------------------------------------------------------- + +pcm-iio utility should be used to monitor CXL.io traffic. pcm-iio will show traffic per CXL device and direction (inbound/outbound, read/write) + + diff --git a/src/cpucounters.cpp b/src/cpucounters.cpp index c0c8efa8..56cdda43 100644 --- a/src/cpucounters.cpp +++ b/src/cpucounters.cpp @@ -2505,6 +2505,40 @@ void PCM::initUncorePMUsDirect() } } } + + if (1) + { + cxlPMUs.resize(num_sockets); + for (uint32 s = 0; s < (uint32)num_sockets; ++s) + { + if (uncorePMUDiscovery.get()) + { + auto createCXLPMU = [this](const uint32 s, const unsigned BoxType, const size_t pos) -> UncorePMU + { + std::vector > CounterControlRegs, CounterValueRegs; + const auto n_regs = uncorePMUDiscovery->getBoxNumRegs(BoxType, s, pos); + const auto unitControlAddr = uncorePMUDiscovery->getBoxCtlAddr(BoxType, s, pos); + const auto unitControlAddrAligned = unitControlAddr & ~4095ULL; + auto handle = std::make_shared(unitControlAddrAligned, CXL_PMON_SIZE, false); + for (size_t r = 0; r < n_regs; ++r) + { + CounterControlRegs.push_back(std::make_shared(handle, uncorePMUDiscovery->getBoxCtlAddr(BoxType, s, pos, r) - unitControlAddrAligned)); + CounterValueRegs.push_back(std::make_shared(handle, uncorePMUDiscovery->getBoxCtrAddr(BoxType, s, pos, r) - unitControlAddrAligned)); + } + return UncorePMU(std::make_shared(handle, unitControlAddr - unitControlAddrAligned), CounterControlRegs, CounterValueRegs); + }; + if (getCPUModel() == PCM::SPR) + { + const auto n_units = (std::min)(uncorePMUDiscovery->getNumBoxes(SPR_CXLCM_BOX_TYPE, s), + uncorePMUDiscovery->getNumBoxes(SPR_CXLDP_BOX_TYPE, s)); + for (size_t pos = 0; pos < n_units; ++pos) + { + cxlPMUs[s].push_back(std::make_pair(createCXLPMU(s, SPR_CXLCM_BOX_TYPE, pos), createCXLPMU(s, SPR_CXLDP_BOX_TYPE, pos))); + } + } + } + } + } } #ifdef PCM_USE_PERF @@ -4637,6 +4671,14 @@ void PCM::cleanupUncorePMUs(const bool silent) { pmu.cleanup(); } + for (auto& sPMUs : cxlPMUs) + { + for (auto& pmus : sPMUs) + { + pmus.first.cleanup(); + pmus.second.cleanup(); + } + } for (auto & uncore : serverUncorePMUs) { uncore->cleanupPMUs(); @@ -5208,12 +5250,13 @@ PCM::ErrorCode PCM::programServerUncoreLatencyMetrics(bool enable_pmm) PCM::ErrorCode PCM::programServerUncoreMemoryMetrics(const ServerUncoreMemoryMetrics & metrics, int rankA, int rankB) { - if(MSR.empty() || serverUncorePMUs.empty()) return PCM::MSRAccessDenied; + if (MSR.empty() || serverUncorePMUs.empty()) return PCM::MSRAccessDenied; for (int i = 0; (i < (int)serverUncorePMUs.size()) && MSR.size(); ++i) { - serverUncorePMUs[i]->programServerUncoreMemoryMetrics(metrics, rankA, rankB); + serverUncorePMUs[i]->programServerUncoreMemoryMetrics(metrics, rankA, rankB); } + programCXLCM(); return PCM::Success; } @@ -5495,8 +5538,8 @@ PCM::ErrorCode PCM::program(const RawPMUConfigs& curPMUConfigs_, const bool sile std::cerr << "ERROR: trying to program " << events.programmable.size() << " core PMU counters, which exceeds the max num possible (" << ServerUncoreCounterState::maxCounters << ")."; return PCM::UnknownError; } - uint32 events32[ServerUncoreCounterState::maxCounters] = { 0,0,0,0 }; - uint64 events64[ServerUncoreCounterState::maxCounters] = { 0,0,0,0 }; + uint32 events32[ServerUncoreCounterState::maxCounters] = { 0,0,0,0,0,0,0,0 }; + uint64 events64[ServerUncoreCounterState::maxCounters] = { 0,0,0,0,0,0,0,0 }; for (size_t c = 0; c < events.programmable.size() && c < ServerUncoreCounterState::maxCounters; ++c) { events32[c] = (uint32)events.programmable[c].first[0]; @@ -5573,6 +5616,14 @@ PCM::ErrorCode PCM::program(const RawPMUConfigs& curPMUConfigs_, const bool sile { threadMSRConfig = pmuConfig.second; } + else if (type == "cxlcm") + { + programCXLCM(events64); + } + else if (type == "cxldp") + { + programCXLDP(events64); + } else { std::cerr << "ERROR: unrecognized PMU type \"" << type << "\"\n"; @@ -5625,6 +5676,14 @@ void PCM::freezeServerUncoreCounters() } } } + for (auto& sPMUs : cxlPMUs) + { + for (auto& pmus : sPMUs) + { + pmus.first.freeze(UNC_PMON_UNIT_CTL_FRZ_EN); + pmus.second.freeze(UNC_PMON_UNIT_CTL_FRZ_EN); + } + } } void PCM::unfreezeServerUncoreCounters() { @@ -5669,6 +5728,14 @@ void PCM::unfreezeServerUncoreCounters() } } } + for (auto& sPMUs : cxlPMUs) + { + for (auto& pmus : sPMUs) + { + pmus.first.unfreeze(UNC_PMON_UNIT_CTL_FRZ_EN); + pmus.second.unfreeze(UNC_PMON_UNIT_CTL_FRZ_EN); + } + } } void UncoreCounterState::readAndAggregate(std::shared_ptr msr) { @@ -5697,6 +5764,7 @@ SystemCounterState PCM::getSystemCounterState() } } + readAndAggregateCXLCMCounters(result); readQPICounters(result); result.ThermalHeadroom = static_cast(PCM_INVALID_THERMAL_HEADROOM); // not available for system @@ -5747,6 +5815,26 @@ void PCM::readAndAggregateMemoryBWCounters(const uint32 core, CounterStateType & //std::cout << std::flush; } + +template +void PCM::readAndAggregateCXLCMCounters( CounterStateType & result) +{ + + for (size_t socket = 0; socket < getNumSockets(); ++socket) + { + uint64 CXLWriteMem = 0; + uint64 CXLWriteCache = 0; + for (size_t p = 0; p < getNumCXLPorts(socket); ++p) + { + CXLWriteMem += *cxlPMUs[socket][p].first.counterValue[0]; + CXLWriteCache += *cxlPMUs[socket][p].first.counterValue[1]; + } + result.CXLWriteMem[socket] = CXLWriteMem; + result.CXLWriteCache[socket] = CXLWriteCache; + } +} + + template void PCM::readAndAggregateUncoreMCCounters(const uint32 socket, CounterStateType & result) { @@ -6280,14 +6368,14 @@ ServerUncoreCounterState PCM::getServerUncoreCounterState(uint32 socket) TemporalThreadAffinity tempThreadAffinity(refCore); for (uint32 cbo = 0; socket < cboPMUs.size() && cbo < cboPMUs[socket].size() && cbo < ServerUncoreCounterState::maxCBOs; ++cbo) { - for (int i = 0; i < ServerUncoreCounterState::maxCounters; ++i) + for (int i = 0; i < ServerUncoreCounterState::maxCounters && size_t(i) < cboPMUs[socket][cbo].size(); ++i) { result.CBOCounter[cbo][i] = *(cboPMUs[socket][cbo].counterValue[i]); } } for (uint32 mdf = 0; socket < mdfPMUs.size() && mdf < mdfPMUs[socket].size() && mdf < ServerUncoreCounterState::maxMDFs; ++mdf) { - for (int i = 0; i < ServerUncoreCounterState::maxCounters; ++i) + for (int i = 0; i < ServerUncoreCounterState::maxCounters && size_t(i) < mdfPMUs[socket][mdf].size(); ++i) { if (mdfPMUs[socket][mdf].counterValue[i].get()) { @@ -6297,14 +6385,14 @@ ServerUncoreCounterState PCM::getServerUncoreCounterState(uint32 socket) } for (uint32 stack = 0; socket < iioPMUs.size() && stack < iioPMUs[socket].size() && stack < ServerUncoreCounterState::maxIIOStacks; ++stack) { - for (int i = 0; i < ServerUncoreCounterState::maxCounters; ++i) + for (int i = 0; i < ServerUncoreCounterState::maxCounters && size_t(i) < iioPMUs[socket][stack].size(); ++i) { result.IIOCounter[stack][i] = *(iioPMUs[socket][stack].counterValue[i]); } } for (uint32 stack = 0; socket < irpPMUs.size() && stack < irpPMUs[socket].size() && stack < ServerUncoreCounterState::maxIIOStacks; ++stack) { - for (int i = 0; i < ServerUncoreCounterState::maxCounters; ++i) + for (int i = 0; i < ServerUncoreCounterState::maxCounters && size_t(i) < irpPMUs[socket][stack].size(); ++i) { if (irpPMUs[socket][stack].counterValue[i].get()) { @@ -6317,8 +6405,21 @@ ServerUncoreCounterState PCM::getServerUncoreCounterState(uint32 socket) result.UBOXCounter[i] = *(uboxPMUs[socket].counterValue[i]); result.UncClocks = getUncoreClocks(socket); } - for (int i = 0; i < ServerUncoreCounterState::maxCounters && socket < pcuPMUs.size(); ++i) + for (int i = 0; i < ServerUncoreCounterState::maxCounters && socket < pcuPMUs.size() && size_t(i) < pcuPMUs[socket].size(); ++i) + { result.PCUCounter[i] = *pcuPMUs[socket].counterValue[i]; + } + for (size_t p = 0; p < getNumCXLPorts(socket); ++p) + { + for (int i = 0; i < ServerUncoreCounterState::maxCounters && socket < cxlPMUs.size() && size_t(i) < cxlPMUs[socket][p].first.size(); ++i) + { + result.CXLCMCounter[p][i] = *cxlPMUs[socket][p].first.counterValue[i]; + } + for (int i = 0; i < ServerUncoreCounterState::maxCounters && socket < cxlPMUs.size() && size_t(i) < cxlPMUs[socket][p].second.size(); ++i) + { + result.CXLDPCounter[p][i] = *cxlPMUs[socket][p].second.counterValue[i]; + } + } // std::cout << "values read: " << result.PCUCounter[0] << " " << result.PCUCounter[1] << " " << result.PCUCounter[2] << " " << result.PCUCounter[3] << "\n"; uint64 val=0; //MSR[refCore]->read(MSR_PKG_ENERGY_STATUS,&val); @@ -8958,7 +9059,7 @@ void PCM::programCbo(const uint64 * events, const uint32 opCode, const uint32 nc PCM::program(cboPMUs[i][cbo], events, events + ServerUncoreCounterState::maxCounters, UNC_PMON_UNIT_CTL_FRZ_EN); - for (int c = 0; c < ServerUncoreCounterState::maxCounters; ++c) + for (int c = 0; c < ServerUncoreCounterState::maxCounters && size_t(c) < cboPMUs[i][cbo].size(); ++c) { *cboPMUs[i][cbo].counterValue[c] = 0; } @@ -9048,6 +9149,40 @@ void PCM::controlQATTelemetry(uint32 dev, uint32 operation) } } +void PCM::programCXLCM(const uint64* events) +{ + for (auto & sPMUs : cxlPMUs) + { + for (auto& pmus : sPMUs) + { + pmus.first.initFreeze(UNC_PMON_UNIT_CTL_FRZ_EN); + assert(pmus.first.size() == 8); + PCM::program(pmus.first, events, events + 8, UNC_PMON_UNIT_CTL_FRZ_EN); + } + } +} + +void PCM::programCXLDP(const uint64* events) +{ + for (auto& sPMUs : cxlPMUs) + { + for (auto& pmus : sPMUs) + { + pmus.second.initFreeze(UNC_PMON_UNIT_CTL_FRZ_EN); + assert(pmus.second.size() == 4); + PCM::program(pmus.second, events, events + 4, UNC_PMON_UNIT_CTL_FRZ_EN); + } + } +} +void PCM::programCXLCM() +{ + uint64 CXLCMevents[8] = { 0,0,0,0,0,0,0,0 }; + + CXLCMevents[EventPosition::CXL_TxC_MEM] = UNC_PMON_CTL_EVENT(0x02) + UNC_PMON_CTL_UMASK(0x10); // CXLCM_TxC_PACK_BUF_INSERTS.MEM_DATA + CXLCMevents[EventPosition::CXL_TxC_CACHE] = UNC_PMON_CTL_EVENT(0x02) + UNC_PMON_CTL_UMASK(0x04);// CXLCM_TxC_PACK_BUF_INSERTS.CACHE_DATA + + programCXLCM(CXLCMevents); +} void PCM::programIDXAccelCounters(uint32 accel, std::vector &events, std::vector &filters_wq, std::vector &filters_eng, std::vector &filters_tc, std::vector &filters_pgsz, std::vector &filters_xfersz) { uint32 maxCTR = getMaxNumOfIDXAccelCtrs(accel); //limit the number of physical counter to use @@ -9338,6 +9473,26 @@ UncorePMU::UncorePMU(const HWRegisterPtr& unitControl_, fixedCounterValue(fixedCounterValue_), filter{ filter0 , filter1 } { + assert(counterControl.size() == counterValue.size()); +} + +UncorePMU::UncorePMU(const HWRegisterPtr& unitControl_, + const std::vector& counterControl_, + const std::vector& counterValue_, + const HWRegisterPtr& fixedCounterControl_, + const HWRegisterPtr& fixedCounterValue_, + const HWRegisterPtr& filter0, + const HWRegisterPtr& filter1 +): + cpu_model_(0), + unitControl(unitControl_), + counterControl{counterControl_}, + counterValue{counterValue_}, + fixedCounterControl(fixedCounterControl_), + fixedCounterValue(fixedCounterValue_), + filter{ filter0 , filter1 } +{ + assert(counterControl.size() == counterValue.size()); } uint32 UncorePMU::getCPUModel() @@ -9351,9 +9506,9 @@ uint32 UncorePMU::getCPUModel() void UncorePMU::cleanup() { - for (int i = 0; i < 4; ++i) + for (auto& cc: counterControl) { - if (counterControl[i].get()) *counterControl[i] = 0; + if (cc.get()) *cc = 0; } if (unitControl.get()) *unitControl = 0; if (fixedCounterControl.get()) *fixedCounterControl = 0; diff --git a/src/cpucounters.h b/src/cpucounters.h index e130e9ce..e68d3e63 100644 --- a/src/cpucounters.h +++ b/src/cpucounters.h @@ -246,8 +246,8 @@ class UncorePMU uint32 getCPUModel(); HWRegisterPtr unitControl; public: - HWRegisterPtr counterControl[4]; - HWRegisterPtr counterValue[4]; + std::vector counterControl; + std::vector counterValue; HWRegisterPtr fixedCounterControl; HWRegisterPtr fixedCounterValue; HWRegisterPtr filter[2]; @@ -266,7 +266,16 @@ class UncorePMU const HWRegisterPtr& filter0 = HWRegisterPtr(), const HWRegisterPtr& filter1 = HWRegisterPtr() ); + UncorePMU(const HWRegisterPtr& unitControl_, + const std::vector & counterControl_, + const std::vector & counterValue_, + const HWRegisterPtr& fixedCounterControl_ = HWRegisterPtr(), + const HWRegisterPtr& fixedCounterValue_ = HWRegisterPtr(), + const HWRegisterPtr& filter0 = HWRegisterPtr(), + const HWRegisterPtr& filter1 = HWRegisterPtr() + ); UncorePMU() : cpu_model_(0U) {} + size_t size() const { return counterControl.size(); } virtual ~UncorePMU() {} bool valid() const { @@ -631,6 +640,7 @@ class PCM_API PCM std::vector > dram_energy_status; std::vector > cboPMUs; std::vector > mdfPMUs; + std::vector>> cxlPMUs; // socket X CXL ports X UNIT {0,1} std::vector > memory_bw_local; std::vector > memory_bw_total; @@ -993,6 +1003,10 @@ class PCM_API PCM void readAndAggregatePackageCStateResidencies(std::shared_ptr msr, CounterStateType & result); public: struct RawPMUConfig; + void programCXLCM(); + template + void readAndAggregateCXLCMCounters(CounterStateType & counterState); + private: template void readMSRs(std::shared_ptr msr, const RawPMUConfig & msrConfig, CounterStateType & result); @@ -1018,7 +1032,7 @@ class PCM_API PCM if (!eventsBegin) return; Iterator curEvent = eventsBegin; const auto cpu_model = PCM::getInstance()->getCPUModel(); - for (int c = 0; curEvent != eventsEnd; ++c, ++curEvent) + for (int c = 0; curEvent != eventsEnd && size_t(c) < pmu.size(); ++c, ++curEvent) { auto ctrl = pmu.counterControl[c]; if (ctrl.get() != nullptr) @@ -1041,7 +1055,8 @@ class PCM_API PCM } void programPCU(uint32 * events, const uint64 filter); void programUBOX(const uint64* events); - + void programCXLDP(const uint64* events); + void programCXLCM(const uint64* events); void cleanupUncorePMUs(const bool silent = false); bool isCLX() const // Cascade Lake-SP @@ -1074,7 +1089,9 @@ class PCM_API PCM TOR_OCCUPANCY = 0, TOR_INSERTS = 1, REQUESTS_ALL = 2, - REQUESTS_LOCAL = 3 + REQUESTS_LOCAL = 3, + CXL_TxC_MEM = 0, // works only on counters 0-3 + CXL_TxC_CACHE = 1 // works only on counters 0-3 }; //! check if in secure boot mode bool isSecureBoot() const; @@ -1564,6 +1581,16 @@ class PCM_API PCM //! \return socket identifier int32 getSocketId(uint32 core_id) const { return (int32)topology[core_id].socket; } + + size_t getNumCXLPorts(uint32 socket) const + { + if (socket < cxlPMUs.size()) + { + return cxlPMUs[socket].size(); + } + return 0; + } + //! \brief Returns the number of Intel(r) Quick Path Interconnect(tm) links per socket //! \return number of QPI links per socket uint64 getQPILinksPerSocket() const @@ -2659,6 +2686,30 @@ uint64 getMCCounter(uint32 channel, uint32 counter, const CounterStateType & bef return after.MCCounter[channel][counter] - before.MCCounter[channel][counter]; } +/*! \brief Direct read of CXLCM PMU counter (counter meaning depends on the programming: power/performance/etc) + \param counter counter number + \param port port number + \param before CPU counter state before the experiment + \param after CPU counter state after the experiment +*/ +template +uint64 getCXLCMCounter(uint32 port, uint32 counter, const CounterStateType& before, const CounterStateType& after) +{ + return after.CXLCMCounter[port][counter] - before.CXLCMCounter[port][counter]; +} + +/*! \brief Direct read of CXLDP PMU counter (counter meaning depends on the programming: power/performance/etc) + \param counter counter number + \param port port number + \param before CPU counter state before the experiment + \param after CPU counter state after the experiment +*/ +template +uint64 getCXLDPCounter(uint32 port, uint32 counter, const CounterStateType& before, const CounterStateType& after) +{ + return after.CXLDPCounter[port][counter] - before.CXLDPCounter[port][counter]; +} + /*! \brief Direct read of M3UPI PMU counter (counter meaning depends on the programming: power/performance/etc) \param counter counter number \param port UPI port number @@ -3015,7 +3066,8 @@ class ServerUncoreCounterState : public UncoreCounterState maxCBOs = 128, maxMDFs = 128, maxIIOStacks = 16, - maxCounters = 4 + maxCXLPorts = 6, + maxCounters = 8 }; enum EventPosition { @@ -3037,6 +3089,8 @@ class ServerUncoreCounterState : public UncoreCounterState std::array, maxMDFs> MDFCounter; std::array, maxIIOStacks> IIOCounter; std::array, maxIIOStacks> IRPCounter; + std::array, maxCXLPorts> CXLCMCounter; + std::array, maxCXLPorts> CXLDPCounter; std::array UBOXCounter; std::array DRAMClocks; std::array HBMClocks; @@ -3055,6 +3109,10 @@ class ServerUncoreCounterState : public UncoreCounterState template friend uint64 getMCCounter(uint32 channel, uint32 counter, const CounterStateType & before, const CounterStateType & after); template + friend uint64 getCXLCMCounter(uint32 port, uint32 counter, const CounterStateType& before, const CounterStateType& after); + template + friend uint64 getCXLDPCounter(uint32 port, uint32 counter, const CounterStateType& before, const CounterStateType& after); + template friend uint64 getM3UPICounter(uint32 port, uint32 counter, const CounterStateType& before, const CounterStateType& after); template friend uint64 getCBOCounter(uint32 cbo, uint32 counter, const CounterStateType& before, const CounterStateType& after); @@ -3095,6 +3153,8 @@ class ServerUncoreCounterState : public UncoreCounterState MDFCounter{{}}, IIOCounter{{}}, IRPCounter{{}}, + CXLCMCounter{{}}, + CXLDPCounter{{}}, UBOXCounter{{}}, DRAMClocks{{}}, HBMClocks{{}}, @@ -3212,6 +3272,7 @@ class SystemCounterState : public SocketCounterState } public: + std::vector CXLWriteMem,CXLWriteCache; friend uint64 getIncomingQPILinkBytes(uint32 socketNr, uint32 linkNr, const SystemCounterState & before, const SystemCounterState & after); friend uint64 getIncomingQPILinkBytes(uint32 socketNr, uint32 linkNr, const SystemCounterState & now); friend double getOutgoingQPILinkUtilization(uint32 socketNr, uint32 linkNr, const SystemCounterState & before, const SystemCounterState & after); @@ -3222,6 +3283,8 @@ class SystemCounterState : public SocketCounterState uncoreTSC(0) { PCM * m = PCM::getInstance(); + CXLWriteMem.resize(m->getNumSockets(),0); + CXLWriteCache.resize(m->getNumSockets(),0); incomingQPIPackets.resize(m->getNumSockets(), std::vector((uint32)m->getQPILinksPerSocket(), 0)); outgoingQPIFlits.resize(m->getNumSockets(), @@ -3975,6 +4038,32 @@ uint64 getNumberOfCustomEvents(int32 eventCounterNr, const CounterStateType & be return after.Event[eventCounterNr] - before.Event[eventCounterNr]; } + +/*! \brief Computes number of bytes Writen from CXL Cache + + \param before CPU counter state before the experiment + \param after CPU counter state after the experiment + \return Number of bytes +*/ +//template +inline uint64 getCXLWriteCacheBytes(uint32 socket,const SystemCounterState & before,const SystemCounterState & after) +{ + return (after.CXLWriteCache[socket] - before.CXLWriteCache[socket]) * 64; +} + +/*! \brief Computes number of bytes Writen from CXL Memory + + \param before CPU counter state before the experiment + \param after CPU counter state after the experiment + \return Number of bytes +*/ +//template +inline uint64 getCXLWriteMemBytes(uint32 socket, const SystemCounterState & before,const SystemCounterState & after) +{ + + return (after.CXLWriteMem[socket] - before.CXLWriteMem[socket]) * 64; +} + /*! \brief Get estimation of QPI data traffic per incoming QPI link Returns an estimation of number of data bytes transferred to a socket over Intel(r) Quick Path Interconnect diff --git a/src/dashboard.cpp b/src/dashboard.cpp index 1462a866..4ed3e756 100644 --- a/src/dashboard.cpp +++ b/src/dashboard.cpp @@ -623,6 +623,11 @@ std::string getPCMDashboardJSON(const PCMDashboardType type, int ns, int nu, int panel->push(t); panel1->push(t); } + for (auto& m : {"CXL Write Mem","CXL Write Cache" }){ + auto t = createTarget(m, "mean(\\\"QPI/UPI Links_QPI Counters Socket " + S + "_" + m + "\\\")/1048576", prometheusCounters(S, m, false) + "/1048576"); + panel->push(t); + panel1->push(t); + } for (std::string m : { "DRAM ", "Persistent Memory " }) { auto t = createTarget(m + "Total", diff --git a/src/pcm-memory.cpp b/src/pcm-memory.cpp index c0ec3c17..a049ade9 100644 --- a/src/pcm-memory.cpp +++ b/src/pcm-memory.cpp @@ -38,6 +38,7 @@ constexpr uint32 max_sockets = 256; uint32 max_imc_channels = ServerUncoreCounterState::maxChannels; const uint32 max_edc_channels = ServerUncoreCounterState::maxChannels; const uint32 max_imc_controllers = ServerUncoreCounterState::maxControllers; +bool SPR_CXL = false; typedef struct memdata { float iMC_Rd_socket_chan[max_sockets][ServerUncoreCounterState::maxChannels]{}; @@ -49,6 +50,10 @@ typedef struct memdata { float iMC_Wr_socket[max_sockets]{}; float iMC_PMM_Rd_socket[max_sockets]{}; float iMC_PMM_Wr_socket[max_sockets]{}; + float CXLMEM_Rd_socket_port[max_sockets][ServerUncoreCounterState::maxCXLPorts]{}; + float CXLMEM_Wr_socket_port[max_sockets][ServerUncoreCounterState::maxCXLPorts]{}; + float CXLCACHE_Rd_socket_port[max_sockets][ServerUncoreCounterState::maxCXLPorts]{}; + float CXLCACHE_Wr_socket_port[max_sockets][ServerUncoreCounterState::maxCXLPorts]{}; float iMC_PMM_MemoryMode_Miss_socket[max_sockets]{}; bool iMC_NM_hit_rate_supported{}; float iMC_PMM_MemoryMode_Hit_socket[max_sockets]{}; @@ -184,7 +189,7 @@ void printSocketRankBWHeader_cvt(const uint32 numSockets, const uint32 num_imc_c cout << endl; } -void printSocketChannelBW(PCM * /*m*/, memdata_t *md, uint32 no_columns, uint32 skt) +void printSocketChannelBW(PCM *, memdata_t *md, uint32 no_columns, uint32 skt) { for (uint32 channel = 0; channel < max_imc_channels; ++channel) { // check all the sockets for bad channel "channel" @@ -218,7 +223,7 @@ void printSocketChannelBW(PCM * /*m*/, memdata_t *md, uint32 no_columns, uint32 } } -void printSocketChannelBW(uint32 no_columns, uint32 skt, uint32 num_imc_channels, const ServerUncoreCounterState * uncState1, const ServerUncoreCounterState * uncState2, uint64 elapsedTime, int rankA, int rankB) +void printSocketChannelBW(uint32 no_columns, uint32 skt, uint32 num_imc_channels, const std::vector& uncState1, const std::vector& uncState2, uint64 elapsedTime, int rankA, int rankB) { for (uint32 channel = 0; channel < num_imc_channels; ++channel) { if(rankA >= 0) { @@ -244,8 +249,8 @@ void printSocketChannelBW(uint32 no_columns, uint32 skt, uint32 num_imc_channels } } -void printSocketChannelBW_cvt(const uint32 numSockets, const uint32 num_imc_channels, const ServerUncoreCounterState * uncState1, - const ServerUncoreCounterState * uncState2, const uint64 elapsedTime, const int rankA, const int rankB) +void printSocketChannelBW_cvt(const uint32 numSockets, const uint32 num_imc_channels, const std::vector& uncState1, + const std::vector& uncState2, const uint64 elapsedTime, const int rankA, const int rankB) { printDateForCSV(Data); for (uint32 skt = 0 ; skt < numSockets; ++skt) { @@ -263,6 +268,58 @@ void printSocketChannelBW_cvt(const uint32 numSockets, const uint32 num_imc_chan cout << endl; } +uint32 getNumCXLPorts(PCM* m) +{ + static int numPorts = -1; + if (numPorts < 0) + { + for (uint32 s = 0; s < m->getNumSockets(); ++s) + { + numPorts = (std::max)(numPorts, (int)m->getNumCXLPorts(s)); + } + assert(numPorts >= 0); + } + return (uint32)numPorts; +} + +void printSocketCXLBW(PCM* m, memdata_t* md, uint32 no_columns, uint32 skt) +{ + uint32 numPorts = getNumCXLPorts(m); + if (numPorts > 0) + { + for (uint32 i = skt; i < (no_columns + skt); ++i) { + cout << "|---------------------------------------|"; + } + cout << "\n"; + for (uint32 i = skt; i < (no_columns + skt); ++i) { + cout << "|-- CXL Port Monitoring --|"; + } + cout << "\n"; + for (uint32 i = skt; i < (no_columns + skt); ++i) { + cout << "|---------------------------------------|"; + } + cout << "\n"; + } + for (uint32 port = 0; port < numPorts; ++port) { + for (uint32 i = skt; i < (skt + no_columns); ++i) { + cout << "|-- .mem --|"; + } + cout << "\n"; + for (uint32 i = skt; i < (skt + no_columns); ++i) { + cout << "|-- Writes(MB/s): " << setw(8) << md->CXLMEM_Wr_socket_port[i][port] << " --|"; + } + cout << "\n"; + for (uint32 i = skt; i < (skt + no_columns); ++i) { + cout << "|-- .cache --|"; + } + cout << "\n"; + for (uint32 i = skt; i < (skt + no_columns); ++i) { + cout << "|-- hst->dv(MB/s): " << setw(8) << md->CXLCACHE_Wr_socket_port[i][port] << " --|"; + } + cout << "\n"; + } +} + float AD_BW(const memdata_t *md, const uint32 skt) { const auto totalPMM = md->iMC_PMM_Rd_socket[skt] + md->iMC_PMM_Wr_socket[skt]; @@ -357,7 +414,7 @@ void printSocketBWFooter(uint32 no_columns, uint32 skt, const memdata_t *md) cout << "\n"; } -void display_bandwidth(PCM *m, memdata_t *md, const uint32 no_columns, const bool show_channel_output, const bool print_update) +void display_bandwidth(PCM *m, memdata_t *md, const uint32 no_columns, const bool show_channel_output, const bool print_update, const float CXL_Read_BW) { float sysReadDRAM = 0.0, sysWriteDRAM = 0.0, sysReadPMM = 0.0, sysWritePMM = 0.0; uint32 numSockets = m->getNumSockets(); @@ -450,6 +507,7 @@ void display_bandwidth(PCM *m, memdata_t *md, const uint32 no_columns, const boo if (show_channel_output) printSocketChannelBW(m, md, no_columns, skt); printSocketBWFooter(no_columns, skt, md); + printSocketCXLBW(m, md, no_columns, skt); for (uint32 i = skt; i < (skt + no_columns); i++) { sysReadDRAM += md->iMC_Rd_socket[i]; @@ -483,6 +541,11 @@ void display_bandwidth(PCM *m, memdata_t *md, const uint32 no_columns, const boo \r|-- System PMM Read Throughput(MB/s):" << setw(14) << sysReadPMM << " --|\n\ \r|-- System PMM Write Throughput(MB/s):" << setw(14) << sysWritePMM << " --|\n"; } + if (SPR_CXL) + { + cout << "\ + \r|-- System CXL Read Throughput(MB/s):" << setw(14) << CXL_Read_BW << " --|\n"; + } cout << "\ \r|-- System Read Throughput(MB/s):" << setw(14) << sysReadDRAM+sysReadPMM << " --|\n\ \r|-- System Write Throughput(MB/s):" << setw(14) << sysWriteDRAM+sysWritePMM << " --|\n\ @@ -491,7 +554,9 @@ void display_bandwidth(PCM *m, memdata_t *md, const uint32 no_columns, const boo } } -void display_bandwidth_csv(PCM *m, memdata_t *md, uint64 /*elapsedTime*/, const bool show_channel_output, const CsvOutputType outputType) +constexpr float CXLBWWrScalingFactor = 0.5; + +void display_bandwidth_csv(PCM *m, memdata_t *md, uint64 /*elapsedTime*/, const bool show_channel_output, const CsvOutputType outputType, const float CXL_Read_BW) { const uint32 numSockets = m->getNumSockets(); printDateForCSV(outputType); @@ -695,6 +760,22 @@ void display_bandwidth_csv(PCM *m, memdata_t *md, uint64 /*elapsedTime*/, const sysWriteDRAM += md->EDC_Wr_socket[skt]; }); } + for (uint64 port = 0; port < m->getNumCXLPorts(skt); ++port) + { + choose(outputType, + [printSKT]() { + printSKT(2); + }, + [&port]() { + cout + << "CXL.mem_P" << port << "Write," + << "CXL.cache_P" << port << "hst->dv,"; + }, + [&md, &skt, &port]() { + cout << setw(8) << md->CXLMEM_Wr_socket_port[skt][port] << ',' + << setw(8) << md->CXLCACHE_Wr_socket_port[skt][port] << ','; + }); + } } if (anyPmem(md->metrics)) @@ -714,6 +795,20 @@ void display_bandwidth_csv(PCM *m, memdata_t *md, uint64 /*elapsedTime*/, const }); } + if (SPR_CXL) + { + choose(outputType, + []() { + cout << "System,"; + }, + []() { + cout << "CXLRead,"; + }, + [&]() { + cout << setw(10) << CXL_Read_BW << ','; + }); + } + choose(outputType, []() { cout << "System,System,System\n"; @@ -729,15 +824,16 @@ void display_bandwidth_csv(PCM *m, memdata_t *md, uint64 /*elapsedTime*/, const } void calculate_bandwidth(PCM *m, - const ServerUncoreCounterState uncState1[], - const ServerUncoreCounterState uncState2[], + const std::vector& uncState1, + const std::vector& uncState2, const uint64 elapsedTime, const bool csv, bool & csvheader, uint32 no_columns, const ServerUncoreMemoryMetrics & metrics, const bool show_channel_output, - const bool print_update) + const bool print_update, + const uint64 SPR_CHA_CXL_Count) { //const uint32 num_imc_channels = m->getMCChannelsPerSocket(); //const uint32 num_edc_channels = m->getEDCChannelsPerSocket(); @@ -775,17 +871,24 @@ void calculate_bandwidth(PCM *m, { md.M2M_NM_read_hit_rate[skt][i] = 0.; } + for (size_t p = 0; p < ServerUncoreCounterState::maxCXLPorts; ++p) + { + md.CXLMEM_Rd_socket_port[skt][p] = 0.0; + md.CXLMEM_Wr_socket_port[skt][p] = 0.0; + md.CXLCACHE_Rd_socket_port[skt][p] = 0.0; + md.CXLCACHE_Wr_socket_port[skt][p] = 0.0; + } } + auto toBW = [&elapsedTime](const uint64 nEvents) + { + return (float)(nEvents * 64 / 1000000.0 / (elapsedTime / 1000.0)); + }; + for(uint32 skt = 0; skt < m->getNumSockets(); ++skt) { const uint32 numChannels1 = (uint32)m->getMCChannels(skt, 0); // number of channels in the first controller - auto toBW = [&elapsedTime](const uint64 nEvents) - { - return (float)(nEvents * 64 / 1000000.0 / (elapsedTime / 1000.0)); - }; - if (m->HBMmemoryTrafficMetricsAvailable()) { const float scalingFactor = ((float) m->getHBMCASTransferSize()) / float(64.); @@ -927,25 +1030,33 @@ void calculate_bandwidth(PCM *m, { md.iMC_NM_hit_rate[skt] = md.iMC_PMM_MemoryMode_Hit_socket[skt] / all; } + + for (size_t p = 0; p < m->getNumCXLPorts(skt); ++p) + { + md.CXLMEM_Wr_socket_port[skt][p] = CXLBWWrScalingFactor * toBW(getCXLCMCounter((uint32)p, PCM::EventPosition::CXL_TxC_MEM, uncState1[skt], uncState2[skt])); + md.CXLCACHE_Wr_socket_port[skt][p] = CXLBWWrScalingFactor * toBW(getCXLCMCounter((uint32)p, PCM::EventPosition::CXL_TxC_CACHE, uncState1[skt], uncState2[skt])); + } } + const auto CXL_Read_BW = toBW(SPR_CHA_CXL_Count); + if (csv) { if (csvheader) { - display_bandwidth_csv(m, &md, elapsedTime, show_channel_output, Header1); - display_bandwidth_csv(m, &md, elapsedTime, show_channel_output, Header2); + display_bandwidth_csv(m, &md, elapsedTime, show_channel_output, Header1, CXL_Read_BW); + display_bandwidth_csv(m, &md, elapsedTime, show_channel_output, Header2, CXL_Read_BW); csvheader = false; } - display_bandwidth_csv(m, &md, elapsedTime, show_channel_output, Data); + display_bandwidth_csv(m, &md, elapsedTime, show_channel_output, Data, CXL_Read_BW); } else { - display_bandwidth(m, &md, no_columns, show_channel_output, print_update); + display_bandwidth(m, &md, no_columns, show_channel_output, print_update, CXL_Read_BW); } } -void calculate_bandwidth_rank(PCM *m, const ServerUncoreCounterState uncState1[], const ServerUncoreCounterState uncState2[], +void calculate_bandwidth_rank(PCM *m, const std::vector & uncState1, const std::vector& uncState2, const uint64 elapsedTime, const bool csv, bool &csvheader, const uint32 no_columns, const int rankA, const int rankB) { uint32 skt = 0; @@ -978,6 +1089,123 @@ void calculate_bandwidth_rank(PCM *m, const ServerUncoreCounterState uncState1[] } } +void readState(std::vector& state) +{ + auto* pcm = PCM::getInstance(); + assert(pcm); + for (uint32 i = 0; i < pcm->getNumSockets(); ++i) + state[i] = pcm->getServerUncoreCounterState(i); +}; + +class CHAEventCollector +{ + std::vector eventGroups; + double delay; + const char* sysCmd; + const MainLoop& mainLoop; + PCM* pcm; + std::vector > MidStates; + size_t curGroup = 0ULL; + uint64 totalCount = 0ULL; + CHAEventCollector() = delete; + CHAEventCollector(const CHAEventCollector&) = delete; + CHAEventCollector & operator = (const CHAEventCollector &) = delete; + + uint64 extractCHATotalCount(const std::vector& before, const std::vector& after) + { + uint64 result = 0; + for (uint32 i = 0; i < pcm->getNumSockets(); ++i) + { + for (uint32 cbo = 0; cbo < pcm->getMaxNumOfCBoxes(); ++cbo) + { + for (uint32 ctr = 0; ctr < 4 && ctr < eventGroups[curGroup].size(); ++ctr) + { + result += getCBOCounter(cbo, ctr, before[i], after[i]); + } + } + } + return result; + } + void programGroup(const size_t group) + { + uint64 events[4] = { 0, 0, 0, 0 }; + assert(group < eventGroups.size()); + for (size_t i = 0; i < 4 && i < eventGroups[group].size(); ++i) + { + events[i] = eventGroups[group][i]; + } + pcm->programCboRaw(events, 0, 0); + } + +public: + CHAEventCollector(const double delay_, const char* sysCmd_, const MainLoop& mainLoop_, PCM* m) : + sysCmd(sysCmd_), + mainLoop(mainLoop_), + pcm(m) + { + assert(pcm); + switch (pcm->getCPUModel()) + { + case PCM::SPR: + eventGroups = { + { + UNC_PMON_CTL_EVENT(0x35) + UNC_PMON_CTL_UMASK(0x01) + UNC_PMON_CTL_UMASK_EXT(0x10C80B82) , // UNC_CHA_TOR_INSERTS.IA_MISS_CRDMORPH_CXL_ACC + UNC_PMON_CTL_EVENT(0x35) + UNC_PMON_CTL_UMASK(0x01) + UNC_PMON_CTL_UMASK_EXT(0x10c80782) , // UNC_CHA_TOR_INSERTS.IA_MISS_RFO_CXL_ACC + UNC_PMON_CTL_EVENT(0x35) + UNC_PMON_CTL_UMASK(0x01) + UNC_PMON_CTL_UMASK_EXT(0x10c81782) , // UNC_CHA_TOR_INSERTS.IA_MISS_DRD_CXL_ACC + UNC_PMON_CTL_EVENT(0x35) + UNC_PMON_CTL_UMASK(0x01) + UNC_PMON_CTL_UMASK_EXT(0x10C88782) // UNC_CHA_TOR_INSERTS.IA_MISS_LLCPREFRFO_CXL_ACC + }, + { + UNC_PMON_CTL_EVENT(0x35) + UNC_PMON_CTL_UMASK(0x01) + UNC_PMON_CTL_UMASK_EXT(0x10CCC782) , // UNC_CHA_TOR_INSERTS.IA_MISS_RFO_PREF_CXL_ACC + UNC_PMON_CTL_EVENT(0x35) + UNC_PMON_CTL_UMASK(0x01) + UNC_PMON_CTL_UMASK_EXT(0x10C89782) , // UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PREF_CXL_ACC + UNC_PMON_CTL_EVENT(0x35) + UNC_PMON_CTL_UMASK(0x01) + UNC_PMON_CTL_UMASK_EXT(0x10CCD782) , // UNC_CHA_TOR_INSERTS.IA_MISS_LLCPREFDATA_CXL_ACC + UNC_PMON_CTL_EVENT(0x35) + UNC_PMON_CTL_UMASK(0x01) + UNC_PMON_CTL_UMASK_EXT(0x10CCCF82) // UNC_CHA_TOR_INSERTS.IA_MISS_LLCPREFCODE_CXL_ACC + } + }; + break; + } + + assert(eventGroups.size() > 1); + + delay = delay_ / double(eventGroups.size()); + MidStates.resize(eventGroups.size() - 1); + for (auto& e : MidStates) + { + e.resize(pcm->getNumSockets()); + } + } + + void programFirstGroup() + { + programGroup(0); + } + + void multiplexEvents(const std::vector& BeforeState) + { + for (curGroup = 0; curGroup < eventGroups.size() - 1; ++curGroup) + { + assert(curGroup < MidStates.size()); + calibratedSleep(delay, sysCmd, mainLoop, pcm); + readState(MidStates[curGroup]); // TODO: read only CHA counters (performance optmization) + totalCount += extractCHATotalCount((curGroup > 0) ? MidStates[curGroup - 1] : BeforeState, MidStates[curGroup]); + programGroup(curGroup + 1); + readState(MidStates[curGroup]); // TODO: read only CHA counters (performance optmization) + } + + calibratedSleep(delay, sysCmd, mainLoop, pcm); + } + + uint64 getTotalCount(const std::vector& AfterState) + { + return eventGroups.size() * (totalCount + extractCHATotalCount(MidStates.back(), AfterState)); + } + + void reset() + { + totalCount = 0; + } +}; + + PCM_MAIN_NOTHROW; int mainThrows(int argc, char * argv[]) @@ -1014,6 +1242,12 @@ int mainThrows(int argc, char * argv[]) string program = string(argv[0]); PCM * m = PCM::getInstance(); + assert(m); + if (m->getNumSockets() > max_sockets) + { + cerr << "Only systems with up to " << max_sockets << " sockets are supported! Program aborted\n"; + exit(EXIT_FAILURE); + } ServerUncoreMemoryMetrics metrics; metrics = m->PMMTrafficMetricsAvailable() ? Pmem : PartialWrites; @@ -1191,16 +1425,10 @@ int mainThrows(int argc, char * argv[]) PCM::ErrorCode status = m->programServerUncoreMemoryMetrics(metrics, rankA, rankB); m->checkError(status); - if(m->getNumSockets() > max_sockets) - { - cerr << "Only systems with up to " << max_sockets << " sockets are supported! Program aborted\n"; - exit(EXIT_FAILURE); - } - max_imc_channels = (pcm::uint32)m->getMCChannelsPerSocket(); - ServerUncoreCounterState * BeforeState = new ServerUncoreCounterState[m->getNumSockets()]; - ServerUncoreCounterState * AfterState = new ServerUncoreCounterState[m->getNumSockets()]; + std::vector BeforeState(m->getNumSockets()); + std::vector AfterState(m->getNumSockets()); uint64 BeforeTime = 0, AfterTime = 0; if ( (sysCmd != NULL) && (delay<=0.0) ) { @@ -1220,13 +1448,24 @@ int mainThrows(int argc, char * argv[]) if( ((delay<1.0) && (delay>0.0)) || (delay<=0.0) ) delay = PCM_DELAY_DEFAULT; } + shared_ptr chaEventCollector; + + SPR_CXL = (PCM::SPR == m->getCPUModel()) && (getNumCXLPorts(m) > 0); + if (SPR_CXL) + { + chaEventCollector = std::make_shared(delay, sysCmd, mainLoop, m); + assert(chaEventCollector.get()); + chaEventCollector->programFirstGroup(); + } + cerr << "Update every " << delay << " seconds\n"; if (csv) cerr << "Read/Write values expressed in (MB/s)" << endl; - for(uint32 i=0; igetNumSockets(); ++i) - BeforeState[i] = m->getServerUncoreCounterState(i); + readState(BeforeState); + + uint64 SPR_CHA_CXL_Event_Count = 0; BeforeTime = m->getTickCount(); @@ -1238,11 +1477,24 @@ int mainThrows(int argc, char * argv[]) { if (enforceFlush || !csv) cout << flush; - calibratedSleep(delay, sysCmd, mainLoop, m); + if (chaEventCollector.get()) + { + chaEventCollector->multiplexEvents(BeforeState); + } + else + { + calibratedSleep(delay, sysCmd, mainLoop, m); + } AfterTime = m->getTickCount(); - for(uint32 i=0; igetNumSockets(); ++i) - AfterState[i] = m->getServerUncoreCounterState(i); + readState(AfterState); + if (chaEventCollector.get()) + { + SPR_CHA_CXL_Event_Count = chaEventCollector->getTotalCount(AfterState); + chaEventCollector->reset(); + chaEventCollector->programFirstGroup(); + readState(AfterState); // TODO: re-read only CHA counters (performance optmization) + } if (!csv) { //cout << "Time elapsed: " << dec << fixed << AfterTime-BeforeTime << " ms\n"; @@ -1250,10 +1502,10 @@ int mainThrows(int argc, char * argv[]) } if(rankA >= 0 || rankB >= 0) - calculate_bandwidth_rank(m,BeforeState,AfterState,AfterTime-BeforeTime,csv,csvheader, no_columns, rankA, rankB); + calculate_bandwidth_rank(m,BeforeState, AfterState, AfterTime - BeforeTime, csv, csvheader, no_columns, rankA, rankB); else calculate_bandwidth(m,BeforeState,AfterState,AfterTime-BeforeTime,csv,csvheader, no_columns, metrics, - show_channel_output, print_update); + show_channel_output, print_update, SPR_CHA_CXL_Event_Count); swap(BeforeTime, AfterTime); swap(BeforeState, AfterState); @@ -1265,8 +1517,5 @@ int mainThrows(int argc, char * argv[]) return true; }); - delete[] BeforeState; - delete[] AfterState; - exit(EXIT_SUCCESS); } diff --git a/src/pcm-raw.cpp b/src/pcm-raw.cpp index 2e5d66f5..f29bb4c4 100644 --- a/src/pcm-raw.cpp +++ b/src/pcm-raw.cpp @@ -1536,6 +1536,22 @@ void printTransposed(const PCM::RawPMUConfigs& curPMUConfigs, [&]() { printUncoreRows([](const uint32 u, const uint32 i, const ServerUncoreCounterState& before, const ServerUncoreCounterState& after) { return getIIOCounter(u, i, before, after); }, (uint32)m->getMaxNumOfIIOStacks(), "IIO"); }); } + else if (type == "cxlcm") + { + choose(outputType, + [&]() { printUncoreRows(nullptr, (uint32) ServerUncoreCounterState::maxCXLPorts, "CXLCM"); }, + [&]() { printUncoreRows(nullptr, (uint32) ServerUncoreCounterState::maxCXLPorts, type); }, + [&]() { printUncoreRows([](const uint32 u, const uint32 i, const ServerUncoreCounterState& before, const ServerUncoreCounterState& after) { return getCXLCMCounter(u, i, before, after); }, ServerUncoreCounterState::maxCXLPorts, "CXLCM"); + }); + } + else if (type == "cxldp") + { + choose(outputType, + [&]() { printUncoreRows(nullptr, (uint32) ServerUncoreCounterState::maxCXLPorts, "CXLDP"); }, + [&]() { printUncoreRows(nullptr, (uint32) ServerUncoreCounterState::maxCXLPorts, type); }, + [&]() { printUncoreRows([](const uint32 u, const uint32 i, const ServerUncoreCounterState& before, const ServerUncoreCounterState& after) { return getCXLDPCounter(u, i, before, after); }, ServerUncoreCounterState::maxCXLPorts, "CXLDP"); + }); + } else { std::cerr << "ERROR: unrecognized PMU type \"" << type << "\"\n"; @@ -1875,6 +1891,42 @@ void print(const PCM::RawPMUConfigs& curPMUConfigs, } } } + else if (type == "cxlcm") + { + for (uint32 s = 0; s < m->getNumSockets(); ++s) + { + for (uint32 p = 0; p < ServerUncoreCounterState::maxCXLPorts; ++p) + { + int i = 0; + for (auto& event : events) + { + choose(outputType, + [s, p]() { cout << "SKT" << s << "CXLCM" << p << separator; }, + [&event, &i]() { if (event.second.empty()) cout << "CXLCMEvent" << i << separator; else cout << event.second << separator; }, + [&]() { cout << getCXLCMCounter(p, i, BeforeUncoreState[s], AfterUncoreState[s]) << separator; }); + ++i; + } + } + } + } + else if (type == "cxldp") + { + for (uint32 s = 0; s < m->getNumSockets(); ++s) + { + for (uint32 p = 0; p < ServerUncoreCounterState::maxCXLPorts; ++p) + { + int i = 0; + for (auto& event : events) + { + choose(outputType, + [s, p]() { cout << "SKT" << s << "CXLDP" << p << separator; }, + [&event, &i]() { if (event.second.empty()) cout << "CXLDPEvent" << i << separator; else cout << event.second << separator; }, + [&]() { cout << getCXLDPCounter(p, i, BeforeUncoreState[s], AfterUncoreState[s]) << separator; }); + ++i; + } + } + } + } else { std::cerr << "ERROR: unrecognized PMU type \"" << type << "\"\n"; diff --git a/src/pcm-sensor-server.cpp b/src/pcm-sensor-server.cpp index e7cedd82..ced46bbd 100644 --- a/src/pcm-sensor-server.cpp +++ b/src/pcm-sensor-server.cpp @@ -439,6 +439,9 @@ class JSONPrinter : Visitor uint32 links = pcm->getQPILinksPerSocket(); for ( uint32 i=0; i < sockets; ++i ) { startObject( std::string( "QPI Counters Socket " ) + std::to_string( i ), BEGIN_OBJECT ); + printCounter( std::string( "CXL Write Cache" ), getCXLWriteCacheBytes (i, before, after ) ); + printCounter( std::string( "CXL Write Mem" ), getCXLWriteMemBytes (i, before, after ) ); + for ( uint32 j=0; j < links; ++j ) { printCounter( std::string( "Incoming Data Traffic On Link " ) + std::to_string( j ), getIncomingQPILinkBytes ( i, j, before, after ) ); printCounter( std::string( "Outgoing Data And Non-Data Traffic On Link " ) + std::to_string( j ), getOutgoingQPILinkBytes ( i, j, before, after ) ); @@ -690,6 +693,8 @@ class PrometheusPrinter : Visitor uint32 links = pcm->getQPILinksPerSocket(); for ( uint32 i=0; i < sockets; ++i ) { addToHierarchy( std::string( "socket=\"" ) + std::to_string( i ) + "\"" ); + printCounter( std::string( "CXL Write Cache" ), getCXLWriteCacheBytes (i, before, after ) ); + printCounter( std::string( "CXL Write Mem" ), getCXLWriteMemBytes (i, before, after ) ); for ( uint32 j=0; j < links; ++j ) { printCounter( std::string( "Incoming Data Traffic On Link " ) + std::to_string( j ), getIncomingQPILinkBytes ( i, j, before, after ) ); printCounter( std::string( "Outgoing Data And Non-Data Traffic On Link " ) + std::to_string( j ), getOutgoingQPILinkBytes ( i, j, before, after ) ); @@ -3317,6 +3322,7 @@ int mainThrows(int argc, char * argv[]) { } do { status = pcmInstance->program(); + switch ( status ) { case PCM::PMUBusy: { @@ -3348,6 +3354,9 @@ int mainThrows(int argc, char * argv[]) { DBG( 1, "Programmed Partial Writes instead of PMEM R/W BW" ); } + //TODO: check return value when its implemented + pcmInstance->programCXLCM(); + #if defined (USE_SSL) if ( useSSL ) { if ( port == 0 ) diff --git a/src/topology.cpp b/src/topology.cpp index 615a3d13..a8cb6c85 100644 --- a/src/topology.cpp +++ b/src/topology.cpp @@ -86,6 +86,7 @@ void Aggregator::dispatch( SystemRoot const& syp ) { } PCM* pcm = PCM::getInstance(); pcm->readQPICounters( sycs_ ); + pcm->readAndAggregateCXLCMCounters( sycs_ ); } Aggregator::Aggregator() diff --git a/src/types.h b/src/types.h index 07a17869..f23a3b21 100644 --- a/src/types.h +++ b/src/types.h @@ -1117,6 +1117,8 @@ constexpr auto SPR_CHA_MSR_STEP = 0x10; #define CBO_MSR_PMON_CTL_INVERT (1 << 23) #define CBO_MSR_PMON_CTL_THRESH(x) (x << 24UL) #define UNC_PMON_CTL_UMASK_EXT(x) (uint64(x) << 32ULL) +#define UNC_PMON_CTL_EVENT(x) (x << 0) +#define UNC_PMON_CTL_UMASK(x) (x << 8) #define JKT_CBO_MSR_PMON_BOX_FILTER_OPC(x) (x << 23UL) #define IVTHSX_CBO_MSR_PMON_BOX_FILTER1_OPC(x) (x << 20UL) @@ -1232,6 +1234,8 @@ constexpr auto SPR_M2IOSF_IIO_CTL0 = 0x3002; constexpr auto SPR_M2IOSF_REG_STEP = 0x10; constexpr auto SPR_M2IOSF_NUM = 12; +constexpr auto CXL_PMON_SIZE = 0x1000; + #define IIO_MSR_PMON_CTL_EVENT(x) ((x) << 0) #define IIO_MSR_PMON_CTL_UMASK(x) ((x) << 8) #define IIO_MSR_PMON_CTL_RST (1 << 17) diff --git a/src/uncore_pmu_discovery.h b/src/uncore_pmu_discovery.h index 327c66e3..c8766c90 100644 --- a/src/uncore_pmu_discovery.h +++ b/src/uncore_pmu_discovery.h @@ -12,6 +12,8 @@ namespace pcm { constexpr auto SPR_PCU_BOX_TYPE = 4U; constexpr auto SPR_MDF_BOX_TYPE = 11U; +constexpr auto SPR_CXLCM_BOX_TYPE = 12U; +constexpr auto SPR_CXLDP_BOX_TYPE = 13U; class UncorePMUDiscovery {