Skip to content

Commit edaf3ed

Browse files
committed
x86/irq: KVM: Add helper for harvesting PIR to deduplicate KVM and posted MSIs
Now that posted MSI and KVM harvesting of PIR is identical, extract the code (and posted MSI's wonderful comment) to a common helper. No functional change intended. Link: https://lore.kernel.org/r/20250401163447.846608-9-seanjc@google.com Signed-off-by: Sean Christopherson <seanjc@google.com>
1 parent baf68a0 commit edaf3ed

File tree

3 files changed

+69
-61
lines changed

3 files changed

+69
-61
lines changed

arch/x86/include/asm/posted_intr.h

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,13 @@
11
/* SPDX-License-Identifier: GPL-2.0 */
22
#ifndef _X86_POSTED_INTR_H
33
#define _X86_POSTED_INTR_H
4+
5+
#include <asm/cmpxchg.h>
6+
#include <asm/rwonce.h>
47
#include <asm/irq_vectors.h>
58

9+
#include <linux/bitmap.h>
10+
611
#define POSTED_INTR_ON 0
712
#define POSTED_INTR_SN 1
813

@@ -26,6 +31,65 @@ struct pi_desc {
2631
u32 rsvd[6];
2732
} __aligned(64);
2833

34+
/*
35+
* De-multiplexing posted interrupts is on the performance path, the code
36+
* below is written to optimize the cache performance based on the following
37+
* considerations:
38+
* 1.Posted interrupt descriptor (PID) fits in a cache line that is frequently
39+
* accessed by both CPU and IOMMU.
40+
* 2.During software processing of posted interrupts, the CPU needs to do
41+
* natural width read and xchg for checking and clearing posted interrupt
42+
* request (PIR), a 256 bit field within the PID.
43+
* 3.On the other side, the IOMMU does atomic swaps of the entire PID cache
44+
* line when posting interrupts and setting control bits.
45+
* 4.The CPU can access the cache line a magnitude faster than the IOMMU.
46+
* 5.Each time the IOMMU does interrupt posting to the PIR will evict the PID
47+
* cache line. The cache line states after each operation are as follows,
48+
* assuming a 64-bit kernel:
49+
* CPU IOMMU PID Cache line state
50+
* ---------------------------------------------------------------
51+
*...read64 exclusive
52+
*...lock xchg64 modified
53+
*... post/atomic swap invalid
54+
*...-------------------------------------------------------------
55+
*
56+
* To reduce L1 data cache miss, it is important to avoid contention with
57+
* IOMMU's interrupt posting/atomic swap. Therefore, a copy of PIR is used
58+
* when processing posted interrupts in software, e.g. to dispatch interrupt
59+
* handlers for posted MSIs, or to move interrupts from the PIR to the vIRR
60+
* in KVM.
61+
*
62+
* In addition, the code is trying to keep the cache line state consistent
63+
* as much as possible. e.g. when making a copy and clearing the PIR
64+
* (assuming non-zero PIR bits are present in the entire PIR), it does:
65+
* read, read, read, read, xchg, xchg, xchg, xchg
66+
* instead of:
67+
* read, xchg, read, xchg, read, xchg, read, xchg
68+
*/
69+
static __always_inline bool pi_harvest_pir(unsigned long *pir,
70+
unsigned long *pir_vals)
71+
{
72+
unsigned long pending = 0;
73+
int i;
74+
75+
for (i = 0; i < NR_PIR_WORDS; i++) {
76+
pir_vals[i] = READ_ONCE(pir[i]);
77+
pending |= pir_vals[i];
78+
}
79+
80+
if (!pending)
81+
return false;
82+
83+
for (i = 0; i < NR_PIR_WORDS; i++) {
84+
if (!pir_vals[i])
85+
continue;
86+
87+
pir_vals[i] = arch_xchg(&pir[i], 0);
88+
}
89+
90+
return true;
91+
}
92+
2993
static inline bool pi_test_and_set_on(struct pi_desc *pi_desc)
3094
{
3195
return test_and_set_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->control);

arch/x86/kernel/irq.c

Lines changed: 3 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -380,58 +380,14 @@ void intel_posted_msi_init(void)
380380
this_cpu_write(posted_msi_pi_desc.ndst, destination);
381381
}
382382

383-
/*
384-
* De-multiplexing posted interrupts is on the performance path, the code
385-
* below is written to optimize the cache performance based on the following
386-
* considerations:
387-
* 1.Posted interrupt descriptor (PID) fits in a cache line that is frequently
388-
* accessed by both CPU and IOMMU.
389-
* 2.During posted MSI processing, the CPU needs to do 64-bit read and xchg
390-
* for checking and clearing posted interrupt request (PIR), a 256 bit field
391-
* within the PID.
392-
* 3.On the other side, the IOMMU does atomic swaps of the entire PID cache
393-
* line when posting interrupts and setting control bits.
394-
* 4.The CPU can access the cache line a magnitude faster than the IOMMU.
395-
* 5.Each time the IOMMU does interrupt posting to the PIR will evict the PID
396-
* cache line. The cache line states after each operation are as follows:
397-
* CPU IOMMU PID Cache line state
398-
* ---------------------------------------------------------------
399-
*...read64 exclusive
400-
*...lock xchg64 modified
401-
*... post/atomic swap invalid
402-
*...-------------------------------------------------------------
403-
*
404-
* To reduce L1 data cache miss, it is important to avoid contention with
405-
* IOMMU's interrupt posting/atomic swap. Therefore, a copy of PIR is used
406-
* to dispatch interrupt handlers.
407-
*
408-
* In addition, the code is trying to keep the cache line state consistent
409-
* as much as possible. e.g. when making a copy and clearing the PIR
410-
* (assuming non-zero PIR bits are present in the entire PIR), it does:
411-
* read, read, read, read, xchg, xchg, xchg, xchg
412-
* instead of:
413-
* read, xchg, read, xchg, read, xchg, read, xchg
414-
*/
415383
static __always_inline bool handle_pending_pir(unsigned long *pir, struct pt_regs *regs)
416384
{
417-
unsigned long pir_copy[NR_PIR_WORDS], pending = 0;
418-
int i, vec = FIRST_EXTERNAL_VECTOR;
419-
420-
for (i = 0; i < NR_PIR_WORDS; i++) {
421-
pir_copy[i] = READ_ONCE(pir[i]);
422-
pending |= pir_copy[i];
423-
}
385+
unsigned long pir_copy[NR_PIR_WORDS];
386+
int vec = FIRST_EXTERNAL_VECTOR;
424387

425-
if (!pending)
388+
if (!pi_harvest_pir(pir, pir_copy))
426389
return false;
427390

428-
for (i = 0; i < NR_PIR_WORDS; i++) {
429-
if (!pir_copy[i])
430-
continue;
431-
432-
pir_copy[i] = arch_xchg(&pir[i], 0);
433-
}
434-
435391
for_each_set_bit_from(vec, pir_copy, FIRST_SYSTEM_VECTOR)
436392
call_irq_handler(vec, regs);
437393

arch/x86/kvm/lapic.c

Lines changed: 2 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -657,7 +657,7 @@ static u8 count_vectors(void *bitmap)
657657

658658
bool __kvm_apic_update_irr(unsigned long *pir, void *regs, int *max_irr)
659659
{
660-
unsigned long pir_vals[NR_PIR_WORDS], pending = 0;
660+
unsigned long pir_vals[NR_PIR_WORDS];
661661
u32 *__pir = (void *)pir_vals;
662662
u32 i, vec;
663663
u32 irr_val, prev_irr_val;
@@ -666,21 +666,9 @@ bool __kvm_apic_update_irr(unsigned long *pir, void *regs, int *max_irr)
666666
max_updated_irr = -1;
667667
*max_irr = -1;
668668

669-
for (i = 0; i < NR_PIR_WORDS; i++) {
670-
pir_vals[i] = READ_ONCE(pir[i]);
671-
pending |= pir_vals[i];
672-
}
673-
674-
if (!pending)
669+
if (!pi_harvest_pir(pir, pir_vals))
675670
return false;
676671

677-
for (i = 0; i < NR_PIR_WORDS; i++) {
678-
if (!pir_vals[i])
679-
continue;
680-
681-
pir_vals[i] = arch_xchg(&pir[i], 0);
682-
}
683-
684672
for (i = vec = 0; i <= 7; i++, vec += 32) {
685673
u32 *p_irr = (u32 *)(regs + APIC_IRR + i * 0x10);
686674

0 commit comments

Comments
 (0)