1
1
/* SPDX-License-Identifier: GPL-2.0 */
2
2
#ifndef _X86_POSTED_INTR_H
3
3
#define _X86_POSTED_INTR_H
4
+
5
+ #include <asm/cmpxchg.h>
6
+ #include <asm/rwonce.h>
4
7
#include <asm/irq_vectors.h>
5
8
9
+ #include <linux/bitmap.h>
10
+
6
11
#define POSTED_INTR_ON 0
7
12
#define POSTED_INTR_SN 1
8
13
@@ -26,6 +31,65 @@ struct pi_desc {
26
31
u32 rsvd [6 ];
27
32
} __aligned (64 );
28
33
34
+ /*
35
+ * De-multiplexing posted interrupts is on the performance path, the code
36
+ * below is written to optimize the cache performance based on the following
37
+ * considerations:
38
+ * 1.Posted interrupt descriptor (PID) fits in a cache line that is frequently
39
+ * accessed by both CPU and IOMMU.
40
+ * 2.During software processing of posted interrupts, the CPU needs to do
41
+ * natural width read and xchg for checking and clearing posted interrupt
42
+ * request (PIR), a 256 bit field within the PID.
43
+ * 3.On the other side, the IOMMU does atomic swaps of the entire PID cache
44
+ * line when posting interrupts and setting control bits.
45
+ * 4.The CPU can access the cache line a magnitude faster than the IOMMU.
46
+ * 5.Each time the IOMMU does interrupt posting to the PIR will evict the PID
47
+ * cache line. The cache line states after each operation are as follows,
48
+ * assuming a 64-bit kernel:
49
+ * CPU IOMMU PID Cache line state
50
+ * ---------------------------------------------------------------
51
+ *...read64 exclusive
52
+ *...lock xchg64 modified
53
+ *... post/atomic swap invalid
54
+ *...-------------------------------------------------------------
55
+ *
56
+ * To reduce L1 data cache miss, it is important to avoid contention with
57
+ * IOMMU's interrupt posting/atomic swap. Therefore, a copy of PIR is used
58
+ * when processing posted interrupts in software, e.g. to dispatch interrupt
59
+ * handlers for posted MSIs, or to move interrupts from the PIR to the vIRR
60
+ * in KVM.
61
+ *
62
+ * In addition, the code is trying to keep the cache line state consistent
63
+ * as much as possible. e.g. when making a copy and clearing the PIR
64
+ * (assuming non-zero PIR bits are present in the entire PIR), it does:
65
+ * read, read, read, read, xchg, xchg, xchg, xchg
66
+ * instead of:
67
+ * read, xchg, read, xchg, read, xchg, read, xchg
68
+ */
69
+ static __always_inline bool pi_harvest_pir (unsigned long * pir ,
70
+ unsigned long * pir_vals )
71
+ {
72
+ unsigned long pending = 0 ;
73
+ int i ;
74
+
75
+ for (i = 0 ; i < NR_PIR_WORDS ; i ++ ) {
76
+ pir_vals [i ] = READ_ONCE (pir [i ]);
77
+ pending |= pir_vals [i ];
78
+ }
79
+
80
+ if (!pending )
81
+ return false;
82
+
83
+ for (i = 0 ; i < NR_PIR_WORDS ; i ++ ) {
84
+ if (!pir_vals [i ])
85
+ continue ;
86
+
87
+ pir_vals [i ] = arch_xchg (& pir [i ], 0 );
88
+ }
89
+
90
+ return true;
91
+ }
92
+
29
93
static inline bool pi_test_and_set_on (struct pi_desc * pi_desc )
30
94
{
31
95
return test_and_set_bit (POSTED_INTR_ON , (unsigned long * )& pi_desc -> control );
0 commit comments