HVM-750 guest virtio drivers are racy with respect to interrupts

HVM-751 CONFIGURE_ONLY check in build.sh is wrong HVM-752 Import qemu-kvm.git barrier changes
TritonDataCenter · Sep 24, 2012 · 6d85df9 · 6d85df9
1 parent a28b557
commit 6d85df9
Show file tree

Hide file tree

Showing 6 changed files with 427 additions and 20 deletions.
diff --git a/build.sh b/build.sh
@@ -75,9 +75,9 @@ KERNEL_SOURCE="${KERNEL_SOURCE:-$(pwd)/../../illumos}"
 CTFBINDIR="$KERNEL_SOURCE"/usr/src/tools/proto/root_i386-nd/opt/onbld/bin/i386
 export PATH="$PATH:$CTFBINDIR"
 
-if [[ -z "CONFIGURE_ONLY" ]]; then
+if [[ -z "$CONFIGURE_ONLY" ]]; then
 	echo "==> Make"
-	gmake
+	V=1 gmake
 else
 	echo "Not running make per-request"
 fi
diff --git a/hw/virtio-net.c b/hw/virtio-net.c
@@ -26,6 +26,89 @@
 #define MAC_TABLE_ENTRIES    64
 #define MAX_VLAN    (1 << 12)   /* Per 802.1Q definition */
 
+/*
+ * Unfortunately some guest virtio drivers are a little racy with respect to
+ * when they notify us and when they unmask their respective interrupts.
+ * Currently we have to work around this in QEMU. While OSes normally work
+ * around pathological devices, virtual devices here will have to work around
+ * virtual hardware. To put this more concretely, a Linux guest will notify the
+ * host to do processing work before it unmasks interrupts. Therefore, by the
+ * time that we get to virtio_notify interrupts on the available ring won't be
+ * unmasked so we won't inject the interrupt, but the guest will instead wait
+ * indefinitely for one. This leads to us losing data.
+ *
+ * We need to note whether or not we injected an interrupt during a
+ * virtio_notify. If we did not and either of the following conditions about the
+ * ring buffers are true:
+ * 
+ * o The last available index processed equals the used index
+ * o The last available index processed does not equal the current
+ *   available index
+ * 
+ * If this is the case, then we set up a small timer that runs for 500 ticks,
+ * each tick is 10ms long. If we reach 500 ticks, then we just ignore it. This
+ * is actually a valid position because the guest could have transmitted a small
+ * amount of packets, but not enough to actually cause it to need injection. If
+ * we get notified, aka hit virtio_net_handle_tx_timer, then we stop the timer,
+ * because we're about to do processing that may inject an interrupt. Finally,
+ * if on a tick we check two different conditions. The first is to see if the
+ * last processed available ring index is not equal to the current available
+ * ring index. If that is true, then we effectively call virtqueue_flush as
+ * virtio_net_tx_timer would.  Finally we check if the last available ring index
+ * is equal to the used ring index and interrupts are not masked. If this is the
+ * case, then we simply inject the interrupt and continue.
+ * 
+ * This is summarized by the following rough state transition diagram:
+ *       
+ *                                    Otherwise     +---+
+ *  virtqueue_ --+                    increment +---*   |
+ *  flush()      |                    tick count   \|/  |  + avail ring
+ *  finishes     |                     +-------------+  |  | index >
+ *  without  +---*-------------------->|             |--+  | last avail 
+ *  injecting|                         |   Timer     |     | index pro-
+ *  an intr. |     +-----*-------------|   Active    |     | cessed
+ *           |     |     |             |             |-----*-----------+
+ *           |     |     |             +-------------+                 |
+ *           |     |     +- 500 ticks    |     |                       |
+ *           |     |        elapse       |     *--+ Avail ring         |
+ *           |    \|/                    |     |    unmasked           |
+ *         +-------------+               |     |                       |
+ *         |             |<--*-----------+     |     +--------+        |
+ *         |   Timer     |   |                 |     |        |        |
+ *         |   Inactive  |   +- virtio_net_    +---->| Inject |        |
+ *         |             |      handle_tx_           | MSI/x  |        |
+ *         +-------------+      timer() runs         |        |        |
+ *           ^      ^                                +--------+        |
+ *           |      |                       +- always    |             |
+ *           |      |                       |            |             |
+ *           |      +-----------------------*------------+             |
+ *           |                                                         |
+ *           |               +- always       +------------------+      |
+ *           |               |               |                  |      |
+ *           +---------------*---------------| Flush Virtqueues |<-----+
+ *                                           |                  |
+ *                                           +------------------+
+ */
+
+
+#define	REINJECT_TICK_RATE	(10000000)	/* 10ms in ns */	
+#define REINJECT_DEADMAN	500		/* 5s in ticks */
+
+typedef enum rein_act {
+	REIN_INJECT,
+	REIN_DEADMAN,
+	REIN_RUN
+} rein_act_t;
+
+#define	REIN_RING_MAX	64
+
+typedef struct rein_event {
+	rein_act_t 	re_act;
+	hrtime_t	re_time;
+	uint64_t	re_other;
+	struct timeval	re_tval;
+} rein_event_t;
+
 typedef struct VirtIONet
 {
     VirtIODevice vdev;
@@ -63,8 +146,78 @@ typedef struct VirtIONet
     } mac_table;
     uint32_t *vlans;
     DeviceState *qdev;
+    QEMUTimer *rein_timer;
+    uint32_t rein_timer_ticks;
+    uint8_t rein_timer_act;
+    uint32_t rein_ring_idx;
+    rein_event_t rein_ring[REIN_RING_MAX];
+    uint64_t rein_n_dead;
+    uint64_t rein_n_inject;
+    uint64_t rein_n_rerun;
 } VirtIONet;
 
+static void virtio_net_handle_tx_timer(VirtIODevice *, VirtQueue *);
+
+static void virtio_net_rein_event(VirtIONet *n, rein_act_t act, uint64_t other)
+{
+	int index = n->rein_ring_idx;
+	n->rein_ring_idx = (n->rein_ring_idx + 1) % REIN_RING_MAX;
+	rein_event_t *rep = n->rein_ring + index;
+	rep->re_time = gethrtime();
+	rep->re_act = act;
+	rep->re_other = other;
+	(void) gettimeofday(&rep->re_tval, NULL);
+}
+
+static void virtio_net_rein_disable(VirtIONet *n)
+{
+	qemu_del_timer(n->rein_timer);
+	n->rein_timer_act = 0;
+}
+
+static void virtio_net_rein_enable(VirtIONet *n)
+{
+	n->rein_timer_ticks = 0;
+	qemu_mod_timer(n->rein_timer,
+	    qemu_get_clock(vm_clock) + REINJECT_TICK_RATE);
+	n->rein_timer_act = 1;
+}
+
+static void virtio_net_rein_tick(void *opaque)
+{
+	int ret;
+	VirtIONet *n = opaque;
+	assert(n->rein_timer_act);
+
+	n->rein_timer_ticks++;
+
+	/* Give up, this may be completely reasonable */
+	if (n->rein_timer_ticks > REINJECT_DEADMAN) {
+		virtio_net_rein_event(n, REIN_DEADMAN, n->rein_timer_ticks);
+		virtio_net_rein_disable(n);
+		n->rein_n_dead++;
+		return;
+	}
+
+	ret = virtqueue_stalled(n->tx_vq);
+	if (ret == 1) {
+		virtio_net_rein_event(n, REIN_INJECT, n->rein_timer_ticks);
+		virtio_net_rein_disable(n);
+		n->rein_n_inject++;
+		return;
+	} else if (ret == 2) {
+		virtio_net_rein_event(n, REIN_RUN, n->rein_timer_ticks);
+		virtio_net_rein_disable(n);
+		virtio_net_handle_tx_timer(&n->vdev, n->tx_vq);
+		n->rein_n_rerun++;
+		return;
+	}
+
+	assert(ret == 0);
+	qemu_mod_timer(n->rein_timer,
+	    qemu_get_clock(vm_clock) + REINJECT_TICK_RATE);
+}
+
 /* TODO
  * - we could suppress RX interrupt if we were so inclined.
  */
@@ -707,6 +860,7 @@ static int32_t virtio_net_flush_tx(VirtIONet *n, VirtQueue *vq)
 {
     VirtQueueElement elem;
     int32_t num_packets = 0;
+    int32_t inject = 1;
     if (!(n->vdev.status & VIRTIO_CONFIG_S_DRIVER_OK)) {
         return num_packets;
     }
@@ -758,12 +912,16 @@ static int32_t virtio_net_flush_tx(VirtIONet *n, VirtQueue *vq)
         len += ret;
 
         virtqueue_push(vq, &elem, len);
-        virtio_notify(&n->vdev, vq);
+        inject = virtio_notify(&n->vdev, vq);
 
         if (++num_packets >= n->tx_burst) {
             break;
         }
     }
+
+    if (inject == 0 && virtqueue_handled(vq))
+	    virtio_net_rein_enable(n);
+
     return num_packets;
 }
 
@@ -777,6 +935,16 @@ static void virtio_net_handle_tx_timer(VirtIODevice *vdev, VirtQueue *vq)
         return;
     }
 
+    /*
+     * Kill the broken guest timer. The reason we are here is because the guest
+     * has kicked us to send packets therefore we don't need to go back and
+     * consider injecting it with interrupts because we will do that again
+     * naturally. We also don't reset 
+     */
+    if (n->rein_timer_act)
+	    virtio_net_rein_disable(n);
+
+
     if (n->tx_waiting) {
         virtio_queue_set_notification(vq, 1);
         qemu_del_timer(n->tx_timer);
@@ -1024,6 +1192,12 @@ VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf,
         n->tx_vq = virtio_add_queue(&n->vdev, 256, virtio_net_handle_tx_bh);
         n->tx_bh = qemu_bh_new(virtio_net_tx_bh, n);
     }
+    n->rein_timer = qemu_new_timer(vm_clock, virtio_net_rein_tick, n);
+    n->rein_ring_idx = 0;
+    bzero(n->rein_ring, sizeof (rein_event_t) * REIN_RING_MAX);
+    n->rein_n_dead = 0;
+    n->rein_n_inject = 0;
+    n->rein_n_rerun = 0;
     n->ctrl_vq = virtio_add_queue(&n->vdev, 64, virtio_net_handle_ctrl);
     qemu_macaddr_default_if_unset(&conf->macaddr);
     memcpy(&n->mac[0], &conf->macaddr, sizeof(n->mac));

diff --git a/hw/virtio.c b/hw/virtio.c
@@ -17,20 +17,12 @@
 #include "qemu-error.h"
 #include "virtio.h"
 #include "sysemu.h"
+#include "qemu-barrier.h"
 
 /* The alignment to use between consumer and producer parts of vring.
  * x86 pagesize again. */
 #define VIRTIO_PCI_VRING_ALIGN         4096
 
-/* QEMU doesn't strictly need write barriers since everything runs in
- * lock-step.  We'll leave the calls to wmb() in though to make it obvious for
- * KVM or if kqemu gets SMP support.
- * In any case, we must prevent the compiler from reordering the code.
- * TODO: we likely need some rmb()/mb() as well.
- */
-
-#define wmb() __asm__ __volatile__("": : :"memory")
-
 typedef struct VRingDesc
 {
     uint64_t addr;
@@ -169,6 +161,13 @@ static inline void vring_used_idx_increment(VirtQueue *vq, uint16_t val)
     stw_phys(pa, vring_used_idx(vq) + val);
 }
 
+static inline uint16_t vring_used_flags(VirtQueue *vq)
+{
+    target_phys_addr_t pa;
+    pa = vq->vring.used + offsetof(VRingUsed, flags);
+    return lduw_phys(pa);
+}
+
 static inline void vring_used_flags_set_bit(VirtQueue *vq, int mask)
 {
     target_phys_addr_t pa;
@@ -235,7 +234,7 @@ void virtqueue_fill(VirtQueue *vq, const VirtQueueElement *elem,
 void virtqueue_flush(VirtQueue *vq, unsigned int count)
 {
     /* Make sure buffer is written before we update index. */
-    wmb();
+    smp_wmb();
     trace_virtqueue_flush(vq, count);
     vring_used_idx_increment(vq, count);
     vq->inuse -= count;
@@ -258,6 +257,11 @@ static int virtqueue_num_heads(VirtQueue *vq, unsigned int idx)
                      idx, vring_avail_idx(vq));
         exit(1);
     }
+    /* On success, callers read a descriptor at vq->last_avail_idx.
+     * Make sure descriptor read does not bypass avail index read. */
+    if (num_heads) {
+        smp_rmb();
+    }
 
     return num_heads;
 }
@@ -291,7 +295,7 @@ static unsigned virtqueue_next_desc(target_phys_addr_t desc_pa,
     /* Check they're not leading us off end of descriptors. */
     next = vring_desc_next(desc_pa, i);
     /* Make sure compiler knows to grab that: we don't want it changing! */
-    wmb();
+    smp_wmb();
 
     if (next >= max) {
         error_report("Desc next is %u", next);
@@ -629,17 +633,20 @@ void virtio_irq(VirtQueue *vq)
     virtio_notify_vector(vq->vdev, vq->vector);
 }
 
-void virtio_notify(VirtIODevice *vdev, VirtQueue *vq)
+int virtio_notify(VirtIODevice *vdev, VirtQueue *vq)
 {
+    /* We need to expose used array entries before checking used event. */
+    smp_mb();
     /* Always notify when queue is empty (when feature acknowledge) */
     if ((vring_avail_flags(vq) & VRING_AVAIL_F_NO_INTERRUPT) &&
         (!(vdev->guest_features & (1 << VIRTIO_F_NOTIFY_ON_EMPTY)) ||
          (vq->inuse || vring_avail_idx(vq) != vq->last_avail_idx)))
-        return;
+        return 0;
 
     trace_virtio_notify(vdev, vq);
     vdev->isr |= 0x01;
     virtio_notify_vector(vdev, vq->vector);
+    return 1;
 }
 
 void virtio_notify_config(VirtIODevice *vdev)
@@ -880,3 +887,55 @@ EventNotifier *virtio_queue_get_host_notifier(VirtQueue *vq)
 {
     return &vq->host_notifier;
 }
+
+int virtqueue_handled(VirtQueue *vq)
+{
+	smp_mb();
+	return (vq->last_avail_idx == vring_used_idx(vq) ||
+	    vq->last_avail_idx != vring_avail_idx(vq));
+}
+
+/*
+ * We need to go through and check if we have hit the 'stalled' condition.
+ * Due to the way that the virtio driver is implemented in the Linux kernel, it
+ * will potentially kick the guest to process data, disable the queue, but not
+ * enable interrupts before the host is done processing packets. When this
+ * happens all network traffic from the guest ends up getting corked up because
+ * the guest disabled the queue and is waiting for an interrupt from the host to
+ * go and enable it again. In fact, when in this state a little bit of libproc
+ * magic gets us going again rather reliably.
+ * 
+ * Eventually the guest will go through and unmask interrupts saying that it
+ * wants an injection. If we reach a point in time where the last seen available
+ * index is equal to the available index ring and is equal to the used index
+ * ring, then we'll go ahead and install the interupt.
+ */
+int virtqueue_stalled(VirtQueue *vq)
+{
+	smp_mb();
+
+	if (vring_avail_flags(vq) & VRING_AVAIL_F_NO_INTERRUPT)
+		return (0);
+
+	if (vring_used_flags(vq) & VRING_USED_F_NO_NOTIFY)
+		return (0);
+
+	if (vq->inuse)
+		return (0);
+
+	/* We could have also lost the interrupt the other way */
+	if (vq->last_avail_idx != vring_avail_idx(vq))
+		return (2);
+
+	if (vq->last_avail_idx != vring_used_idx(vq))
+		return (0);
+
+	/*
+	 * Interrupts are enabled and we're at a point in time where we would
+	 * have stalled. Let's go ahead and inject the interrupt.
+	 */
+	trace_virtio_notify(vq->vdev, vq);
+	vq->vdev->isr |= 0x01;
+	virtio_notify_vector(vq->vdev, vq->vector);
+	return (1);
+}
diff --git a/hw/virtio.h b/hw/virtio.h
@@ -153,7 +153,7 @@ void virtqueue_map_sg(struct iovec *sg, target_phys_addr_t *addr,
 int virtqueue_pop(VirtQueue *vq, VirtQueueElement *elem);
 int virtqueue_avail_bytes(VirtQueue *vq, int in_bytes, int out_bytes);
 
-void virtio_notify(VirtIODevice *vdev, VirtQueue *vq);
+int virtio_notify(VirtIODevice *vdev, VirtQueue *vq);
 
 void virtio_save(VirtIODevice *vdev, QEMUFile *f);
 
@@ -226,4 +226,6 @@ EventNotifier *virtio_queue_get_guest_notifier(VirtQueue *vq);
 EventNotifier *virtio_queue_get_host_notifier(VirtQueue *vq);
 void virtio_queue_notify_vq(VirtQueue *vq);
 void virtio_irq(VirtQueue *vq);
+int virtqueue_stalled(VirtQueue *vq);
+int virtqueue_handled(VirtQueue *vq);
 #endif