Skip to content

Commit

Permalink
HVM-750 guest virtio drivers are racy with respect to interrupts
Browse files Browse the repository at this point in the history
HVM-751 CONFIGURE_ONLY check in build.sh is wrong
HVM-752 Import qemu-kvm.git barrier changes
  • Loading branch information
rmustacc committed Sep 24, 2012
1 parent a28b557 commit 6d85df9
Show file tree
Hide file tree
Showing 6 changed files with 427 additions and 20 deletions.
4 changes: 2 additions & 2 deletions build.sh
Expand Up @@ -75,9 +75,9 @@ KERNEL_SOURCE="${KERNEL_SOURCE:-$(pwd)/../../illumos}"
CTFBINDIR="$KERNEL_SOURCE"/usr/src/tools/proto/root_i386-nd/opt/onbld/bin/i386
export PATH="$PATH:$CTFBINDIR"

if [[ -z "CONFIGURE_ONLY" ]]; then
if [[ -z "$CONFIGURE_ONLY" ]]; then
echo "==> Make"
gmake
V=1 gmake
else
echo "Not running make per-request"
fi
176 changes: 175 additions & 1 deletion hw/virtio-net.c
Expand Up @@ -26,6 +26,89 @@
#define MAC_TABLE_ENTRIES 64
#define MAX_VLAN (1 << 12) /* Per 802.1Q definition */

/*
* Unfortunately some guest virtio drivers are a little racy with respect to
* when they notify us and when they unmask their respective interrupts.
* Currently we have to work around this in QEMU. While OSes normally work
* around pathological devices, virtual devices here will have to work around
* virtual hardware. To put this more concretely, a Linux guest will notify the
* host to do processing work before it unmasks interrupts. Therefore, by the
* time that we get to virtio_notify interrupts on the available ring won't be
* unmasked so we won't inject the interrupt, but the guest will instead wait
* indefinitely for one. This leads to us losing data.
*
* We need to note whether or not we injected an interrupt during a
* virtio_notify. If we did not and either of the following conditions about the
* ring buffers are true:
*
* o The last available index processed equals the used index
* o The last available index processed does not equal the current
* available index
*
* If this is the case, then we set up a small timer that runs for 500 ticks,
* each tick is 10ms long. If we reach 500 ticks, then we just ignore it. This
* is actually a valid position because the guest could have transmitted a small
* amount of packets, but not enough to actually cause it to need injection. If
* we get notified, aka hit virtio_net_handle_tx_timer, then we stop the timer,
* because we're about to do processing that may inject an interrupt. Finally,
* if on a tick we check two different conditions. The first is to see if the
* last processed available ring index is not equal to the current available
* ring index. If that is true, then we effectively call virtqueue_flush as
* virtio_net_tx_timer would. Finally we check if the last available ring index
* is equal to the used ring index and interrupts are not masked. If this is the
* case, then we simply inject the interrupt and continue.
*
* This is summarized by the following rough state transition diagram:
*
* Otherwise +---+
* virtqueue_ --+ increment +---* |
* flush() | tick count \|/ | + avail ring
* finishes | +-------------+ | | index >
* without +---*-------------------->| |--+ | last avail
* injecting| | Timer | | index pro-
* an intr. | +-----*-------------| Active | | cessed
* | | | | |-----*-----------+
* | | | +-------------+ |
* | | +- 500 ticks | | |
* | | elapse | *--+ Avail ring |
* | \|/ | | unmasked |
* +-------------+ | | |
* | |<--*-----------+ | +--------+ |
* | Timer | | | | | |
* | Inactive | +- virtio_net_ +---->| Inject | |
* | | handle_tx_ | MSI/x | |
* +-------------+ timer() runs | | |
* ^ ^ +--------+ |
* | | +- always | |
* | | | | |
* | +-----------------------*------------+ |
* | |
* | +- always +------------------+ |
* | | | | |
* +---------------*---------------| Flush Virtqueues |<-----+
* | |
* +------------------+
*/


#define REINJECT_TICK_RATE (10000000) /* 10ms in ns */
#define REINJECT_DEADMAN 500 /* 5s in ticks */

typedef enum rein_act {
REIN_INJECT,
REIN_DEADMAN,
REIN_RUN
} rein_act_t;

#define REIN_RING_MAX 64

typedef struct rein_event {
rein_act_t re_act;
hrtime_t re_time;
uint64_t re_other;
struct timeval re_tval;
} rein_event_t;

typedef struct VirtIONet
{
VirtIODevice vdev;
Expand Down Expand Up @@ -63,8 +146,78 @@ typedef struct VirtIONet
} mac_table;
uint32_t *vlans;
DeviceState *qdev;
QEMUTimer *rein_timer;
uint32_t rein_timer_ticks;
uint8_t rein_timer_act;
uint32_t rein_ring_idx;
rein_event_t rein_ring[REIN_RING_MAX];
uint64_t rein_n_dead;
uint64_t rein_n_inject;
uint64_t rein_n_rerun;
} VirtIONet;

static void virtio_net_handle_tx_timer(VirtIODevice *, VirtQueue *);

static void virtio_net_rein_event(VirtIONet *n, rein_act_t act, uint64_t other)
{
int index = n->rein_ring_idx;
n->rein_ring_idx = (n->rein_ring_idx + 1) % REIN_RING_MAX;
rein_event_t *rep = n->rein_ring + index;
rep->re_time = gethrtime();
rep->re_act = act;
rep->re_other = other;
(void) gettimeofday(&rep->re_tval, NULL);
}

static void virtio_net_rein_disable(VirtIONet *n)
{
qemu_del_timer(n->rein_timer);
n->rein_timer_act = 0;
}

static void virtio_net_rein_enable(VirtIONet *n)
{
n->rein_timer_ticks = 0;
qemu_mod_timer(n->rein_timer,
qemu_get_clock(vm_clock) + REINJECT_TICK_RATE);
n->rein_timer_act = 1;
}

static void virtio_net_rein_tick(void *opaque)
{
int ret;
VirtIONet *n = opaque;
assert(n->rein_timer_act);

n->rein_timer_ticks++;

/* Give up, this may be completely reasonable */
if (n->rein_timer_ticks > REINJECT_DEADMAN) {
virtio_net_rein_event(n, REIN_DEADMAN, n->rein_timer_ticks);
virtio_net_rein_disable(n);
n->rein_n_dead++;
return;
}

ret = virtqueue_stalled(n->tx_vq);
if (ret == 1) {
virtio_net_rein_event(n, REIN_INJECT, n->rein_timer_ticks);
virtio_net_rein_disable(n);
n->rein_n_inject++;
return;
} else if (ret == 2) {
virtio_net_rein_event(n, REIN_RUN, n->rein_timer_ticks);
virtio_net_rein_disable(n);
virtio_net_handle_tx_timer(&n->vdev, n->tx_vq);
n->rein_n_rerun++;
return;
}

assert(ret == 0);
qemu_mod_timer(n->rein_timer,
qemu_get_clock(vm_clock) + REINJECT_TICK_RATE);
}

/* TODO
* - we could suppress RX interrupt if we were so inclined.
*/
Expand Down Expand Up @@ -707,6 +860,7 @@ static int32_t virtio_net_flush_tx(VirtIONet *n, VirtQueue *vq)
{
VirtQueueElement elem;
int32_t num_packets = 0;
int32_t inject = 1;
if (!(n->vdev.status & VIRTIO_CONFIG_S_DRIVER_OK)) {
return num_packets;
}
Expand Down Expand Up @@ -758,12 +912,16 @@ static int32_t virtio_net_flush_tx(VirtIONet *n, VirtQueue *vq)
len += ret;

virtqueue_push(vq, &elem, len);
virtio_notify(&n->vdev, vq);
inject = virtio_notify(&n->vdev, vq);

if (++num_packets >= n->tx_burst) {
break;
}
}

if (inject == 0 && virtqueue_handled(vq))
virtio_net_rein_enable(n);

return num_packets;
}

Expand All @@ -777,6 +935,16 @@ static void virtio_net_handle_tx_timer(VirtIODevice *vdev, VirtQueue *vq)
return;
}

/*
* Kill the broken guest timer. The reason we are here is because the guest
* has kicked us to send packets therefore we don't need to go back and
* consider injecting it with interrupts because we will do that again
* naturally. We also don't reset
*/
if (n->rein_timer_act)
virtio_net_rein_disable(n);


if (n->tx_waiting) {
virtio_queue_set_notification(vq, 1);
qemu_del_timer(n->tx_timer);
Expand Down Expand Up @@ -1024,6 +1192,12 @@ VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf,
n->tx_vq = virtio_add_queue(&n->vdev, 256, virtio_net_handle_tx_bh);
n->tx_bh = qemu_bh_new(virtio_net_tx_bh, n);
}
n->rein_timer = qemu_new_timer(vm_clock, virtio_net_rein_tick, n);
n->rein_ring_idx = 0;
bzero(n->rein_ring, sizeof (rein_event_t) * REIN_RING_MAX);
n->rein_n_dead = 0;
n->rein_n_inject = 0;
n->rein_n_rerun = 0;
n->ctrl_vq = virtio_add_queue(&n->vdev, 64, virtio_net_handle_ctrl);
qemu_macaddr_default_if_unset(&conf->macaddr);
memcpy(&n->mac[0], &conf->macaddr, sizeof(n->mac));
Expand Down
85 changes: 72 additions & 13 deletions hw/virtio.c
Expand Up @@ -17,20 +17,12 @@
#include "qemu-error.h"
#include "virtio.h"
#include "sysemu.h"
#include "qemu-barrier.h"

/* The alignment to use between consumer and producer parts of vring.
* x86 pagesize again. */
#define VIRTIO_PCI_VRING_ALIGN 4096

/* QEMU doesn't strictly need write barriers since everything runs in
* lock-step. We'll leave the calls to wmb() in though to make it obvious for
* KVM or if kqemu gets SMP support.
* In any case, we must prevent the compiler from reordering the code.
* TODO: we likely need some rmb()/mb() as well.
*/

#define wmb() __asm__ __volatile__("": : :"memory")

typedef struct VRingDesc
{
uint64_t addr;
Expand Down Expand Up @@ -169,6 +161,13 @@ static inline void vring_used_idx_increment(VirtQueue *vq, uint16_t val)
stw_phys(pa, vring_used_idx(vq) + val);
}

static inline uint16_t vring_used_flags(VirtQueue *vq)
{
target_phys_addr_t pa;
pa = vq->vring.used + offsetof(VRingUsed, flags);
return lduw_phys(pa);
}

static inline void vring_used_flags_set_bit(VirtQueue *vq, int mask)
{
target_phys_addr_t pa;
Expand Down Expand Up @@ -235,7 +234,7 @@ void virtqueue_fill(VirtQueue *vq, const VirtQueueElement *elem,
void virtqueue_flush(VirtQueue *vq, unsigned int count)
{
/* Make sure buffer is written before we update index. */
wmb();
smp_wmb();
trace_virtqueue_flush(vq, count);
vring_used_idx_increment(vq, count);
vq->inuse -= count;
Expand All @@ -258,6 +257,11 @@ static int virtqueue_num_heads(VirtQueue *vq, unsigned int idx)
idx, vring_avail_idx(vq));
exit(1);
}
/* On success, callers read a descriptor at vq->last_avail_idx.
* Make sure descriptor read does not bypass avail index read. */
if (num_heads) {
smp_rmb();
}

return num_heads;
}
Expand Down Expand Up @@ -291,7 +295,7 @@ static unsigned virtqueue_next_desc(target_phys_addr_t desc_pa,
/* Check they're not leading us off end of descriptors. */
next = vring_desc_next(desc_pa, i);
/* Make sure compiler knows to grab that: we don't want it changing! */
wmb();
smp_wmb();

if (next >= max) {
error_report("Desc next is %u", next);
Expand Down Expand Up @@ -629,17 +633,20 @@ void virtio_irq(VirtQueue *vq)
virtio_notify_vector(vq->vdev, vq->vector);
}

void virtio_notify(VirtIODevice *vdev, VirtQueue *vq)
int virtio_notify(VirtIODevice *vdev, VirtQueue *vq)
{
/* We need to expose used array entries before checking used event. */
smp_mb();
/* Always notify when queue is empty (when feature acknowledge) */
if ((vring_avail_flags(vq) & VRING_AVAIL_F_NO_INTERRUPT) &&
(!(vdev->guest_features & (1 << VIRTIO_F_NOTIFY_ON_EMPTY)) ||
(vq->inuse || vring_avail_idx(vq) != vq->last_avail_idx)))
return;
return 0;

trace_virtio_notify(vdev, vq);
vdev->isr |= 0x01;
virtio_notify_vector(vdev, vq->vector);
return 1;
}

void virtio_notify_config(VirtIODevice *vdev)
Expand Down Expand Up @@ -880,3 +887,55 @@ EventNotifier *virtio_queue_get_host_notifier(VirtQueue *vq)
{
return &vq->host_notifier;
}

int virtqueue_handled(VirtQueue *vq)
{
smp_mb();
return (vq->last_avail_idx == vring_used_idx(vq) ||
vq->last_avail_idx != vring_avail_idx(vq));
}

/*
* We need to go through and check if we have hit the 'stalled' condition.
* Due to the way that the virtio driver is implemented in the Linux kernel, it
* will potentially kick the guest to process data, disable the queue, but not
* enable interrupts before the host is done processing packets. When this
* happens all network traffic from the guest ends up getting corked up because
* the guest disabled the queue and is waiting for an interrupt from the host to
* go and enable it again. In fact, when in this state a little bit of libproc
* magic gets us going again rather reliably.
*
* Eventually the guest will go through and unmask interrupts saying that it
* wants an injection. If we reach a point in time where the last seen available
* index is equal to the available index ring and is equal to the used index
* ring, then we'll go ahead and install the interupt.
*/
int virtqueue_stalled(VirtQueue *vq)
{
smp_mb();

if (vring_avail_flags(vq) & VRING_AVAIL_F_NO_INTERRUPT)
return (0);

if (vring_used_flags(vq) & VRING_USED_F_NO_NOTIFY)
return (0);

if (vq->inuse)
return (0);

/* We could have also lost the interrupt the other way */
if (vq->last_avail_idx != vring_avail_idx(vq))
return (2);

if (vq->last_avail_idx != vring_used_idx(vq))
return (0);

/*
* Interrupts are enabled and we're at a point in time where we would
* have stalled. Let's go ahead and inject the interrupt.
*/
trace_virtio_notify(vq->vdev, vq);
vq->vdev->isr |= 0x01;
virtio_notify_vector(vq->vdev, vq->vector);
return (1);
}
4 changes: 3 additions & 1 deletion hw/virtio.h
Expand Up @@ -153,7 +153,7 @@ void virtqueue_map_sg(struct iovec *sg, target_phys_addr_t *addr,
int virtqueue_pop(VirtQueue *vq, VirtQueueElement *elem);
int virtqueue_avail_bytes(VirtQueue *vq, int in_bytes, int out_bytes);

void virtio_notify(VirtIODevice *vdev, VirtQueue *vq);
int virtio_notify(VirtIODevice *vdev, VirtQueue *vq);

void virtio_save(VirtIODevice *vdev, QEMUFile *f);

Expand Down Expand Up @@ -226,4 +226,6 @@ EventNotifier *virtio_queue_get_guest_notifier(VirtQueue *vq);
EventNotifier *virtio_queue_get_host_notifier(VirtQueue *vq);
void virtio_queue_notify_vq(VirtQueue *vq);
void virtio_irq(VirtQueue *vq);
int virtqueue_stalled(VirtQueue *vq);
int virtqueue_handled(VirtQueue *vq);
#endif

0 comments on commit 6d85df9

Please sign in to comment.