Skip to content

Commit

Permalink
OS-8136 Add DISCARD/TRIM support to bhyve (#268)
Browse files Browse the repository at this point in the history
Contributed by: Allan Jude <allanjude@freebsd.org>
Reviewed by: Mike Gerdts <mike.gerdts@joyent.com>
Reviewed by: Mike Zeller <mike.zeller.@joyent.com>
Approved by: Mike Zeller <mike.zeller@joyent.com>
  • Loading branch information
jasonbking committed Mar 18, 2020
1 parent 288b348 commit bfe191c
Show file tree
Hide file tree
Showing 2 changed files with 122 additions and 19 deletions.
15 changes: 13 additions & 2 deletions usr/src/cmd/bhyve/block_if.c
Expand Up @@ -29,7 +29,7 @@
*/

/*
* Copyright 2018 Joyent, Inc.
* Copyright 2020 Joyent, Inc.
*/

#include <sys/cdefs.h>
Expand Down Expand Up @@ -475,6 +475,10 @@ blockif_open(const char *optstr, const char *ident)
off_t size, psectsz, psectoff;
int extra, fd, i, sectsz;
int nocache, sync, ro, candelete, geom, ssopt, pssopt;
#ifdef __FreeBSD__
int nodelete;
#endif

#ifndef WITHOUT_CAPSICUM
cap_rights_t rights;
cap_ioctl_t cmds[] = { DIOCGFLUSH, DIOCGDELETE };
Expand All @@ -487,6 +491,9 @@ blockif_open(const char *optstr, const char *ident)
nocache = 0;
sync = 0;
ro = 0;
#ifdef __FreeBSD__
nodelete = 0;
#endif

/*
* The first element in the optstring is always a pathname.
Expand All @@ -499,6 +506,10 @@ blockif_open(const char *optstr, const char *ident)
continue;
else if (!strcmp(cp, "nocache"))
nocache = 1;
#ifdef __FreeBSD__
else if (!strcmp(cp, "nodelete"))
nodelete = 1;
#endif
else if (!strcmp(cp, "sync") || !strcmp(cp, "direct"))
sync = 1;
else if (!strcmp(cp, "ro"))
Expand Down Expand Up @@ -566,7 +577,7 @@ blockif_open(const char *optstr, const char *ident)
ioctl(fd, DIOCGSTRIPEOFFSET, &psectoff);
strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name));
arg.len = sizeof(arg.value.i);
if (ioctl(fd, DIOCGATTR, &arg) == 0)
if (nodelete == 0 && ioctl(fd, DIOCGATTR, &arg) == 0)
candelete = arg.value.i;
if (ioctl(fd, DIOCGPROVIDERNAME, name) == 0)
geom = 1;
Expand Down
126 changes: 109 additions & 17 deletions usr/src/cmd/bhyve/pci_virtio_block.c
Expand Up @@ -3,7 +3,7 @@
*
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
* Copyright (c) 2019 Joyent, Inc.
* Copyright 2020 Joyent, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -39,7 +39,6 @@
* http://www.illumos.org/license/CDDL.
*
* Copyright 2014 Pluribus Networks Inc.
* Copyright 2018 Joyent, Inc.
*/

#include <sys/cdefs.h>
Expand Down Expand Up @@ -69,32 +68,54 @@ __FBSDID("$FreeBSD$");
#include "virtio.h"
#include "block_if.h"

#define VTBLK_RINGSZ 128
#define VTBLK_BSIZE 512
#define VTBLK_RINGSZ 128

_Static_assert(VTBLK_RINGSZ <= BLOCKIF_RING_MAX, "Each ring entry must be able to queue a request");

#define VTBLK_S_OK 0
#define VTBLK_S_IOERR 1
#define VTBLK_S_OK 0
#define VTBLK_S_IOERR 1
#define VTBLK_S_UNSUPP 2

#define VTBLK_BLK_ID_BYTES 20 + 1

/* Capability bits */
#define VTBLK_F_SEG_MAX (1 << 2) /* Maximum request segments */
#define VTBLK_F_BLK_SIZE (1 << 6) /* cfg block size valid */
#define VTBLK_F_FLUSH (1 << 9) /* Cache flush support */
#define VTBLK_F_TOPOLOGY (1 << 10) /* Optimal I/O alignment */
#define VTBLK_F_BARRIER (1 << 0) /* Does host support barriers? */
#define VTBLK_F_SIZE_MAX (1 << 1) /* Indicates maximum segment size */
#define VTBLK_F_SEG_MAX (1 << 2) /* Indicates maximum # of segments */
#define VTBLK_F_GEOMETRY (1 << 4) /* Legacy geometry available */
#define VTBLK_F_RO (1 << 5) /* Disk is read-only */
#define VTBLK_F_BLK_SIZE (1 << 6) /* Block size of disk is available*/
#define VTBLK_F_SCSI (1 << 7) /* Supports scsi command passthru */
#define VTBLK_F_FLUSH (1 << 9) /* Writeback mode enabled after reset */
#define VTBLK_F_WCE (1 << 9) /* Legacy alias for FLUSH */
#define VTBLK_F_TOPOLOGY (1 << 10) /* Topology information is available */
#define VTBLK_F_CONFIG_WCE (1 << 11) /* Writeback mode available in config */
#define VTBLK_F_DISCARD (1 << 13) /* Trim blocks */
#define VTBLK_F_WRITE_ZEROES (1 << 14) /* Write zeros */

/*
* Host capabilities
*/
#define VTBLK_S_HOSTCAPS \
#define VTBLK_S_HOSTCAPS \
( VTBLK_F_SEG_MAX | \
VTBLK_F_BLK_SIZE | \
VTBLK_F_FLUSH | \
VTBLK_F_TOPOLOGY | \
VIRTIO_RING_F_INDIRECT_DESC ) /* indirect descriptors */

/*
* The current blockif_delete() interface only allows a single delete
* request at a time.
*/
#define VTBLK_MAX_DISCARD_SEG 1

/*
* An arbitrary limit to prevent excessive latency due to large
* delete requests.
*/
#define VTBLK_MAX_DISCARD_SECT ((16 << 20) / VTBLK_BSIZE) /* 16 MiB */

/*
* Config space "registers"
*/
Expand All @@ -115,6 +136,14 @@ struct vtblk_config {
uint32_t opt_io_size;
} vbc_topology;
uint8_t vbc_writeback;
uint8_t unused0[3];
uint32_t max_discard_sectors;
uint32_t max_discard_seg;
uint32_t discard_sector_alignment;
uint32_t max_write_zeroes_sectors;
uint32_t max_write_zeroes_seg;
uint8_t write_zeroes_may_unmap;
uint8_t unused1[3];
} __packed;

/*
Expand All @@ -123,9 +152,14 @@ struct vtblk_config {
struct virtio_blk_hdr {
#define VBH_OP_READ 0
#define VBH_OP_WRITE 1
#define VBH_OP_SCSI_CMD 2
#define VBH_OP_SCSI_CMD_OUT 3
#define VBH_OP_FLUSH 4
#define VBH_OP_FLUSH_OUT 5
#define VBH_OP_IDENT 8
#define VBH_OP_DISCARD 11
#define VBH_OP_WRITE_ZEROES 13

#define VBH_FLAG_BARRIER 0x80000000 /* OR'ed into vbh_type */
uint32_t vbh_type;
uint32_t vbh_ioprio;
Expand All @@ -136,8 +170,8 @@ struct virtio_blk_hdr {
* Debug printf
*/
static int pci_vtblk_debug;
#define DPRINTF(params) if (pci_vtblk_debug) printf params
#define WPRINTF(params) printf params
#define DPRINTF(params) if (pci_vtblk_debug) printf params
#define WPRINTF(params) printf params

struct pci_vtblk_ioreq {
struct blockif_req io_req;
Expand All @@ -146,6 +180,15 @@ struct pci_vtblk_ioreq {
uint16_t io_idx;
};

struct virtio_blk_discard_write_zeroes {
uint64_t sector;
uint32_t num_sectors;
struct {
uint32_t unmap:1;
uint32_t reserved:31;
} flags;
};

/*
* Per-device softc
*/
Expand All @@ -154,6 +197,7 @@ struct pci_vtblk_softc {
pthread_mutex_t vsc_mtx;
struct vqueue_info vbsc_vq;
struct vtblk_config vbsc_cfg;
struct virtio_consts vbsc_consts;
struct blockif_ctxt *bc;
#ifndef __FreeBSD__
int vbsc_wce;
Expand Down Expand Up @@ -243,6 +287,7 @@ pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vqueue_info *vq)
int writeop, type;
struct iovec iov[BLOCKIF_IOV_MAX + 2];
uint16_t idx, flags[BLOCKIF_IOV_MAX + 2];
struct virtio_blk_discard_write_zeroes *discard;

n = vq_getchain(vq, &idx, iov, BLOCKIF_IOV_MAX + 2, flags);

Expand All @@ -262,7 +307,7 @@ pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vqueue_info *vq)
vbh = (struct virtio_blk_hdr *)iov[0].iov_base;
memcpy(&io->io_req.br_iov, &iov[1], sizeof(struct iovec) * (n - 2));
io->io_req.br_iovcnt = n - 2;
io->io_req.br_offset = vbh->vbh_sector * DEV_BSIZE;
io->io_req.br_offset = vbh->vbh_sector * VTBLK_BSIZE;
io->io_status = (uint8_t *)iov[--n].iov_base;
assert(iov[n].iov_len == 1);
assert(flags[n] & VRING_DESC_F_WRITE);
Expand All @@ -273,7 +318,7 @@ pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vqueue_info *vq)
* we don't advertise the capability.
*/
type = vbh->vbh_type & ~VBH_FLAG_BARRIER;
writeop = (type == VBH_OP_WRITE);
writeop = (type == VBH_OP_WRITE || type == VBH_OP_DISCARD);

iolen = 0;
for (i = 1; i < n; i++) {
Expand All @@ -289,7 +334,7 @@ pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vqueue_info *vq)
io->io_req.br_resid = iolen;

DPRINTF(("virtio-block: %s op, %zd bytes, %d segs, offset %ld\n\r",
writeop ? "write" : "read/ident", iolen, i - 1,
writeop ? "write/discard" : "read/ident", iolen, i - 1,
io->io_req.br_offset));

switch (type) {
Expand All @@ -299,6 +344,46 @@ pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vqueue_info *vq)
case VBH_OP_WRITE:
err = blockif_write(sc->bc, &io->io_req);
break;
case VBH_OP_DISCARD:
/*
* We currently only support a single request, if the guest
* has submitted a request that doesn't conform to the
* requirements, we return a error.
*/
if (iov[1].iov_len != sizeof (*discard)) {
pci_vtblk_done_locked(io, EINVAL);
return;
}

/* The segments to discard are provided rather than data */
discard = (struct virtio_blk_discard_write_zeroes *)
iov[1].iov_base;

/*
* virtio v1.1 5.2.6.2:
* The device MUST set the status byte to VIRTIO_BLK_S_UNSUPP
* for discard and write zeroes commands if any unknown flag is
* set. Furthermore, the device MUST set the status byte to
* VIRTIO_BLK_S_UNSUPP for discard commands if the unmap flag
* is set.
*
* Currently there are no known flags for a DISCARD request.
*/
if (discard->flags.unmap != 0 || discard->flags.reserved != 0) {
pci_vtblk_done_locked(io, ENOTSUP);
return;
}

/* Make sure the request doesn't exceed our size limit */
if (discard->num_sectors > VTBLK_MAX_DISCARD_SECT) {
pci_vtblk_done_locked(io, EINVAL);
return;
}

io->io_req.br_offset = discard->sector * VTBLK_BSIZE;
io->io_req.br_resid = discard->num_sectors * VTBLK_BSIZE;
err = blockif_delete(sc->bc, &io->io_req);
break;
case VBH_OP_FLUSH:
case VBH_OP_FLUSH_OUT:
err = blockif_flush(sc->bc, &io->io_req);
Expand Down Expand Up @@ -367,6 +452,10 @@ pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
io->io_idx = i;
}

bcopy(&vtblk_vi_consts, &sc->vbsc_consts, sizeof (vtblk_vi_consts));
if (blockif_candelete(sc->bc))
sc->vbsc_consts.vc_hv_caps |= VTBLK_F_DISCARD;

#ifndef __FreeBSD__
/* Disable write cache until FLUSH feature is negotiated */
(void) blockif_set_wce(sc->bc, 0);
Expand All @@ -376,7 +465,7 @@ pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
pthread_mutex_init(&sc->vsc_mtx, NULL);

/* init virtio softc and virtqueues */
vi_softc_linkup(&sc->vbsc_vs, &vtblk_vi_consts, sc, pi, &sc->vbsc_vq);
vi_softc_linkup(&sc->vbsc_vs, &sc->vbsc_consts, sc, pi, &sc->vbsc_vq);
sc->vbsc_vs.vs_mtx = &sc->vsc_mtx;

sc->vbsc_vq.vq_qsize = VTBLK_RINGSZ;
Expand All @@ -394,7 +483,7 @@ pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
digest[0], digest[1], digest[2], digest[3], digest[4], digest[5]);

/* setup virtio block config space */
sc->vbsc_cfg.vbc_capacity = size / DEV_BSIZE; /* 512-byte units */
sc->vbsc_cfg.vbc_capacity = size / VTBLK_BSIZE; /* 512-byte units */
sc->vbsc_cfg.vbc_size_max = 0; /* not negotiated */

/*
Expand All @@ -416,6 +505,9 @@ pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
sc->vbsc_cfg.vbc_topology.min_io_size = 0;
sc->vbsc_cfg.vbc_topology.opt_io_size = 0;
sc->vbsc_cfg.vbc_writeback = 0;
sc->vbsc_cfg.max_discard_sectors = VTBLK_MAX_DISCARD_SECT;
sc->vbsc_cfg.max_discard_seg = VTBLK_MAX_DISCARD_SEG;
sc->vbsc_cfg.discard_sector_alignment = sectsz / VTBLK_BSIZE;

/*
* Should we move some of this into virtio.c? Could
Expand Down

0 comments on commit bfe191c

Please sign in to comment.