Skip to content
Permalink
Browse files
iommufd: IOCTLs for the io_pagetable
Connect the IO Address Space Pagetable to its IOCTL interface. This
exposes most of the functionality in the io_pagetable to userspace.

This is inteded to be the core of the generic interface that IOMMUFD will
provide. Every IOMMU driver should be able to implement an iommu_domain
that is compatible with this generic mechanism.

It is also designed to be easy to use for simple non
virtual-machine-monitor users, like DPDK. The kernel provides universal
support for all IOMMUs no (PPC special path), an IOVA allocator, and
allows any number or configuration of devices to be connected to the IOAS,
simplifying these applications.

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
  • Loading branch information
jgunthorpe committed Nov 16, 2021
1 parent 60716d1 commit 7188450644ef414f96455c9f15043990762078a6
Show file tree
Hide file tree
Showing 5 changed files with 390 additions and 0 deletions.
@@ -1,4 +1,5 @@
# SPDX-License-Identifier: GPL-2.0-only
obj-$(CONFIG_IOMMUFD) += \
io_pagetable.o \
ioas_pt.o \
iommufd.o
@@ -0,0 +1,262 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES
*/
#include <linux/interval_tree.h>
#include <linux/iommufd.h>
#include <linux/iommu.h>
#include <uapi/linux/iommufd.h>

#include "iommufd_private.h"

static inline struct iommufd_ioas_pagetable *
get_ioas_pagetable(struct iommufd_ucmd *ucmd, u32 id)
{
return container_of(iommufd_get_object(ucmd->ictx, id,
IOMMUFD_OBJ_IOAS_PAGETABLE),
struct iommufd_ioas_pagetable, obj);
}

void iommufd_ioas_pagetable_destroy(struct iommufd_object *obj)
{
struct iommufd_ioas_pagetable *ioaspt =
container_of(obj, struct iommufd_ioas_pagetable, obj);

iopt_destroy_table(&ioaspt->iopt);
}

int iommufd_ioas_pagetable_alloc(struct iommufd_ucmd *ucmd)
{
struct iommu_ioas_pagetable_alloc *cmd = ucmd->cmd;
struct iommufd_ioas_pagetable *ioaspt;
int rc;

if (cmd->flags)
return -EOPNOTSUPP;

ioaspt = iommufd_object_alloc_ucmd(ucmd, ioaspt,
IOMMUFD_OBJ_IOAS_PAGETABLE);
if (IS_ERR(ioaspt))
return PTR_ERR(ioaspt);

rc = iopt_init_table(&ioaspt->iopt);
if (rc)
return rc;

INIT_LIST_HEAD(&ioaspt->auto_domains);
cmd->out_ioas_id = ioaspt->obj.id;

rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
if (rc)
goto out_table;
return 0;

out_table:
iopt_destroy_table(&ioaspt->iopt);
return rc;
}

int iommufd_ioas_pagetable_iova_ranges(struct iommufd_ucmd *ucmd)
{
struct iommu_ioas_pagetable_iova_ranges __user *uptr = ucmd->ubuffer;
struct iommu_ioas_pagetable_iova_ranges *cmd = ucmd->cmd;
struct interval_tree_node *valid_iova;
struct iommufd_ioas_pagetable *ioaspt;
u32 max_iovas;
int rc;

/*
* FIXME: we could recalculate the valid_iova_itree here based on
* current information. This would provide a way for userspace to
* recover from the narrowing after a domain or group is removed.
*/

if (cmd->__reserved)
return -EOPNOTSUPP;

max_iovas = cmd->size - sizeof(*cmd);
if (max_iovas % sizeof(cmd->out_valid_iovas[0]))
return -EINVAL;

ioaspt = get_ioas_pagetable(ucmd, cmd->ioas_id);
if (IS_ERR(ioaspt))
return PTR_ERR(ioaspt);

down_read(&ioaspt->iopt.rwsem);
for (valid_iova = interval_tree_iter_first(
&ioaspt->iopt.valid_iova_itree, 0, ULONG_MAX);
valid_iova;
valid_iova = interval_tree_iter_next(valid_iova, 0, ULONG_MAX)) {
if (cmd->out_num_iovas < max_iovas) {
rc = put_user((u64)valid_iova->start,
&uptr->out_valid_iovas[cmd->out_num_iovas]
.start);
if (rc)
goto out_put;
rc = put_user(
(u64)valid_iova->last,
&uptr->out_valid_iovas[cmd->out_num_iovas].last);
if (rc)
goto out_put;
}
cmd->out_num_iovas++;
}
rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
out_put:
up_read(&ioaspt->iopt.rwsem);
iommufd_put_object(&ioaspt->obj);
return rc;
}

static int conv_iommu_prot(u32 map_flags)
{
int iommu_prot;

/*
* We provide no manual cache coherencey ioctls to userspace and most
* architectures make the CPU ops for cache flushign privileged.
* Therefore we require the underlying IOMMU to support CPU coherent
* operation.
*/
iommu_prot = IOMMU_CACHE;
if (map_flags & IOMMU_IOAS_PAGETABLE_MAP_WRITEABLE)
iommu_prot |= IOMMU_WRITE;
if (map_flags & IOMMU_IOAS_PAGETABLE_MAP_READABLE)
iommu_prot |= IOMMU_READ;
return iommu_prot;
}

int iommufd_ioas_pagetable_map(struct iommufd_ucmd *ucmd)
{
struct iommu_ioas_pagetable_map *cmd = ucmd->cmd;
struct iommufd_ioas_pagetable *ioaspt;
int rc;

if ((cmd->flags & ~(IOMMU_IOAS_PAGETABLE_MAP_FIXED_IOVA |
IOMMU_IOAS_PAGETABLE_MAP_WRITEABLE |
IOMMU_IOAS_PAGETABLE_MAP_READABLE)) ||
cmd->__reserved)
return -EOPNOTSUPP;
if (cmd->iova >= ULONG_MAX || cmd->length >= ULONG_MAX)
return -EOVERFLOW;

ioaspt = get_ioas_pagetable(ucmd, cmd->ioas_id);
if (IS_ERR(ioaspt))
return PTR_ERR(ioaspt);

down_write(&ioaspt->iopt.rwsem);
if (!(cmd->flags & IOMMU_IOAS_PAGETABLE_MAP_FIXED_IOVA)) {
unsigned long iova;

rc = iopt_alloc_iova(&ioaspt->iopt, &iova, cmd->length);
if (rc)
goto out_unlock;
cmd->iova = iova;
}

rc = iopt_map_user_pages(&ioaspt->iopt, cmd->iova,
u64_to_user_ptr(cmd->user_va), cmd->length,
conv_iommu_prot(cmd->flags));
if (rc)
goto out_unlock;
rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
out_unlock:
up_write(&ioaspt->iopt.rwsem);
iommufd_put_object(&ioaspt->obj);
return rc;
}

int iommufd_ioas_pagetable_copy(struct iommufd_ucmd *ucmd)
{
struct iommu_ioas_pagetable_copy *cmd = ucmd->cmd;
struct iommufd_ioas_pagetable *src_ioaspt;
struct iommufd_ioas_pagetable *dst_ioaspt;
struct iopt_pages *pages;
int rc;

if ((cmd->flags & ~(IOMMU_IOAS_PAGETABLE_MAP_FIXED_IOVA |
IOMMU_IOAS_PAGETABLE_MAP_WRITEABLE |
IOMMU_IOAS_PAGETABLE_MAP_READABLE)))
return -EOPNOTSUPP;
if (cmd->length >= ULONG_MAX)
return -EOVERFLOW;

src_ioaspt = get_ioas_pagetable(ucmd, cmd->src_ioas_id);
if (IS_ERR(src_ioaspt))
return PTR_ERR(src_ioaspt);
pages = iopt_get_pages(&src_ioaspt->iopt, cmd->src_iova, cmd->length);
iommufd_put_object(&src_ioaspt->obj);
if (IS_ERR(pages))
return PTR_ERR(pages);

dst_ioaspt = get_ioas_pagetable(ucmd, cmd->dst_ioas_id);
if (IS_ERR(dst_ioaspt))
return PTR_ERR(dst_ioaspt);

down_write(&dst_ioaspt->iopt.rwsem);
if (!(cmd->flags & IOMMU_IOAS_PAGETABLE_MAP_FIXED_IOVA)) {
unsigned long iova;

rc = iopt_alloc_iova(&dst_ioaspt->iopt, &iova, cmd->length);
if (rc)
goto out_unlock;
cmd->dst_iova = iova;
}

rc = iopt_copy_iova(&dst_ioaspt->iopt, pages, cmd->dst_iova,
conv_iommu_prot(cmd->flags));
if (rc)
goto out_unlock;
rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
out_unlock:
up_write(&dst_ioaspt->iopt.rwsem);
iommufd_put_object(&dst_ioaspt->obj);
return rc;
}

/* FIXME: VFIO_DMA_MAP_FLAG_VADDR
* https://lore.kernel.org/kvm/1611939252-7240-1-git-send-email-steven.sistare@oracle.com/
* Wow, what a wild feature. This should be implemetned by allowing a iopt_pages
* to be associated with a memfd. It can then source mapping requests directly
* from a memfd without going through a mm_struct and thus doesn't care that the
* original qemu exec'd itself. The idea that userspace can flip a flag and
* cause kernerl users to block indefinately is unacceptable.
*
* For VFIO compat we implement this in a slightly different way, creating a
* access_user that spans the whole area will immediately stop new faults as
* they will be handled from the xarray. We can then reparent the iopt_pages to
* the new mm_struct and undo the access_user. No blockage of kernel users
* required, does require filling the xarray with pages though.
*/

int iommufd_ioas_pagetable_unmap(struct iommufd_ucmd *ucmd)
{
struct iommu_ioas_pagetable_unmap *cmd = ucmd->cmd;
struct iommufd_ioas_pagetable *ioaspt;
int rc;

ioaspt = get_ioas_pagetable(ucmd, cmd->ioas_id);
if (IS_ERR(ioaspt))
return PTR_ERR(ioaspt);

if (cmd->iova == 0 && cmd->length == U64_MAX) {
rc = iopt_unmap_all(&ioaspt->iopt);
} else {
if (cmd->iova >= ULONG_MAX || cmd->length >= ULONG_MAX) {
rc = -EOVERFLOW;
goto out_put;
}
rc = iopt_unmap_iova(&ioaspt->iopt, cmd->iova, cmd->length);
}

out_put:
iommufd_put_object(&ioaspt->obj);
return rc;
}

/* FIXME: VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP I think everything with dirty
* tracking should be in its own ioctl, not muddled in unmap. If we want to
* atomically unmap and get the dirty bitmap it should be a flag in the dirty
* tracking ioctl, not here in unmap. Overall dirty tracking needs a careful
* review along side HW drivers implementing it.
*/
@@ -192,6 +192,10 @@ static int iommufd_fops_release(struct inode *inode, struct file *filp)
}

union ucmd_buffer {
struct iommu_ioas_pagetable_alloc alloc;
struct iommu_ioas_pagetable_iova_ranges iova_ranges;
struct iommu_ioas_pagetable_map map;
struct iommu_ioas_pagetable_unmap unmap;
struct iommu_destroy destroy;
};

@@ -214,6 +218,17 @@ struct iommufd_ioctl_op {
static struct iommufd_ioctl_op iommufd_ioctl_ops[] = {
IOCTL_OP(IOMMUFD_CMD_DESTROY, iommufd_destroy, struct iommu_destroy,
id),
IOCTL_OP(IOMMU_IOAS_PAGETABLE_ALLOC, iommufd_ioas_pagetable_alloc,
struct iommu_ioas_pagetable_alloc, out_ioas_id),
IOCTL_OP(IOMMU_IOAS_PAGETABLE_COPY, iommufd_ioas_pagetable_copy,
struct iommu_ioas_pagetable_copy, src_iova),
IOCTL_OP(IOMMU_IOAS_PAGETABLE_IOVA_RANGES,
iommufd_ioas_pagetable_iova_ranges,
struct iommu_ioas_pagetable_iova_ranges, __reserved),
IOCTL_OP(IOMMU_IOAS_PAGETABLE_MAP, iommufd_ioas_pagetable_map,
struct iommu_ioas_pagetable_map, __reserved),
IOCTL_OP(IOMMU_IOAS_PAGETABLE_UNMAP, iommufd_ioas_pagetable_unmap,
struct iommu_ioas_pagetable_unmap, length),
};

static long iommufd_fops_ioctl(struct file *filp, unsigned int cmd,
@@ -282,6 +297,9 @@ struct iommufd_ctx *iommufd_fget(int fd)
}

static struct iommufd_object_ops iommufd_object_ops[] = {
[IOMMUFD_OBJ_IOAS_PAGETABLE] = {
.destroy = iommufd_ioas_pagetable_destroy,
},
};

static struct miscdevice iommu_misc_dev = {
@@ -87,6 +87,7 @@ static inline int iommufd_ucmd_respond(struct iommufd_ucmd *ucmd,
enum iommufd_object_type {
IOMMUFD_OBJ_NONE,
IOMMUFD_OBJ_ANY = IOMMUFD_OBJ_NONE,
IOMMUFD_OBJ_IOAS_PAGETABLE,
IOMMUFD_OBJ_MAX,
};

@@ -141,4 +142,21 @@ struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx,
__tmp; \
})

/*
* The IO Address Space (IOAS) pagetable is a virtual page table backed by the
* io_pagetable object. It is a user controlled mapping of IOVA -> PFNs. The
* mapping is copied into all of the associated domains and SW users.
*/
struct iommufd_ioas_pagetable {
struct iommufd_object obj;
struct io_pagetable iopt;
struct list_head auto_domains;
};

int iommufd_ioas_pagetable_alloc(struct iommufd_ucmd *ucmd);
void iommufd_ioas_pagetable_destroy(struct iommufd_object *obj);
int iommufd_ioas_pagetable_iova_ranges(struct iommufd_ucmd *ucmd);
int iommufd_ioas_pagetable_map(struct iommufd_ucmd *ucmd);
int iommufd_ioas_pagetable_copy(struct iommufd_ucmd *ucmd);
int iommufd_ioas_pagetable_unmap(struct iommufd_ucmd *ucmd);
#endif

0 comments on commit 7188450

Please sign in to comment.