Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
mm/memfd: Introduce userspace inaccessible memfd
KVM can use memfd-provided memory for guest memory. For normal userspace accessible memory, KVM userspace (e.g. QEMU) mmaps the memfd into its virtual address space and then tells KVM to use the virtual address to setup the mapping in the secondary page table (e.g. EPT). With confidential computing technologies like Intel TDX, the memfd-provided memory may be encrypted with special key for special software domain (e.g. KVM guest) and is not expected to be directly accessed by userspace. Precisely, userspace access to such encrypted memory may lead to host crash so it should be prevented. This patch introduces userspace inaccessible memfd (created with MFD_INACCESSIBLE). Its memory is inaccessible from userspace through ordinary MMU access (e.g. read/write/mmap) but can be accessed via in-kernel interface so KVM can directly interact with core-mm without the need to map the memory into KVM userspace. It provides semantics required for KVM guest private(encrypted) memory support that a file descriptor with this flag set is going to be used as the source of guest memory in confidential computing environments such as Intel TDX/AMD SEV. KVM userspace is still in charge of the lifecycle of the memfd. It should pass the opened fd to KVM. KVM uses the kernel APIs newly added in this patch to obtain the physical memory address and then populate the secondary page table entries. The userspace inaccessible memfd can be fallocate-ed and hole-punched from userspace. When hole-punching happens, KVM can get notified through inaccessible_notifier it then gets chance to remove any mapped entries of the range in the secondary page tables. The userspace inaccessible memfd itself is implemented as a shim layer on top of real memory file systems like tmpfs/hugetlbfs but this patch only implemented tmpfs. The allocated memory is currently marked as unmovable and unevictable, this is required for current confidential usage. But in future this might be changed. Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com> Link: https://lore.kernel.org/r/20220915142913.2213336-2-chao.p.peng@linux.intel.com
- Loading branch information
Showing
6 changed files
with
270 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,219 @@ | ||
// SPDX-License-Identifier: GPL-2.0 | ||
#include "linux/sbitmap.h" | ||
#include <linux/memfd.h> | ||
#include <linux/pagemap.h> | ||
#include <linux/pseudo_fs.h> | ||
#include <linux/shmem_fs.h> | ||
#include <uapi/linux/falloc.h> | ||
#include <uapi/linux/magic.h> | ||
|
||
struct inaccessible_data { | ||
struct mutex lock; | ||
struct file *memfd; | ||
struct list_head notifiers; | ||
}; | ||
|
||
static void inaccessible_notifier_invalidate(struct inaccessible_data *data, | ||
pgoff_t start, pgoff_t end) | ||
{ | ||
struct inaccessible_notifier *notifier; | ||
|
||
mutex_lock(&data->lock); | ||
list_for_each_entry(notifier, &data->notifiers, list) { | ||
notifier->ops->invalidate(notifier, start, end); | ||
} | ||
mutex_unlock(&data->lock); | ||
} | ||
|
||
static int inaccessible_release(struct inode *inode, struct file *file) | ||
{ | ||
struct inaccessible_data *data = inode->i_mapping->private_data; | ||
|
||
fput(data->memfd); | ||
kfree(data); | ||
return 0; | ||
} | ||
|
||
static long inaccessible_fallocate(struct file *file, int mode, | ||
loff_t offset, loff_t len) | ||
{ | ||
struct inaccessible_data *data = file->f_mapping->private_data; | ||
struct file *memfd = data->memfd; | ||
int ret; | ||
|
||
if (mode & FALLOC_FL_PUNCH_HOLE) { | ||
if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len)) | ||
return -EINVAL; | ||
} | ||
|
||
ret = memfd->f_op->fallocate(memfd, mode, offset, len); | ||
inaccessible_notifier_invalidate(data, offset, offset + len); | ||
return ret; | ||
} | ||
|
||
static const struct file_operations inaccessible_fops = { | ||
.release = inaccessible_release, | ||
.fallocate = inaccessible_fallocate, | ||
}; | ||
|
||
static int inaccessible_getattr(struct user_namespace *mnt_userns, | ||
const struct path *path, struct kstat *stat, | ||
u32 request_mask, unsigned int query_flags) | ||
{ | ||
struct inode *inode = d_inode(path->dentry); | ||
struct inaccessible_data *data = inode->i_mapping->private_data; | ||
struct file *memfd = data->memfd; | ||
|
||
return memfd->f_inode->i_op->getattr(mnt_userns, path, stat, | ||
request_mask, query_flags); | ||
} | ||
|
||
static int inaccessible_setattr(struct user_namespace *mnt_userns, | ||
struct dentry *dentry, struct iattr *attr) | ||
{ | ||
struct inode *inode = d_inode(dentry); | ||
struct inaccessible_data *data = inode->i_mapping->private_data; | ||
struct file *memfd = data->memfd; | ||
int ret; | ||
|
||
if (attr->ia_valid & ATTR_SIZE) { | ||
if (memfd->f_inode->i_size) | ||
return -EPERM; | ||
|
||
if (!PAGE_ALIGNED(attr->ia_size)) | ||
return -EINVAL; | ||
} | ||
|
||
ret = memfd->f_inode->i_op->setattr(mnt_userns, | ||
file_dentry(memfd), attr); | ||
return ret; | ||
} | ||
|
||
static const struct inode_operations inaccessible_iops = { | ||
.getattr = inaccessible_getattr, | ||
.setattr = inaccessible_setattr, | ||
}; | ||
|
||
static int inaccessible_init_fs_context(struct fs_context *fc) | ||
{ | ||
if (!init_pseudo(fc, INACCESSIBLE_MAGIC)) | ||
return -ENOMEM; | ||
|
||
fc->s_iflags |= SB_I_NOEXEC; | ||
return 0; | ||
} | ||
|
||
static struct file_system_type inaccessible_fs = { | ||
.owner = THIS_MODULE, | ||
.name = "[inaccessible]", | ||
.init_fs_context = inaccessible_init_fs_context, | ||
.kill_sb = kill_anon_super, | ||
}; | ||
|
||
static struct vfsmount *inaccessible_mnt; | ||
|
||
static __init int inaccessible_init(void) | ||
{ | ||
inaccessible_mnt = kern_mount(&inaccessible_fs); | ||
if (IS_ERR(inaccessible_mnt)) | ||
return PTR_ERR(inaccessible_mnt); | ||
return 0; | ||
} | ||
fs_initcall(inaccessible_init); | ||
|
||
struct file *memfd_mkinaccessible(struct file *memfd) | ||
{ | ||
struct inaccessible_data *data; | ||
struct address_space *mapping; | ||
struct inode *inode; | ||
struct file *file; | ||
|
||
data = kzalloc(sizeof(*data), GFP_KERNEL); | ||
if (!data) | ||
return ERR_PTR(-ENOMEM); | ||
|
||
data->memfd = memfd; | ||
mutex_init(&data->lock); | ||
INIT_LIST_HEAD(&data->notifiers); | ||
|
||
inode = alloc_anon_inode(inaccessible_mnt->mnt_sb); | ||
if (IS_ERR(inode)) { | ||
kfree(data); | ||
return ERR_CAST(inode); | ||
} | ||
|
||
inode->i_mode |= S_IFREG; | ||
inode->i_op = &inaccessible_iops; | ||
inode->i_mapping->private_data = data; | ||
|
||
file = alloc_file_pseudo(inode, inaccessible_mnt, | ||
"[memfd:inaccessible]", O_RDWR, | ||
&inaccessible_fops); | ||
if (IS_ERR(file)) { | ||
iput(inode); | ||
kfree(data); | ||
} | ||
|
||
file->f_flags |= O_LARGEFILE; | ||
|
||
mapping = memfd->f_mapping; | ||
mapping_set_unevictable(mapping); | ||
mapping_set_gfp_mask(mapping, | ||
mapping_gfp_mask(mapping) & ~__GFP_MOVABLE); | ||
|
||
return file; | ||
} | ||
|
||
void inaccessible_register_notifier(struct file *file, | ||
struct inaccessible_notifier *notifier) | ||
{ | ||
struct inaccessible_data *data = file->f_mapping->private_data; | ||
|
||
mutex_lock(&data->lock); | ||
list_add(¬ifier->list, &data->notifiers); | ||
mutex_unlock(&data->lock); | ||
} | ||
EXPORT_SYMBOL_GPL(inaccessible_register_notifier); | ||
|
||
void inaccessible_unregister_notifier(struct file *file, | ||
struct inaccessible_notifier *notifier) | ||
{ | ||
struct inaccessible_data *data = file->f_mapping->private_data; | ||
|
||
mutex_lock(&data->lock); | ||
list_del(¬ifier->list); | ||
mutex_unlock(&data->lock); | ||
} | ||
EXPORT_SYMBOL_GPL(inaccessible_unregister_notifier); | ||
|
||
int inaccessible_get_pfn(struct file *file, pgoff_t offset, pfn_t *pfn, | ||
int *order) | ||
{ | ||
struct inaccessible_data *data = file->f_mapping->private_data; | ||
struct file *memfd = data->memfd; | ||
struct page *page; | ||
int ret; | ||
|
||
ret = shmem_getpage(file_inode(memfd), offset, &page, SGP_WRITE); | ||
if (ret) | ||
return ret; | ||
|
||
*pfn = page_to_pfn_t(page); | ||
*order = thp_order(compound_head(page)); | ||
SetPageUptodate(page); | ||
unlock_page(page); | ||
|
||
return 0; | ||
} | ||
EXPORT_SYMBOL_GPL(inaccessible_get_pfn); | ||
|
||
void inaccessible_put_pfn(struct file *file, pfn_t pfn) | ||
{ | ||
struct page *page = pfn_t_to_page(pfn); | ||
|
||
if (WARN_ON_ONCE(!page)) | ||
return; | ||
|
||
put_page(page); | ||
} | ||
EXPORT_SYMBOL_GPL(inaccessible_put_pfn); |