From 14079b26d8927db4530e2ecb5fef7989c73f0fa1 Mon Sep 17 00:00:00 2001 From: Junxiao Bi Date: Tue, 7 Jun 2022 10:12:26 -0700 Subject: [PATCH 01/34] ocfs2: kill EBUSY from dlmfs_evict_inode When unlinking a dlmfs, first it will invoke dlmfs_unlink(), and then invoke dlmfs_evict_inode(), user_dlm_destroy_lock() is invoked in both places, the second one from dlmfs_evict_inode() will get EBUSY error because USER_LOCK_IN_TEARDOWN is already set in lockres. This doesn't affect any function, just the error log is annoying. Link: https://lkml.kernel.org/r/20220607171226.86672-1-junxiao.bi@oracle.com Signed-off-by: Junxiao Bi Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton --- fs/ocfs2/dlmfs/dlmfs.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index e360543ad7e714..8b2020f92b5f07 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -296,17 +296,25 @@ static void dlmfs_evict_inode(struct inode *inode) { int status; struct dlmfs_inode_private *ip; + struct user_lock_res *lockres; + int teardown; clear_inode(inode); mlog(0, "inode %lu\n", inode->i_ino); ip = DLMFS_I(inode); + lockres = &ip->ip_lockres; if (S_ISREG(inode->i_mode)) { - status = user_dlm_destroy_lock(&ip->ip_lockres); - if (status < 0) - mlog_errno(status); + spin_lock(&lockres->l_lock); + teardown = !!(lockres->l_flags & USER_LOCK_IN_TEARDOWN); + spin_unlock(&lockres->l_lock); + if (!teardown) { + status = user_dlm_destroy_lock(lockres); + if (status < 0) + mlog_errno(status); + } iput(ip->ip_parent); goto clear_fields; } From 9757b0b2b8fea04e47a6852b9ff4ddf0588d6e0a Mon Sep 17 00:00:00 2001 From: Gang He Date: Wed, 15 Jun 2022 16:08:21 -0700 Subject: [PATCH 02/34] ocfs2: reflink deadlock when clone file to the same directory simultaneously Running reflink from multiple nodes simultaneously to clone a file to the same directory probably triggers a deadlock issue. For example, there is a three node ocfs2 cluster, each node mounts the ocfs2 file system to /mnt/shared, and run the reflink command from each node repeatedly, like reflink "/mnt/shared/test" \ "/mnt/shared/.snapshots/test.`date +%m%d%H%M%S`.`hostname`" then, reflink command process will be hung on each node, and you can't list this file system directory. The problematic reflink command process is blocked at one node, task:reflink state:D stack: 0 pid: 1283 ppid: 4154 Call Trace: __schedule+0x2fd/0x750 schedule+0x2f/0xa0 schedule_timeout+0x1cc/0x310 ? ocfs2_control_cfu+0x50/0x50 [ocfs2_stack_user] ? 0xffffffffc0e3e000 wait_for_completion+0xba/0x140 ? wake_up_q+0xa0/0xa0 __ocfs2_cluster_lock.isra.41+0x3b5/0x820 [ocfs2] ? ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2] ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2] ocfs2_init_security_and_acl+0xbe/0x1d0 [ocfs2] ocfs2_reflink+0x436/0x4c0 [ocfs2] ? ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2] ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2] ocfs2_ioctl+0x25e/0x670 [ocfs2] do_vfs_ioctl+0xa0/0x680 ksys_ioctl+0x70/0x80 __x64_sys_ioctl+0x16/0x20 do_syscall_64+0x5b/0x1e0 The other reflink command processes are blocked at other nodes, task:reflink state:D stack: 0 pid:29759 ppid: 4088 Call Trace: __schedule+0x2fd/0x750 schedule+0x2f/0xa0 schedule_timeout+0x1cc/0x310 ? ocfs2_control_cfu+0x50/0x50 [ocfs2_stack_user] ? 0xffffffffc0b19000 wait_for_completion+0xba/0x140 ? wake_up_q+0xa0/0xa0 __ocfs2_cluster_lock.isra.41+0x3b5/0x820 [ocfs2] ? ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2] ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2] ocfs2_mv_orphaned_inode_to_new+0x87/0x7e0 [ocfs2] ocfs2_reflink+0x335/0x4c0 [ocfs2] ? ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2] ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2] ocfs2_ioctl+0x25e/0x670 [ocfs2] do_vfs_ioctl+0xa0/0x680 ksys_ioctl+0x70/0x80 __x64_sys_ioctl+0x16/0x20 do_syscall_64+0x5b/0x1e0 or task:reflink state:D stack: 0 pid:18465 ppid: 4156 Call Trace: __schedule+0x302/0x940 ? usleep_range+0x80/0x80 schedule+0x46/0xb0 schedule_timeout+0xff/0x140 ? ocfs2_control_cfu+0x50/0x50 [ocfs2_stack_user] ? 0xffffffffc0c3b000 __wait_for_common+0xb9/0x170 __ocfs2_cluster_lock.constprop.0+0x1d6/0x860 [ocfs2] ? ocfs2_wait_for_recovery+0x49/0xd0 [ocfs2] ? ocfs2_inode_lock_full_nested+0x30f/0xa50 [ocfs2] ocfs2_inode_lock_full_nested+0x30f/0xa50 [ocfs2] ocfs2_inode_lock_tracker+0xf2/0x2b0 [ocfs2] ? dput+0x32/0x2f0 ocfs2_permission+0x45/0xe0 [ocfs2] inode_permission+0xcc/0x170 link_path_walk.part.0.constprop.0+0x2a2/0x380 ? path_init+0x2c1/0x3f0 path_parentat+0x3c/0x90 filename_parentat+0xc1/0x1d0 ? filename_lookup+0x138/0x1c0 filename_create+0x43/0x160 ocfs2_reflink_ioctl+0xe6/0x380 [ocfs2] ocfs2_ioctl+0x1ea/0x2c0 [ocfs2] ? do_sys_openat2+0x81/0x150 __x64_sys_ioctl+0x82/0xb0 do_syscall_64+0x61/0xb0 The deadlock is caused by multiple acquiring the destination directory inode dlm lock in ocfs2_reflink function, we should acquire this directory inode dlm lock at the beginning, and hold this dlm lock until end of the function. Link: https://lkml.kernel.org/r/20210729110230.18983-1-ghe@suse.com Signed-off-by: Gang He Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton --- fs/ocfs2/namei.c | 32 +++++++++++++------------------- fs/ocfs2/namei.h | 2 ++ fs/ocfs2/refcounttree.c | 15 +++++++++++---- fs/ocfs2/xattr.c | 12 +----------- fs/ocfs2/xattr.h | 1 + 5 files changed, 28 insertions(+), 34 deletions(-) diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index c75fd54b91854b..e3dd30dd3547ff 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -2489,6 +2489,7 @@ static int ocfs2_prep_new_orphaned_file(struct inode *dir, } int ocfs2_create_inode_in_orphan(struct inode *dir, + struct buffer_head **dir_bh, int mode, struct inode **new_inode) { @@ -2597,13 +2598,16 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, brelse(new_di_bh); - if (!status) - *new_inode = inode; - ocfs2_free_dir_lookup_result(&orphan_insert); - ocfs2_inode_unlock(dir, 1); - brelse(parent_di_bh); + if (!status) { + *new_inode = inode; + *dir_bh = parent_di_bh; + } else { + ocfs2_inode_unlock(dir, 1); + brelse(parent_di_bh); + } + return status; } @@ -2760,11 +2764,11 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, } int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, + struct buffer_head *dir_bh, struct inode *inode, struct dentry *dentry) { int status = 0; - struct buffer_head *parent_di_bh = NULL; handle_t *handle = NULL; struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); struct ocfs2_dinode *dir_di, *di; @@ -2778,14 +2782,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, (unsigned long long)OCFS2_I(dir)->ip_blkno, (unsigned long long)OCFS2_I(inode)->ip_blkno); - status = ocfs2_inode_lock(dir, &parent_di_bh, 1); - if (status < 0) { - if (status != -ENOENT) - mlog_errno(status); - return status; - } - - dir_di = (struct ocfs2_dinode *) parent_di_bh->b_data; + dir_di = (struct ocfs2_dinode *) dir_bh->b_data; if (!dir_di->i_links_count) { /* can't make a file in a deleted directory. */ status = -ENOENT; @@ -2798,7 +2795,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, goto leave; /* get a spot inside the dir. */ - status = ocfs2_prepare_dir_for_insert(osb, dir, parent_di_bh, + status = ocfs2_prepare_dir_for_insert(osb, dir, dir_bh, dentry->d_name.name, dentry->d_name.len, &lookup); if (status < 0) { @@ -2862,7 +2859,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, ocfs2_journal_dirty(handle, di_bh); status = ocfs2_add_entry(handle, dentry, inode, - OCFS2_I(inode)->ip_blkno, parent_di_bh, + OCFS2_I(inode)->ip_blkno, dir_bh, &lookup); if (status < 0) { mlog_errno(status); @@ -2886,10 +2883,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, iput(orphan_dir_inode); leave: - ocfs2_inode_unlock(dir, 1); - brelse(di_bh); - brelse(parent_di_bh); brelse(orphan_dir_bh); ocfs2_free_dir_lookup_result(&lookup); diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h index 9cc891eb874e04..03a2c526e2c1b8 100644 --- a/fs/ocfs2/namei.h +++ b/fs/ocfs2/namei.h @@ -24,6 +24,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, struct buffer_head *orphan_dir_bh, bool dio); int ocfs2_create_inode_in_orphan(struct inode *dir, + struct buffer_head **dir_bh, int mode, struct inode **new_inode); int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb, @@ -32,6 +33,7 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, struct inode *inode, struct buffer_head *di_bh, int update_isize, loff_t end); int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, + struct buffer_head *dir_bh, struct inode *new_inode, struct dentry *new_dentry); diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index e04358a46b6805..00ce8fe7e32370 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4252,7 +4252,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, { int error, had_lock; struct inode *inode = d_inode(old_dentry); - struct buffer_head *old_bh = NULL; + struct buffer_head *old_bh = NULL, *dir_bh = NULL; struct inode *new_orphan_inode = NULL; struct ocfs2_lock_holder oh; @@ -4260,7 +4260,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, return -EOPNOTSUPP; - error = ocfs2_create_inode_in_orphan(dir, inode->i_mode, + error = ocfs2_create_inode_in_orphan(dir, &dir_bh, inode->i_mode, &new_orphan_inode); if (error) { mlog_errno(error); @@ -4306,13 +4306,15 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, /* If the security isn't preserved, we need to re-initialize them. */ if (!preserve) { - error = ocfs2_init_security_and_acl(dir, new_orphan_inode, + error = ocfs2_init_security_and_acl(dir, dir_bh, + new_orphan_inode, &new_dentry->d_name); if (error) mlog_errno(error); } if (!error) { - error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode, + error = ocfs2_mv_orphaned_inode_to_new(dir, dir_bh, + new_orphan_inode, new_dentry); if (error) mlog_errno(error); @@ -4330,6 +4332,11 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, iput(new_orphan_inode); } + if (dir_bh) { + ocfs2_inode_unlock(dir, 1); + brelse(dir_bh); + } + return error; } diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 95d0611c5fc7d1..3f23e3a5018ce7 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -7203,16 +7203,13 @@ int ocfs2_reflink_xattrs(struct inode *old_inode, /* * Initialize security and acl for a already created inode. * Used for reflink a non-preserve-security file. - * - * It uses common api like ocfs2_xattr_set, so the caller - * must not hold any lock expect i_rwsem. */ int ocfs2_init_security_and_acl(struct inode *dir, + struct buffer_head *dir_bh, struct inode *inode, const struct qstr *qstr) { int ret = 0; - struct buffer_head *dir_bh = NULL; ret = ocfs2_init_security_get(inode, dir, qstr, NULL); if (ret) { @@ -7220,17 +7217,10 @@ int ocfs2_init_security_and_acl(struct inode *dir, goto leave; } - ret = ocfs2_inode_lock(dir, &dir_bh, 0); - if (ret) { - mlog_errno(ret); - goto leave; - } ret = ocfs2_init_acl(NULL, inode, dir, NULL, dir_bh, NULL, NULL); if (ret) mlog_errno(ret); - ocfs2_inode_unlock(dir, 0); - brelse(dir_bh); leave: return ret; } diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h index 00308b57f64f18..b27fd8ba00196a 100644 --- a/fs/ocfs2/xattr.h +++ b/fs/ocfs2/xattr.h @@ -83,6 +83,7 @@ int ocfs2_reflink_xattrs(struct inode *old_inode, struct buffer_head *new_bh, bool preserve_security); int ocfs2_init_security_and_acl(struct inode *dir, + struct buffer_head *dir_bh, struct inode *inode, const struct qstr *qstr); #endif /* OCFS2_XATTR_H */ From d89860c1cb905776b7da77264ba6110e96917324 Mon Sep 17 00:00:00 2001 From: Wangyan Date: Wed, 15 Jun 2022 16:08:21 -0700 Subject: [PATCH 03/34] ocfs2: clear links count in ocfs2_mknod() if an error occurs In this condition, the inode can not be wiped when error happened. ocfs2_mkdir() ->ocfs2_mknod() ->ocfs2_mknod_locked() ->__ocfs2_mknod_locked() ->ocfs2_set_links_count() // i_links_count is 2 -> ... // an error accrue, goto roll_back or leave. ->ocfs2_commit_trans() ->iput(inode) ->evict() ->ocfs2_evict_inode() ->ocfs2_delete_inode() ->ocfs2_inode_lock() ->ocfs2_inode_lock_update() ->ocfs2_refresh_inode() ->set_nlink(); // inode->i_nlink is 2 now. /* if wipe is 0, it will goto bail_unlock_inode */ ->ocfs2_query_inode_wipe() ->if (inode->i_nlink) return; // wipe is 0. /* inode can not be wiped */ ->ocfs2_wipe_inode() So, we need clear links before the transaction committed. Link: http://lkml.kernel.org/r/d8147c41-fb2b-bdf7-b660-1f3c8448c33f@huawei.com Signed-off-by: Yan Wang Reviewed-by: Jun Piao Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Cc: Gang He Cc: Signed-off-by: Andrew Morton --- fs/ocfs2/namei.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index e3dd30dd3547ff..ea27e63ec278fc 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -453,8 +453,12 @@ static int ocfs2_mknod(struct user_namespace *mnt_userns, leave: if (status < 0 && did_quota_inode) dquot_free_inode(inode); - if (handle) + if (handle) { + if (status < 0 && new_fe_bh != NULL) + ocfs2_set_links_count((struct ocfs2_dinode *) + new_fe_bh->b_data, 0); ocfs2_commit_trans(osb, handle); + } ocfs2_inode_unlock(dir, 1); if (did_block_signals) @@ -598,6 +602,8 @@ static int __ocfs2_mknod_locked(struct inode *dir, leave: if (status < 0) { if (*new_fe_bh) { + if (fe) + ocfs2_set_links_count(fe, 0); brelse(*new_fe_bh); *new_fe_bh = NULL; } @@ -2027,8 +2033,12 @@ static int ocfs2_symlink(struct user_namespace *mnt_userns, ocfs2_clusters_to_bytes(osb->sb, 1)); if (status < 0 && did_quota_inode) dquot_free_inode(inode); - if (handle) + if (handle) { + if (status < 0 && new_fe_bh != NULL) + ocfs2_set_links_count((struct ocfs2_dinode *) + new_fe_bh->b_data, 0); ocfs2_commit_trans(osb, handle); + } ocfs2_inode_unlock(dir, 1); if (did_block_signals) From 58cf4b4c8459c702163ed73ef8d10f1d829802ec Mon Sep 17 00:00:00 2001 From: Wangyan Date: Wed, 15 Jun 2022 16:08:21 -0700 Subject: [PATCH 04/34] ocfs2: fix ocfs2 corrupt when iputting an inode In this condition, it will cause an bug on error. ocfs2_mkdir() ->ocfs2_mknod() ->ocfs2_mknod_locked() ->__ocfs2_mknod_locked() //Assume inode->i_generation is genN. ->inode->i_generation = osb->s_next_generation++; // The inode lockres has been initialized. ->ocfs2_populate_inode() ->ocfs2_create_new_inode_locks() ->An error happened, returned value is non-zero // free the start_bit x in bg_blkno ->ocfs2_free_suballoc_bits() ->... /* Another process execute mkdir success in this place, and it occupied the start_bit x in bg_blkno which has been freed before. Its inode->i_generation is genN + 1 */ ->iput(inode) ->evict() ->ocfs2_evict_inode() ->ocfs2_delete_inode() ->ocfs2_inode_lock() ->ocfs2_inode_lock_update() /* Bug on here, genN != genN + 1 */ ->mlog_bug_on_msg(inode->i_generation != le32_to_cpu(fe->i_generation)) So, we need not to reclaim the inode when the inode->ip_inode_lockres has been initialized. It will be freed in iput(). Link: http://lkml.kernel.org/r/ef080ca3-5d74-e276-17a1-d9e7c7e662c9@huawei.com Fixes: b1529a41f777 ("ocfs2: should reclaim the inode if '__ocfs2_mknod_locked' returns an error") Signed-off-by: Yan Wang Reviewed-by: Jun Piao Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Cc: Gang He Cc: Signed-off-by: Andrew Morton --- fs/ocfs2/namei.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index ea27e63ec278fc..7d7f2b8f0554e0 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -640,7 +640,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, status = __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh, parent_fe_bh, handle, inode_ac, fe_blkno, suballoc_loc, suballoc_bit); - if (status < 0) { + if (status < 0 && !(OCFS2_I(inode)->ip_inode_lockres.l_flags & + OCFS2_LOCK_INITIALIZED)) { u64 bg_blkno = ocfs2_which_suballoc_group(fe_blkno, suballoc_bit); int tmp = ocfs2_free_suballoc_bits(handle, inode_ac->ac_inode, inode_ac->ac_bh, suballoc_bit, bg_blkno, 1); From 40baab8f611a929bfe62865fb9b7ed7cf65bb425 Mon Sep 17 00:00:00 2001 From: Dan Moulding Date: Wed, 15 Jun 2022 16:08:22 -0700 Subject: [PATCH 05/34] init: add "hostname" kernel parameter The gethostname system call returns the hostname for the current machine. However, the kernel has no mechanism to initially set the current machine's name in such a way as to guarantee that the first userspace process to call gethostname will receive a meaningful result. It relies on some unspecified userspace process to first call sethostname before gethostname can produce a meaningful name. Traditionally the machine's hostname is set from userspace by the init system. The init system, in turn, often relies on a configuration file (say, /etc/hostname) to provide the value that it will supply in the call to sethostname. Consequently, the file system containing /etc/hostname usually must be available before the hostname will be set. There may, however, be earlier userspace processes that could call gethostname before the file system containing /etc/hostname is mounted. Such a process will get some other, likely meaningless, name from gethostname (such as "(none)", "localhost", or "darkstar"). A real-world example where this can happen, and lead to undesirable results, is with mdadm. When assembling arrays, mdadm distinguishes between "local" arrays and "foreign" arrays. A local array is one that properly belongs to the current machine, and a foreign array is one that is (possibly temporarily) attached to the current machine, but properly belongs to some other machine. To determine if an array is local or foreign, mdadm may compare the "homehost" recorded on the array with the current hostname. If mdadm is run before the root file system is mounted, perhaps because the root file system itself resides on an md-raid array, then /etc/hostname isn't yet available and the init system will not yet have called sethostname, causing mdadm to incorrectly conclude that all of the local arrays are foreign. Solving this problem *could* be delegated to the init system. It could be left up to the init system (including any init system that starts within an initramfs, if one is in use) to ensure that sethostname is called before any other userspace process could possibly call gethostname. However, it may not always be obvious which processes could call gethostname (for example, udev itself might not call gethostname, but it could via udev rules invoke processes that do). Additionally, the init system has to ensure that the hostname configuration value is stored in some place where it will be readily accessible during early boot. Unfortunately, every init system will attempt to (or has already attempted to) solve this problem in a different, possibly incorrect, way. This makes getting consistently working configurations harder for users. I believe it is better for the kernel to provide the means by which the hostname may be set early, rather than making this a problem for the init system to solve. The option to set the hostname during early startup, via a kernel parameter, provides a simple, reliable way to solve this problem. It also could make system configuration easier for some embedded systems. Link: https://lkml.kernel.org/r/20220505180651.22849-2-dmoulding@me.com Signed-off-by: Dan Moulding Cc: Thomas Gleixner Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- Documentation/admin-guide/kernel-parameters.txt | 13 +++++++++++++ init/version.c | 17 +++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 8090130b544b07..3beeb86eb92e25 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1665,6 +1665,19 @@ Valid parameters: "on", "off" Default: "on" + hostname= [KNL] Set the hostname (aka UTS nodename). + Format: + This allows setting the system's hostname during early + startup. This sets the name returned by gethostname. + Using this parameter to set the hostname makes it + possible to ensure the hostname is correctly set before + any userspace processes run, avoiding the possibility + that a process may call gethostname before the hostname + has been explicitly set, resulting in the calling + process getting an incorrect result. The string must + not exceed the maximum allowed hostname length (usually + 64 characters) and will be truncated otherwise. + hlt [BUGS=ARM,SH] hpet= [X86-32,HPET] option to control HPET usage diff --git a/init/version.c b/init/version.c index 1a356f5493e853..66d237d5629c6a 100644 --- a/init/version.c +++ b/init/version.c @@ -11,6 +11,8 @@ #include #include #include +#include +#include #include #include #include @@ -35,6 +37,21 @@ struct uts_namespace init_uts_ns = { }; EXPORT_SYMBOL_GPL(init_uts_ns); +static int __init early_hostname(char *arg) +{ + size_t bufsize = sizeof(init_uts_ns.name.nodename); + size_t maxlen = bufsize - 1; + + strncpy(init_uts_ns.name.nodename, arg, bufsize); + if (strlen(arg) > maxlen) { + pr_warn("hostname parameter exceeds %zd characters and will be truncated", + maxlen); + init_uts_ns.name.nodename[maxlen] = '\0'; + } + return 0; +} +early_param("hostname", early_hostname); + /* FIXED STRINGS! Don't touch! */ const char linux_banner[] = "Linux version " UTS_RELEASE " (" LINUX_COMPILE_BY "@" From be40a252f289134022919b10522c87b34ef8355e Mon Sep 17 00:00:00 2001 From: Dan Moulding Date: Wed, 15 Jun 2022 16:08:22 -0700 Subject: [PATCH 06/34] init-add-hostname-kernel-parameter-v2 Link: https://lkml.kernel.org/r/20220506060310.7495-2-dmoulding@me.com Signed-off-by: Dan Moulding Signed-off-by: Andrew Morton --- Documentation/admin-guide/kernel-parameters.txt | 4 ++-- init/version.c | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 3beeb86eb92e25..31fde44d401b8f 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1665,6 +1665,8 @@ Valid parameters: "on", "off" Default: "on" + hlt [BUGS=ARM,SH] + hostname= [KNL] Set the hostname (aka UTS nodename). Format: This allows setting the system's hostname during early @@ -1678,8 +1680,6 @@ not exceed the maximum allowed hostname length (usually 64 characters) and will be truncated otherwise. - hlt [BUGS=ARM,SH] - hpet= [X86-32,HPET] option to control HPET usage Format: { enable (default) | disable | force | verbose } diff --git a/init/version.c b/init/version.c index 66d237d5629c6a..b7f9559d417c74 100644 --- a/init/version.c +++ b/init/version.c @@ -41,12 +41,12 @@ static int __init early_hostname(char *arg) { size_t bufsize = sizeof(init_uts_ns.name.nodename); size_t maxlen = bufsize - 1; + size_t arglen; - strncpy(init_uts_ns.name.nodename, arg, bufsize); - if (strlen(arg) > maxlen) { + arglen = strlcpy(init_uts_ns.name.nodename, arg, bufsize); + if (arglen > maxlen) { pr_warn("hostname parameter exceeds %zd characters and will be truncated", maxlen); - init_uts_ns.name.nodename[maxlen] = '\0'; } return 0; } From 27a435ee68eccbdf1a9744781dab54e2c8008fdb Mon Sep 17 00:00:00 2001 From: Andrew Halaney Date: Wed, 15 Jun 2022 16:08:22 -0700 Subject: [PATCH 07/34] init/main.c: silence some -Wunused-parameter warnings There are a bunch of callbacks with unused arguments, go ahead and silence those so "make KCFLAGS=-W init/main.o" is a little quieter. Here's a little sample: init/main.c:182:43: warning: unused parameter 'str' [-Wunused-parameter] static int __init set_reset_devices(char *str) Link: https://lkml.kernel.org/r/20210519162341.1275452-1-ahalaney@redhat.com Signed-off-by: Andrew Halaney Cc: Rasmus Villemoes Signed-off-by: Andrew Morton --- init/main.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/init/main.c b/init/main.c index 0ee39cdcfcac97..0f452ae3b20f54 100644 --- a/init/main.c +++ b/init/main.c @@ -181,7 +181,7 @@ EXPORT_SYMBOL_GPL(static_key_initialized); unsigned int reset_devices; EXPORT_SYMBOL(reset_devices); -static int __init set_reset_devices(char *str) +static int __init set_reset_devices(char *str __always_unused) { reset_devices = 1; return 1; @@ -231,13 +231,13 @@ static bool __init obsolete_checksetup(char *line) unsigned long loops_per_jiffy = (1<<12); EXPORT_SYMBOL(loops_per_jiffy); -static int __init debug_kernel(char *str) +static int __init debug_kernel(char *str __always_unused) { console_loglevel = CONSOLE_LOGLEVEL_DEBUG; return 0; } -static int __init quiet_kernel(char *str) +static int __init quiet_kernel(char *str __always_unused) { console_loglevel = CONSOLE_LOGLEVEL_QUIET; return 0; @@ -474,7 +474,7 @@ static void __init setup_boot_config(void) get_boot_config_from_initrd(NULL); } -static int __init warn_bootconfig(char *str) +static int __init warn_bootconfig(char *str __always_unused) { pr_warn("WARNING: 'bootconfig' found on the kernel command line but CONFIG_BOOT_CONFIG is not set.\n"); return 0; @@ -503,7 +503,8 @@ static void __init repair_env_string(char *param, char *val) /* Anything after -- gets handed straight to init. */ static int __init set_init_arg(char *param, char *val, - const char *unused, void *arg) + const char *unused __always_unused, + void *arg __always_unused) { unsigned int i; @@ -528,7 +529,8 @@ static int __init set_init_arg(char *param, char *val, * unused parameters (modprobe will find them in /proc/cmdline). */ static int __init unknown_bootoption(char *param, char *val, - const char *unused, void *arg) + const char *unused __always_unused, + void *arg __always_unused) { size_t len = strlen(param); @@ -728,7 +730,8 @@ noinline void __ref rest_init(void) /* Check for early params. */ static int __init do_early_param(char *param, char *val, - const char *unused, void *arg) + const char *unused __always_unused, + void *arg __always_unused) { const struct obs_kernel_param *p; @@ -1347,8 +1350,10 @@ static const char *initcall_level_names[] __initdata = { "late", }; -static int __init ignore_unknown_bootoption(char *param, char *val, - const char *unused, void *arg) +static int __init ignore_unknown_bootoption(char *param __always_unused, + char *val __always_unused, + const char *unused __always_unused, + void *arg __always_unused) { return 0; } @@ -1487,7 +1492,7 @@ void __weak free_initmem(void) free_initmem_default(POISON_FREE_INITMEM); } -static int __ref kernel_init(void *unused) +static int __ref kernel_init(void *unused __always_unused) { int ret; From e7e8754565ca8a35e6b8ae25e90e81d0141ce3cd Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 25 May 2022 12:03:17 -0700 Subject: [PATCH 08/34] checkpatch: add XA_STATE and XA_STATE_ORDER to the macro declaration list XA_STATE() and XA_STATE_ORDER macro uses are declarations. Add them to the declaration macro list to avoid suggesting a blank line after declarations when used. Link: https://lkml.kernel.org/r/144314f4bf2c58cf2336028a75a5127e848abd81.camel@perches.com Signed-off-by: Joe Perches Reported-by: David Howells Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- scripts/checkpatch.pl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 503e8abbb2c1e4..205bf5055acffb 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -1042,7 +1042,8 @@ sub build_types { our $declaration_macros = qr{(?x: (?:$Storage\s+)?(?:[A-Z_][A-Z0-9]*_){0,2}(?:DEFINE|DECLARE)(?:_[A-Z0-9]+){1,6}\s*\(| (?:$Storage\s+)?[HLP]?LIST_HEAD\s*\(| - (?:SKCIPHER_REQUEST|SHASH_DESC|AHASH_REQUEST)_ON_STACK\s*\( + (?:SKCIPHER_REQUEST|SHASH_DESC|AHASH_REQUEST)_ON_STACK\s*\(| + (?:$Storage\s+)?(?:XA_STATE|XA_STATE_ORDER)\s*\( )}; our %allow_repeated_words = ( From f8eaac3443ee3ce5252da25665e935a9cbc6d54b Mon Sep 17 00:00:00 2001 From: Chen Zhongjin Date: Tue, 31 May 2022 09:28:54 +0800 Subject: [PATCH 09/34] profiling: fix shift too large makes kernel panic 2d186afd04d6 ("profiling: fix shift-out-of-bounds bugs") limits shift value by [0, BITS_PER_LONG -1], which means [0, 63]. However, syzbot found that the max shift value should be the bit number of (_etext - _stext). If shift is outside of this, the "buffer_bytes" will be zero and will cause kzalloc(0). Then the kernel panics due to dereferencing the returned pointer 16. This can be easily reproduced by passing a large number like 60 to enable profiling and then run readprofile. LOGS: BUG: kernel NULL pointer dereference, address: 0000000000000010 #PF: supervisor write access in kernel mode #PF: error_code(0x0002) - not-present page PGD 6148067 P4D 6148067 PUD 6142067 PMD 0 PREEMPT SMP CPU: 4 PID: 184 Comm: readprofile Not tainted 5.18.0+ #162 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.15.0-0-g2dd4b9b3f840-prebuilt.qemu.org 04/01/2014 RIP: 0010:read_profile+0x104/0x220 RSP: 0018:ffffc900006fbe80 EFLAGS: 00000202 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 RDX: ffff888006150000 RSI: 0000000000000001 RDI: ffffffff82aba4a0 RBP: 000000000188bb60 R08: 0000000000000010 R09: ffff888006151000 R10: 0000000000000000 R11: 0000000000000000 R12: ffffffff82aba4a0 R13: 0000000000000000 R14: ffffc900006fbf08 R15: 0000000000020c30 FS: 000000000188a8c0(0000) GS:ffff88803ed00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000010 CR3: 0000000006144000 CR4: 00000000000006e0 Call Trace: proc_reg_read+0x56/0x70 vfs_read+0x9a/0x1b0 ksys_read+0xa1/0xe0 ? fpregs_assert_state_consistent+0x1e/0x40 do_syscall_64+0x3a/0x80 entry_SYSCALL_64_after_hwframe+0x46/0xb0 RIP: 0033:0x4d4b4e RSP: 002b:00007ffebb668d58 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 RAX: ffffffffffffffda RBX: 000000000188a8a0 RCX: 00000000004d4b4e RDX: 0000000000000400 RSI: 000000000188bb60 RDI: 0000000000000003 RBP: 0000000000000003 R08: 000000000000006e R09: 0000000000000000 R10: 0000000000000041 R11: 0000000000000246 R12: 000000000188bb60 R13: 0000000000000400 R14: 0000000000000000 R15: 000000000188bb60 Modules linked in: CR2: 0000000000000010 Killed ---[ end trace 0000000000000000 ]--- Check prof_len in profile_init() to prevent it be zero. Link: https://lkml.kernel.org/r/20220531012854.229439-1-chenzhongjin@huawei.com Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Chen Zhongjin Signed-off-by: Andrew Morton --- kernel/profile.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kernel/profile.c b/kernel/profile.c index 37640a0bd8a3c7..ae82ddfc6a6845 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -109,6 +109,13 @@ int __ref profile_init(void) /* only text is profiled */ prof_len = (_etext - _stext) >> prof_shift; + + if (!prof_len) { + pr_warn("profiling shift: %u too large\n", prof_shift); + prof_on = 0; + return -EINVAL; + } + buffer_bytes = prof_len*sizeof(atomic_t); if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL)) From 2406c082be186dcc964a7438eba23ab4e1241a0f Mon Sep 17 00:00:00 2001 From: Vaibhav Jain Date: Wed, 1 Jun 2022 22:02:43 +0530 Subject: [PATCH 10/34] resource: re-factor page_is_ram() Presently page_is_ram() relies on walk_system_ram_range() that performs a walk on kernel iomem resources hierarchy with a dummy callback __is_ram(). Before calling find_next_iomem_res(), walk_system_ram_range() does some book-keeping which can be avoided for page_is_ram() use-case. Hence this patch proposes to update page_is_ram() to directly call find_next_iomem_res() with minimal book-keeping needed. To avoid allocating a 'struct resource' the patch also updates find_next_iomem_res() to not return -EINVAL in case 'res == NULL'. Instead our 'struct resource *res' is only populated when its not NULL. Link: https://lkml.kernel.org/r/20220601163243.3806231-1-vaibhav@linux.ibm.com Signed-off-by: Vaibhav Jain Cc: Vaibhav Jain Cc: "Aneesh Kumar K . V" Cc: David Hildenbrand Cc: Dan Williams Cc: Miaohe Lin Cc: Muchun Song Signed-off-by: Andrew Morton --- kernel/resource.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/kernel/resource.c b/kernel/resource.c index 34eaee179689a1..ecf6b9a50adc1f 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -311,7 +311,7 @@ EXPORT_SYMBOL(release_resource); * * If a resource is found, returns 0 and @*res is overwritten with the part * of the resource that's within [@start..@end]; if none is found, returns - * -ENODEV. Returns -EINVAL for invalid parameters. + * -ENODEV. * * @start: start address of the resource searched for * @end: end address of same resource @@ -328,9 +328,6 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end, { struct resource *p; - if (!res) - return -EINVAL; - if (start >= end) return -EINVAL; @@ -356,7 +353,7 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end, break; } - if (p) { + if (p && res) { /* copy data */ *res = (struct resource) { .start = max(start, p->start), @@ -474,18 +471,18 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, return ret; } -static int __is_ram(unsigned long pfn, unsigned long nr_pages, void *arg) -{ - return 1; -} - /* * This generic page_is_ram() returns true if specified address is * registered as System RAM in iomem_resource list. */ int __weak page_is_ram(unsigned long pfn) { - return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1; + const resource_size_t pfn_res = PFN_PHYS(pfn); + + return find_next_iomem_res(pfn_res, + pfn_res + 1, + IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY, + IORES_DESC_NONE, NULL) == 0; } EXPORT_SYMBOL_GPL(page_is_ram); From 4d3d970b1942b2ca4d3a7735c7d57a7527150f4e Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Tue, 31 May 2022 15:29:51 -0700 Subject: [PATCH 11/34] lib/list_debug.c: Detect uninitialized lists In some circumstances, attempts are made to add entries to or to remove entries from an uninitialized list. A prime example is amdgpu_bo_vm_destroy(): It is indirectly called from ttm_bo_init_reserved() if that function fails, and tries to remove an entry from a list. However, that list is only initialized in amdgpu_bo_create_vm() after the call to ttm_bo_init_reserved() returned success. This results in crashes such as BUG: kernel NULL pointer dereference, address: 0000000000000000 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP NOPTI CPU: 1 PID: 1479 Comm: chrome Not tainted 5.10.110-15768-g29a72e65dae5 Hardware name: Google Grunt/Grunt, BIOS Google_Grunt.11031.149.0 07/15/2020 RIP: 0010:__list_del_entry_valid+0x26/0x7d ... Call Trace: amdgpu_bo_vm_destroy+0x48/0x8b ttm_bo_init_reserved+0x1d7/0x1e0 amdgpu_bo_create+0x212/0x476 ? amdgpu_bo_user_destroy+0x23/0x23 ? kmem_cache_alloc+0x60/0x271 amdgpu_bo_create_vm+0x40/0x7d amdgpu_vm_pt_create+0xe8/0x24b ... Check if the list's prev and next pointers are NULL to catch such problems. Link: https://lkml.kernel.org/r/20220531222951.92073-1-linux@roeck-us.net Signed-off-by: Guenter Roeck Cc: Steven Rostedt Signed-off-by: Andrew Morton --- lib/list_debug.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/lib/list_debug.c b/lib/list_debug.c index 9daa3fb9d1cd61..d98d43f80958b8 100644 --- a/lib/list_debug.c +++ b/lib/list_debug.c @@ -20,7 +20,11 @@ bool __list_add_valid(struct list_head *new, struct list_head *prev, struct list_head *next) { - if (CHECK_DATA_CORRUPTION(next->prev != prev, + if (CHECK_DATA_CORRUPTION(prev == NULL, + "list_add corruption. prev is NULL.\n") || + CHECK_DATA_CORRUPTION(next == NULL, + "list_add corruption. next is NULL.\n") || + CHECK_DATA_CORRUPTION(next->prev != prev, "list_add corruption. next->prev should be prev (%px), but was %px. (next=%px).\n", prev, next->prev, next) || CHECK_DATA_CORRUPTION(prev->next != next, @@ -42,7 +46,11 @@ bool __list_del_entry_valid(struct list_head *entry) prev = entry->prev; next = entry->next; - if (CHECK_DATA_CORRUPTION(next == LIST_POISON1, + if (CHECK_DATA_CORRUPTION(next == NULL, + "list_del corruption, %px->next is NULL\n", entry) || + CHECK_DATA_CORRUPTION(prev == NULL, + "list_del corruption, %px->prev is NULL\n", entry) || + CHECK_DATA_CORRUPTION(next == LIST_POISON1, "list_del corruption, %px->next is LIST_POISON1 (%px)\n", entry, LIST_POISON1) || CHECK_DATA_CORRUPTION(prev == LIST_POISON2, From 44b6a09f1869c6471ce1ff5fa6a699d6eba5fc93 Mon Sep 17 00:00:00 2001 From: wuchi Date: Sat, 4 Jun 2022 21:15:02 +0800 Subject: [PATCH 12/34] lib/flex_proportions.c: remove local_irq_ops in fprop_new_period() commit e78d4833c03e28> "lib: Fix possible deadlock in flexible proportion code" adds the local_irq_ops because percpu_counter_{sum |add} ops'lock can cause deadlock by interrupts. Now percpu_counter _{sum|add} ops use raw_spin_(un)lock_irq*, so revert the commit and resolve the conflict. Link: https://lkml.kernel.org/r/20220604131502.5190-1-wuchi.zero@gmail.com Signed-off-by: wuchi Reviewed-by: Jan Kara Cc: Christoph Hellwig Signed-off-by: Andrew Morton --- lib/flex_proportions.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/lib/flex_proportions.c b/lib/flex_proportions.c index 53e7eb1dd76c96..05cccbcf1661a3 100644 --- a/lib/flex_proportions.c +++ b/lib/flex_proportions.c @@ -63,18 +63,13 @@ void fprop_global_destroy(struct fprop_global *p) */ bool fprop_new_period(struct fprop_global *p, int periods) { - s64 events; - unsigned long flags; + s64 events = percpu_counter_sum(&p->events); - local_irq_save(flags); - events = percpu_counter_sum(&p->events); /* * Don't do anything if there are no events. */ - if (events <= 1) { - local_irq_restore(flags); + if (events <= 1) return false; - } write_seqcount_begin(&p->sequence); if (periods < 64) events -= events >> periods; @@ -82,7 +77,6 @@ bool fprop_new_period(struct fprop_global *p, int periods) percpu_counter_add(&p->events, -events); p->period += periods; write_seqcount_end(&p->sequence); - local_irq_restore(flags); return true; } From f57af29c4bb61e4856c189d3c4c008c2a298ff74 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 3 Jun 2022 20:10:12 +0300 Subject: [PATCH 13/34] include/linux/rbtree.h: replace kernel.h with the necessary inclusions When kernel.h is used in the headers it adds a lot into dependency hell, especially when there are circular dependencies are involved. Replace kernel.h inclusion with the list of what is really being used. Link: https://lkml.kernel.org/r/20220603171012.48880-1-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- include/linux/rbtree.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h index 235047d7a1b5e8..f7edca369edadd 100644 --- a/include/linux/rbtree.h +++ b/include/linux/rbtree.h @@ -17,9 +17,9 @@ #ifndef _LINUX_RBTREE_H #define _LINUX_RBTREE_H +#include #include -#include #include #include From c5871229df87c75160f40ab0553e450a250b32cb Mon Sep 17 00:00:00 2001 From: Luc Van Oostenryck Date: Sun, 5 Jun 2022 18:07:38 +0200 Subject: [PATCH 14/34] ia64: fix sparse warnings with cmpxchg() & xchg() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On IA64, new sparse's warnings where issued after fixing some __rcu annotations in kernel/bpf/. These new warnings are false positives and appear on IA64 because on this architecture, the macros for cmpxchg() and xchg() make casts that ignore sparse annotations. This patch contains the minimal patch to fix this issue: adding a missing cast and some missing '__force'. Link: https://lore.kernel.org/r/20220601120013.bq5a3ynbkc3hngm5@mail Link: https://lkml.kernel.org/r/20220605160738.79736-1-luc.vanoostenryck@gmail.com Signed-off-by: Luc Van Oostenryck Reported-by: kernel test robot Acked-by: Paul E. McKenney Acked-by: Toke Høiland-Jørgensen Cc: Arnd Bergmann Signed-off-by: Andrew Morton --- arch/ia64/include/uapi/asm/cmpxchg.h | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/arch/ia64/include/uapi/asm/cmpxchg.h b/arch/ia64/include/uapi/asm/cmpxchg.h index 2c2f3cfeaa77b3..ca2e0268534384 100644 --- a/arch/ia64/include/uapi/asm/cmpxchg.h +++ b/arch/ia64/include/uapi/asm/cmpxchg.h @@ -33,24 +33,24 @@ extern void ia64_xchg_called_with_bad_pointer(void); \ switch (size) { \ case 1: \ - __xchg_result = ia64_xchg1((__u8 *)ptr, x); \ + __xchg_result = ia64_xchg1((__u8 __force *)ptr, x); \ break; \ \ case 2: \ - __xchg_result = ia64_xchg2((__u16 *)ptr, x); \ + __xchg_result = ia64_xchg2((__u16 __force *)ptr, x); \ break; \ \ case 4: \ - __xchg_result = ia64_xchg4((__u32 *)ptr, x); \ + __xchg_result = ia64_xchg4((__u32 __force *)ptr, x); \ break; \ \ case 8: \ - __xchg_result = ia64_xchg8((__u64 *)ptr, x); \ + __xchg_result = ia64_xchg8((__u64 __force *)ptr, x); \ break; \ default: \ ia64_xchg_called_with_bad_pointer(); \ } \ - __xchg_result; \ + (__typeof__ (*(ptr)) __force) __xchg_result; \ }) #ifndef __KERNEL__ @@ -76,42 +76,42 @@ extern long ia64_cmpxchg_called_with_bad_pointer(void); \ switch (size) { \ case 1: \ - _o_ = (__u8) (long) (old); \ + _o_ = (__u8) (long __force) (old); \ break; \ case 2: \ - _o_ = (__u16) (long) (old); \ + _o_ = (__u16) (long __force) (old); \ break; \ case 4: \ - _o_ = (__u32) (long) (old); \ + _o_ = (__u32) (long __force) (old); \ break; \ case 8: \ - _o_ = (__u64) (long) (old); \ + _o_ = (__u64) (long __force) (old); \ break; \ default: \ break; \ } \ switch (size) { \ case 1: \ - _r_ = ia64_cmpxchg1_##sem((__u8 *) ptr, new, _o_); \ + _r_ = ia64_cmpxchg1_##sem((__u8 __force *) ptr, new, _o_); \ break; \ \ case 2: \ - _r_ = ia64_cmpxchg2_##sem((__u16 *) ptr, new, _o_); \ + _r_ = ia64_cmpxchg2_##sem((__u16 __force *) ptr, new, _o_); \ break; \ \ case 4: \ - _r_ = ia64_cmpxchg4_##sem((__u32 *) ptr, new, _o_); \ + _r_ = ia64_cmpxchg4_##sem((__u32 __force *) ptr, new, _o_); \ break; \ \ case 8: \ - _r_ = ia64_cmpxchg8_##sem((__u64 *) ptr, new, _o_); \ + _r_ = ia64_cmpxchg8_##sem((__u64 __force *) ptr, new, _o_); \ break; \ \ default: \ _r_ = ia64_cmpxchg_called_with_bad_pointer(); \ break; \ } \ - (__typeof__(old)) _r_; \ + (__typeof__(old) __force) _r_; \ }) #define cmpxchg_acq(ptr, o, n) \ From fe461ffec57b7663d4ce6b283b1478ed22169d5a Mon Sep 17 00:00:00 2001 From: wuchi Date: Tue, 7 Jun 2022 21:35:56 +0800 Subject: [PATCH 15/34] lib/btree: simplify btree_{lookup|update} btree_{lookup|update} both need to look up node by key, using the common parts(add function btree_lookup_node) to simplify code. Link: https://lkml.kernel.org/r/20220607133556.34732-1-wuchi.zero@gmail.com Signed-off-by: wuchi Cc: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Signed-off-by: Andrew Morton --- lib/btree.c | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/lib/btree.c b/lib/btree.c index b4cf08a5c26789..a82100c73b5597 100644 --- a/lib/btree.c +++ b/lib/btree.c @@ -238,7 +238,7 @@ static int keyzero(struct btree_geo *geo, unsigned long *key) return 1; } -void *btree_lookup(struct btree_head *head, struct btree_geo *geo, +static void *btree_lookup_node(struct btree_head *head, struct btree_geo *geo, unsigned long *key) { int i, height = head->height; @@ -257,7 +257,16 @@ void *btree_lookup(struct btree_head *head, struct btree_geo *geo, if (!node) return NULL; } + return node; +} +void *btree_lookup(struct btree_head *head, struct btree_geo *geo, + unsigned long *key) +{ + int i; + unsigned long *node; + + node = btree_lookup_node(head, geo, key); if (!node) return NULL; @@ -271,23 +280,10 @@ EXPORT_SYMBOL_GPL(btree_lookup); int btree_update(struct btree_head *head, struct btree_geo *geo, unsigned long *key, void *val) { - int i, height = head->height; - unsigned long *node = head->node; - - if (height == 0) - return -ENOENT; - - for ( ; height > 1; height--) { - for (i = 0; i < geo->no_pairs; i++) - if (keycmp(geo, node, i, key) <= 0) - break; - if (i == geo->no_pairs) - return -ENOENT; - node = bval(geo, node, i); - if (!node) - return -ENOENT; - } + int i; + unsigned long *node; + node = btree_lookup_node(head, geo, key); if (!node) return -ENOENT; From 629d3e0175db10998054ef60f209153b068c2853 Mon Sep 17 00:00:00 2001 From: Justin Stitt Date: Wed, 8 Jun 2022 15:35:39 -0700 Subject: [PATCH 16/34] include/uapi/linux/swab.h: move explicit cast outside ternary A cast inside __builtin_constant_p doesn't do anything since it should evaluate as constant at compile time irrespective of this cast. Instead, I moved this cast outside the ternary to ensure the return type is as expected. Additionally, if __HAVE_BUILTIN_BSWAP16__ was not defined then __swab16 is actually returning an `int` not a `u16` due to integer promotion. As Al Viro notes: You *can't* get smaller-than-int out of ? :, same as you can't get it out of addition, etc. This also fixes some clang -Wformat warnings involving default argument promotion. Link: https://github.com/ClangBuiltLinux/linux/issues/378 Link: https://lkml.kernel.org/r/20220608223539.470472-1-justinstitt@google.com Signed-off-by: Justin Stitt Suggested-by: Al Viro Suggested-by: Nathan Chancellor Reviewed-by: Nathan Chancellor Suggested-by: Nick Desaulniers Signed-off-by: Andrew Morton --- include/uapi/linux/swab.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/uapi/linux/swab.h b/include/uapi/linux/swab.h index 7272f85d6d6ab5..0723a9cce747c8 100644 --- a/include/uapi/linux/swab.h +++ b/include/uapi/linux/swab.h @@ -102,7 +102,7 @@ static inline __attribute_const__ __u32 __fswahb32(__u32 val) #define __swab16(x) (__u16)__builtin_bswap16((__u16)(x)) #else #define __swab16(x) \ - (__builtin_constant_p((__u16)(x)) ? \ + (__u16)(__builtin_constant_p(x) ? \ ___constant_swab16(x) : \ __fswab16(x)) #endif @@ -115,7 +115,7 @@ static inline __attribute_const__ __u32 __fswahb32(__u32 val) #define __swab32(x) (__u32)__builtin_bswap32((__u32)(x)) #else #define __swab32(x) \ - (__builtin_constant_p((__u32)(x)) ? \ + (__u32)(__builtin_constant_p(x) ? \ ___constant_swab32(x) : \ __fswab32(x)) #endif @@ -128,7 +128,7 @@ static inline __attribute_const__ __u32 __fswahb32(__u32 val) #define __swab64(x) (__u64)__builtin_bswap64((__u64)(x)) #else #define __swab64(x) \ - (__builtin_constant_p((__u64)(x)) ? \ + (__u64)(__builtin_constant_p(x) ? \ ___constant_swab64(x) : \ __fswab64(x)) #endif From 3be83218891cd1e665c7434090a21350ed3bc314 Mon Sep 17 00:00:00 2001 From: wuchi Date: Sat, 11 Jun 2022 21:06:34 +0800 Subject: [PATCH 17/34] lib/debugobjects: fix stat count and optimize debug_objects_mem_init. 1. Var debug_objects_allocated tracks valid kmem_cache_alloc calls, so track it in debug_objects_replace_static_objects. Do similar things in object_cpu_offline. 2. In debug_objects_mem_init, there is no need to call function cpuhp_setup_state_nocalls when debug_objects_enabled = 0 (out of memory). Link: https://lkml.kernel.org/r/20220611130634.99741-1-wuchi.zero@gmail.com Signed-off-by: wuchi Cc: Thomas Gleixner Cc: Christoph Hellwig Cc: Kees Cook Cc: Waiman Long Signed-off-by: Andrew Morton --- lib/debugobjects.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 337d797a714163..6f8e5dd1dcd0c8 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -437,6 +437,7 @@ static int object_cpu_offline(unsigned int cpu) struct debug_percpu_free *percpu_pool; struct hlist_node *tmp; struct debug_obj *obj; + unsigned long flags; /* Remote access is safe as the CPU is dead already */ percpu_pool = per_cpu_ptr(&percpu_obj_pool, cpu); @@ -444,6 +445,12 @@ static int object_cpu_offline(unsigned int cpu) hlist_del(&obj->node); kmem_cache_free(obj_cache, obj); } + + raw_spin_lock_irqsave(&pool_lock, flags); + obj_pool_used -= percpu_pool->obj_free; + debug_objects_freed += percpu_pool->obj_free; + raw_spin_unlock_irqrestore(&pool_lock, flags); + percpu_pool->obj_free = 0; return 0; @@ -1318,6 +1325,8 @@ static int __init debug_objects_replace_static_objects(void) hlist_add_head(&obj->node, &objects); } + debug_objects_allocated += i; + /* * debug_objects_mem_init() is now called early that only one CPU is up * and interrupts have been disabled, so it is safe to replace the @@ -1386,6 +1395,7 @@ void __init debug_objects_mem_init(void) debug_objects_enabled = 0; kmem_cache_destroy(obj_cache); pr_warn("out of memory.\n"); + return; } else debug_objects_selftest(); From 1fdeb461be77283f7d7e447dbf10f04a2b475cac Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Fri, 27 May 2022 02:55:34 +0000 Subject: [PATCH 18/34] fs/kernel_read_file: allow to read files up-to ssize_t Patch series "Allow to kexec with initramfs larger than 2G", v2. Currently, the largest initramfs that is supported by kexec_file_load() syscall is 2G. This is because kernel_read_file() returns int, and is limited to INT_MAX or 2G. On the other hand, there are kexec based boot loaders (i.e. u-root), that may need to boot netboot images that might be larger than 2G. The first patch changes the return type from int to ssize_t in kernel_read_file* functions. The second patch increases the maximum initramfs file size to 4G. Tested: verified that can kexec_file_load() works with 4G initramfs on x86_64. This patch (of 2): Currently, the maximum file size that is supported is 2G. This may be too small in some cases. For example, kexec_file_load() system call loads initramfs. In some netboot cases initramfs can be rather large. Allow to use up-to ssize_t bytes. The callers still can limit the maximum file size via buf_size. Link: https://lkml.kernel.org/r/20220527025535.3953665-1-pasha.tatashin@soleen.com Link: https://lkml.kernel.org/r/20220527025535.3953665-2-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Cc: Al Viro Cc: Baoquan He Cc: "Eric W. Biederman" Cc: Greg Thelen Cc: Sasha Levin Signed-off-by: Andrew Morton --- fs/kernel_read_file.c | 38 ++++++++++++++++---------------- include/linux/kernel_read_file.h | 32 +++++++++++++-------------- include/linux/limits.h | 1 + 3 files changed, 36 insertions(+), 35 deletions(-) diff --git a/fs/kernel_read_file.c b/fs/kernel_read_file.c index 1b07550485b964..5d826274570cab 100644 --- a/fs/kernel_read_file.c +++ b/fs/kernel_read_file.c @@ -29,15 +29,15 @@ * change between calls to kernel_read_file(). * * Returns number of bytes read (no single read will be bigger - * than INT_MAX), or negative on error. + * than SSIZE_MAX), or negative on error. * */ -int kernel_read_file(struct file *file, loff_t offset, void **buf, - size_t buf_size, size_t *file_size, - enum kernel_read_file_id id) +ssize_t kernel_read_file(struct file *file, loff_t offset, void **buf, + size_t buf_size, size_t *file_size, + enum kernel_read_file_id id) { loff_t i_size, pos; - size_t copied; + ssize_t copied; void *allocated = NULL; bool whole_file; int ret; @@ -58,7 +58,7 @@ int kernel_read_file(struct file *file, loff_t offset, void **buf, goto out; } /* The file is too big for sane activities. */ - if (i_size > INT_MAX) { + if (i_size > SSIZE_MAX) { ret = -EFBIG; goto out; } @@ -124,12 +124,12 @@ int kernel_read_file(struct file *file, loff_t offset, void **buf, } EXPORT_SYMBOL_GPL(kernel_read_file); -int kernel_read_file_from_path(const char *path, loff_t offset, void **buf, - size_t buf_size, size_t *file_size, - enum kernel_read_file_id id) +ssize_t kernel_read_file_from_path(const char *path, loff_t offset, void **buf, + size_t buf_size, size_t *file_size, + enum kernel_read_file_id id) { struct file *file; - int ret; + ssize_t ret; if (!path || !*path) return -EINVAL; @@ -144,14 +144,14 @@ int kernel_read_file_from_path(const char *path, loff_t offset, void **buf, } EXPORT_SYMBOL_GPL(kernel_read_file_from_path); -int kernel_read_file_from_path_initns(const char *path, loff_t offset, - void **buf, size_t buf_size, - size_t *file_size, - enum kernel_read_file_id id) +ssize_t kernel_read_file_from_path_initns(const char *path, loff_t offset, + void **buf, size_t buf_size, + size_t *file_size, + enum kernel_read_file_id id) { struct file *file; struct path root; - int ret; + ssize_t ret; if (!path || !*path) return -EINVAL; @@ -171,12 +171,12 @@ int kernel_read_file_from_path_initns(const char *path, loff_t offset, } EXPORT_SYMBOL_GPL(kernel_read_file_from_path_initns); -int kernel_read_file_from_fd(int fd, loff_t offset, void **buf, - size_t buf_size, size_t *file_size, - enum kernel_read_file_id id) +ssize_t kernel_read_file_from_fd(int fd, loff_t offset, void **buf, + size_t buf_size, size_t *file_size, + enum kernel_read_file_id id) { struct fd f = fdget(fd); - int ret = -EBADF; + ssize_t ret = -EBADF; if (!f.file || !(f.file->f_mode & FMODE_READ)) goto out; diff --git a/include/linux/kernel_read_file.h b/include/linux/kernel_read_file.h index 575ffa1031d348..90451e2e12bd19 100644 --- a/include/linux/kernel_read_file.h +++ b/include/linux/kernel_read_file.h @@ -35,21 +35,21 @@ static inline const char *kernel_read_file_id_str(enum kernel_read_file_id id) return kernel_read_file_str[id]; } -int kernel_read_file(struct file *file, loff_t offset, - void **buf, size_t buf_size, - size_t *file_size, - enum kernel_read_file_id id); -int kernel_read_file_from_path(const char *path, loff_t offset, - void **buf, size_t buf_size, - size_t *file_size, - enum kernel_read_file_id id); -int kernel_read_file_from_path_initns(const char *path, loff_t offset, - void **buf, size_t buf_size, - size_t *file_size, - enum kernel_read_file_id id); -int kernel_read_file_from_fd(int fd, loff_t offset, - void **buf, size_t buf_size, - size_t *file_size, - enum kernel_read_file_id id); +ssize_t kernel_read_file(struct file *file, loff_t offset, + void **buf, size_t buf_size, + size_t *file_size, + enum kernel_read_file_id id); +ssize_t kernel_read_file_from_path(const char *path, loff_t offset, + void **buf, size_t buf_size, + size_t *file_size, + enum kernel_read_file_id id); +ssize_t kernel_read_file_from_path_initns(const char *path, loff_t offset, + void **buf, size_t buf_size, + size_t *file_size, + enum kernel_read_file_id id); +ssize_t kernel_read_file_from_fd(int fd, loff_t offset, + void **buf, size_t buf_size, + size_t *file_size, + enum kernel_read_file_id id); #endif /* _LINUX_KERNEL_READ_FILE_H */ diff --git a/include/linux/limits.h b/include/linux/limits.h index b568b9c30bbf58..f6bcc936901071 100644 --- a/include/linux/limits.h +++ b/include/linux/limits.h @@ -7,6 +7,7 @@ #include #define SIZE_MAX (~(size_t)0) +#define SSIZE_MAX ((ssize_t)(SIZE_MAX >> 1)) #define PHYS_ADDR_MAX (~(phys_addr_t)0) #define U8_MAX ((u8)~0U) From cdf84b2608a731b4eb9b64f957dbf67ccfceb02b Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Fri, 27 May 2022 02:55:35 +0000 Subject: [PATCH 19/34] kexec_file: increase maximum file size to 4G In some case initrd can be large. For example, it could be a netboot image loaded by u-root, that is kexec'ing into it. The maximum size of initrd is arbitrary set to 2G. Also, the limit is not very obvious because it is hidden behind a generic INT_MAX macro. Theoretically, we could make it LONG_MAX, but it is safer to keep it sane, and just increase it to 4G. Increase the size to 4G, and make it obvious by having a new macro that specifies the maximum file size supported by kexec_file_load() syscall: KEXEC_FILE_SIZE_MAX. Link: https://lkml.kernel.org/r/20220527025535.3953665-3-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Cc: Sasha Levin Cc: "Eric W. Biederman" Cc: Greg Thelen Cc: Al Viro Cc: Baoquan He Signed-off-by: Andrew Morton --- kernel/kexec_file.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 145321a5e798a6..9b2839775c837b 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -31,6 +31,9 @@ static int kexec_calculate_store_digests(struct kimage *image); +/* Maximum size in bytes for kernel/initrd files. */ +#define KEXEC_FILE_SIZE_MAX min_t(s64, 4LL << 30, SSIZE_MAX) + /* * Currently this is the only default function that is exported as some * architectures need it to do additional handlings. @@ -189,11 +192,12 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, const char __user *cmdline_ptr, unsigned long cmdline_len, unsigned flags) { - int ret; + ssize_t ret; void *ldata; ret = kernel_read_file_from_fd(kernel_fd, 0, &image->kernel_buf, - INT_MAX, NULL, READING_KEXEC_IMAGE); + KEXEC_FILE_SIZE_MAX, NULL, + READING_KEXEC_IMAGE); if (ret < 0) return ret; image->kernel_buf_len = ret; @@ -213,7 +217,7 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, /* It is possible that there no initramfs is being loaded */ if (!(flags & KEXEC_FILE_NO_INITRAMFS)) { ret = kernel_read_file_from_fd(initrd_fd, 0, &image->initrd_buf, - INT_MAX, NULL, + KEXEC_FILE_SIZE_MAX, NULL, READING_KEXEC_INITRAMFS); if (ret < 0) goto out; From e7881995974a2970f8f25390656090aacd7e1913 Mon Sep 17 00:00:00 2001 From: cxbing Date: Thu, 9 Jun 2022 07:44:59 -0700 Subject: [PATCH 20/34] delayacct: remove some unused variables Drop the unused variables *done* and *count*. Link: https://lkml.kernel.org/r/20220609144459.86379-1-zhangkkoo@126.com Signed-off-by: cxbing Signed-off-by: Andrew Morton --- tools/accounting/getdelays.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tools/accounting/getdelays.c b/tools/accounting/getdelays.c index e83e6e47a21ea1..938dec0dfaad84 100644 --- a/tools/accounting/getdelays.c +++ b/tools/accounting/getdelays.c @@ -45,7 +45,6 @@ exit(code); \ } while (0) -int done; int rcvbufsz; char name[100]; int dbg; @@ -285,7 +284,6 @@ int main(int argc, char *argv[]) pid_t rtid = 0; int fd = 0; - int count = 0; int write_file = 0; int maskset = 0; char *logfile = NULL; @@ -495,7 +493,6 @@ int main(int argc, char *argv[]) len2 = 0; /* For nested attributes, na follows */ na = (struct nlattr *) NLA_DATA(na); - done = 0; while (len2 < aggr_len) { switch (na->nla_type) { case TASKSTATS_TYPE_PID: @@ -509,7 +506,6 @@ int main(int argc, char *argv[]) printf("TGID\t%d\n", rtid); break; case TASKSTATS_TYPE_STATS: - count++; if (print_delays) print_delayacct((struct taskstats *) NLA_DATA(na)); if (print_io_accounting) From 2b80c411a362cf3fa4d9e6fd6fd431acb855597f Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Sat, 11 Jun 2022 04:21:32 +0100 Subject: [PATCH 21/34] squashfs: extend "page actor" to handle missing pages Patch series "Squashfs: handle missing pages decompressing into page cache". This patchset enables Squashfs to handle missing pages when directly decompressing datablocks into the page cache. Previously if the full set of pages needed was not available, Squashfs would have to fall back to using an intermediate buffer (the older method), which is slower, involving a memcopy, and it introduces contention on a shared buffer. The first patch extends the "page actor" code to handle missing pages. The second patch updates Squashfs_readpage_block() to use the new functionality, and removes the code that falls back to using an intermediate buffer. This patchset is independent of the readahead work, and it is standalone. It can be merged on its own. But the readahead patch for efficiency also needs this patch-set. This patch (of 2): This patch extends the "page actor" code to handle missing pages. Previously if the full set of pages needed to decompress a Squashfs datablock was unavailable, this would cause decompression to fail on the missing pages. In this case direct decompression into the page cache could not be achieved and the code would fall back to using the older intermediate buffer method. With this patch, direct decompression into the page cache can be achieved with missing pages. For "multi-shot" decompressors (zlib, xz, zstd), the page actor will allocate a temporary buffer which is passed to the decompressor, and then freed by the page actor. For "single shot" decompressors (lz4, lzo) which decompress into a contiguous "bounce buffer", and which is then copied into the page cache, it would be pointless to allocate a temporary buffer, memcpy into it, and then free it. For these decompressors -ENOMEM is returned, which signifies that the memcpy for that page should be skipped. This also happens if the data block is uncompressed. Link: https://lkml.kernel.org/r/20220611032133.5743-1-phillip@squashfs.org.uk Link: https://lkml.kernel.org/r/20220611032133.5743-2-phillip@squashfs.org.uk Signed-off-by: Phillip Lougher Cc: Matthew Wilcox (Oracle) Cc: Hsin-Yi Wang Cc: Xiongwei Song Signed-off-by: Andrew Morton --- fs/squashfs/block.c | 10 ++++--- fs/squashfs/decompressor.h | 1 + fs/squashfs/file_direct.c | 21 ++++++++------- fs/squashfs/lz4_wrapper.c | 7 +++-- fs/squashfs/lzo_wrapper.c | 7 +++-- fs/squashfs/page_actor.c | 55 ++++++++++++++++++++++++++++++++------ fs/squashfs/page_actor.h | 21 ++++++++++++--- fs/squashfs/xz_wrapper.c | 11 +++++++- fs/squashfs/zlib_wrapper.c | 12 ++++++++- fs/squashfs/zstd_wrapper.c | 12 ++++++++- 10 files changed, 126 insertions(+), 31 deletions(-) diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 8879d052f96c6a..833aca92301f0e 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -34,12 +34,15 @@ static int copy_bio_to_actor(struct bio *bio, struct squashfs_page_actor *actor, int offset, int req_length) { - void *actor_addr = squashfs_first_page(actor); + void *actor_addr; struct bvec_iter_all iter_all = {}; struct bio_vec *bvec = bvec_init_iter_all(&iter_all); int copied_bytes = 0; int actor_offset = 0; + squashfs_actor_nobuff(actor); + actor_addr = squashfs_first_page(actor); + if (WARN_ON_ONCE(!bio_next_segment(bio, &iter_all))) return 0; @@ -49,8 +52,9 @@ static int copy_bio_to_actor(struct bio *bio, bytes_to_copy = min_t(int, bytes_to_copy, req_length - copied_bytes); - memcpy(actor_addr + actor_offset, bvec_virt(bvec) + offset, - bytes_to_copy); + if (!IS_ERR(actor_addr)) + memcpy(actor_addr + actor_offset, bvec_virt(bvec) + + offset, bytes_to_copy); actor_offset += bytes_to_copy; copied_bytes += bytes_to_copy; diff --git a/fs/squashfs/decompressor.h b/fs/squashfs/decompressor.h index 1b9ccfd0aa519b..19ab608343895f 100644 --- a/fs/squashfs/decompressor.h +++ b/fs/squashfs/decompressor.h @@ -20,6 +20,7 @@ struct squashfs_decompressor { struct bio *, int, int, struct squashfs_page_actor *); int id; char *name; + int alloc_buffer; int supported; }; diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c index a4894cc5944719..5af5802f5626cb 100644 --- a/fs/squashfs/file_direct.c +++ b/fs/squashfs/file_direct.c @@ -47,14 +47,6 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize, if (page == NULL) return res; - /* - * Create a "page actor" which will kmap and kunmap the - * page cache pages appropriately within the decompressor - */ - actor = squashfs_page_actor_init_special(page, pages, 0); - if (actor == NULL) - goto out; - /* Try to grab all the pages covered by the Squashfs block */ for (missing_pages = 0, i = 0, n = start_index; i < pages; i++, n++) { page[i] = (n == target_page->index) ? target_page : @@ -89,8 +81,19 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize, goto out; } + /* + * Create a "page actor" which will kmap and kunmap the + * page cache pages appropriately within the decompressor + */ + actor = squashfs_page_actor_init_special(msblk, page, pages, 0); + if (actor == NULL) + goto out; + /* Decompress directly into the page cache buffers */ res = squashfs_read_data(inode->i_sb, block, bsize, NULL, actor); + + kfree(actor); + if (res < 0) goto mark_errored; @@ -116,7 +119,6 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize, put_page(page[i]); } - kfree(actor); kfree(page); return 0; @@ -135,7 +137,6 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize, } out: - kfree(actor); kfree(page); return res; } diff --git a/fs/squashfs/lz4_wrapper.c b/fs/squashfs/lz4_wrapper.c index b685b6238316c7..49797729f14383 100644 --- a/fs/squashfs/lz4_wrapper.c +++ b/fs/squashfs/lz4_wrapper.c @@ -119,10 +119,12 @@ static int lz4_uncompress(struct squashfs_sb_info *msblk, void *strm, buff = stream->output; while (data) { if (bytes <= PAGE_SIZE) { - memcpy(data, buff, bytes); + if (!IS_ERR(data)) + memcpy(data, buff, bytes); break; } - memcpy(data, buff, PAGE_SIZE); + if (!IS_ERR(data)) + memcpy(data, buff, PAGE_SIZE); buff += PAGE_SIZE; bytes -= PAGE_SIZE; data = squashfs_next_page(output); @@ -139,5 +141,6 @@ const struct squashfs_decompressor squashfs_lz4_comp_ops = { .decompress = lz4_uncompress, .id = LZ4_COMPRESSION, .name = "lz4", + .alloc_buffer = 0, .supported = 1 }; diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c index cb510a63196836..d216aeefa865ce 100644 --- a/fs/squashfs/lzo_wrapper.c +++ b/fs/squashfs/lzo_wrapper.c @@ -93,10 +93,12 @@ static int lzo_uncompress(struct squashfs_sb_info *msblk, void *strm, buff = stream->output; while (data) { if (bytes <= PAGE_SIZE) { - memcpy(data, buff, bytes); + if (!IS_ERR(data)) + memcpy(data, buff, bytes); break; } else { - memcpy(data, buff, PAGE_SIZE); + if (!IS_ERR(data)) + memcpy(data, buff, PAGE_SIZE); buff += PAGE_SIZE; bytes -= PAGE_SIZE; data = squashfs_next_page(output); @@ -116,5 +118,6 @@ const struct squashfs_decompressor squashfs_lzo_comp_ops = { .decompress = lzo_uncompress, .id = LZO_COMPRESSION, .name = "lzo", + .alloc_buffer = 0, .supported = 1 }; diff --git a/fs/squashfs/page_actor.c b/fs/squashfs/page_actor.c index 520d323a99ce67..b23b780d8f42ec 100644 --- a/fs/squashfs/page_actor.c +++ b/fs/squashfs/page_actor.c @@ -7,6 +7,8 @@ #include #include #include +#include "squashfs_fs_sb.h" +#include "decompressor.h" #include "page_actor.h" /* @@ -57,29 +59,62 @@ struct squashfs_page_actor *squashfs_page_actor_init(void **buffer, } /* Implementation of page_actor for decompressing directly into page cache. */ +static void *handle_next_page(struct squashfs_page_actor *actor) +{ + int max_pages = (actor->length + PAGE_SIZE - 1) >> PAGE_SHIFT; + + if (actor->returned_pages == max_pages) + return NULL; + + if ((actor->next_page == actor->pages) || + (actor->next_index != actor->page[actor->next_page]->index)) { + if (actor->alloc_buffer) { + void *tmp_buffer = kmalloc(PAGE_SIZE, GFP_KERNEL); + + if (tmp_buffer) { + actor->tmp_buffer = tmp_buffer; + actor->next_index++; + actor->returned_pages++; + return tmp_buffer; + } + } + + actor->next_index++; + actor->returned_pages++; + return ERR_PTR(-ENOMEM); + } + + actor->next_index++; + actor->returned_pages++; + return actor->pageaddr = kmap_local_page(actor->page[actor->next_page++]); +} + static void *direct_first_page(struct squashfs_page_actor *actor) { - actor->next_page = 1; - return actor->pageaddr = kmap_atomic(actor->page[0]); + return handle_next_page(actor); } static void *direct_next_page(struct squashfs_page_actor *actor) { if (actor->pageaddr) - kunmap_atomic(actor->pageaddr); + kunmap_local(actor->pageaddr); + + kfree(actor->tmp_buffer); + actor->pageaddr = actor->tmp_buffer = NULL; - return actor->pageaddr = actor->next_page == actor->pages ? NULL : - kmap_atomic(actor->page[actor->next_page++]); + return handle_next_page(actor); } static void direct_finish_page(struct squashfs_page_actor *actor) { if (actor->pageaddr) - kunmap_atomic(actor->pageaddr); + kunmap_local(actor->pageaddr); + + kfree(actor->tmp_buffer); } -struct squashfs_page_actor *squashfs_page_actor_init_special(struct page **page, - int pages, int length) +struct squashfs_page_actor *squashfs_page_actor_init_special(struct squashfs_sb_info *msblk, + struct page **page, int pages, int length) { struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL); @@ -90,7 +125,11 @@ struct squashfs_page_actor *squashfs_page_actor_init_special(struct page **page, actor->page = page; actor->pages = pages; actor->next_page = 0; + actor->returned_pages = 0; + actor->next_index = page[0]->index & ~((1 << (msblk->block_log - PAGE_SHIFT)) - 1); actor->pageaddr = NULL; + actor->tmp_buffer = NULL; + actor->alloc_buffer = msblk->decompressor->alloc_buffer; actor->squashfs_first_page = direct_first_page; actor->squashfs_next_page = direct_next_page; actor->squashfs_finish_page = direct_finish_page; diff --git a/fs/squashfs/page_actor.h b/fs/squashfs/page_actor.h index 2e3073ace0097b..37523c54256fa7 100644 --- a/fs/squashfs/page_actor.h +++ b/fs/squashfs/page_actor.h @@ -45,6 +45,11 @@ static inline void squashfs_finish_page(struct squashfs_page_actor *actor) { /* empty */ } + +static inline void squashfs_actor_nobuff(struct squashfs_page_actor *actor) +{ + /* empty */ +} #else struct squashfs_page_actor { union { @@ -52,17 +57,23 @@ struct squashfs_page_actor { struct page **page; }; void *pageaddr; + void *tmp_buffer; void *(*squashfs_first_page)(struct squashfs_page_actor *); void *(*squashfs_next_page)(struct squashfs_page_actor *); void (*squashfs_finish_page)(struct squashfs_page_actor *); int pages; int length; int next_page; + int alloc_buffer; + int returned_pages; + pgoff_t next_index; }; -extern struct squashfs_page_actor *squashfs_page_actor_init(void **, int, int); -extern struct squashfs_page_actor *squashfs_page_actor_init_special(struct page - **, int, int); +extern struct squashfs_page_actor *squashfs_page_actor_init(void **buffer, + int pages, int length); +extern struct squashfs_page_actor *squashfs_page_actor_init_special( + struct squashfs_sb_info *msblk, + struct page **page, int pages, int length); static inline void *squashfs_first_page(struct squashfs_page_actor *actor) { return actor->squashfs_first_page(actor); @@ -75,5 +86,9 @@ static inline void squashfs_finish_page(struct squashfs_page_actor *actor) { actor->squashfs_finish_page(actor); } +static inline void squashfs_actor_nobuff(struct squashfs_page_actor *actor) +{ + actor->alloc_buffer = 0; +} #endif #endif diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c index 68f6d09bb3a2bc..6c49481a2f8c43 100644 --- a/fs/squashfs/xz_wrapper.c +++ b/fs/squashfs/xz_wrapper.c @@ -131,6 +131,10 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm, stream->buf.out_pos = 0; stream->buf.out_size = PAGE_SIZE; stream->buf.out = squashfs_first_page(output); + if (IS_ERR(stream->buf.out)) { + error = PTR_ERR(stream->buf.out); + goto finish; + } for (;;) { enum xz_ret xz_err; @@ -156,7 +160,10 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm, if (stream->buf.out_pos == stream->buf.out_size) { stream->buf.out = squashfs_next_page(output); - if (stream->buf.out != NULL) { + if (IS_ERR(stream->buf.out)) { + error = PTR_ERR(stream->buf.out); + break; + } else if (stream->buf.out != NULL) { stream->buf.out_pos = 0; total += PAGE_SIZE; } @@ -171,6 +178,7 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm, } } +finish: squashfs_finish_page(output); return error ? error : total + stream->buf.out_pos; @@ -183,5 +191,6 @@ const struct squashfs_decompressor squashfs_xz_comp_ops = { .decompress = squashfs_xz_uncompress, .id = XZ_COMPRESSION, .name = "xz", + .alloc_buffer = 1, .supported = 1 }; diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c index a20e9042146bd6..cbb7afe7bc4679 100644 --- a/fs/squashfs/zlib_wrapper.c +++ b/fs/squashfs/zlib_wrapper.c @@ -62,6 +62,11 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm, stream->next_out = squashfs_first_page(output); stream->avail_in = 0; + if (IS_ERR(stream->next_out)) { + error = PTR_ERR(stream->next_out); + goto finish; + } + for (;;) { int zlib_err; @@ -85,7 +90,10 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm, if (stream->avail_out == 0) { stream->next_out = squashfs_next_page(output); - if (stream->next_out != NULL) + if (IS_ERR(stream->next_out)) { + error = PTR_ERR(stream->next_out); + break; + } else if (stream->next_out != NULL) stream->avail_out = PAGE_SIZE; } @@ -107,6 +115,7 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm, } } +finish: squashfs_finish_page(output); if (!error) @@ -122,6 +131,7 @@ const struct squashfs_decompressor squashfs_zlib_comp_ops = { .decompress = zlib_uncompress, .id = ZLIB_COMPRESSION, .name = "zlib", + .alloc_buffer = 1, .supported = 1 }; diff --git a/fs/squashfs/zstd_wrapper.c b/fs/squashfs/zstd_wrapper.c index c40445dbf38c77..0e407c4d8b3bc3 100644 --- a/fs/squashfs/zstd_wrapper.c +++ b/fs/squashfs/zstd_wrapper.c @@ -80,6 +80,10 @@ static int zstd_uncompress(struct squashfs_sb_info *msblk, void *strm, out_buf.size = PAGE_SIZE; out_buf.dst = squashfs_first_page(output); + if (IS_ERR(out_buf.dst)) { + error = PTR_ERR(out_buf.dst); + goto finish; + } for (;;) { size_t zstd_err; @@ -104,7 +108,10 @@ static int zstd_uncompress(struct squashfs_sb_info *msblk, void *strm, if (out_buf.pos == out_buf.size) { out_buf.dst = squashfs_next_page(output); - if (out_buf.dst == NULL) { + if (IS_ERR(out_buf.dst)) { + error = PTR_ERR(out_buf.dst); + break; + } else if (out_buf.dst == NULL) { /* Shouldn't run out of pages * before stream is done. */ @@ -129,6 +136,8 @@ static int zstd_uncompress(struct squashfs_sb_info *msblk, void *strm, } } +finish: + squashfs_finish_page(output); return error ? error : total_out; @@ -140,5 +149,6 @@ const struct squashfs_decompressor squashfs_zstd_comp_ops = { .decompress = zstd_uncompress, .id = ZSTD_COMPRESSION, .name = "zstd", + .alloc_buffer = 1, .supported = 1 }; From d3d71b53432327e3df8dade98fe476616abf35a8 Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Sat, 11 Jun 2022 04:21:33 +0100 Subject: [PATCH 22/34] squashfs: don't use intermediate buffer if pages missing Now that the "page actor" can handle missing pages, we don't have to fall back to using an intermediate buffer in Squashfs_readpage_block() if all the pages necessary can't be obtained. Link: https://lkml.kernel.org/r/20220611032133.5743-3-phillip@squashfs.org.uk Signed-off-by: Phillip Lougher Cc: Hsin-Yi Wang Cc: Matthew Wilcox (Oracle) Cc: Xiongwei Song Signed-off-by: Andrew Morton --- fs/squashfs/file_direct.c | 75 +++++++-------------------------------- 1 file changed, 12 insertions(+), 63 deletions(-) diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c index 5af5802f5626cb..be4b12d31e0c36 100644 --- a/fs/squashfs/file_direct.c +++ b/fs/squashfs/file_direct.c @@ -18,9 +18,6 @@ #include "squashfs.h" #include "page_actor.h" -static int squashfs_read_cache(struct page *target_page, u64 block, int bsize, - int pages, struct page **page, int bytes); - /* Read separately compressed datablock directly into page cache */ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize, int expected) @@ -33,7 +30,7 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize, int mask = (1 << (msblk->block_log - PAGE_SHIFT)) - 1; int start_index = target_page->index & ~mask; int end_index = start_index | mask; - int i, n, pages, missing_pages, bytes, res = -ENOMEM; + int i, n, pages, bytes, res = -ENOMEM; struct page **page; struct squashfs_page_actor *actor; void *pageaddr; @@ -48,44 +45,29 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize, return res; /* Try to grab all the pages covered by the Squashfs block */ - for (missing_pages = 0, i = 0, n = start_index; i < pages; i++, n++) { + for (i = 0, n = start_index; n <= end_index; n++) { page[i] = (n == target_page->index) ? target_page : grab_cache_page_nowait(target_page->mapping, n); - if (page[i] == NULL) { - missing_pages++; + if (page[i] == NULL) continue; - } if (PageUptodate(page[i])) { unlock_page(page[i]); put_page(page[i]); - page[i] = NULL; - missing_pages++; + continue; } - } - - if (missing_pages) { - /* - * Couldn't get one or more pages, this page has either - * been VM reclaimed, but others are still in the page cache - * and uptodate, or we're racing with another thread in - * squashfs_readpage also trying to grab them. Fall back to - * using an intermediate buffer. - */ - res = squashfs_read_cache(target_page, block, bsize, pages, - page, expected); - if (res < 0) - goto mark_errored; - goto out; + i++; } + pages = i; + /* * Create a "page actor" which will kmap and kunmap the * page cache pages appropriately within the decompressor */ - actor = squashfs_page_actor_init_special(msblk, page, pages, 0); + actor = squashfs_page_actor_init_special(msblk, page, pages, expected); if (actor == NULL) goto out; @@ -102,12 +84,12 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize, goto mark_errored; } - /* Last page may have trailing bytes not filled */ + /* Last page (if present) may have trailing bytes not filled */ bytes = res % PAGE_SIZE; - if (bytes) { - pageaddr = kmap_atomic(page[pages - 1]); + if (page[pages - 1]->index == end_index && bytes) { + pageaddr = kmap_local_page(page[pages - 1]); memset(pageaddr + bytes, 0, PAGE_SIZE - bytes); - kunmap_atomic(pageaddr); + kunmap_local(pageaddr); } /* Mark pages as uptodate, unlock and release */ @@ -140,36 +122,3 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize, kfree(page); return res; } - - -static int squashfs_read_cache(struct page *target_page, u64 block, int bsize, - int pages, struct page **page, int bytes) -{ - struct inode *i = target_page->mapping->host; - struct squashfs_cache_entry *buffer = squashfs_get_datablock(i->i_sb, - block, bsize); - int res = buffer->error, n, offset = 0; - - if (res) { - ERROR("Unable to read page, block %llx, size %x\n", block, - bsize); - goto out; - } - - for (n = 0; n < pages && bytes > 0; n++, - bytes -= PAGE_SIZE, offset += PAGE_SIZE) { - int avail = min_t(int, bytes, PAGE_SIZE); - - if (page[n] == NULL) - continue; - - squashfs_fill_page(page[n], buffer, offset, avail); - unlock_page(page[n]); - if (page[n] != target_page) - put_page(page[n]); - } - -out: - squashfs_cache_put(buffer); - return res; -} From 80a6b080446b8bdda0456f1247f3f9e5ccef2e61 Mon Sep 17 00:00:00 2001 From: Hsin-Yi Wang Date: Mon, 13 Jun 2022 16:28:00 +0800 Subject: [PATCH 23/34] Revert "squashfs: provide backing_dev_info in order to disable read-ahead" Patch series "Implement readahead for squashfs", v6. Commit 9eec1d897139("squashfs: provide backing_dev_info in order to disable read-ahead") mitigates the performance drop issue for squashfs by closing readahead for it. This patch (of 3): This reverts commit 9eec1d897139e5 ("squashfs: provide backing_dev_info in order to disable read-ahead"). Revert closing the readahead to squashfs since the readahead callback for squashfs is implemented. Link: https://lkml.kernel.org/r/20220613082802.1301238-1-hsinyi@chromium.org Link: https://lkml.kernel.org/r/20220613082802.1301238-2-hsinyi@chromium.org Suggested-by: Xiongwei Song Signed-off-by: Hsin-Yi Wang Cc: Phillip Lougher Cc: Matthew Wilcox (Oracle) Cc: Marek Szyprowski Cc: Zheng Liang Cc: Zhang Yi Cc: Hou Tao Cc: Miao Xie Cc: kernel test robot Signed-off-by: Andrew Morton --- fs/squashfs/super.c | 33 --------------------------------- 1 file changed, 33 deletions(-) diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 6d594ba2ed28ff..32565dafa7f3ba 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -29,7 +29,6 @@ #include #include #include -#include #include "squashfs_fs.h" #include "squashfs_fs_sb.h" @@ -113,24 +112,6 @@ static const struct squashfs_decompressor *supported_squashfs_filesystem( return decompressor; } -static int squashfs_bdi_init(struct super_block *sb) -{ - int err; - unsigned int major = MAJOR(sb->s_dev); - unsigned int minor = MINOR(sb->s_dev); - - bdi_put(sb->s_bdi); - sb->s_bdi = &noop_backing_dev_info; - - err = super_setup_bdi_name(sb, "squashfs_%u_%u", major, minor); - if (err) - return err; - - sb->s_bdi->ra_pages = 0; - sb->s_bdi->io_pages = 0; - - return 0; -} static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) { @@ -146,20 +127,6 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) TRACE("Entered squashfs_fill_superblock\n"); - /* - * squashfs provides 'backing_dev_info' in order to disable read-ahead. For - * squashfs, I/O is not deferred, it is done immediately in read_folio, - * which means the user would always have to wait their own I/O. So the effect - * of readahead is very weak for squashfs. squashfs_bdi_init will set - * sb->s_bdi->ra_pages and sb->s_bdi->io_pages to 0 and close readahead for - * squashfs. - */ - err = squashfs_bdi_init(sb); - if (err) { - errorf(fc, "squashfs init bdi failed"); - return err; - } - sb->s_fs_info = kzalloc(sizeof(*msblk), GFP_KERNEL); if (sb->s_fs_info == NULL) { ERROR("Failed to allocate squashfs_sb_info\n"); From dcbec26fbbe548ee5555e11b8c21084f00333c2c Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Mon, 13 Jun 2022 16:28:01 +0800 Subject: [PATCH 24/34] squashfs: always build "file direct" version of page actor Squashfs_readahead uses the "file direct" version of the page actor, and so build it unconditionally. Link: https://lkml.kernel.org/r/20220613082802.1301238-3-hsinyi@chromium.org Reported-by: kernel test robot Signed-off-by: Phillip Lougher Signed-off-by: Hsin-Yi Wang Cc: Hou Tao Cc: Marek Szyprowski Cc: Matthew Wilcox (Oracle) Cc: Miao Xie Cc: Xiongwei Song Cc: Zhang Yi Cc: Zheng Liang Signed-off-by: Andrew Morton --- fs/squashfs/Makefile | 4 ++-- fs/squashfs/page_actor.h | 46 ---------------------------------------- 2 files changed, 2 insertions(+), 48 deletions(-) diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile index 7bd9b8b856d0bf..477c89a519ee88 100644 --- a/fs/squashfs/Makefile +++ b/fs/squashfs/Makefile @@ -5,9 +5,9 @@ obj-$(CONFIG_SQUASHFS) += squashfs.o squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o -squashfs-y += namei.o super.o symlink.o decompressor.o +squashfs-y += namei.o super.o symlink.o decompressor.o page_actor.o squashfs-$(CONFIG_SQUASHFS_FILE_CACHE) += file_cache.o -squashfs-$(CONFIG_SQUASHFS_FILE_DIRECT) += file_direct.o page_actor.o +squashfs-$(CONFIG_SQUASHFS_FILE_DIRECT) += file_direct.o squashfs-$(CONFIG_SQUASHFS_DECOMP_SINGLE) += decompressor_single.o squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI) += decompressor_multi.o squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU) += decompressor_multi_percpu.o diff --git a/fs/squashfs/page_actor.h b/fs/squashfs/page_actor.h index 37523c54256fa7..24841d28bc0fb8 100644 --- a/fs/squashfs/page_actor.h +++ b/fs/squashfs/page_actor.h @@ -6,51 +6,6 @@ * Phillip Lougher */ -#ifndef CONFIG_SQUASHFS_FILE_DIRECT -struct squashfs_page_actor { - void **page; - int pages; - int length; - int next_page; -}; - -static inline struct squashfs_page_actor *squashfs_page_actor_init(void **page, - int pages, int length) -{ - struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL); - - if (actor == NULL) - return NULL; - - actor->length = length ? : pages * PAGE_SIZE; - actor->page = page; - actor->pages = pages; - actor->next_page = 0; - return actor; -} - -static inline void *squashfs_first_page(struct squashfs_page_actor *actor) -{ - actor->next_page = 1; - return actor->page[0]; -} - -static inline void *squashfs_next_page(struct squashfs_page_actor *actor) -{ - return actor->next_page == actor->pages ? NULL : - actor->page[actor->next_page++]; -} - -static inline void squashfs_finish_page(struct squashfs_page_actor *actor) -{ - /* empty */ -} - -static inline void squashfs_actor_nobuff(struct squashfs_page_actor *actor) -{ - /* empty */ -} -#else struct squashfs_page_actor { union { void **buffer; @@ -91,4 +46,3 @@ static inline void squashfs_actor_nobuff(struct squashfs_page_actor *actor) actor->alloc_buffer = 0; } #endif -#endif From f8f5486b21ee345fe691c8c1c86b2ecf3d9c2650 Mon Sep 17 00:00:00 2001 From: Hsin-Yi Wang Date: Mon, 13 Jun 2022 16:28:02 +0800 Subject: [PATCH 25/34] squashfs: implement readahead Implement readahead callback for squashfs. It will read datablocks which cover pages in readahead request. For a few cases it will not mark page as uptodate, including: - file end is 0. - zero filled blocks. - current batch of pages isn't in the same datablock. - decompressor error. Otherwise pages will be marked as uptodate. The unhandled pages will be updated by readpage later. Link: https://lkml.kernel.org/r/20220613082802.1301238-4-hsinyi@chromium.org Suggested-by: Matthew Wilcox Signed-off-by: Hsin-Yi Wang Reported-by: Matthew Wilcox Reported-by: Phillip Lougher Reported-by: Xiongwei Song Reported-by: Andrew Morton Cc: Hou Tao Cc: kernel test robot Cc: Marek Szyprowski Cc: Miao Xie Cc: Zhang Yi Cc: Zheng Liang Signed-off-by: Andrew Morton --- fs/squashfs/file.c | 92 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 91 insertions(+), 1 deletion(-) diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c index a8e495d8eb8600..128ebe9aded87d 100644 --- a/fs/squashfs/file.c +++ b/fs/squashfs/file.c @@ -39,6 +39,7 @@ #include "squashfs_fs_sb.h" #include "squashfs_fs_i.h" #include "squashfs.h" +#include "page_actor.h" /* * Locate cache slot in range [offset, index] for specified inode. If @@ -495,7 +496,96 @@ static int squashfs_read_folio(struct file *file, struct folio *folio) return 0; } +static void squashfs_readahead(struct readahead_control *ractl) +{ + struct inode *inode = ractl->mapping->host; + struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; + size_t mask = (1UL << msblk->block_log) - 1; + unsigned short shift = msblk->block_log - PAGE_SHIFT; + loff_t start = readahead_pos(ractl) & ~mask; + size_t len = readahead_length(ractl) + readahead_pos(ractl) - start; + struct squashfs_page_actor *actor; + unsigned int nr_pages = 0; + struct page **pages; + int i, file_end = i_size_read(inode) >> msblk->block_log; + unsigned int max_pages = 1UL << shift; + + readahead_expand(ractl, start, (len | mask) + 1); + + if (file_end == 0) + return; + + pages = kmalloc_array(max_pages, sizeof(void *), GFP_KERNEL); + if (!pages) + return; + + for (;;) { + pgoff_t index; + int res, bsize; + u64 block = 0; + unsigned int expected; + + nr_pages = __readahead_batch(ractl, pages, max_pages); + if (!nr_pages) + break; + + if (readahead_pos(ractl) >= i_size_read(inode)) + goto skip_pages; + + index = pages[0]->index >> shift; + if ((pages[nr_pages - 1]->index >> shift) != index) + goto skip_pages; + + expected = index == file_end ? + (i_size_read(inode) & (msblk->block_size - 1)) : + msblk->block_size; + + bsize = read_blocklist(inode, index, &block); + if (bsize == 0) + goto skip_pages; + + actor = squashfs_page_actor_init_special(msblk, pages, nr_pages, + expected); + if (!actor) + goto skip_pages; + + res = squashfs_read_data(inode->i_sb, block, bsize, NULL, actor); + + kfree(actor); + + if (res == expected) { + int bytes; + + /* Last page (if present) may have trailing bytes not filled */ + bytes = res % PAGE_SIZE; + if (pages[nr_pages - 1]->index == file_end && bytes) + memzero_page(pages[nr_pages - 1], bytes, + PAGE_SIZE - bytes); + + for (i = 0; i < nr_pages; i++) { + flush_dcache_page(pages[i]); + SetPageUptodate(pages[i]); + } + } + + for (i = 0; i < nr_pages; i++) { + unlock_page(pages[i]); + put_page(pages[i]); + } + } + + kfree(pages); + return; + +skip_pages: + for (i = 0; i < nr_pages; i++) { + unlock_page(pages[i]); + put_page(pages[i]); + } + kfree(pages); +} const struct address_space_operations squashfs_aops = { - .read_folio = squashfs_read_folio + .read_folio = squashfs_read_folio, + .readahead = squashfs_readahead }; From f71295e16144805a787b4b2d324dd80223224d88 Mon Sep 17 00:00:00 2001 From: Antonio Borneo Date: Mon, 13 Jun 2022 12:00:55 +0200 Subject: [PATCH 26/34] checkpatch: fix incorrect camelcase detection on numeric constant The code fragment below int foo(int *array, int index) { return array[index & 0xFF]; } triggers an incorrect camelcase detection by checking a substring of the hex constant: CHECK: Avoid CamelCase: #3: FILE: test.c:3: + return array[index & 0xFF]; This is caused by passing the whole string "array[index & 0xFF]" to the inner loop that iterates over a "$Ident" match. The numeric constant is not a $Ident as it doesn't start with [A-Za-z_] and should be excluded from the match. Similar issue can be detected with other constants like "1uL", "0xffffU". Force the match to start at word boundary so the $Ident will be properly checked starting from its first char and the constants will be filtered-out. Link: https://lkml.kernel.org/r/20220613100055.77821-1-borneo.antonio@gmail.com Signed-off-by: Antonio Borneo Cc: Joe Perches Cc: Andy Whitcroft Cc: Dwaipayan Ray Cc: Lukas Bulwahn Signed-off-by: Andrew Morton --- scripts/checkpatch.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 205bf5055acffb..79e759aac543b8 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -5721,7 +5721,7 @@ sub process { $var !~ /^(?:[a-z0-9_]*|[A-Z0-9_]*)?_?[a-z][A-Z](?:_[a-z0-9_]+|_[A-Z0-9_]+)?$/ && #Ignore some three character SI units explicitly, like MiB and KHz $var !~ /^(?:[a-z_]*?)_?(?:[KMGT]iB|[KMGT]?Hz)(?:_[a-z_]+)?$/) { - while ($var =~ m{($Ident)}g) { + while ($var =~ m{\b($Ident)}g) { my $word = $1; next if ($word !~ /[A-Z][a-z]|[a-z][A-Z]/); if ($check) { From 0c0f88e67ec3a58ceb34ba7d84a81f374c9bf024 Mon Sep 17 00:00:00 2001 From: Javier Martinez Canillas Date: Fri, 10 Jun 2022 09:57:18 +0200 Subject: [PATCH 27/34] fat: add a vfat_rename2() and make existing .rename callback a helper Patch series "fat: add support for the renameat2 RENAME_EXCHANGE flag", v6. The series adds support for the renameat2 system call RENAME_EXCHANGE flag (which allows to atomically replace two paths) to the vfat filesystem code. There are many use cases for this, but we are particularly interested in making possible for vfat filesystems to be part of OSTree [0] deployments. Currently OSTree relies on symbolic links to make the deployment updates an atomic transactional operation. But RENAME_EXCHANGE could be used [1] to achieve a similar level of robustness when using a vfat filesystem. Patch #1 is just a preparatory patch to introduce the RENAME_EXCHANGE support, patch #2 moves some code blocks in vfat_rename() to a set of helper functions, that can be reused by tvfat_rename_exchange() that's added by patch #3 and finally patch #4 adds some kselftests to test it. This patch (of 4): Currently vfat only supports the RENAME_NOREPLACE flag which is handled by the virtual file system layer but doesn't support the RENAME_EXCHANGE flag. Add a vfat_rename2() function to be used as the .rename callback and move the current vfat_rename() handler to a helper. This is in preparation for implementing the RENAME_NOREPLACE flag using a different helper function. Link: https://lkml.kernel.org/r/20220610075721.1182745-1-javierm@redhat.com Link: https://lkml.kernel.org/r/20220610075721.1182745-2-javierm@redhat.com Signed-off-by: Javier Martinez Canillas Acked-by: OGAWA Hirofumi Cc: Christian Kellner Cc: Peter Jones Cc: Chung-Chiang Cheng Cc: Lennart Poettering Cc: Alexander Larsson Cc: Colin Walters Cc: Muhammad Usama Anjum Signed-off-by: Andrew Morton --- fs/fat/namei_vfat.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index c573314806cf82..88ccb2ee3537bc 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -889,9 +889,8 @@ static int vfat_mkdir(struct user_namespace *mnt_userns, struct inode *dir, return err; } -static int vfat_rename(struct user_namespace *mnt_userns, struct inode *old_dir, - struct dentry *old_dentry, struct inode *new_dir, - struct dentry *new_dentry, unsigned int flags) +static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) { struct buffer_head *dotdot_bh; struct msdos_dir_entry *dotdot_de; @@ -902,9 +901,6 @@ static int vfat_rename(struct user_namespace *mnt_userns, struct inode *old_dir, int err, is_dir, update_dotdot, corrupt = 0; struct super_block *sb = old_dir->i_sb; - if (flags & ~RENAME_NOREPLACE) - return -EINVAL; - old_sinfo.bh = sinfo.bh = dotdot_bh = NULL; old_inode = d_inode(old_dentry); new_inode = d_inode(new_dentry); @@ -1021,13 +1017,24 @@ static int vfat_rename(struct user_namespace *mnt_userns, struct inode *old_dir, goto out; } +static int vfat_rename2(struct user_namespace *mnt_userns, struct inode *old_dir, + struct dentry *old_dentry, struct inode *new_dir, + struct dentry *new_dentry, unsigned int flags) +{ + if (flags & ~RENAME_NOREPLACE) + return -EINVAL; + + /* VFS already handled RENAME_NOREPLACE, handle it as a normal rename */ + return vfat_rename(old_dir, old_dentry, new_dir, new_dentry); +} + static const struct inode_operations vfat_dir_inode_operations = { .create = vfat_create, .lookup = vfat_lookup, .unlink = vfat_unlink, .mkdir = vfat_mkdir, .rmdir = vfat_rmdir, - .rename = vfat_rename, + .rename = vfat_rename2, .setattr = fat_setattr, .getattr = fat_getattr, .update_time = fat_update_time, From 8ac6399a1277e10038fbb266dab3b2de237c8873 Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Fri, 10 Jun 2022 09:57:19 +0200 Subject: [PATCH 28/34] fat: factor out reusable code in vfat_rename() as helper functions The vfat_rename() function is quite big and there are code blocks that can be moved into helper functions. This not only simplify the implementation of that function but also allows these helpers to be reused. For example, the helpers can be used by the handler of the RENAME_EXCHANGE flag once this is implemented in a subsequent patch. Link: https://lkml.kernel.org/r/20220610075721.1182745-3-javierm@redhat.com Signed-off-by: OGAWA Hirofumi Signed-off-by: Javier Martinez Canillas Acked-by: OGAWA Hirofumi Cc: Alexander Larsson Cc: Christian Kellner Cc: Chung-Chiang Cheng Cc: Colin Walters Cc: Lennart Poettering Cc: Muhammad Usama Anjum Cc: Peter Jones Signed-off-by: Andrew Morton --- fs/fat/namei_vfat.c | 89 +++++++++++++++++++++++++++++---------------- 1 file changed, 57 insertions(+), 32 deletions(-) diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index 88ccb2ee3537bc..9c04053a8f1cc5 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -889,16 +889,55 @@ static int vfat_mkdir(struct user_namespace *mnt_userns, struct inode *dir, return err; } +static int vfat_get_dotdot_de(struct inode *inode, struct buffer_head **bh, + struct msdos_dir_entry **de) +{ + if (S_ISDIR(inode->i_mode)) { + if (fat_get_dotdot_entry(inode, bh, de)) + return -EIO; + } + return 0; +} + +static int vfat_sync_ipos(struct inode *dir, struct inode *inode) +{ + if (IS_DIRSYNC(dir)) + return fat_sync_inode(inode); + mark_inode_dirty(inode); + return 0; +} + +static int vfat_update_dotdot_de(struct inode *dir, struct inode *inode, + struct buffer_head *dotdot_bh, + struct msdos_dir_entry *dotdot_de) +{ + fat_set_start(dotdot_de, MSDOS_I(dir)->i_logstart); + mark_buffer_dirty_inode(dotdot_bh, inode); + if (IS_DIRSYNC(dir)) + return sync_dirty_buffer(dotdot_bh); + return 0; +} + +static void vfat_update_dir_metadata(struct inode *dir, struct timespec64 *ts) +{ + inode_inc_iversion(dir); + fat_truncate_time(dir, ts, S_CTIME | S_MTIME); + if (IS_DIRSYNC(dir)) + (void)fat_sync_inode(dir); + else + mark_inode_dirty(dir); +} + static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { struct buffer_head *dotdot_bh; - struct msdos_dir_entry *dotdot_de; + struct msdos_dir_entry *dotdot_de = NULL; struct inode *old_inode, *new_inode; struct fat_slot_info old_sinfo, sinfo; struct timespec64 ts; loff_t new_i_pos; - int err, is_dir, update_dotdot, corrupt = 0; + int err, is_dir, corrupt = 0; struct super_block *sb = old_dir->i_sb; old_sinfo.bh = sinfo.bh = dotdot_bh = NULL; @@ -909,15 +948,13 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry, if (err) goto out; - is_dir = S_ISDIR(old_inode->i_mode); - update_dotdot = (is_dir && old_dir != new_dir); - if (update_dotdot) { - if (fat_get_dotdot_entry(old_inode, &dotdot_bh, &dotdot_de)) { - err = -EIO; + if (old_dir != new_dir) { + err = vfat_get_dotdot_de(old_inode, &dotdot_bh, &dotdot_de); + if (err) goto out; - } } + is_dir = S_ISDIR(old_inode->i_mode); ts = current_time(old_dir); if (new_inode) { if (is_dir) { @@ -938,21 +975,15 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry, fat_detach(old_inode); fat_attach(old_inode, new_i_pos); - if (IS_DIRSYNC(new_dir)) { - err = fat_sync_inode(old_inode); - if (err) - goto error_inode; - } else - mark_inode_dirty(old_inode); + err = vfat_sync_ipos(new_dir, old_inode); + if (err) + goto error_inode; - if (update_dotdot) { - fat_set_start(dotdot_de, MSDOS_I(new_dir)->i_logstart); - mark_buffer_dirty_inode(dotdot_bh, old_inode); - if (IS_DIRSYNC(new_dir)) { - err = sync_dirty_buffer(dotdot_bh); - if (err) - goto error_dotdot; - } + if (dotdot_de) { + err = vfat_update_dotdot_de(new_dir, old_inode, dotdot_bh, + dotdot_de); + if (err) + goto error_dotdot; drop_nlink(old_dir); if (!new_inode) inc_nlink(new_dir); @@ -962,12 +993,7 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry, old_sinfo.bh = NULL; if (err) goto error_dotdot; - inode_inc_iversion(old_dir); - fat_truncate_time(old_dir, &ts, S_CTIME|S_MTIME); - if (IS_DIRSYNC(old_dir)) - (void)fat_sync_inode(old_dir); - else - mark_inode_dirty(old_dir); + vfat_update_dir_metadata(old_dir, &ts); if (new_inode) { drop_nlink(new_inode); @@ -987,10 +1013,9 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry, /* data cluster is shared, serious corruption */ corrupt = 1; - if (update_dotdot) { - fat_set_start(dotdot_de, MSDOS_I(old_dir)->i_logstart); - mark_buffer_dirty_inode(dotdot_bh, old_inode); - corrupt |= sync_dirty_buffer(dotdot_bh); + if (dotdot_de) { + corrupt |= vfat_update_dotdot_de(old_dir, old_inode, dotdot_bh, + dotdot_de); } error_inode: fat_detach(old_inode); From f2d6daf457968aeb1a8dbf05b3b7ffe5530fe6be Mon Sep 17 00:00:00 2001 From: Javier Martinez Canillas Date: Fri, 10 Jun 2022 09:57:20 +0200 Subject: [PATCH 29/34] fat: add renameat2 RENAME_EXCHANGE flag support The renameat2 RENAME_EXCHANGE flag allows to atomically exchange two paths but is currently not supported by the Linux vfat filesystem driver. Add a vfat_rename_exchange() helper function that implements this support. The super block lock is acquired during the operation to ensure atomicity, and in the error path actions made are reversed also with the mutex held. It makes the operation as transactional as possible, within the limitation impossed by vfat due not having a journal with logs to replay. Link: https://lkml.kernel.org/r/20220610075721.1182745-4-javierm@redhat.com Signed-off-by: Javier Martinez Canillas Acked-by: OGAWA Hirofumi Cc: Alexander Larsson Cc: Christian Kellner Cc: Chung-Chiang Cheng Cc: Colin Walters Cc: Lennart Poettering Cc: Muhammad Usama Anjum Cc: Peter Jones Signed-off-by: Andrew Morton --- fs/fat/namei_vfat.c | 123 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 122 insertions(+), 1 deletion(-) diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index 9c04053a8f1cc5..21620054e1c44e 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -1042,13 +1042,134 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry, goto out; } +static void vfat_exchange_ipos(struct inode *old_inode, struct inode *new_inode, + loff_t old_i_pos, loff_t new_i_pos) +{ + fat_detach(old_inode); + fat_detach(new_inode); + fat_attach(old_inode, new_i_pos); + fat_attach(new_inode, old_i_pos); +} + +static void vfat_move_nlink(struct inode *src, struct inode *dst) +{ + drop_nlink(src); + inc_nlink(dst); +} + +static int vfat_rename_exchange(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct buffer_head *old_dotdot_bh = NULL, *new_dotdot_bh = NULL; + struct msdos_dir_entry *old_dotdot_de = NULL, *new_dotdot_de = NULL; + struct inode *old_inode, *new_inode; + struct timespec64 ts = current_time(old_dir); + loff_t old_i_pos, new_i_pos; + int err, corrupt = 0; + struct super_block *sb = old_dir->i_sb; + + old_inode = d_inode(old_dentry); + new_inode = d_inode(new_dentry); + + /* Acquire super block lock for the operation to be atomic */ + mutex_lock(&MSDOS_SB(sb)->s_lock); + + /* if directories are not the same, get ".." info to update */ + if (old_dir != new_dir) { + err = vfat_get_dotdot_de(old_inode, &old_dotdot_bh, + &old_dotdot_de); + if (err) + goto out; + + err = vfat_get_dotdot_de(new_inode, &new_dotdot_bh, + &new_dotdot_de); + if (err) + goto out; + } + + old_i_pos = MSDOS_I(old_inode)->i_pos; + new_i_pos = MSDOS_I(new_inode)->i_pos; + + vfat_exchange_ipos(old_inode, new_inode, old_i_pos, new_i_pos); + + err = vfat_sync_ipos(old_dir, new_inode); + if (err) + goto error_exchange; + err = vfat_sync_ipos(new_dir, old_inode); + if (err) + goto error_exchange; + + /* update ".." directory entry info */ + if (old_dotdot_de) { + err = vfat_update_dotdot_de(new_dir, old_inode, old_dotdot_bh, + old_dotdot_de); + if (err) + goto error_old_dotdot; + } + if (new_dotdot_de) { + err = vfat_update_dotdot_de(old_dir, new_inode, new_dotdot_bh, + new_dotdot_de); + if (err) + goto error_new_dotdot; + } + + /* if cross directory and only one is a directory, adjust nlink */ + if (!old_dotdot_de != !new_dotdot_de) { + if (old_dotdot_de) + vfat_move_nlink(old_dir, new_dir); + else + vfat_move_nlink(new_dir, old_dir); + } + + vfat_update_dir_metadata(old_dir, &ts); + /* if directories are not the same, update new_dir as well */ + if (old_dir != new_dir) + vfat_update_dir_metadata(new_dir, &ts); + +out: + brelse(old_dotdot_bh); + brelse(new_dotdot_bh); + mutex_unlock(&MSDOS_SB(sb)->s_lock); + + return err; + +error_new_dotdot: + if (new_dotdot_de) { + corrupt |= vfat_update_dotdot_de(new_dir, new_inode, + new_dotdot_bh, new_dotdot_de); + } + +error_old_dotdot: + if (old_dotdot_de) { + corrupt |= vfat_update_dotdot_de(old_dir, old_inode, + old_dotdot_bh, old_dotdot_de); + } + +error_exchange: + vfat_exchange_ipos(old_inode, new_inode, new_i_pos, old_i_pos); + corrupt |= vfat_sync_ipos(new_dir, new_inode); + corrupt |= vfat_sync_ipos(old_dir, old_inode); + + if (corrupt < 0) { + fat_fs_error(new_dir->i_sb, + "%s: Filesystem corrupted (i_pos %lld, %lld)", + __func__, old_i_pos, new_i_pos); + } + goto out; +} + static int vfat_rename2(struct user_namespace *mnt_userns, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { - if (flags & ~RENAME_NOREPLACE) + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) return -EINVAL; + if (flags & RENAME_EXCHANGE) { + return vfat_rename_exchange(old_dir, old_dentry, + new_dir, new_dentry); + } + /* VFS already handled RENAME_NOREPLACE, handle it as a normal rename */ return vfat_rename(old_dir, old_dentry, new_dir, new_dentry); } From 0f96a01d0ed12c93627110e2b3cf0c75c6b1de46 Mon Sep 17 00:00:00 2001 From: Javier Martinez Canillas Date: Fri, 10 Jun 2022 09:57:21 +0200 Subject: [PATCH 30/34] selftests/filesystems: add a vfat RENAME_EXCHANGE test Add a test for the renameat2 RENAME_EXCHANGE support in vfat, but split it in a tool that just does the rename exchange and a script that is run by the kselftests framework on `make TARGETS="filesystems/fat" kselftest`. That way the script can be easily extended to test other file operations. The script creates a 1 MiB disk image, that is then formated with a vfat filesystem and mounted using a loop device. That way all file operations are done on an ephemeral filesystem. Link: https://lkml.kernel.org/r/20220610075721.1182745-5-javierm@redhat.com Signed-off-by: Javier Martinez Canillas Acked-by: Muhammad Usama Anjum Acked-by: OGAWA Hirofumi Cc: Alexander Larsson Cc: Christian Kellner Cc: Chung-Chiang Cheng Cc: Colin Walters Cc: Lennart Poettering Cc: Peter Jones Signed-off-by: Andrew Morton --- MAINTAINERS | 1 + tools/testing/selftests/Makefile | 1 + .../selftests/filesystems/fat/.gitignore | 2 + .../selftests/filesystems/fat/Makefile | 7 ++ .../testing/selftests/filesystems/fat/config | 2 + .../filesystems/fat/rename_exchange.c | 37 +++++++++ .../filesystems/fat/run_fat_tests.sh | 82 +++++++++++++++++++ 7 files changed, 132 insertions(+) create mode 100644 tools/testing/selftests/filesystems/fat/.gitignore create mode 100644 tools/testing/selftests/filesystems/fat/Makefile create mode 100644 tools/testing/selftests/filesystems/fat/config create mode 100644 tools/testing/selftests/filesystems/fat/rename_exchange.c create mode 100644 tools/testing/selftests/filesystems/fat/run_fat_tests.sh diff --git a/MAINTAINERS b/MAINTAINERS index 1fc9ead83d2aa3..addcc0cca3211a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -20891,6 +20891,7 @@ M: OGAWA Hirofumi S: Maintained F: Documentation/filesystems/vfat.rst F: fs/fat/ +F: tools/testing/selftests/filesystems/fat/ VFIO DRIVER M: Alex Williamson diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index de11992dc57763..67668a9fa115ed 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -17,6 +17,7 @@ TARGETS += exec TARGETS += filesystems TARGETS += filesystems/binderfs TARGETS += filesystems/epoll +TARGETS += filesystems/fat TARGETS += firmware TARGETS += fpu TARGETS += ftrace diff --git a/tools/testing/selftests/filesystems/fat/.gitignore b/tools/testing/selftests/filesystems/fat/.gitignore new file mode 100644 index 00000000000000..b89920ed841cce --- /dev/null +++ b/tools/testing/selftests/filesystems/fat/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +rename_exchange diff --git a/tools/testing/selftests/filesystems/fat/Makefile b/tools/testing/selftests/filesystems/fat/Makefile new file mode 100644 index 00000000000000..902033f6ef098b --- /dev/null +++ b/tools/testing/selftests/filesystems/fat/Makefile @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0 + +TEST_PROGS := run_fat_tests.sh +TEST_GEN_PROGS_EXTENDED := rename_exchange +CFLAGS += -O2 -g -Wall $(KHDR_INCLUDES) + +include ../../lib.mk diff --git a/tools/testing/selftests/filesystems/fat/config b/tools/testing/selftests/filesystems/fat/config new file mode 100644 index 00000000000000..6cf95e787a17b5 --- /dev/null +++ b/tools/testing/selftests/filesystems/fat/config @@ -0,0 +1,2 @@ +CONFIG_BLK_DEV_LOOP=y +CONFIG_VFAT_FS=y diff --git a/tools/testing/selftests/filesystems/fat/rename_exchange.c b/tools/testing/selftests/filesystems/fat/rename_exchange.c new file mode 100644 index 00000000000000..e488ad354fce4f --- /dev/null +++ b/tools/testing/selftests/filesystems/fat/rename_exchange.c @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Program that atomically exchanges two paths using + * the renameat2() system call RENAME_EXCHANGE flag. + * + * Copyright 2022 Red Hat Inc. + * Author: Javier Martinez Canillas + */ + +#define _GNU_SOURCE +#include +#include +#include + +void print_usage(const char *program) +{ + printf("Usage: %s [oldpath] [newpath]\n", program); + printf("Atomically exchange oldpath and newpath\n"); +} + +int main(int argc, char *argv[]) +{ + int ret; + + if (argc != 3) { + print_usage(argv[0]); + exit(EXIT_FAILURE); + } + + ret = renameat2(AT_FDCWD, argv[1], AT_FDCWD, argv[2], RENAME_EXCHANGE); + if (ret) { + perror("rename exchange failed"); + exit(EXIT_FAILURE); + } + + exit(EXIT_SUCCESS); +} diff --git a/tools/testing/selftests/filesystems/fat/run_fat_tests.sh b/tools/testing/selftests/filesystems/fat/run_fat_tests.sh new file mode 100644 index 00000000000000..7f35dc3d15dfa6 --- /dev/null +++ b/tools/testing/selftests/filesystems/fat/run_fat_tests.sh @@ -0,0 +1,82 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Run filesystem operations tests on an 1 MiB disk image that is formatted with +# a vfat filesystem and mounted in a temporary directory using a loop device. +# +# Copyright 2022 Red Hat Inc. +# Author: Javier Martinez Canillas + +set -e +set -u +set -o pipefail + +BASE_DIR="$(dirname $0)" +TMP_DIR="$(mktemp -d /tmp/fat_tests_tmp.XXXX)" +IMG_PATH="${TMP_DIR}/fat.img" +MNT_PATH="${TMP_DIR}/mnt" + +cleanup() +{ + mountpoint -q "${MNT_PATH}" && unmount_image + rm -rf "${TMP_DIR}" +} +trap cleanup SIGINT SIGTERM EXIT + +create_loopback() +{ + touch "${IMG_PATH}" + chattr +C "${IMG_PATH}" >/dev/null 2>&1 || true + + truncate -s 1M "${IMG_PATH}" + mkfs.vfat "${IMG_PATH}" >/dev/null 2>&1 +} + +mount_image() +{ + mkdir -p "${MNT_PATH}" + sudo mount -o loop "${IMG_PATH}" "${MNT_PATH}" +} + +rename_exchange_test() +{ + local rename_exchange="${BASE_DIR}/rename_exchange" + local old_path="${MNT_PATH}/old_file" + local new_path="${MNT_PATH}/new_file" + + echo old | sudo tee "${old_path}" >/dev/null 2>&1 + echo new | sudo tee "${new_path}" >/dev/null 2>&1 + sudo "${rename_exchange}" "${old_path}" "${new_path}" >/dev/null 2>&1 + sudo sync -f "${MNT_PATH}" + grep new "${old_path}" >/dev/null 2>&1 + grep old "${new_path}" >/dev/null 2>&1 +} + +rename_exchange_subdir_test() +{ + local rename_exchange="${BASE_DIR}/rename_exchange" + local dir_path="${MNT_PATH}/subdir" + local old_path="${MNT_PATH}/old_file" + local new_path="${dir_path}/new_file" + + sudo mkdir -p "${dir_path}" + echo old | sudo tee "${old_path}" >/dev/null 2>&1 + echo new | sudo tee "${new_path}" >/dev/null 2>&1 + sudo "${rename_exchange}" "${old_path}" "${new_path}" >/dev/null 2>&1 + sudo sync -f "${MNT_PATH}" + grep new "${old_path}" >/dev/null 2>&1 + grep old "${new_path}" >/dev/null 2>&1 +} + +unmount_image() +{ + sudo umount "${MNT_PATH}" &> /dev/null +} + +create_loopback +mount_image +rename_exchange_test +rename_exchange_subdir_test +unmount_image + +exit 0 From 803d912fd2d4da0c843e81f253f39eee36cc62e3 Mon Sep 17 00:00:00 2001 From: wuchi Date: Sun, 12 Jun 2022 13:20:15 +0800 Subject: [PATCH 31/34] lib/error-inject: convert to DEFINE_SEQ_ATTRIBUTE Use DEFINE_SEQ_ATTRIBUTE helper macro to simplify the code. Link: https://lkml.kernel.org/r/20220612052015.23283-1-wuchi.zero@gmail.com Signed-off-by: wuchi Cc: Masami Hiramatsu (Google) Cc: Martin KaFai Lau Cc: Song Liu Cc: Yonghong Song Cc: John Fastabend Cc: KP Singh Signed-off-by: Andrew Morton --- lib/error-inject.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/lib/error-inject.c b/lib/error-inject.c index 2ff5ef689d7271..4a4f1278c41916 100644 --- a/lib/error-inject.c +++ b/lib/error-inject.c @@ -197,24 +197,14 @@ static int ei_seq_show(struct seq_file *m, void *v) return 0; } -static const struct seq_operations ei_seq_ops = { +static const struct seq_operations ei_sops = { .start = ei_seq_start, .next = ei_seq_next, .stop = ei_seq_stop, .show = ei_seq_show, }; -static int ei_open(struct inode *inode, struct file *filp) -{ - return seq_open(filp, &ei_seq_ops); -} - -static const struct file_operations debugfs_ei_ops = { - .open = ei_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; +DEFINE_SEQ_ATTRIBUTE(ei); static int __init ei_debugfs_init(void) { @@ -224,7 +214,7 @@ static int __init ei_debugfs_init(void) if (!dir) return -ENOMEM; - file = debugfs_create_file("list", 0444, dir, NULL, &debugfs_ei_ops); + file = debugfs_create_file("list", 0444, dir, NULL, &ei_fops); if (!file) { debugfs_remove(dir); return -ENOMEM; From 3672e34988603e2bf1e11e9f35b1504d7a95ac46 Mon Sep 17 00:00:00 2001 From: Stephen Brennan Date: Mon, 16 May 2022 17:05:07 -0700 Subject: [PATCH 32/34] kallsyms: move declarations to internal header Patch series "Expose kallsyms data in vmcoreinfo note". The kernel can be configured to contain a lot of introspection or debugging information built-in, such as ORC for unwinding stack traces, BTF for type information, and of course kallsyms. Debuggers could use this information to navigate a core dump or live system, but they need to be able to find it. This patch series adds the necessary symbols into vmcoreinfo, which would allow a debugger to find and interpret the kallsyms table. Using the kallsyms data, the debugger can then lookup any symbol, allowing it to find ORC, BTF, or any other useful data. This would allow a live kernel, or core dump, to be debugged without any DWARF debuginfo. This is useful for many cases: the debuginfo may not have been generated, or you may not want to deploy the large files everywhere you need them. I've demonstrated a proof of concept for this at LSF/MM+BPF during a lighting talk. Using a work-in-progress branch of the drgn debugger, and an extended set of BTF generated by a patched version of dwarves, I've been able to open a core dump without any DWARF info and do basic tasks such as enumerating slab caches, block devices, tasks, and doing backtraces. I hope this series can be a first step toward a new possibility of "DWARFless debugging". Related discussion around the BTF side of this: https://lore.kernel.org/bpf/586a6288-704a-f7a7-b256-e18a675927df@oracle.com/T/#u Some work-in-progress branches using this feature: https://github.com/brenns10/dwarves/tree/remove_percpu_restriction_1 https://github.com/brenns10/drgn/tree/kallsyms_plus_btf This patch (of 2): To include kallsyms data in the vmcoreinfo note, we must make the symbol declarations visible outside of kallsyms.c. Move these to a new internal header file. Link: https://lkml.kernel.org/r/20220517000508.777145-1-stephen.s.brennan@oracle.com Link: https://lkml.kernel.org/r/20220517000508.777145-2-stephen.s.brennan@oracle.com Signed-off-by: Stephen Brennan Acked-by: Baoquan He Cc: Nick Desaulniers Cc: Dave Young Cc: Kees Cook Cc: Jiri Olsa Cc: Stephen Boyd Cc: Bixuan Cui Cc: David Vernet Cc: Vivek Goyal Cc: Sami Tolvanen Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton --- kernel/kallsyms.c | 23 +---------------------- kernel/kallsyms_internal.h | 30 ++++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 22 deletions(-) create mode 100644 kernel/kallsyms_internal.h diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index fbdf8d3279aca3..510fba0ba5b4a5 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -31,28 +31,7 @@ #include #include -/* - * These will be re-linked against their real values - * during the second link stage. - */ -extern const unsigned long kallsyms_addresses[] __weak; -extern const int kallsyms_offsets[] __weak; -extern const u8 kallsyms_names[] __weak; - -/* - * Tell the compiler that the count isn't in the small data section if the arch - * has one (eg: FRV). - */ -extern const unsigned int kallsyms_num_syms -__section(".rodata") __attribute__((weak)); - -extern const unsigned long kallsyms_relative_base -__section(".rodata") __attribute__((weak)); - -extern const char kallsyms_token_table[] __weak; -extern const u16 kallsyms_token_index[] __weak; - -extern const unsigned int kallsyms_markers[] __weak; +#include "kallsyms_internal.h" /* * Expand a compressed symbol data into the resulting uncompressed string, diff --git a/kernel/kallsyms_internal.h b/kernel/kallsyms_internal.h new file mode 100644 index 00000000000000..2d0c6f2f0243a2 --- /dev/null +++ b/kernel/kallsyms_internal.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef LINUX_KALLSYMS_INTERNAL_H_ +#define LINUX_KALLSYMS_INTERNAL_H_ + +#include + +/* + * These will be re-linked against their real values + * during the second link stage. + */ +extern const unsigned long kallsyms_addresses[] __weak; +extern const int kallsyms_offsets[] __weak; +extern const u8 kallsyms_names[] __weak; + +/* + * Tell the compiler that the count isn't in the small data section if the arch + * has one (eg: FRV). + */ +extern const unsigned int kallsyms_num_syms +__section(".rodata") __attribute__((weak)); + +extern const unsigned long kallsyms_relative_base +__section(".rodata") __attribute__((weak)); + +extern const char kallsyms_token_table[] __weak; +extern const u16 kallsyms_token_index[] __weak; + +extern const unsigned int kallsyms_markers[] __weak; + +#endif // LINUX_KALLSYMS_INTERNAL_H_ From d3a360df501fc6e202022fea6984e731ae936a10 Mon Sep 17 00:00:00 2001 From: Stephen Brennan Date: Mon, 16 May 2022 17:05:08 -0700 Subject: [PATCH 33/34] vmcoreinfo: include kallsyms symbols The internal kallsyms tables contain information which could be quite useful to a debugging tool in the absence of other debuginfo. If kallsyms is enabled, then a debugging tool could parse it and use it as a fallback symbol table. Combined with BTF data, live & post-mortem debuggers can support basic operations without needing a large DWARF debuginfo file available. As many as five symbols are necessary to properly parse kallsyms names and addresses. Add these to the vmcoreinfo note. CONFIG_KALLSYMS_ABSOLUTE_PERCPU does impact the computation of symbol addresses. However, a debugger can infer this configuration value by comparing the address of _stext in the vmcoreinfo with the address computed via kallsyms. So there's no need to include information about this config value in the vmcoreinfo note. To verify that we're still well below the maximum of 4096 bytes, I created a script[1] to compute a rough upper bound on the possible size of vmcoreinfo. On v5.18-rc7, the script reports 3106 bytes, and with this patch, the maximum become 3370 bytes. [1]: https://github.com/brenns10/kernel_stuff/blob/master/vmcoreinfosize/ Link: https://lkml.kernel.org/r/20220517000508.777145-3-stephen.s.brennan@oracle.com Signed-off-by: Stephen Brennan Acked-by: Baoquan He Cc: Bixuan Cui Cc: Dave Young Cc: David Vernet Cc: "Eric W. Biederman" Cc: Jiri Olsa Cc: Kees Cook Cc: Nick Desaulniers Cc: Sami Tolvanen Cc: Stephen Boyd Cc: Vivek Goyal Signed-off-by: Andrew Morton --- kernel/crash_core.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 71122e01623cc2..f64d35e2841198 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -15,6 +15,8 @@ #include +#include "kallsyms_internal.h" + /* vmcoreinfo stuff */ unsigned char *vmcoreinfo_data; size_t vmcoreinfo_size; @@ -480,6 +482,18 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE); #endif +#ifdef CONFIG_KALLSYMS + VMCOREINFO_SYMBOL(kallsyms_names); + VMCOREINFO_SYMBOL(kallsyms_token_table); + VMCOREINFO_SYMBOL(kallsyms_token_index); +#ifdef CONFIG_KALLSYMS_BASE_RELATIVE + VMCOREINFO_SYMBOL(kallsyms_offsets); + VMCOREINFO_SYMBOL(kallsyms_relative_base); +#else + VMCOREINFO_SYMBOL(kallsyms_addresses); +#endif /* CONFIG_KALLSYMS_BASE_RELATIVE */ +#endif /* CONFIG_KALLSYMS */ + arch_crash_save_vmcoreinfo(); update_vmcoreinfo_note(); From 63825aa6d2b0bda447b739dcfa7c4cdb6ff65c83 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 15 Jun 2022 14:22:06 +0300 Subject: [PATCH 34/34] proc: delete unused includes Those aren't necessary after seq files won. Link: https://lkml.kernel.org/r/YqnA3mS7KBt8Z4If@localhost.localdomain Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton --- fs/proc/array.c | 1 - fs/proc/inode.c | 2 -- fs/proc/kmsg.c | 1 - fs/proc/nommu.c | 1 - fs/proc/proc_net.c | 3 --- fs/proc/proc_tty.c | 2 -- fs/proc/root.c | 3 --- fs/proc/vmcore.c | 1 - 8 files changed, 14 deletions(-) diff --git a/fs/proc/array.c b/fs/proc/array.c index eb815759842ce4..65fa603422e04d 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -69,7 +69,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 73aeb4e6d32e51..fd40d60169b5a2 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -26,8 +26,6 @@ #include #include -#include - #include "internal.h" static void proc_evict_inode(struct inode *inode) diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c index b38ad552887fb5..592e6dc7c1102b 100644 --- a/fs/proc/kmsg.c +++ b/fs/proc/kmsg.c @@ -15,7 +15,6 @@ #include #include -#include #include extern wait_queue_head_t log_wait; diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c index 13452b32e2bd57..4d3493579458f0 100644 --- a/fs/proc/nommu.c +++ b/fs/proc/nommu.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include #include "internal.h" diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index 913e5acefbb66b..bbce6fbe779c8c 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -8,9 +8,6 @@ * * proc net directory handling functions */ - -#include - #include #include #include diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c index c69ff191e5d8f5..5c6a5ceab2f1b7 100644 --- a/fs/proc/proc_tty.c +++ b/fs/proc/proc_tty.c @@ -4,8 +4,6 @@ * * Copyright 1997, Theodore Ts'o */ - -#include #include #include #include diff --git a/fs/proc/root.c b/fs/proc/root.c index c7e3b1350ef84f..5a7d15d197f8e0 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -6,9 +6,6 @@ * * proc root directory handling functions */ - -#include - #include #include #include diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 4eaeb645e75966..f2aa86c421f2d4 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #include #include