Skip to content

Commit

Permalink
eventfd: support delayed wakeup for non-semaphore eventfd to reduce c…
Browse files Browse the repository at this point in the history
…pu utilization

For the NON SEMAPHORE eventfd, if it's counter has a nonzero value,
then a read(2) returns 8 bytes containing that value, and the counter's
value is reset to zero. Therefore, in the NON SEMAPHORE scenario,
N event_writes vs ONE event_read is possible.

However, the current implementation wakes up the read thread immediately
in eventfd_write so that the cpu utilization increases unnecessarily.

By adding a configurable delay after eventfd_write, these unnecessary
wakeup operations are avoided, thereby reducing cpu utilization.

We used the following test code:

 #include <assert.h>
 #include <errno.h>
 #include <unistd.h>
 #include <stdio.h>
 #include <string.h>
 #include <poll.h>
 #include <sys/eventfd.h>
 #include <sys/prctl.h>

void publish(int fd)
{
	unsigned long long i = 0;
	int ret;

	prctl(PR_SET_NAME,"publish");
	while (1) {
		i++;
		ret = write(fd, &i, sizeof(i));
		if (ret < 0)
			printf("XXX: write error: %s\n", strerror(errno));
	}
}

void subscribe(int fd)
{
	unsigned long long i = 0;
	struct pollfd pfds[1];
	int ret;

	prctl(PR_SET_NAME,"subscribe");
	pfds[0].fd = fd;
	pfds[0].events = POLLIN;

	usleep(10);
	while(1) {
		ret = poll(pfds, 1, -1);
		if (ret == -1)
			printf("XXX: poll error: %s\n", strerror(errno));
		if(pfds[0].revents & POLLIN)
			read(fd, &i, sizeof(i));
	}
}

int main(int argc, char *argv[])
{
	pid_t pid;
	int fd;

	fd = eventfd(0, EFD_CLOEXEC | EFD_NONBLOCK | EFD_NONBLOCK);
	assert(fd);

	pid = fork();
	if (pid == 0)
		subscribe(fd);
	else if (pid > 0)
		publish(fd);
	else {
		printf("XXX: fork error!\n");
		return -1;
	}

	return 0;
}

 # taskset -c 2-3 ./a.out

The original cpu usage is as follows:
07:02:55 PM  CPU    %usr   %nice    %sys %iowait    %irq   %soft  %steal  %guest  %gnice   %idle
07:02:57 PM  all   16.43    0.00   16.28    0.16    0.00    0.00    0.00    0.00    0.00   67.14
07:02:57 PM    0    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00  100.00
07:02:57 PM    1    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00  100.00
07:02:57 PM    2   29.21    0.00   34.83    1.12    0.00    0.00    0.00    0.00    0.00   34.83
07:02:57 PM    3   51.97    0.00   48.03    0.00    0.00    0.00    0.00    0.00    0.00    0.00

07:02:57 PM  CPU    %usr   %nice    %sys %iowait    %irq   %soft  %steal  %guest  %gnice   %idle
07:02:59 PM  all   18.75    0.00   17.47    2.56    0.00    0.32    0.00    0.00    0.00   60.90
07:02:59 PM    0    6.88    0.00    1.59    5.82    0.00    0.00    0.00    0.00    0.00   85.71
07:02:59 PM    1    1.04    0.00    1.04    2.59    0.00    0.00    0.00    0.00    0.00   95.34
07:02:59 PM    2   26.09    0.00   35.87    0.00    0.00    1.09    0.00    0.00    0.00   36.96
07:02:59 PM    3   52.00    0.00   47.33    0.00    0.00    0.67    0.00    0.00    0.00    0.00

07:02:59 PM  CPU    %usr   %nice    %sys %iowait    %irq   %soft  %steal  %guest  %gnice   %idle
07:03:01 PM  all   16.15    0.00   16.77    0.00    0.00    0.00    0.00    0.00    0.00   67.08
07:03:01 PM    0    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00  100.00
07:03:01 PM    1    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00  100.00
07:03:01 PM    2   27.47    0.00   36.26    0.00    0.00    0.00    0.00    0.00    0.00   36.26
07:03:01 PM    3   51.30    0.00   48.70    0.00    0.00    0.00    0.00    0.00    0.00    0.00

Then settinga the new control parameter, as follows:
echo 5 > /proc/sys/fs/eventfd_wakeup_delay_msec

The cpu usagen was observed to decrease by more than 20% (cpu #2, 26% -> 0.x%),  as follows:

07:03:01 PM  CPU    %usr   %nice    %sys %iowait    %irq   %soft  %steal  %guest  %gnice   %idle
07:03:03 PM  all   10.31    0.00    8.36    0.00    0.00    0.00    0.00    0.00    0.00   81.34
07:03:03 PM    0    0.00    0.00    1.01    0.00    0.00    0.00    0.00    0.00    0.00   98.99
07:03:03 PM    1    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00  100.00
07:03:03 PM    2    0.52    0.00    1.05    0.00    0.00    0.00    0.00    0.00    0.00   98.43
07:03:03 PM    3   56.59    0.00   43.41    0.00    0.00    0.00    0.00    0.00    0.00    0.00

07:03:03 PM  CPU    %usr   %nice    %sys %iowait    %irq   %soft  %steal  %guest  %gnice   %idle
07:03:05 PM  all   10.61    0.00    7.82    0.00    0.00    0.00    0.00    0.00    0.00   81.56
07:03:05 PM    0    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00  100.00
07:03:05 PM    1    0.00    0.00    1.01    0.00    0.00    0.00    0.00    0.00    0.00   98.99
07:03:05 PM    2    0.53    0.00    0.53    0.00    0.00    0.00    0.00    0.00    0.00   98.94
07:03:05 PM    3   58.59    0.00   41.41    0.00    0.00    0.00    0.00    0.00    0.00    0.00

07:03:05 PM  CPU    %usr   %nice    %sys %iowait    %irq   %soft  %steal  %guest  %gnice   %idle
07:03:07 PM  all    8.99    0.00    7.25    0.72    0.00    0.00    0.00    0.00    0.00   83.04
07:03:07 PM    0    0.00    0.00    1.52    2.53    0.00    0.00    0.00    0.00    0.00   95.96
07:03:07 PM    1    0.00    0.00    0.50    0.00    0.00    0.00    0.00    0.00    0.00   99.50
07:03:07 PM    2    0.54    0.00    0.54    0.00    0.00    0.00    0.00    0.00    0.00   98.92
07:03:07 PM    3   57.55    0.00   42.45    0.00    0.00    0.00    0.00    0.00    0.00    0.00

Signed-off-by: Wen Yang <wenyang.linux@foxmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dylan Yudaken <dylany@fb.com>
Cc: David Woodhouse <dwmw@amazon.co.uk>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Fu Wei <wefu@redhat.com>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
  • Loading branch information
taskset authored and intel-lab-lkp committed Apr 16, 2023
1 parent d4425d3 commit ea9214e
Show file tree
Hide file tree
Showing 3 changed files with 109 additions and 1 deletion.
13 changes: 13 additions & 0 deletions Documentation/admin-guide/sysctl/fs.rst
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,19 @@ negative dentries which do not map to any files. Instead,
they help speeding up rejection of non-existing files provided
by the users.

eventfd_wakeup_delay_msec
------------------
Frequent writing of an eventfd can also lead to frequent wakeup of the peer
read process, resulting in significant cpu overhead.
How ever for the NON SEMAPHORE eventfd, if it's counter has a nonzero value,
then a read(2) returns 8 bytes containing that value, and the counter's value
is reset to zero.
So it coule be optimized as follows: N event_writes vs ONE event_read.
By adding a configurable delay after eventfd_write, these unnecessary wakeup
operations are avoided.
The max value is 100 ms.

Default: 0

file-max & file-nr
------------------
Expand Down
78 changes: 77 additions & 1 deletion fs/eventfd.c
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@ struct eventfd_ctx {
__u64 count;
unsigned int flags;
int id;
#ifdef CONFIG_EVENTFD_WAKEUP_DELAY
struct delayed_work dwork;
#endif
};

__u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, unsigned mask)
Expand Down Expand Up @@ -95,6 +98,9 @@ static void eventfd_free_ctx(struct eventfd_ctx *ctx)
{
if (ctx->id >= 0)
ida_simple_remove(&eventfd_ida, ctx->id);
#ifdef CONFIG_EVENTFD_WAKEUP_DELAY
flush_delayed_work(&ctx->dwork);
#endif
kfree(ctx);
}

Expand Down Expand Up @@ -256,6 +262,28 @@ static ssize_t eventfd_read(struct kiocb *iocb, struct iov_iter *to)
return sizeof(ucnt);
}

#ifdef CONFIG_EVENTFD_WAKEUP_DELAY

static unsigned long eventfd_wake_delay_jiffies;

static void eventfd_delayed_workfn(struct work_struct *work)
{
struct delayed_work *dwork = to_delayed_work(work);
struct eventfd_ctx *ctx = container_of(dwork, struct eventfd_ctx, dwork);

spin_lock_irq(&ctx->wqh.lock);
current->in_eventfd = 1;
if (ctx->count) {
/* waitqueue_active is safe because ctx->wqh.lock is being held here. */
if (waitqueue_active(&ctx->wqh))
wake_up_locked_poll(&ctx->wqh, EPOLLIN);
}
current->in_eventfd = 0;
spin_unlock_irq(&ctx->wqh.lock);
}

#endif

static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count,
loff_t *ppos)
{
Expand All @@ -282,8 +310,27 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c
if (likely(res > 0)) {
ctx->count += ucnt;
current->in_eventfd = 1;
if (waitqueue_active(&ctx->wqh))

/* waitqueue_active is safe because ctx->wqh.lock is being held here. */
if (waitqueue_active(&ctx->wqh)) {
#ifdef CONFIG_EVENTFD_WAKEUP_DELAY
if (ctx->flags & EFD_SEMAPHORE)
wake_up_locked_poll(&ctx->wqh, EPOLLIN);
else {
unsigned long delay = eventfd_wake_delay_jiffies;

if (delay) {
if (!delayed_work_pending(&ctx->dwork))
queue_delayed_work(system_unbound_wq,
&ctx->dwork, delay);
} else
wake_up_locked_poll(&ctx->wqh, EPOLLIN);
}
#else
wake_up_locked_poll(&ctx->wqh, EPOLLIN);
#endif
}

current->in_eventfd = 0;
}
spin_unlock_irq(&ctx->wqh.lock);
Expand Down Expand Up @@ -406,6 +453,9 @@ static int do_eventfd(unsigned int count, int flags)
ctx->count = count;
ctx->flags = flags;
ctx->id = ida_simple_get(&eventfd_ida, 0, 0, GFP_KERNEL);
#ifdef CONFIG_EVENTFD_WAKEUP_DELAY
INIT_DELAYED_WORK(&ctx->dwork, eventfd_delayed_workfn);
#endif

flags &= EFD_SHARED_FCNTL_FLAGS;
flags |= O_RDWR;
Expand Down Expand Up @@ -438,3 +488,29 @@ SYSCALL_DEFINE1(eventfd, unsigned int, count)
return do_eventfd(count, 0);
}

#ifdef CONFIG_EVENTFD_WAKEUP_DELAY

static const unsigned long eventfd_wake_delay_max = HZ / 10;

static struct ctl_table fs_eventfd_ctl[] = {
{
.procname = "eventfd_wakeup_delay_msec",
.data = &eventfd_wake_delay_jiffies,
.maxlen = sizeof(eventfd_wake_delay_jiffies),
.mode = 0644,
.proc_handler = proc_doulongvec_ms_jiffies_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = (void *)&eventfd_wake_delay_max,
},
{ }
};

static int __init init_fs_eventfd_sysctls(void)
{
register_sysctl_init("fs", fs_eventfd_ctl);
return 0;
}

fs_initcall(init_fs_eventfd_sysctls);

#endif /* CONFIG_EVENTFD_WAKEUP_DELAY */
19 changes: 19 additions & 0 deletions init/Kconfig
Original file line number Diff line number Diff line change
Expand Up @@ -1691,6 +1691,25 @@ config EVENTFD

If unsure, say Y.

if EVENTFD
config EVENTFD_WAKEUP_DELAY
bool "support delayed wakeup for the non-semaphore eventfd" if EXPERT
default n
depends on SYSCTL
help
This option enables the delayed wakeup for the non-semaphore eventfd.
Frequent writing of an eventfd can also lead to frequent wakeup of
the peer read process, resulting in significant cpu overhead.
How ever for the NON SEMAPHORE eventfd, if it's counter has a
nonzero value, then a read(2) returns 8 bytes containing that value,
and the counter's value is reset to zero.
By adding a configurable delay after eventfd_write, these unnecessary
wakeup operations are avoided.

If unsure, say N.

endif # EVENTFD

config SHMEM
bool "Use full shmem filesystem" if EXPERT
default y
Expand Down

0 comments on commit ea9214e

Please sign in to comment.