Skip to content

Commit

Permalink
block: Change the granularity of io ticks from ms to ns
Browse files Browse the repository at this point in the history
Problem Desc
============
The "iostat" user-space utility was showing %util as 100% for the disks
which has latencies less than a milli-second i.e for latencies in the
range of micro-seconds and below.

Root Cause
==========
The IO accounting in block layer is currently done by updating the
io_ticks in jiffies which is of milli-seconds granularity. Due to this,
for the devices with IO latencies less than a milli-second, the latency
will be accounted as 1 milli-second even-though its in the range of
micro-seconds. This was causing the iostat command to show %util
as 100% which is incorrect.

Recreationg of the issue
========================
Setup
-----
Devices: NVMe 24 devices
Model number: 4610 (Intel)

fio
  • Loading branch information
gulams authored and intel-lab-lkp committed Dec 21, 2022
1 parent 254be08 commit afff703
Show file tree
Hide file tree
Showing 8 changed files with 103 additions and 16 deletions.
26 changes: 22 additions & 4 deletions block/blk-core.c
Expand Up @@ -927,6 +927,18 @@ int iocb_bio_iopoll(struct kiocb *kiocb, struct io_comp_batch *iob,
}
EXPORT_SYMBOL_GPL(iocb_bio_iopoll);

/*
* Get the time based upon the available granularity for io accounting
* If the resolution mode is set to precise (2) i.e
* (/sys/block/<device>/queue/iostats = 2), then this will return time
* in nano-seconds else this will return time in jiffies
*/
u64 blk_get_iostat_ticks(struct request_queue *q)
{
return (blk_queue_precise_io_stat(q) ? ktime_get_ns() : jiffies);
}
EXPORT_SYMBOL_GPL(blk_get_iostat_ticks);

void update_io_ticks(struct block_device *part, u64 now, bool end)
{
u64 stamp;
Expand Down Expand Up @@ -968,20 +980,26 @@ EXPORT_SYMBOL(bdev_start_io_acct);
u64 bio_start_io_acct(struct bio *bio)
{
return bdev_start_io_acct(bio->bi_bdev, bio_sectors(bio),
bio_op(bio), jiffies);
bio_op(bio),
blk_get_iostat_ticks(bio->bi_bdev->bd_queue));
}
EXPORT_SYMBOL_GPL(bio_start_io_acct);

void bdev_end_io_acct(struct block_device *bdev, enum req_op op,
u64 start_time)
{
u64 now;
u64 duration;
struct request_queue *q = bdev_get_queue(bdev);
const int sgrp = op_stat_group(op);
u64 now = READ_ONCE(jiffies);
u64 duration = (unsigned long)now -(unsigned long) start_time;
now = blk_get_iostat_ticks(q);;
duration = (unsigned long)now -(unsigned long) start_time;

part_stat_lock();
update_io_ticks(bdev, now, true);
part_stat_add(bdev, nsecs[sgrp], jiffies_to_nsecs(duration));
part_stat_add(bdev, nsecs[sgrp],
(blk_queue_precise_io_stat(q) ? duration :
jiffies_to_nsecs(duration)));
part_stat_local_dec(bdev, in_flight[op_is_write(op)]);
part_stat_unlock();
}
Expand Down
4 changes: 2 additions & 2 deletions block/blk-mq.c
Expand Up @@ -980,7 +980,7 @@ static void __blk_account_io_done(struct request *req, u64 now)
const int sgrp = op_stat_group(req_op(req));

part_stat_lock();
update_io_ticks(req->part, jiffies, true);
update_io_ticks(req->part, blk_get_iostat_ticks(req->q), true);
part_stat_inc(req->part, ios[sgrp]);
part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns);
part_stat_unlock();
Expand Down Expand Up @@ -1012,7 +1012,7 @@ static void __blk_account_io_start(struct request *rq)
rq->part = rq->q->disk->part0;

part_stat_lock();
update_io_ticks(rq->part, jiffies, false);
update_io_ticks(rq->part, blk_get_iostat_ticks(rq->q), false);
part_stat_unlock();
}

Expand Down
39 changes: 38 additions & 1 deletion block/blk-sysfs.c
Expand Up @@ -307,7 +307,6 @@ queue_##name##_store(struct request_queue *q, const char *page, size_t count) \

QUEUE_SYSFS_BIT_FNS(nonrot, NONROT, 1);
QUEUE_SYSFS_BIT_FNS(random, ADD_RANDOM, 0);
QUEUE_SYSFS_BIT_FNS(iostats, IO_STAT, 0);
QUEUE_SYSFS_BIT_FNS(stable_writes, STABLE_WRITES, 0);
#undef QUEUE_SYSFS_BIT_FNS

Expand Down Expand Up @@ -363,6 +362,44 @@ static ssize_t queue_nomerges_store(struct request_queue *q, const char *page,
return ret;
}

static ssize_t queue_iostats_show(struct request_queue *q, char *page)
{
bool iostat = blk_queue_io_stat(q);
bool precise_iostat = blk_queue_precise_io_stat(q);

return queue_var_show(iostat << precise_iostat, page);
}

static ssize_t queue_iostats_store(struct request_queue *q, const char *page,
size_t count)
{
unsigned long val;
ssize_t ret = -EINVAL;

ret = queue_var_store(&val, page, count);

if (ret < 0)
return ret;

if (val == 2) {
blk_queue_flag_set(QUEUE_FLAG_IO_STAT, q);
blk_queue_flag_set(QUEUE_FLAG_PRECISE_IO_STAT, q);
q->disk->part0->bd_stamp = 0;
}
else if (val == 1) {
blk_queue_flag_set(QUEUE_FLAG_IO_STAT, q);
blk_queue_flag_clear(QUEUE_FLAG_PRECISE_IO_STAT, q);
q->disk->part0->bd_stamp = 0;
}
else if (val == 0) {
blk_queue_flag_clear(QUEUE_FLAG_IO_STAT, q);
blk_queue_flag_clear(QUEUE_FLAG_PRECISE_IO_STAT, q);
q->disk->part0->bd_stamp = 0;
}

return ret;
}

static ssize_t queue_rq_affinity_show(struct request_queue *q, char *page)
{
bool set = test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags);
Expand Down
18 changes: 14 additions & 4 deletions block/genhd.c
Expand Up @@ -956,6 +956,7 @@ ssize_t part_stat_show(struct device *dev,
struct request_queue *q = bdev_get_queue(bdev);
struct disk_stats stat;
unsigned int inflight;
u64 stat_ioticks;

if (queue_is_mq(q))
inflight = blk_mq_in_flight(q, bdev);
Expand All @@ -964,10 +965,13 @@ ssize_t part_stat_show(struct device *dev,

if (inflight) {
part_stat_lock();
update_io_ticks(bdev, jiffies, true);
update_io_ticks(bdev, blk_get_iostat_ticks(q), true);
part_stat_unlock();
}
part_stat_read_all(bdev, &stat);
stat_ioticks = (blk_queue_precise_io_stat(q) ?
div_u64(stat.io_ticks, NSEC_PER_MSEC) :
jiffies_to_msecs(stat.io_ticks));
return sprintf(buf,
"%8lu %8lu %8llu %8u "
"%8lu %8lu %8llu %8u "
Expand All @@ -984,7 +988,7 @@ ssize_t part_stat_show(struct device *dev,
(unsigned long long)stat.sectors[STAT_WRITE],
(unsigned int)div_u64(stat.nsecs[STAT_WRITE], NSEC_PER_MSEC),
inflight,
jiffies_to_msecs(stat.io_ticks),
(unsigned int)stat_ioticks,
(unsigned int)div_u64(stat.nsecs[STAT_READ] +
stat.nsecs[STAT_WRITE] +
stat.nsecs[STAT_DISCARD] +
Expand Down Expand Up @@ -1222,6 +1226,8 @@ static int diskstats_show(struct seq_file *seqf, void *v)
unsigned int inflight;
struct disk_stats stat;
unsigned long idx;
struct request_queue *q;
u64 stat_ioticks;

/*
if (&disk_to_dev(gp)->kobj.entry == block_class.devices.next)
Expand All @@ -1240,12 +1246,16 @@ static int diskstats_show(struct seq_file *seqf, void *v)
else
inflight = part_in_flight(hd);

q = bdev_get_queue(hd);
if (inflight) {
part_stat_lock();
update_io_ticks(hd, jiffies, true);
update_io_ticks(hd, blk_get_iostat_ticks(q), true);
part_stat_unlock();
}
part_stat_read_all(hd, &stat);
stat_ioticks = (blk_queue_precise_io_stat(q) ?
div_u64(stat.io_ticks, NSEC_PER_MSEC) :
jiffies_to_msecs(stat.io_ticks));
seq_printf(seqf, "%4d %7d %pg "
"%lu %lu %lu %u "
"%lu %lu %lu %u "
Expand All @@ -1265,7 +1275,7 @@ static int diskstats_show(struct seq_file *seqf, void *v)
(unsigned int)div_u64(stat.nsecs[STAT_WRITE],
NSEC_PER_MSEC),
inflight,
jiffies_to_msecs(stat.io_ticks),
(unsigned int)stat_ioticks,
(unsigned int)div_u64(stat.nsecs[STAT_READ] +
stat.nsecs[STAT_WRITE] +
stat.nsecs[STAT_DISCARD] +
Expand Down
12 changes: 10 additions & 2 deletions drivers/block/drbd/drbd_debugfs.c
Expand Up @@ -97,6 +97,12 @@ static void seq_print_one_request(struct seq_file *m, struct drbd_request *req,
{
/* change anything here, fixup header below! */
unsigned int s = req->rq_state;
unsigned long start_time;
struct request_queue *q = req->device->rq_queue;

start_time = (blk_queue_precise_io_stat(q) ?
nsecs_to_jiffies(req->start_jif) :
req->start_jif);

#define RQ_HDR_1 "epoch\tsector\tsize\trw"
seq_printf(m, "0x%x\t%llu\t%u\t%s",
Expand All @@ -105,7 +111,7 @@ static void seq_print_one_request(struct seq_file *m, struct drbd_request *req,
(s & RQ_WRITE) ? "W" : "R");

#define RQ_HDR_2 "\tstart\tin AL\tsubmit"
seq_printf(m, "\t%d", jiffies_to_msecs(now - req->start_jif));
seq_printf(m, "\t%d", jiffies_to_msecs(now - start_time));
seq_print_age_or_dash(m, s & RQ_IN_ACT_LOG, now - req->in_actlog_jif);
seq_print_age_or_dash(m, s & RQ_LOCAL_PENDING, now - req->pre_submit_jif);

Expand Down Expand Up @@ -171,7 +177,9 @@ static void seq_print_waiting_for_AL(struct seq_file *m, struct drbd_resource *r
/* if the oldest request does not wait for the activity log
* it is not interesting for us here */
if (req && !(req->rq_state & RQ_IN_ACT_LOG))
jif = req->start_jif;
jif = (blk_queue_precise_io_stat(device->rq_queue) ?
nsecs_to_jiffies(req->start_jif):
req->start_jif);
else
req = NULL;
spin_unlock_irq(&device->resource->req_lock);
Expand Down
3 changes: 2 additions & 1 deletion drivers/block/zram/zram_drv.c
Expand Up @@ -2089,6 +2089,7 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector,
struct zram *zram;
struct bio_vec bv;
u64 start_time;
struct request_queue *q = bdev_get_queue(bdev);

if (PageTransHuge(page))
return -ENOTSUPP;
Expand All @@ -2108,7 +2109,7 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector,
bv.bv_offset = 0;

start_time = bdev_start_io_acct(bdev->bd_disk->part0,
SECTORS_PER_PAGE, op, jiffies);
SECTORS_PER_PAGE, op, blk_get_iostat_ticks(q));
ret = zram_bvec_rw(zram, &bv, index, offset, op, NULL);
bdev_end_io_acct(bdev->bd_disk->part0, op, start_time);
out:
Expand Down
13 changes: 11 additions & 2 deletions drivers/md/dm.c
Expand Up @@ -482,7 +482,11 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,

u64 dm_start_time_ns_from_clone(struct bio *bio)
{
return jiffies_to_nsecs(clone_to_tio(bio)->io->start_time);
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
u64 start_time_ns = (blk_queue_precise_io_stat(q) ?
clone_to_tio(bio)->io->start_time :
jiffies_to_nsecs(clone_to_tio(bio)->io->start_time));
return start_time_ns;
}
EXPORT_SYMBOL_GPL(dm_start_time_ns_from_clone);

Expand All @@ -498,6 +502,11 @@ static void dm_io_acct(struct dm_io *io, bool end)
struct mapped_device *md = io->md;
struct bio *bio = io->orig_bio;
unsigned int sectors;
struct request_queue *q = bdev_get_queue(bio->bi_bdev);

u64 start_time = (blk_queue_precise_io_stat(q) ?
nsecs_to_jiffies(io->start_time) :
io->start_time);

/*
* If REQ_PREFLUSH set, don't account payload, it will be
Expand Down Expand Up @@ -589,7 +598,7 @@ static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio)
io->orig_bio = bio;
io->md = md;
spin_lock_init(&io->lock);
io->start_time = jiffies;
io->start_time = blk_get_iostat_ticks(bio->bi_bdev->bd_queue);
io->flags = 0;

if (static_branch_unlikely(&stats_enabled))
Expand Down
4 changes: 4 additions & 0 deletions include/linux/blkdev.h
Expand Up @@ -571,6 +571,7 @@ struct request_queue {
#define QUEUE_FLAG_NOWAIT 29 /* device supports NOWAIT */
#define QUEUE_FLAG_SQ_SCHED 30 /* single queue style io dispatch */
#define QUEUE_FLAG_SKIP_TAGSET_QUIESCE 31 /* quiesce_tagset skip the queue*/
#define QUEUE_FLAG_PRECISE_IO_STAT 32 /* To enable precise io accounting */

#define QUEUE_FLAG_MQ_DEFAULT ((1UL << QUEUE_FLAG_IO_STAT) | \
(1UL << QUEUE_FLAG_SAME_COMP) | \
Expand All @@ -579,6 +580,7 @@ struct request_queue {
void blk_queue_flag_set(unsigned int flag, struct request_queue *q);
void blk_queue_flag_clear(unsigned int flag, struct request_queue *q);
bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);
u64 blk_get_iostat_ticks(struct request_queue *q);

#define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)
#define blk_queue_dying(q) test_bit(QUEUE_FLAG_DYING, &(q)->queue_flags)
Expand All @@ -590,6 +592,8 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);
#define blk_queue_stable_writes(q) \
test_bit(QUEUE_FLAG_STABLE_WRITES, &(q)->queue_flags)
#define blk_queue_io_stat(q) test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags)
#define blk_queue_precise_io_stat(q) \
test_bit(QUEUE_FLAG_PRECISE_IO_STAT, &(q)->queue_flags)
#define blk_queue_add_random(q) test_bit(QUEUE_FLAG_ADD_RANDOM, &(q)->queue_flags)
#define blk_queue_zone_resetall(q) \
test_bit(QUEUE_FLAG_ZONE_RESETALL, &(q)->queue_flags)
Expand Down

0 comments on commit afff703

Please sign in to comment.