Skip to content

Commit

Permalink
iomap: Add per-block dirty state tracking to improve performance
Browse files Browse the repository at this point in the history
When filesystem blocksize is less than folio size (either with
mapping_large_folio_support() or with blocksize < pagesize) and when the
folio is uptodate in pagecache, then even a byte write can cause
an entire folio to be written to disk during writeback. This happens
because we currently don't have a mechanism to track per-block dirty
state within struct iomap_page. We currently only track uptodate state.

This patch implements support for tracking per-block dirty state in
iomap_page->state bitmap. This should help improve the filesystem write
performance and help reduce write amplification.

Performance testing of below fio workload reveals ~16x performance
improvement using nvme with XFS (4k blocksize) on Power (64K pagesize)
FIO reported write bw scores improved from around ~28 MBps to ~452 MBps.

1. <test_randwrite.fio>
[global]
	ioengine=psync
	rw=randwrite
	overwrite=1
	pre_read=1
	direct=0
	bs=4k
	size=1G
	dir=./
	numjobs=8
	fdatasync=1
	runtime=60
	iodepth=64
	group_reporting=1

[fio-run]

2. Also our internal performance team reported that this patch improves
   their database workload performance by around ~83% (with XFS on Power)

Reported-by: Aravinda Herle <araherle@in.ibm.com>
Reported-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
  • Loading branch information
riteshharjani authored and intel-lab-lkp committed May 7, 2023
1 parent a67b7a5 commit a05d7e5
Show file tree
Hide file tree
Showing 5 changed files with 112 additions and 10 deletions.
2 changes: 1 addition & 1 deletion fs/gfs2/aops.c
Original file line number Diff line number Diff line change
Expand Up @@ -746,7 +746,7 @@ static const struct address_space_operations gfs2_aops = {
.writepages = gfs2_writepages,
.read_folio = gfs2_read_folio,
.readahead = gfs2_readahead,
.dirty_folio = filemap_dirty_folio,
.dirty_folio = iomap_dirty_folio,
.release_folio = iomap_release_folio,
.invalidate_folio = iomap_invalidate_folio,
.bmap = gfs2_bmap,
Expand Down
115 changes: 108 additions & 7 deletions fs/iomap/buffered-io.c
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,12 @@ static inline void iop_set_range(struct iomap_page *iop, unsigned int start_blk,
bitmap_set(iop->state, start_blk, nr_blks);
}

static inline void iop_clear_range(struct iomap_page *iop,
unsigned int start_blk, unsigned int nr_blks)
{
bitmap_clear(iop->state, start_blk, nr_blks);
}

static inline bool iop_test_block(struct iomap_page *iop, unsigned int block)
{
return test_bit(block, iop->state);
Expand Down Expand Up @@ -84,6 +90,16 @@ static bool iop_test_block_uptodate(struct folio *folio, unsigned int block)
return iop_test_block(iop, block);
}

static bool iop_test_block_dirty(struct folio *folio, int block)
{
struct iomap_page *iop = to_iomap_page(folio);
struct inode *inode = folio->mapping->host;
unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);

WARN_ON(!iop);
return iop_test_block(iop, block + blks_per_folio);
}

static void iop_set_range_uptodate(struct inode *inode, struct folio *folio,
size_t off, size_t len)
{
Expand All @@ -104,8 +120,42 @@ static void iop_set_range_uptodate(struct inode *inode, struct folio *folio,
}
}

static void iop_set_range_dirty(struct inode *inode, struct folio *folio,
size_t off, size_t len)
{
struct iomap_page *iop = to_iomap_page(folio);
unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
unsigned int first_blk = (off >> inode->i_blkbits);
unsigned int last_blk = ((off + len - 1) >> inode->i_blkbits);
unsigned int nr_blks = last_blk - first_blk + 1;
unsigned long flags;

if (!iop)
return;
spin_lock_irqsave(&iop->state_lock, flags);
iop_set_range(iop, first_blk + blks_per_folio, nr_blks);
spin_unlock_irqrestore(&iop->state_lock, flags);
}

static void iop_clear_range_dirty(struct folio *folio, size_t off, size_t len)
{
struct iomap_page *iop = to_iomap_page(folio);
struct inode *inode = folio->mapping->host;
unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
unsigned int first_blk = (off >> inode->i_blkbits);
unsigned int last_blk = ((off + len - 1) >> inode->i_blkbits);
unsigned int nr_blks = last_blk - first_blk + 1;
unsigned long flags;

if (!iop)
return;
spin_lock_irqsave(&iop->state_lock, flags);
iop_clear_range(iop, first_blk + blks_per_folio, nr_blks);
spin_unlock_irqrestore(&iop->state_lock, flags);
}

static struct iomap_page *iop_alloc(struct inode *inode, struct folio *folio,
unsigned int flags)
unsigned int flags, bool is_dirty)
{
struct iomap_page *iop = to_iomap_page(folio);
unsigned int nr_blocks = i_blocks_per_folio(inode, folio);
Expand All @@ -119,12 +169,20 @@ static struct iomap_page *iop_alloc(struct inode *inode, struct folio *folio,
else
gfp = GFP_NOFS | __GFP_NOFAIL;

iop = kzalloc(struct_size(iop, state, BITS_TO_LONGS(nr_blocks)),
/*
* iop->state tracks two sets of state flags when the
* filesystem block size is smaller than the folio size.
* The first state tracks per-block uptodate and the
* second tracks per-block dirty state.
*/
iop = kzalloc(struct_size(iop, state, BITS_TO_LONGS(2 * nr_blocks)),
gfp);
if (iop) {
spin_lock_init(&iop->state_lock);
if (folio_test_uptodate(folio))
iop_set_range(iop, 0, nr_blocks);
if (is_dirty)
iop_set_range(iop, nr_blocks, nr_blocks);
folio_attach_private(folio, iop);
}
return iop;
Expand Down Expand Up @@ -268,7 +326,8 @@ static int iomap_read_inline_data(const struct iomap_iter *iter,
if (WARN_ON_ONCE(size > iomap->length))
return -EIO;
if (offset > 0)
iop = iop_alloc(iter->inode, folio, iter->flags);
iop = iop_alloc(iter->inode, folio, iter->flags,
folio_test_dirty(folio));
else
iop = to_iomap_page(folio);

Expand Down Expand Up @@ -306,7 +365,8 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
return iomap_read_inline_data(iter, folio);

/* zero post-eof blocks as the page may be mapped */
iop = iop_alloc(iter->inode, folio, iter->flags);
iop = iop_alloc(iter->inode, folio, iter->flags,
folio_test_dirty(folio));
iomap_adjust_read_range(iter->inode, folio, &pos, length, &poff, &plen);
if (plen == 0)
goto done;
Expand Down Expand Up @@ -561,6 +621,18 @@ void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len)
}
EXPORT_SYMBOL_GPL(iomap_invalidate_folio);

bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio)
{
struct iomap_page *iop;
struct inode *inode = mapping->host;
size_t len = i_blocks_per_folio(inode, folio) << inode->i_blkbits;

iop = iop_alloc(inode, folio, 0, false);
iop_set_range_dirty(inode, folio, 0, len);
return filemap_dirty_folio(mapping, folio);
}
EXPORT_SYMBOL_GPL(iomap_dirty_folio);

static void
iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
{
Expand Down Expand Up @@ -608,7 +680,8 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
pos + len >= folio_pos(folio) + folio_size(folio))
return 0;

iop = iop_alloc(iter->inode, folio, iter->flags);
iop = iop_alloc(iter->inode, folio, iter->flags,
folio_test_dirty(folio));

if ((iter->flags & IOMAP_NOWAIT) && !iop && nr_blocks > 1)
return -EAGAIN;
Expand Down Expand Up @@ -767,6 +840,7 @@ static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
if (unlikely(copied < len && !folio_test_uptodate(folio)))
return 0;
iop_set_range_uptodate(inode, folio, offset_in_folio(folio, pos), len);
iop_set_range_dirty(inode, folio, offset_in_folio(folio, pos), copied);
filemap_dirty_folio(inode->i_mapping, folio);
return copied;
}
Expand Down Expand Up @@ -954,6 +1028,10 @@ static int iomap_write_delalloc_scan(struct inode *inode,
{
while (start_byte < end_byte) {
struct folio *folio;
struct iomap_page *iop;
unsigned int first_blk, last_blk, blks_per_folio, i;
loff_t last_byte;
u8 blkbits = inode->i_blkbits;

/* grab locked page */
folio = filemap_lock_folio(inode->i_mapping,
Expand All @@ -978,6 +1056,28 @@ static int iomap_write_delalloc_scan(struct inode *inode,
}
}

/*
* When we have per-block dirty tracking, there can be
* blocks within a folio which are marked uptodate
* but not dirty. In that case it is necessary to punch
* out such blocks to avoid leaking any delalloc blocks.
*/
iop = to_iomap_page(folio);
if (!iop)
goto skip_iop_punch;
last_byte = min_t(loff_t, end_byte - 1,
(folio_next_index(folio) << PAGE_SHIFT) - 1);
first_blk = offset_in_folio(folio, start_byte) >>
blkbits;
last_blk = offset_in_folio(folio, last_byte) >>
blkbits;
blks_per_folio = i_blocks_per_folio(inode, folio);
for (i = first_blk; i <= last_blk; i++) {
if (!iop_test_block_dirty(folio, i))
punch(inode, i << blkbits,
1 << blkbits);
}
skip_iop_punch:
/*
* Make sure the next punch start is correctly bound to
* the end of this data range, not the end of the folio.
Expand Down Expand Up @@ -1666,7 +1766,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
struct writeback_control *wbc, struct inode *inode,
struct folio *folio, u64 end_pos)
{
struct iomap_page *iop = iop_alloc(inode, folio, 0);
struct iomap_page *iop = iop_alloc(inode, folio, 0, true);
struct iomap_ioend *ioend, *next;
unsigned len = i_blocksize(inode);
unsigned nblocks = i_blocks_per_folio(inode, folio);
Expand All @@ -1682,7 +1782,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
* invalid, grab a new one.
*/
for (i = 0; i < nblocks && pos < end_pos; i++, pos += len) {
if (iop && !iop_test_block_uptodate(folio, i))
if (iop && !iop_test_block_dirty(folio, i))
continue;

error = wpc->ops->map_blocks(wpc, inode, pos);
Expand Down Expand Up @@ -1726,6 +1826,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
}
}

iop_clear_range_dirty(folio, 0, end_pos - folio_pos(folio));
folio_start_writeback(folio);
folio_unlock(folio);

Expand Down
2 changes: 1 addition & 1 deletion fs/xfs/xfs_aops.c
Original file line number Diff line number Diff line change
Expand Up @@ -578,7 +578,7 @@ const struct address_space_operations xfs_address_space_operations = {
.read_folio = xfs_vm_read_folio,
.readahead = xfs_vm_readahead,
.writepages = xfs_vm_writepages,
.dirty_folio = filemap_dirty_folio,
.dirty_folio = iomap_dirty_folio,
.release_folio = iomap_release_folio,
.invalidate_folio = iomap_invalidate_folio,
.bmap = xfs_vm_bmap,
Expand Down
2 changes: 1 addition & 1 deletion fs/zonefs/file.c
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ const struct address_space_operations zonefs_file_aops = {
.read_folio = zonefs_read_folio,
.readahead = zonefs_readahead,
.writepages = zonefs_writepages,
.dirty_folio = filemap_dirty_folio,
.dirty_folio = iomap_dirty_folio,
.release_folio = iomap_release_folio,
.invalidate_folio = iomap_invalidate_folio,
.migrate_folio = filemap_migrate_folio,
Expand Down
1 change: 1 addition & 0 deletions include/linux/iomap.h
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,7 @@ bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count);
struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos);
bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags);
void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len);
bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio);
int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
const struct iomap_ops *ops);
int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
Expand Down

0 comments on commit a05d7e5

Please sign in to comment.