Skip to content

Commit

Permalink
bcache: dynamic incremental gc
Browse files Browse the repository at this point in the history
During GC, IO performance would be reduced by half or more.
According to our test data, when nvme is used as the cache,
it takes about 1ms for GC to handle each node (block 4k and
bucket 512k).

So, GC process at least 100 nodes each time, resulting in
IOPS decreasing by half and latency increasing.

This patch add some cost statistics and hold the inflight peak.
When IO depth up to maximum, gc sleep and handle these requests.
GC sleep time dynamically calculate based on gc_cost.

Signed-off-by: Mingzhe Zou <mingzhe.zou@easystack.cn>
  • Loading branch information
zoumingzhe authored and intel-lab-lkp committed Apr 21, 2022
1 parent b253435 commit 9934df9
Show file tree
Hide file tree
Showing 2 changed files with 78 additions and 13 deletions.
8 changes: 8 additions & 0 deletions drivers/md/bcache/bcache.h
Expand Up @@ -471,6 +471,14 @@ struct cache {
};

struct gc_stat {
uint64_t gc_cost;
uint64_t sleep_cost;
uint64_t average_cost;
uint64_t start_time;
uint64_t finish_time;
size_t max_inflight;

size_t times;
size_t nodes;
size_t nodes_pre;
size_t key_bytes;
Expand Down
83 changes: 70 additions & 13 deletions drivers/md/bcache/btree.c
Expand Up @@ -88,11 +88,12 @@
* Test module load/unload
*/

#define MAX_NEED_GC 64
#define MAX_SAVE_PRIO 72
#define MAX_GC_TIMES 100
#define MIN_GC_NODES 100
#define GC_SLEEP_MS 100
#define MAX_GC_NODES 1000
#define MAX_GC_PERCENT 10
#define MIN_GC_SLEEP_MS 10
#define MAX_GC_SLEEP_MS 1000

#define PTR_DIRTY_BIT (((uint64_t) 1 << 36))

Expand Down Expand Up @@ -1542,12 +1543,56 @@ static unsigned int btree_gc_count_keys(struct btree *b)
return ret;
}

static size_t btree_gc_min_nodes(struct cache_set *c)
static uint64_t btree_gc_sleep_ms(struct cache_set *c, struct gc_stat *gc)
{
uint64_t now = local_clock();
uint64_t expect_time, sleep_time = 0;

/*
* GC maybe process very few nodes when IO requests are very frequent.
* If the sleep time is constant (100ms) each time, whole GC would last
* a long time.
* The IO performance also decline if a single GC takes a long time
* (such as single GC 100ms and sleep 100ms, IOPS is only half).
* So GC sleep time should be calculated dynamically based on gc_cost.
*/
gc->finish_time = time_after64(now, gc->start_time)
? now - gc->start_time : 0;
gc->gc_cost = gc->finish_time > gc->sleep_cost
? gc->finish_time - gc->sleep_cost : 0;
expect_time = div_u64(gc->gc_cost * (100 - MAX_GC_PERCENT), MAX_GC_PERCENT);
if (expect_time > gc->sleep_cost)
sleep_time = div_u64(expect_time - gc->sleep_cost, NSEC_PER_MSEC);

if (sleep_time < MIN_GC_SLEEP_MS)
sleep_time = MIN_GC_SLEEP_MS;
if (sleep_time > MAX_GC_SLEEP_MS)
sleep_time = MAX_GC_SLEEP_MS;

return sleep_time;
}

static size_t btree_gc_min_nodes(struct cache_set *c, struct gc_stat *gc)
{
size_t min_nodes;
size_t inflight;

/*
* If there are no requests, the GC is not stopped. Also, we hope to
* process the increasing number of IO requests immediately and hold
* the inflight peak. When IO depth up to maximum, gc sleep and handle
* these requests.
*/
inflight = atomic_read(&c->search_inflight);
if (inflight <= 0)
return max(c->gc_stats.nodes, gc->nodes) + 1;
if (inflight > gc->max_inflight)
gc->max_inflight = inflight;
if (inflight >= gc->max_inflight)
return 1;

/*
* Since incremental GC would stop 100ms when front
* Since incremental GC would dynamic sleep when front
* side I/O comes, so when there are many btree nodes,
* if GC only processes constant (100) nodes each time,
* GC would last a long time, and the front side I/Os
Expand All @@ -1558,11 +1603,14 @@ static size_t btree_gc_min_nodes(struct cache_set *c)
* realized by dividing GC into constant(100) times,
* so when there are many btree nodes, GC can process
* more nodes each time, otherwise, GC will process less
* nodes each time (but no less than MIN_GC_NODES)
* nodes each time (but no less than MIN_GC_NODES and
* no more than MAX_GC_NODES)
*/
min_nodes = c->gc_stats.nodes / MAX_GC_TIMES;
if (min_nodes < MIN_GC_NODES)
min_nodes = MIN_GC_NODES;
if (min_nodes > MAX_GC_NODES)
min_nodes = MAX_GC_NODES;

return min_nodes;
}
Expand Down Expand Up @@ -1633,8 +1681,7 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1));
r->b = NULL;

if (atomic_read(&b->c->search_inflight) &&
gc->nodes >= gc->nodes_pre + btree_gc_min_nodes(b->c)) {
if (gc->nodes >= gc->nodes_pre + btree_gc_min_nodes(b->c, gc)) {
gc->nodes_pre = gc->nodes;
ret = -EAGAIN;
break;
Expand Down Expand Up @@ -1789,7 +1836,7 @@ static void bch_btree_gc(struct cache_set *c)
struct gc_stat stats;
struct closure writes;
struct btree_op op;
uint64_t start_time = local_clock();
uint64_t sleep_time;

trace_bcache_gc_start(c);

Expand All @@ -1798,24 +1845,34 @@ static void bch_btree_gc(struct cache_set *c)
bch_btree_op_init(&op, SHRT_MAX);

btree_gc_start(c);
stats.start_time = local_clock();

/* if CACHE_SET_IO_DISABLE set, gc thread should stop too */
do {
stats.times++;
ret = bcache_btree_root(gc_root, c, &op, &writes, &stats);
closure_sync(&writes);
cond_resched();

if (ret == -EAGAIN)
sleep_time = btree_gc_sleep_ms(c, &stats);
if (ret == -EAGAIN) {
stats.sleep_cost += sleep_time * NSEC_PER_MSEC;
schedule_timeout_interruptible(msecs_to_jiffies
(GC_SLEEP_MS));
else if (ret)
(sleep_time));
} else if (ret)
pr_warn("gc failed!\n");
} while (ret && !test_bit(CACHE_SET_IO_DISABLE, &c->flags));

bch_btree_gc_finish(c);
wake_up_allocators(c);

bch_time_stats_update(&c->btree_gc_time, start_time);
bch_time_stats_update(&c->btree_gc_time, stats.start_time);
stats.average_cost = stats.gc_cost / stats.nodes;
pr_info("gc %llu times with %llu nodes, sleep %llums, "
"average gc cost %lluus per node",
(uint64_t)stats.times, (uint64_t)stats.nodes,
div_u64(stats.sleep_cost, NSEC_PER_MSEC),
div_u64(stats.average_cost, NSEC_PER_USEC));

stats.key_bytes *= sizeof(uint64_t);
stats.data <<= 9;
Expand Down

0 comments on commit 9934df9

Please sign in to comment.