Skip to content

Commit

Permalink
mon/MDSMonitor: warn when creating fs with default EC data pool
Browse files Browse the repository at this point in the history
Rationale can be found in [1]. Point is that EC pools incur a
significant performance penalty when dealing with small files and xattr
updates. This is because _every_ inode has a corresponding data pool
object with backtrace information stored in its xattr.

[1] doc/cephfs/createfs.rst

Fixes: https://tracker.ceph.com/issues/42450
Signed-off-by: Patrick Donnelly <pdonnell@redhat.com>
(cherry picked from commit 3e0aee5)
  • Loading branch information
batrick committed Feb 5, 2020
1 parent 95baab0 commit 1ee9f2c
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 25 deletions.
57 changes: 33 additions & 24 deletions src/mon/FSCommands.cc
Expand Up @@ -238,12 +238,12 @@ class FsNewHandler : public FileSystemCommandHandler
pg_pool_t const *metadata_pool = mon->osdmon()->osdmap.get_pg_pool(metadata);
ceph_assert(metadata_pool != NULL); // Checked it existed above

int r = _check_pool(mon->osdmon()->osdmap, data, false, force, &ss);
int r = _check_pool(mon->osdmon()->osdmap, data, POOL_DATA_DEFAULT, force, &ss);
if (r < 0) {
return r;
}

r = _check_pool(mon->osdmon()->osdmap, metadata, true, force, &ss);
r = _check_pool(mon->osdmon()->osdmap, metadata, POOL_METADATA, force, &ss);
if (r < 0) {
return r;
}
Expand Down Expand Up @@ -679,7 +679,7 @@ class AddDataPoolHandler : public FileSystemCommandHandler
}
}

int r = _check_pool(mon->osdmon()->osdmap, poolid, false, false, &ss);
int r = _check_pool(mon->osdmon()->osdmap, poolid, POOL_DATA_EXTRA, false, &ss);
if (r != 0) {
return r;
}
Expand Down Expand Up @@ -986,7 +986,7 @@ FileSystemCommandHandler::load(Paxos *paxos)
int FileSystemCommandHandler::_check_pool(
OSDMap &osd_map,
const int64_t pool_id,
bool metadata,
int type,
bool force,
std::stringstream *ss) const
{
Expand All @@ -1000,32 +1000,41 @@ int FileSystemCommandHandler::_check_pool(

const string& pool_name = osd_map.get_pool_name(pool_id);

if (pool->is_erasure() && metadata) {
if (pool->is_erasure()) {
if (type == POOL_METADATA) {
*ss << "pool '" << pool_name << "' (id '" << pool_id << "')"
<< " is an erasure-coded pool. Use of erasure-coded pools"
<< " for CephFS metadata is not permitted";
return -EINVAL;
} else if (pool->is_erasure() && !pool->allows_ecoverwrites()) {
// non-overwriteable EC pools are only acceptable with a cache tier overlay
if (!pool->has_tiers() || !pool->has_read_tier() || !pool->has_write_tier()) {
return -EINVAL;
} else if (type == POOL_DATA_DEFAULT && !force) {
*ss << "pool '" << pool_name << "' (id '" << pool_id << "')"
<< " is an erasure-coded pool, with no overwrite support";
" is an erasure-coded pool."
" Use of an EC pool for the default data pool is discouraged;"
" see the online CephFS documentation for more information."
" Use --force to override.";
return -EINVAL;
}
} else if (!pool->allows_ecoverwrites()) {
// non-overwriteable EC pools are only acceptable with a cache tier overlay
if (!pool->has_tiers() || !pool->has_read_tier() || !pool->has_write_tier()) {
*ss << "pool '" << pool_name << "' (id '" << pool_id << "')"
<< " is an erasure-coded pool, with no overwrite support";
return -EINVAL;
}

// That cache tier overlay must be writeback, not readonly (it's the
// write operations like modify+truncate we care about support for)
const pg_pool_t *write_tier = osd_map.get_pg_pool(
pool->write_tier);
ceph_assert(write_tier != NULL); // OSDMonitor shouldn't allow DNE tier
if (write_tier->cache_mode == pg_pool_t::CACHEMODE_FORWARD
|| write_tier->cache_mode == pg_pool_t::CACHEMODE_READONLY) {
*ss << "EC pool '" << pool_name << "' has a write tier ("
<< osd_map.get_pool_name(pool->write_tier)
<< ") that is configured "
"to forward writes. Use a cache mode such as 'writeback' for "
"CephFS";
return -EINVAL;
// That cache tier overlay must be writeback, not readonly (it's the
// write operations like modify+truncate we care about support for)
const pg_pool_t *write_tier = osd_map.get_pg_pool(
pool->write_tier);
ceph_assert(write_tier != NULL); // OSDMonitor shouldn't allow DNE tier
if (write_tier->cache_mode == pg_pool_t::CACHEMODE_FORWARD
|| write_tier->cache_mode == pg_pool_t::CACHEMODE_READONLY) {
*ss << "EC pool '" << pool_name << "' has a write tier ("
<< osd_map.get_pool_name(pool->write_tier)
<< ") that is configured "
"to forward writes. Use a cache mode such as 'writeback' for "
"CephFS";
return -EINVAL;
}
}
}

Expand Down
7 changes: 6 additions & 1 deletion src/mon/FSCommands.h
Expand Up @@ -30,6 +30,11 @@ class FileSystemCommandHandler : protected CommandHandler
protected:
std::string prefix;

enum {
POOL_METADATA,
POOL_DATA_DEFAULT,
POOL_DATA_EXTRA,
};
/**
* Return 0 if the pool is suitable for use with CephFS, or
* in case of errors return a negative error code, and populate
Expand All @@ -40,7 +45,7 @@ class FileSystemCommandHandler : protected CommandHandler
int _check_pool(
OSDMap &osd_map,
const int64_t pool_id,
bool metadata,
int type,
bool force,
std::stringstream *ss) const;

Expand Down

0 comments on commit 1ee9f2c

Please sign in to comment.