Skip to content

Commit

Permalink
ceph: improve upgrade procedure
Browse files Browse the repository at this point in the history
When a cluster is updated with a different image version, this triggers
a serialized restart of all the pods. Prior to this commit, no safety
check were performed and rook was hoping for the best outcome.

Now before doing restarting a daemon we check it can be restarted. Once
it's restarted we also check we can pursue with the rest of the
platform. For instance, with monitors we check that they are in quorum,
for OSD we check that PGs are clean and for MDS we make sure they are
 all active.

Fixes: rook#2889
Signed-off-by: Sébastien Han <seb@redhat.com>
  • Loading branch information
leseb committed Jun 21, 2019
1 parent 818ad9c commit b28c4af
Show file tree
Hide file tree
Showing 13 changed files with 520 additions and 45 deletions.
65 changes: 65 additions & 0 deletions pkg/daemon/ceph/client/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ type CephStatus struct {
} `json:"osdmap"`
PgMap PgMap `json:"pgmap"`
MgrMap MgrMap `json:"mgrmap"`
Fsmap Fsmap `json:"fsmap"`
}

type HealthStatus struct {
Expand Down Expand Up @@ -123,6 +124,23 @@ type PgStateEntry struct {
Count int `json:"count"`
}

// Fsmap is a struct representing the filesystem map
type Fsmap struct {
Epoch int `json:"epoch"`
ID int `json:"id"`
Up int `json:"up"`
In int `json:"in"`
Max int `json:"max"`
ByRank []struct {
FilesystemID int `json:"filesystem_id"`
Rank int `json:"rank"`
Name string `json:"name"`
Status string `json:"status"`
Gid int `json:"gid"`
} `json:"by_rank"`
UpStandby int `json:"up:standby"`
}

func Status(context *clusterd.Context, clusterName string, debug bool) (CephStatus, error) {
args := []string{"status"}
cmd := NewCephCommand(context, clusterName, args)
Expand Down Expand Up @@ -171,3 +189,50 @@ func isClusterClean(status CephStatus) error {

return fmt.Errorf("cluster is not fully clean. PGs: %+v", status.PgMap.PgsByState)
}

// getMDSRank returns the rank of a given MDS
func getMDSRank(status CephStatus, clusterName, mdsName string) (int, error) {
// dummy rank
mdsRank := -1000
for r := range status.Fsmap.ByRank {
if status.Fsmap.ByRank[r].Name == mdsName {
mdsRank = r
}
}
// if the mds is not shown in the map one reason might be because it's in standby
// if not in standby there is something else going wron
if mdsRank < 0 && status.Fsmap.UpStandby < 1 {
// it might seem strange to log an error since this could be a warning too
// it is a warning until we reach the timeout, this should give enough time to the mds to transtion its state
// after the timeout we consider that the mds might be gone or the timeout was not long enough...
return mdsRank, fmt.Errorf("mds %s not found in fsmap, this likely means mdss are transitioning between active and standby states", mdsName)
}

return mdsRank, nil
}

// MdsActiveOrStandbyReplay returns wether a given MDS is active or in standby
func MdsActiveOrStandbyReplay(context *clusterd.Context, clusterName, mdsName string) error {
status, err := Status(context, clusterName, false)
if err != nil {
return err
}

mdsRank, err := getMDSRank(status, clusterName, mdsName)
if err != nil {
return fmt.Errorf("%+v", err)
}

// this MDS is in standby so let's return immediatly
if mdsRank < 0 {
logger.Infof("mds %s is in standby, nothing to check", mdsName)
return nil
}

if status.Fsmap.ByRank[mdsRank].Status == "up:active" || status.Fsmap.ByRank[mdsRank].Status == "up:standby-replay" || status.Fsmap.ByRank[mdsRank].Status == "up:standby" {
logger.Infof("mds %s is %s", mdsName, status.Fsmap.ByRank[mdsRank].Status)
return nil
}

return fmt.Errorf("mds %s is %s, bad state", mdsName, status.Fsmap.ByRank[mdsRank].Status)
}
14 changes: 14 additions & 0 deletions pkg/daemon/ceph/client/status_test.go

Large diffs are not rendered by default.

Loading

0 comments on commit b28c4af

Please sign in to comment.