Skip to content

Commit

Permalink
Robustness engine actions with stats and log (#685)
Browse files Browse the repository at this point in the history
* Robustness engine actions with stats and logging

- Add actions to robustness engine
- Actions wrap other functional behavior and serve as a common interface for collecting stats
- Add stats for the engine, both per run and cumulative over time
- Add a log for actions that the engine has executed
- Add recovery logic to re-sync snapshot metadata after a possible failed engine run (e.g. if metadata wasn't properly persisted).

Current built-in actions:
- snapshot root directory
- restore random snapshot ID into a target restore path
- delete a random snapshot ID
- run GC
- write random files to the local data directory
- delete a random subdirectory under the local data directory
- delete files in a directory
- restore a snapshot ID into the local data directory

Actions are executed according to a set of options, which dictate the relative probabilities of picking a given action, along with ranges for action-specific parameters that can be randomized.
  • Loading branch information
redgoat650 committed Nov 17, 2020
1 parent ade7975 commit 71dcbcf
Show file tree
Hide file tree
Showing 19 changed files with 1,896 additions and 187 deletions.
11 changes: 9 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -205,9 +205,16 @@ endurance-tests: export KOPIA_EXE ?= $(KOPIA_INTEGRATION_EXE)
endurance-tests: build-integration-test-binary $(gotestsum)
$(GO_TEST) $(TEST_FLAGS) -count=1 -parallel $(PARALLEL) -timeout 3600s github.com/kopia/kopia/tests/endurance_test

robustness-tool-tests: $(gotestsum)
robustness-tests: export KOPIA_EXE ?= $(KOPIA_INTEGRATION_EXE)
robustness-tests: build-integration-test-binary $(gotestsum)
FIO_DOCKER_IMAGE=$(FIO_DOCKER_TAG) \
$(GO_TEST) $(TEST_FLAGS) -count=1 -timeout 90s github.com/kopia/kopia/tests/tools/...
$(GO_TEST) -count=1 github.com/kopia/kopia/tests/robustness/robustness_test $(TEST_FLAGS)

robustness-tool-tests: export KOPIA_EXE ?= $(KOPIA_INTEGRATION_EXE)
robustness-tool-tests: build-integration-test-binary $(gotestsum)
KOPIA_EXE=$(KOPIA_INTEGRATION_EXE) \
FIO_DOCKER_IMAGE=$(FIO_DOCKER_TAG) \
$(GO_TEST) -count=1 github.com/kopia/kopia/tests/tools/... github.com/kopia/kopia/tests/robustness/engine/... $(TEST_FLAGS)

stress-test: $(gotestsum)
KOPIA_LONG_STRESS_TEST=1 $(GO_TEST) -count=1 -timeout 200s github.com/kopia/kopia/tests/stress_test
Expand Down
106 changes: 86 additions & 20 deletions tests/robustness/checker/checker.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"io/ioutil"
"log"
"os"
"strconv"
"time"

"github.com/pkg/errors"
Expand All @@ -19,28 +20,45 @@ import (
"github.com/kopia/kopia/tests/robustness/snapmeta"
)

const (
deleteLimitEnvKey = "LIVE_SNAP_DELETE_LIMIT"
defaultDeleteLimit = 10
)

// Checker is an object that can take snapshots and restore them, performing
// a validation for data consistency.
type Checker struct {
RestoreDir string
snapshotIssuer snap.Snapshotter
snapshotMetadataStore snapmeta.Store
validator Comparer
RecoveryMode bool
DeleteLimit int
}

// NewChecker instantiates a new Checker, returning its pointer. A temporary
// directory is created to mount restored data.
func NewChecker(snapIssuer snap.Snapshotter, snapmetaStore snapmeta.Store, validator Comparer) (*Checker, error) {
restoreDir, err := ioutil.TempDir("", "restore-data-")
func NewChecker(snapIssuer snap.Snapshotter, snapmetaStore snapmeta.Store, validator Comparer, restoreDir string) (*Checker, error) {
restoreDir, err := ioutil.TempDir(restoreDir, "restore-data-")
if err != nil {
return nil, err
}

delLimitStr := os.Getenv(deleteLimitEnvKey)

delLimit, err := strconv.Atoi(delLimitStr)
if err != nil {
log.Printf("using default delete limit %d", defaultDeleteLimit)
delLimit = defaultDeleteLimit
}

return &Checker{
RestoreDir: restoreDir,
snapshotIssuer: snapIssuer,
snapshotMetadataStore: snapmetaStore,
validator: validator,
RecoveryMode: false,
DeleteLimit: delLimit,
}, nil
}

Expand All @@ -53,7 +71,7 @@ func (chk *Checker) Cleanup() {

// GetSnapIDs gets the list of snapshot IDs being tracked by the checker's snapshot store.
func (chk *Checker) GetSnapIDs() []string {
return chk.snapshotMetadataStore.GetKeys()
return chk.snapshotMetadataStore.GetKeys(allSnapshotsIdxName)
}

// SnapshotMetadata holds metadata associated with a given snapshot.
Expand All @@ -73,18 +91,7 @@ func (chk *Checker) GetSnapshotMetadata(snapID string) (*SnapshotMetadata, error
// GetLiveSnapIDs gets the list of snapshot IDs being tracked by the checker's snapshot store
// that do not have a deletion time associated with them.
func (chk *Checker) GetLiveSnapIDs() []string {
snapIDs := chk.GetSnapIDs()

var ret []string

for _, snapID := range snapIDs {
deleted, err := chk.IsSnapshotIDDeleted(snapID)
if err == nil && !deleted {
ret = append(ret, snapID)
}
}

return ret
return chk.snapshotMetadataStore.GetKeys(liveSnapshotsIdxName)
}

// IsSnapshotIDDeleted reports whether the metadata associated with the provided snapshot ID
Expand Down Expand Up @@ -125,14 +132,44 @@ func (chk *Checker) VerifySnapshotMetadata() error {

for _, metaSnapID := range liveSnapsInMetadata {
if _, ok := liveMap[metaSnapID]; !ok {
log.Printf("Metadata present for snapID %v but not found in known metadata", metaSnapID)
errCount++
log.Printf("Metadata present for snapID %v but not found in list of repo snapshots", metaSnapID)

if chk.RecoveryMode {
chk.snapshotMetadataStore.Delete(metaSnapID)
chk.snapshotMetadataStore.RemoveFromIndex(metaSnapID, liveSnapshotsIdxName)
} else {
errCount++
}
}
}

var liveSnapsDeleted int

for _, liveSnapID := range liveSnapsInRepo {
if _, ok := metadataMap[liveSnapID]; !ok {
log.Printf("Live snapshot present for snapID %v but not found in known metadata", liveSnapID)
if _, ok := metadataMap[liveSnapID]; ok {
// Found live snapshot ID in the metadata. No recovery handling needed.
continue
}

log.Printf("Live snapshot present for snapID %v but not found in known metadata", liveSnapID)

if chk.RecoveryMode {
if liveSnapsDeleted >= chk.DeleteLimit {
log.Printf("delete limit (%v) reached", chk.DeleteLimit)
errCount++
}

// Might as well delete the snapshot since we don't have metadata for it
log.Printf("Deleting snapshot ID %s", liveSnapID)

err = chk.snapshotIssuer.DeleteSnapshot(liveSnapID)
if err != nil {
log.Printf("error deleting snapshot: %s", err)
errCount++
}

liveSnapsDeleted++
} else {
errCount++
}
}
Expand Down Expand Up @@ -173,6 +210,9 @@ func (chk *Checker) TakeSnapshot(ctx context.Context, sourceDir string) (snapID
return snapID, err
}

chk.snapshotMetadataStore.AddToIndex(snapID, allSnapshotsIdxName)
chk.snapshotMetadataStore.AddToIndex(snapID, liveSnapshotsIdxName)

return snapID, nil
}

Expand Down Expand Up @@ -211,6 +251,22 @@ func (chk *Checker) RestoreVerifySnapshot(ctx context.Context, snapID, destPath
return err
}

if ssMeta == nil && chk.RecoveryMode {
var b []byte

b, err = chk.validator.Gather(ctx, destPath)
if err != nil {
return err
}

ssMeta = &SnapshotMetadata{
SnapID: snapID,
ValidationData: b,
}

return chk.saveSnapshotMetadata(ssMeta)
}

err = chk.validator.Compare(ctx, destPath, ssMeta.ValidationData, reportOut)
if err != nil {
return err
Expand All @@ -219,6 +275,12 @@ func (chk *Checker) RestoreVerifySnapshot(ctx context.Context, snapID, destPath
return nil
}

const (
deletedSnapshotsIdxName = "deleted-snapshots-idx"
liveSnapshotsIdxName = "live-snapshots-idx"
allSnapshotsIdxName = "all-snapshots-idx"
)

// DeleteSnapshot performs the Snapshotter's DeleteSnapshot action, and
// marks the snapshot with the given snapshot ID as deleted.
func (chk *Checker) DeleteSnapshot(ctx context.Context, snapID string) error {
Expand All @@ -232,13 +294,17 @@ func (chk *Checker) DeleteSnapshot(ctx context.Context, snapID string) error {
return err
}

ssMeta.DeletionTime = clock.Now()
ssMeta.DeletionTime = time.Now()
ssMeta.ValidationData = nil

err = chk.saveSnapshotMetadata(ssMeta)
if err != nil {
return err
}

chk.snapshotMetadataStore.AddToIndex(ssMeta.SnapID, deletedSnapshotsIdxName)
chk.snapshotMetadataStore.RemoveFromIndex(ssMeta.SnapID, liveSnapshotsIdxName)

return nil
}

Expand Down

0 comments on commit 71dcbcf

Please sign in to comment.