Skip to content

Commit

Permalink
Native Skyframe support for node restarting
Browse files Browse the repository at this point in the history
Useful for attempting to recover relationships between Skyframe graph
state and external systems, when the evaluation of a Skyframe node has
the side effect of creating that relationship.

Currently, only supported in graph evaluations when reverse dependency
edges are not tracked.

RELNOTES: None.
PiperOrigin-RevId: 202892953
  • Loading branch information
anakanemison authored and Copybara-Service committed Jul 2, 2018
1 parent b916694 commit 4f64b77
Show file tree
Hide file tree
Showing 5 changed files with 97 additions and 34 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,14 @@
import com.google.devtools.build.skyframe.NodeEntry.DirtyState;
import com.google.devtools.build.skyframe.ParallelEvaluatorContext.EnqueueParentBehavior;
import com.google.devtools.build.skyframe.QueryableGraph.Reason;
import com.google.devtools.build.skyframe.SkyFunction.Restart;
import com.google.devtools.build.skyframe.SkyFunctionEnvironment.UndonePreviouslyRequestedDep;
import com.google.devtools.build.skyframe.SkyFunctionException.ReifiedSkyFunctionException;
import java.time.Duration;
import java.util.Collection;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.concurrent.ForkJoinPool;
import java.util.logging.Logger;
Expand Down Expand Up @@ -179,8 +181,10 @@ private boolean invalidatedByErrorTransience(Collection<SkyKey> depGroup, NodeEn
throws InterruptedException {
return depGroup.size() == 1
&& depGroup.contains(ErrorTransienceValue.KEY)
&& !graph.get(
null, Reason.OTHER, ErrorTransienceValue.KEY).getVersion().atMost(entry.getVersion());
&& !graph
.get(null, Reason.OTHER, ErrorTransienceValue.KEY)
.getVersion()
.atMost(entry.getVersion());
}

private DirtyOutcome maybeHandleDirtyNode(NodeEntry state) throws InterruptedException {
Expand Down Expand Up @@ -363,20 +367,21 @@ public void run() {
Set<SkyKey> oldDeps = state.getAllRemainingDirtyDirectDeps();
SkyFunctionEnvironment env;
try {
evaluatorContext.getProgressReceiver().stateStarting(skyKey,
NodeState.INITIALIZING_ENVIRONMENT);
evaluatorContext
.getProgressReceiver()
.stateStarting(skyKey, NodeState.INITIALIZING_ENVIRONMENT);
env =
new SkyFunctionEnvironment(
skyKey, state.getTemporaryDirectDeps(), oldDeps, evaluatorContext);
} catch (UndonePreviouslyRequestedDep undonePreviouslyRequestedDep) {
// If a previously requested dep is no longer done, restart this node from scratch.
maybeEraseNodeToRestartFromScratch(
skyKey, state, SkyFunction.SENTINEL_FOR_RESTART_FROM_SCRATCH);
restart(skyKey, state);
evaluatorContext.getVisitor().enqueueEvaluation(skyKey);
return;
} finally {
evaluatorContext.getProgressReceiver().stateEnding(skyKey,
NodeState.INITIALIZING_ENVIRONMENT, -1);
evaluatorContext
.getProgressReceiver()
.stateEnding(skyKey, NodeState.INITIALIZING_ENVIRONMENT, -1);
}
SkyFunctionName functionName = skyKey.functionName();
SkyFunction factory =
Expand Down Expand Up @@ -476,7 +481,7 @@ public void run() {
env.doneBuilding();
}

if (maybeEraseNodeToRestartFromScratch(skyKey, state, value)) {
if (maybeHandleRestart(skyKey, state, value)) {
evaluatorContext.getVisitor().enqueueEvaluation(skyKey);
return;
}
Expand Down Expand Up @@ -652,15 +657,36 @@ private String prepareCrashMessage(SkyKey skyKey, Iterable<SkyKey> reverseDeps)
private static final int MAX_REVERSEDEP_DUMP_LENGTH = 1000;
}

private boolean maybeEraseNodeToRestartFromScratch(
SkyKey key, NodeEntry entry, SkyValue returnedValue) {
if (!SkyFunction.SENTINEL_FOR_RESTART_FROM_SCRATCH.equals(returnedValue)) {
/**
* If {@code returnedValue} is a {@link Restart} value, then {@code entry} will be reset, and the
* nodes specified by {@code returnedValue.getAdditionalKeysToRestart()} will be marked changed.
*
* @return {@code returnedValue instanceof Restart}
*/
private boolean maybeHandleRestart(SkyKey key, NodeEntry entry, SkyValue returnedValue)
throws InterruptedException {
if (!(returnedValue instanceof Restart)) {
return false;
}
evaluatorContext
.getGraphInconsistencyReceiver()
.noteInconsistencyAndMaybeThrow(key, /*otherKey=*/ null, Inconsistency.RESET_REQUESTED);
entry.resetForRestartFromScratch();
restart(key, entry);

Restart restart = (Restart) returnedValue;

Map<SkyKey, ? extends NodeEntry> additionalNodesToRestart =
this.evaluatorContext
.getBatchValues(key, Reason.INVALIDATION, restart.getAdditionalKeysToRestart());
for (Entry<SkyKey, ? extends NodeEntry> restartEntry : additionalNodesToRestart.entrySet()) {
evaluatorContext
.getGraphInconsistencyReceiver()
.noteInconsistencyAndMaybeThrow(
restartEntry.getKey(),
/*otherKey=*/ key,
Inconsistency.CHILD_FORCED_REEVALUATION_BY_PARENT);
// Nodes are marked changed to ensure that they run. (Also, marking dirty-but-not-changed
// would fail if the node has no deps, because dirtying works only when nodes have deps.)
restartEntry.getValue().markDirty(/*isChanged=*/ true);
}

// TODO(mschaller): rdeps of children have to be handled here. If the graph does not keep edges,
// nothing has to be done, since there are no reverse deps to keep consistent. If the graph
// keeps edges, it's a harder problem. The reverse deps could just be removed, but in the case
Expand All @@ -670,6 +696,13 @@ private boolean maybeEraseNodeToRestartFromScratch(
return true;
}

private void restart(SkyKey key, NodeEntry entry) {
evaluatorContext
.getGraphInconsistencyReceiver()
.noteInconsistencyAndMaybeThrow(key, /*otherKey=*/ null, Inconsistency.RESET_REQUESTED);
entry.resetForRestartFromScratch();
}

void propagateEvaluatorContextCrashIfAny() {
if (!evaluatorContext.getVisitor().getCrashes().isEmpty()) {
evaluatorContext
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ void noteInconsistencyAndMaybeThrow(
enum Inconsistency {
RESET_REQUESTED,
CHILD_MISSING_FOR_DIRTY_NODE,
CHILD_FORCED_REEVALUATION_BY_PARENT,
CHILD_UNDONE_FOR_BUILDING_NODE
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -362,8 +362,8 @@ DependencyState addReverseDepAndCheckIfDone(@Nullable SkyKey reverseDep)
/**
* Erases all stored work during this evaluation from this entry, namely all temporary direct
* deps. The entry will be as if it had never evaluated at this version. Called after the {@link
* SkyFunction} for this entry returns {@link SkyFunction#SENTINEL_FOR_RESTART_FROM_SCRATCH},
* indicating that something went wrong in external state and the evaluation has to be restarted.
* SkyFunction} for this entry returns {@link SkyFunction.Restart}, indicating that something went
* wrong in external state and the evaluation has to be restarted.
*/
@ThreadSafe
void resetForRestartFromScratch();
Expand Down
42 changes: 32 additions & 10 deletions src/main/java/com/google/devtools/build/skyframe/SkyFunction.java
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
package com.google.devtools.build.skyframe;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.ImmutableList;
import com.google.devtools.build.lib.concurrent.ThreadSafety.ThreadSafe;
import com.google.devtools.build.lib.events.ExtendedEventHandler;
import com.google.devtools.build.lib.util.GroupedList;
Expand Down Expand Up @@ -46,8 +47,8 @@ public interface SkyFunction {
* another containing that additional context. If it has no such additional context, then it
* should allow its dependency's exception to be thrown through it.
*
* <p>This method may return {@link #SENTINEL_FOR_RESTART_FROM_SCRATCH} in rare circumstances. See
* its docs. Do not return this value unless you know exactly what you are doing.
* <p>This method may return {@link Restart} in rare circumstances. See its docs. Do not return
* values of this type unless you know exactly what you are doing.
*
* @throws SkyFunctionException on failure
* @throws InterruptedException if interrupted
Expand All @@ -69,15 +70,36 @@ SkyValue compute(SkyKey skyKey, Environment env)
String extractTag(SkyKey skyKey);

/**
* Sentinel value for {@link #compute} to return, indicating that something went wrong in external
* state and the evaluation has to be restarted from scratch, ignoring any deps that the {@link
* #compute} function may have declared during evaluation at this version (including deps declared
* during previous calls that returned null). Common causes for returning this would be data loss,
* if a dep's data is not actually available externally. In this case, the {@link SkyFunction}
* will likely dirty the unusable dep to force its re-evalution when re-requested by the restarted
* entry's computation.
* Sentinel {@link SkyValue} type for {@link #compute} to return, indicating that something went
* wrong, and that the evaluation returning this value must be restarted, and the nodes associated
* with the specified keys (which should be direct or transitive dependencies of the failed
* evaluation) must also be restarted.
*
* <p>An intended cause for returning this is external data loss; e.g., if a dependency's
* "done-ness" is intended to mean that certain data is available in an external system, but
* during evaluation of a node that depends on that external data, that data has gone missing, and
* reevaluation of the dependency is expected to repair the discrepancy.
*
* <p>Values of this type will <em>never</em> be returned by {@link Environment}'s getValue
* methods or from {@link NodeEntry#getValue()}.
*
* <p>TODO(mschaller): the ability to specify arbitrary additional keys to restart is error-prone.
* It would be safer to require nodes requesting restarts to provide dependency paths, which the
* framework could efficiently verify before restarting.
*/
SkyValue SENTINEL_FOR_RESTART_FROM_SCRATCH = new SkyValue() {};
interface Restart extends SkyValue {
Restart SELF = ImmutableList::of;

static Restart selfAnd(SkyKey... additionalKeysToRestart) {
return selfAnd(ImmutableList.copyOf(additionalKeysToRestart));
}

static Restart selfAnd(ImmutableList<SkyKey> additionalKeysToRestart) {
return () -> additionalKeysToRestart;
}

ImmutableList<SkyKey> getAdditionalKeysToRestart();
}

/**
* The services provided to the {@link SkyFunction#compute} implementation by the Skyframe
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2283,7 +2283,7 @@ public void resetNodeOnRequest_smoke() throws Exception {
@Override
public SkyValue compute(SkyKey skyKey, Environment env) {
if (numFunctionCalls.getAndIncrement() < 2) {
return SkyFunction.SENTINEL_FOR_RESTART_FROM_SCRATCH;
return Restart.SELF;
}
return expectedValue;
}
Expand Down Expand Up @@ -2363,7 +2363,7 @@ public SkyValue compute(SkyKey skyKey, Environment env) throws InterruptedExcept
}
env.getValues(ImmutableList.of(newlyRequestedDoneDep, newlyRequestedNotDoneDep));
if (numFunctionCalls.get() < 4) {
return SkyFunction.SENTINEL_FOR_RESTART_FROM_SCRATCH;
return Restart.SELF;
} else if (numFunctionCalls.get() == 4) {
if (cleanBuild.get()) {
Preconditions.checkState(
Expand Down Expand Up @@ -4892,20 +4892,27 @@ public void initialize(boolean keepEdges) {
this.driver = getBuildDriver(evaluator);
}

public void setProgressReceiver(TrackingProgressReceiver customProgressReceiver) {
Preconditions.checkState(evaluator == null, "evaluator already initialized");
progressReceiver = customProgressReceiver;
/**
* Sets the {@link #progressReceiver}. {@link #initialize} must be called after this to have any
* effect.
*/
public void setProgressReceiver(TrackingProgressReceiver progressReceiver) {
this.progressReceiver = progressReceiver;
}

/**
* Sets the {@link #graphInconsistencyReceiver}. {@link #initialize} must be called after this
* to have any effect/
* to have any effect.
*/
public void setGraphInconsistencyReceiver(
GraphInconsistencyReceiver graphInconsistencyReceiver) {
this.graphInconsistencyReceiver = graphInconsistencyReceiver;
}

public MemoizingEvaluator getEvaluator() {
return evaluator;
}

public void invalidate() {
differencer.invalidate(getModifiedValues());
clearModifiedValues();
Expand Down

0 comments on commit 4f64b77

Please sign in to comment.