-
-
Notifications
You must be signed in to change notification settings - Fork 264
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[JENKINS-61103] Retry on class resource load failures and introduce timeouts #379
Changes from 6 commits
02f15c0
45acb17
adb3b80
11edae4
23e8617
08ce1b4
5c0bec8
8099a97
e6ddc55
8d8321e
780bdde
24a1f89
dd81a0b
9b505b3
4ef7ad3
d6985ed
3745555
5dbdde7
23d1ba3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -73,6 +73,38 @@ final class RemoteClassLoader extends URLClassLoader { | |
|
||
private static final Logger LOGGER = Logger.getLogger(RemoteClassLoader.class.getName()); | ||
|
||
/** | ||
* Intercept {@link RemoteClassLoader#loadRemoteClass(String, Channel, ClassReference, RemoteClassLoader)} for unit tests. | ||
* See JENKINS-6604 and similar issues. | ||
* Should not be used for any other purpose. | ||
*/ | ||
static Runnable TESTING_CLASS_LOAD; | ||
/** | ||
* Intercept {@link RemoteClassLoader#prefetchClassReference(String, Channel)} for unit tests. | ||
* Should not be used for any other purpose. | ||
*/ | ||
static Runnable TESTING_CLASS_REFERENCE_LOAD; | ||
/** | ||
* Intercept {@link RemoteClassLoader#findResource(String)} for unit tests. | ||
* Should not be used for any other purpose. | ||
*/ | ||
static Runnable TESTING_RESOURCE_LOAD; | ||
|
||
/** | ||
* The amount of time to sleep before retrying an interrupted class load. | ||
* This sleep keeps it from hammering the channel if there is a failure. | ||
* The default value is 100 (ms). | ||
*/ | ||
static int RETRY_SLEEP_DURATION_MILLISECONDS = Integer.getInteger(RemoteClassLoader.class.getName() + "retrySleepDurationMilliseconds", 100); | ||
/** | ||
* The total number of retries for an interrupted class load. | ||
* This makes the operation retry for an extended period of time but eventually timeout. | ||
* Combined with the default value for RETRY_SLEEP_DURATION_MILLISECONDS this gives a default | ||
* timeout of 10 minutes, which is much less than the former (infinite) retry but still a significant | ||
* amount of time. | ||
*/ | ||
static int MAX_RETRIES = Integer.getInteger(RemoteClassLoader.class.getName() + "maxRetries", 10 * 60 * 1000 / RETRY_SLEEP_DURATION_MILLISECONDS); | ||
|
||
/** | ||
* Proxy to the code running on remote end. | ||
* <p> | ||
|
@@ -239,11 +271,9 @@ private Class<?> loadRemoteClass(String name, Channel channel, ClassReference cr | |
// and just retry until it succeeds, but in the end we set the interrupt flag | ||
// back on to let the interrupt in the next earliest occasion. | ||
|
||
while (true) { | ||
for (int tries = 0; tries < MAX_RETRIES; tries++) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe start at |
||
try { | ||
if (TESTING_CLASS_LOAD != null) { | ||
TESTING_CLASS_LOAD.run(); | ||
} | ||
invokeClassLoadTestingHookIfNeeded(); | ||
|
||
if (c != null) { | ||
return c; | ||
|
@@ -269,10 +299,15 @@ private Class<?> loadRemoteClass(String name, Channel channel, ClassReference cr | |
// but we need to remember to set the interrupt flag back on | ||
// before we leave this call. | ||
interrupted = true; | ||
try { | ||
rcl.getClassLoadingLock(name).wait(RETRY_SLEEP_DURATION_MILLISECONDS); | ||
} catch (InterruptedException e) { | ||
// Not much to do if we can't sleep. Run through the tries more quickly. | ||
} | ||
LOGGER.finer("Handling interrupt while loading remote class. Current retry count = " + tries + ", maximum = " + MAX_RETRIES); | ||
} | ||
|
||
// no code is allowed to reach here | ||
} | ||
} | ||
jeffret-b marked this conversation as resolved.
Show resolved
Hide resolved
|
||
throw new ClassNotFoundException("Could not load class " + name + " after " + MAX_RETRIES + " tries."); | ||
} finally { | ||
// process the interrupt later. | ||
if (interrupted) | ||
|
@@ -281,6 +316,17 @@ private Class<?> loadRemoteClass(String name, Channel channel, ClassReference cr | |
} | ||
} | ||
|
||
private void invokeClassLoadTestingHookIfNeeded() throws InterruptedException { | ||
// Testing support only. | ||
if (TESTING_CLASS_LOAD != null) { | ||
TESTING_CLASS_LOAD.run(); | ||
if (Thread.currentThread().isInterrupted()) { | ||
// Otherwise the interrupt isn't recognized and the test doesn't work. | ||
throw new InterruptedException("loading was interrupted."); | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: maybe use a different runnable hook, which would directly throw the exception you need for your test, instead of "hardcoding" the exception here? |
||
} | ||
} | ||
|
||
private ClassReference prefetchClassReference(String name, Channel channel) throws ClassNotFoundException { | ||
ClassReference cr; | ||
cr = prefetchedClasses.remove(name); | ||
|
@@ -294,11 +340,9 @@ private ClassReference prefetchClassReference(String name, Channel channel) thro | |
// and just retry until it succeeds, but in the end we set the interrupt flag | ||
// back on to let the interrupt in the next earliest occasion. | ||
|
||
while (true) { | ||
for (int tries = 0; tries < MAX_RETRIES; tries++) { | ||
try { | ||
if (TESTING_CLASS_REFERENCE_LOAD != null) { | ||
TESTING_CLASS_REFERENCE_LOAD.run(); | ||
} | ||
invokeClassReferenceLoadTestingHookIfNeeded(); | ||
|
||
Map<String, ClassFile2> all = proxy.fetch3(name); | ||
synchronized (prefetchedClasses) { | ||
|
@@ -353,6 +397,12 @@ ClassReference toRef(ClassFile2 cf) { | |
// but we need to remember to set the interrupt flag back on | ||
// before we leave this call. | ||
interrupted = true; | ||
try { | ||
Thread.sleep(RETRY_SLEEP_DURATION_MILLISECONDS); | ||
} catch (InterruptedException e) { | ||
// Not much to do if we can't sleep. Run through the tries more quickly. | ||
} | ||
LOGGER.finer("Handling interrupt while fetching class reference. Current retry count = " + tries + ", maximum = " + MAX_RETRIES); | ||
continue; // JENKINS-19453: retry | ||
} | ||
throw x; | ||
|
@@ -375,13 +425,12 @@ ClassReference toRef(ClassFile2 cf) { | |
return cr; | ||
} | ||
|
||
/** | ||
* Intercept {@link RemoteClassLoader#findClass(String)} to allow unit tests to be written. | ||
* <p> | ||
* See JENKINS-6604 and similar issues | ||
*/ | ||
static Runnable TESTING_CLASS_LOAD; | ||
static Runnable TESTING_CLASS_REFERENCE_LOAD; | ||
private void invokeClassReferenceLoadTestingHookIfNeeded() { | ||
// Testing support only. | ||
if (TESTING_CLASS_REFERENCE_LOAD != null) { | ||
TESTING_CLASS_REFERENCE_LOAD.run(); | ||
} | ||
} | ||
|
||
/** | ||
* Loads class from the byte array. | ||
|
@@ -449,38 +498,75 @@ public URL findResource(String name) { | |
return url; | ||
} | ||
|
||
boolean interrupted = false; | ||
try { | ||
if (resourceMap.containsKey(name)) { | ||
URLish f = resourceMap.get(name); | ||
if (f == null) { | ||
return null; // no such resource | ||
} | ||
URL u = f.toURL(); | ||
if (u != null) { | ||
return u; | ||
} | ||
} | ||
for (int tries = 0; tries < MAX_RETRIES; tries++) { | ||
try { | ||
if (resourceMap.containsKey(name)) { | ||
URLish f = resourceMap.get(name); | ||
if (f == null) { | ||
return null; // no such resource | ||
} | ||
URL u = f.toURL(); | ||
if (u != null) { | ||
return u; | ||
} | ||
} | ||
|
||
long startTime = System.nanoTime(); | ||
invokeResourceLoadTestingHookIfNeeded(); | ||
|
||
ResourceFile r = proxy.getResource2(name); | ||
ResourceImageRef image = null; | ||
if (r != null) { | ||
image = r.image; | ||
} | ||
long startTime = System.nanoTime(); | ||
|
||
channel.resourceLoadingTime.addAndGet(System.nanoTime() - startTime); | ||
channel.resourceLoadingCount.incrementAndGet(); | ||
if (image == null) { | ||
resourceMap.put(name, null); | ||
return null; | ||
ResourceFile r = proxy.getResource2(name); | ||
ResourceImageRef image = null; | ||
if (r != null) { | ||
image = r.image; | ||
} | ||
|
||
channel.resourceLoadingTime.addAndGet(System.nanoTime() - startTime); | ||
channel.resourceLoadingCount.incrementAndGet(); | ||
if (image == null) { | ||
resourceMap.put(name, null); | ||
return null; | ||
} | ||
|
||
URLish res = image.resolveURL(channel, name).get(); | ||
resourceMap.put(name, res); | ||
return res.toURL(); | ||
} catch (IOException | InterruptedException | ExecutionException e) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. move There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This makes me reconsider the previous change to standardize the exceptions through a single method. Was there some reason InterruptedException was handled the way it was? -- But, I don't have much confidence that the previous exception handling was correct or very intentional. I still think the standardization is probably better. |
||
throw new Error("Unable to load resource " + name, e); | ||
} catch (RemotingSystemException x) { | ||
if (x.getCause() instanceof InterruptedException) { | ||
// pretend as if this operation is not interruptible. | ||
// but we need to remember to set the interrupt flag back on | ||
// before we leave this call. | ||
interrupted = true; | ||
try { | ||
Thread.sleep(RETRY_SLEEP_DURATION_MILLISECONDS); | ||
} catch (InterruptedException e) { | ||
// Not much to do if we can't sleep. Run through the tries more quickly. | ||
} | ||
LOGGER.finer("Handling interrupt while finding resource. Current retry count = " + tries + ", maximum = " + MAX_RETRIES); | ||
continue; | ||
} | ||
throw x; | ||
} | ||
|
||
// no code is allowed to reach here | ||
} | ||
} finally { | ||
// process the interrupt later. | ||
if (interrupted) { | ||
Thread.currentThread().interrupt(); | ||
} | ||
} | ||
throw new RuntimeException("Could not load resource " + name + " after " + MAX_RETRIES + " tries."); | ||
} | ||
|
||
URLish res = image.resolveURL(channel, name).get(); | ||
resourceMap.put(name, res); | ||
return res.toURL(); | ||
} catch (IOException | InterruptedException | ExecutionException e) { | ||
throw new Error("Unable to load resource " + name, e); | ||
private void invokeResourceLoadTestingHookIfNeeded() { | ||
// Testing support only. | ||
if (TESTING_RESOURCE_LOAD != null) { | ||
TESTING_RESOURCE_LOAD.run(); | ||
} | ||
} | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: this forbids
RETRY_SLEEP_DURATION_MILLISECONDS=0
(would divide by zero); I think allowing0
(with some arbitrary default value forMAX_RETRIES
, and someif (... > 0)
conditions around the sleep/wait calls) could be good.