Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[JENKINS-39150] expose diagnostics across all the channels #122

Merged
merged 8 commits into from
Oct 21, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
145 changes: 125 additions & 20 deletions src/main/java/hudson/remoting/Channel.java
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
import hudson.remoting.forward.PortForwarder;
import org.jenkinsci.remoting.CallableDecorator;
import org.jenkinsci.remoting.RoleChecker;
import org.jenkinsci.remoting.nio.NioChannelHub;

import javax.annotation.CheckForNull;
import javax.annotation.Nonnull;
Expand All @@ -45,6 +46,7 @@
import java.lang.ref.WeakReference;
import java.net.URL;
import java.util.Collections;
import java.util.Date;
import java.util.Hashtable;
import java.util.Locale;
import java.util.Map;
Expand Down Expand Up @@ -204,8 +206,40 @@ public class Channel implements VirtualChannel, IChannel, Closeable {
*/
private final Vector<Listener> listeners = new Vector<Listener>();
private int gcCounter;

/**
* Number of {@link Command} objects sent to the other side.
*/
private int commandsSent;

/**
* Number of {@link Command} objects received from the other side.
*
* When a transport is functioning correctly, {@link #commandsSent} of one side
* and {@link #commandsReceived} of the other side should closely match.
*/
private int commandsReceived;

/**
* Timestamp of the last {@link Command} object sent/received, in
* {@link System#currentTimeMillis()} format.
* This can be used as a basis for detecting dead connections.
*
* <p>
* Note that {@link #lastCommandSentAt} doesn't mean
* anything in terms of whether the underlying network was able to send
* the data (for example, if the other end of a socket connection goes down
* without telling us anything, the {@link SocketOutputStream#write(int)} will
* return right away, and the socket only really times out after 10s of minutes.
*/
private volatile long lastCommandSentAt, lastCommandReceivedAt;

/**
* Timestamp of when this channel was connected/created, in
* {@link System#currentTimeMillis()} format.
*/
private final long createdAt = System.currentTimeMillis();

/**
* Total number of nanoseconds spent for remote class loading.
* <p>
Expand Down Expand Up @@ -266,19 +300,6 @@ public class Channel implements VirtualChannel, IChannel, Closeable {
*/
public final Capability remoteCapability;

/**
* When did we receive any data from this slave the last time?
* This can be used as a basis for detecting dead connections.
* <p>
* Note that this doesn't include our sender side of the operation,
* as successfully returning from {@link #send(Command)} doesn't mean
* anything in terms of whether the underlying network was able to send
* the data (for example, if the other end of a socket connection goes down
* without telling us anything, the {@link SocketOutputStream#write(int)} will
* return right away, and the socket only really times out after 10s of minutes.
*/
private volatile long lastHeard;

/**
* Single-thread executor for running pipe I/O operations.
*
Expand Down Expand Up @@ -494,7 +515,8 @@ protected Channel(ChannelBuilder settings, CommandTransport transport) throws IO

transport.setup(this, new CommandReceiver() {
public void handle(Command cmd) {
updateLastHeard();
commandsReceived++;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Risk of overflow?

lastCommandReceivedAt = System.currentTimeMillis();
if (logger.isLoggable(Level.FINE))
logger.fine("Received " + cmd);
try {
Expand All @@ -509,6 +531,7 @@ public void terminate(IOException e) {
Channel.this.terminate(e);
}
});
ACTIVE_CHANNELS.put(this,ref());
}

/**
Expand Down Expand Up @@ -581,6 +604,7 @@ private ExecutorService createPipeWriterExecutor() {

transport.write(cmd, cmd instanceof CloseCommand);
commandsSent++;
lastCommandSentAt = System.currentTimeMillis();
}

/**
Expand Down Expand Up @@ -1148,6 +1172,73 @@ public void dumpPerformanceCounters(PrintWriter w) throws IOException {
w.printf(Locale.ENGLISH, "Resource loading time=%,dms%n", resourceLoadingTime.get() / (1000 * 1000));
}

/**
* Print the diagnostic information.
*
* <p>
* Here's how you interpret these metrics:
*
* <dl>
* <dt>Created
* <dd>
* When the channel was created, which is more or less the same thing as when the channel is connected.
*
* <dt>Commands sent
* <dd>
* Number of {@link Command} objects sent to the other side. More specifically, number of commands
* successfully passed to {@link #transport}, which means data was written to socket with
* {@link ClassicCommandTransport} but just buffered for write with {@link NioChannelHub}.
*
* If you have access to the remoting diagnostics
* of the other side of the channel, then you can compare this with "commandsReceived" metrics of the other
* side to see how many commands are in transit in transport. If {@code commandsSent==commandsReceived},
* then no command is in transit.
*
* <dt>Commands received
* <dd>
* Number of {@link Command} objects received from the other side. More precisely, number
* of commands reported from {@link #transport}. So for example, if data is still buffered somewhere
* in the networking stack, it won't be counted here.
*
* <dt>Last command sent
* <dd>
* The timestamp in which the last command was sent to the other side. The timestamp in which
* {@link #lastCommandSentAt} was updated.
*
* <dt>Last command received
* <dd>
* The timestamp in which the last command was sent to the other side. The timestamp in which
* {@link #lastCommandReceivedAt} was updated.
*
* <dt>Pending outgoing calls
* <dd>
* Number of RPC calls (e.g., method call through a {@linkplain RemoteInvocationHandler proxy})
* that are made but not returned yet. If you have the remoting diagnostics of the other side, you
* can compare this number with "pending incoming calls" on the other side to see how many RPC
* calls are executing vs in flight. "one side's incoming calls" < "the other side's outgoing calls"
* indicates some RPC requests or responses are passing through the network layer, and mismatch
* between "# of commands sent" vs "# of commands received" can give some indication of whether
* it is request or response that's in flight.
*
* <dt>Pending incoming calls
* <dd>
* The reverse of "pending outgoing calls".
* Number of RPC calls (e.g., method call through a {@linkplain RemoteInvocationHandler proxy})
* that the other side has made to us but not yet returned yet.
*
* @since 2.26.3
*/
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the Javadocs @kohsuke 👍

Copy link
Member

@oleg-nenashev oleg-nenashev Oct 24, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does not work in such way

public void dumpDiagnostics(PrintWriter w) throws IOException {
w.printf("Channel %s%n",name);
w.printf(" Created=%s%n", new Date(createdAt));
w.printf(" Commands sent=%d%n", commandsSent);
w.printf(" Commands received=%d%n", commandsReceived);
w.printf(" Last command sent=%s%n", new Date(lastCommandSentAt));
w.printf(" Last command received=%s%n", new Date(lastCommandReceivedAt));
w.printf(" Pending outgoing calls=%d%n", pendingCalls.size());
w.printf(" Pending incoming calls=%d%n", pendingCalls.size());
}

/**
* {@inheritDoc}
*/
Expand Down Expand Up @@ -1468,12 +1559,7 @@ public ExportTable.ExportList startExportRecording() {
*/
public long getLastHeard() {
// TODO - this is not safe against clock skew and is called from jenkins core (and potentially plugins)
return lastHeard;
}

private void updateLastHeard() {
// TODO - this is not safe against clock skew and is called from jenkins core (and potentially plugins)
lastHeard = System.currentTimeMillis();
return lastCommandReceivedAt;
}

/*package*/ static Channel setCurrent(Channel channel) {
Expand All @@ -1498,6 +1584,20 @@ public static Channel current() {
return CURRENT.get();
}

/**
* Calls {@link #dumpDiagnostics(PrintWriter)} across all the active channels in this system.
* Used for diagnostics.
*
* @since 2.26.3
*/
public static void dumpDiagnosticsForAll(PrintWriter w) throws IOException {
for (Ref ref : ACTIVE_CHANNELS.values().toArray(new Ref[0])) {
Channel ch = ref.channel();
if (ch!=null)
ch.dumpDiagnostics(w);
}
}

/**
* Remembers the current "channel" associated for this thread.
*/
Expand All @@ -1522,6 +1622,11 @@ public static Channel current() {
*/
public static final int PIPE_WINDOW_SIZE = Integer.getInteger(Channel.class.getName()+".pipeWindowSize",1024*1024);

/**
* Keep track of active channels in the system for diagnostics purposes.
*/
private static final Map<Channel,Ref> ACTIVE_CHANNELS = Collections.synchronizedMap(new WeakHashMap<Channel, Ref>());

static final Class jarLoaderProxy;

static {
Expand Down
2 changes: 2 additions & 0 deletions src/main/java/hudson/remoting/Command.java
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@

/**
* One-way command to be sent over to the remote system and executed there.
* This is the smallest unit of message in remoting from one side to another,
* such as "please execute this method" or "here's the return value from an earlier method call".
* This is layer 0, the lower most layer.
*
* <p>
Expand Down
13 changes: 13 additions & 0 deletions src/test/java/hudson/remoting/ChannelTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@

import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.PrintWriter;
import java.io.Serializable;
import java.io.StringWriter;
import java.net.URL;
import java.net.URLClassLoader;
import java.util.concurrent.TimeUnit;
Expand Down Expand Up @@ -163,4 +165,15 @@ public T call() throws RuntimeException {
return t;
}
}

@Bug(39150)
public void testDiagnostics() throws Exception {
StringWriter sw = new StringWriter();
Channel.dumpDiagnosticsForAll(new PrintWriter(sw));
System.out.println(sw);
assertTrue(sw.toString().contains("Channel north"));
assertTrue(sw.toString().contains("Channel south"));
assertTrue(sw.toString().contains("Commands sent=0"));
assertTrue(sw.toString().contains("Commands received=0"));
}
}