Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions packages/collector/src/agentConnection.js
Original file line number Diff line number Diff line change
Expand Up @@ -489,11 +489,19 @@ function sendData(path, data, cb, ignore404 = false) {
);

req.setTimeout(agentOpts.requestTimeout, function onTimeout() {
if (req.destroyed) {
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This resolves the warning

{"level":20,"time":"2025-10-13T15:58:45.423Z","threadId":0,"name":"@instana/collector","msg":"Function callback for sendData: /com.instana.plugin.nodejs.41842 was called 2 times. This time with the following arguments: {"args":[{}]}"}

return;
}

cb(new Error(`Failed to send data to agent via POST ${path}. Ran into a timeout.`));
req.destroy();
});

req.on('error', err => {
if (req.destroyed) {
return;
}

cb(new Error(`Send data to agent via POST ${path}. Request failed: ${err.message}`));
});

Expand Down
30 changes: 24 additions & 6 deletions packages/collector/src/announceCycle/agentready.js
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ if (typeof process.env.INSTANA_TRACER_METRICS_INTERVAL === 'string') {
}
}

/** @type {number | null} */
let originalTracingMetricsDelay = null;

/** @type {NodeJS.Timeout} */
let tracingMetricsTimeout = null;

Expand Down Expand Up @@ -183,23 +186,38 @@ function sendTracingMetrics() {

agentConnection.sendTracingMetricsToAgent(payload, error => {
if (error) {
logger.info(
`Error received while trying to send tracing metrics to agent: ${error?.message}.` +
' This will not affect monitoring or tracing.'
);
if (typeof error.message === 'string' && error.message.indexOf('Got status code 404')) {
if (typeof error.message === 'string' && error.message.indexOf('Got status code 404') !== -1) {
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The good old classic bug.

error.message.indexOf('Got status code 404')

Thats why

Apparently the version of the Instana host agent on this host does not support the POST /tracermetrics endpoint, will stop sending tracing metrics.

Appeared in the logs. But that was wrong. We do not receive a 404 on timeout error.

logger.info(
'Apparently the version of the Instana host agent on this host does not support the POST /tracermetrics ' +
'endpoint, will stop sending tracing metrics.'
'endpoint, will stop sending tracing metrics. This will not affect monitoring or tracing.'
);

return;
}

logger.warn(
`Error received while trying to send tracing metrics to agent: ${error?.message}.` +
' This will not affect monitoring or tracing. Will retry sending tracing metrics on next interval.'
);

if (originalTracingMetricsDelay === null) {
originalTracingMetricsDelay = tracingMetricsDelay;
}

// Reduce noisyness, increase retry time on error.
tracingMetricsDelay += 1000;
} else if (originalTracingMetricsDelay !== null) {
// on success, restore original delay
tracingMetricsDelay = originalTracingMetricsDelay;
originalTracingMetricsDelay = null;
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added a tiny logic to increase the retry time for sending tracer metrics to /tracermetrics endpoint on error. Please review carefully.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

SInce this is metrics only, I think this should be fine

}

scheduleTracingMetrics();
});
}

function scheduleTracingMetrics() {
logger.debug(`Sending tracing metrics to /tracermetrics in ${tracingMetricsDelay} ms...`);
tracingMetricsTimeout = setTimeout(sendTracingMetrics, tracingMetricsDelay);
tracingMetricsTimeout.unref();
}
Expand Down
28 changes: 26 additions & 2 deletions packages/collector/test/apps/agentStub.js
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ const logProfiles = process.env.LOG_PROFILES === 'true';
const rejectTraces = process.env.REJECT_TRACES === 'true';
const doesntHandleProfiles = process.env.DOESNT_HANDLE_PROFILES === 'true';
const tracingMetrics = process.env.TRACING_METRICS !== 'false';
let slowMetricsReply = process.env.SLOW_METRICS_REPLY === 'true';
const enableSpanBatching = process.env.ENABLE_SPANBATCHING === 'true';
const kafkaTraceCorrelation = process.env.KAFKA_TRACE_CORRELATION
? process.env.KAFKA_TRACE_CORRELATION === 'true'
Expand Down Expand Up @@ -155,6 +156,14 @@ app.delete('/agent/logs', (req, res) => {
app.post(
'/com.instana.plugin.nodejs.:pid',
checkExistenceOfKnownPid(function handleEntityData(req, res) {
if (slowMetricsReply) {
setTimeout(() => {
res.send('OK');
}, 1000 * 2);

return;
}

if (!dropAllData) {
receivedData.metrics.push({
pid: parseInt(req.params.pid, 10),
Expand Down Expand Up @@ -235,14 +244,24 @@ app.post(
);

app.post('/tracermetrics', function handleTracermetrics(req, res) {
if (slowMetricsReply) {
setTimeout(() => {
res.send('OK');
}, 1000 * 2);

return;
}

if (!dropAllData) {
receivedData.tracingMetrics.push(req.body);
}

if (!tracingMetrics) {
res.sendStatus(404);
} else {
res.send('OK');
return;
}

res.send('OK');
});

app.post('/com.instana.plugin.generic.event', function postEvent(req, res) {
Expand All @@ -259,6 +278,11 @@ app.post('/com.instana.plugin.generic.agent-monitoring-event', function postMoni
res.send('OK');
});

app.put('/make-healthy', (req, res) => {
slowMetricsReply = false;
res.send('OK');
});

function checkExistenceOfKnownPid(fn) {
return (req, res) => {
const pid = req.params.pid;
Expand Down
11 changes: 11 additions & 0 deletions packages/collector/test/apps/agentStubControls.js
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,11 @@ class AgentStubControls {
if (typeof opts.tracingMetrics === 'boolean') {
env.TRACING_METRICS = opts.tracingMetrics.toString();
}

if (opts.slowMetricsReply) {
env.SLOW_METRICS_REPLY = opts.slowMetricsReply.toString();
}

if (opts.enableSpanBatching) {
env.ENABLE_SPANBATCHING = 'true';
}
Expand Down Expand Up @@ -184,6 +189,12 @@ class AgentStubControls {
}).then(response => response.json());
}

makeAgentHealthy() {
return fetch(`http://127.0.0.1:${this.agentPort}/make-healthy`, {
method: 'PUT'
});
}

async reset() {
// eslint-disable-next-line no-console
console.log(`[AgentStubControls] reset ${this.agentPort}`);
Expand Down
64 changes: 64 additions & 0 deletions packages/collector/test/tracing/misc/tracing_metrics/test.js
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,70 @@ mochaSuiteFn('tracing/tracing metrics', function () {
expect(tracingMetrics).to.have.lengthOf(1);
});
});

describe('when agent does not respond in time for the metrics endpoints', () => {
const { AgentStubControls } = require('../../../apps/agentStubControls');
const customeAgentControls = new AgentStubControls();
let controls;

before(async () => {
await customeAgentControls.startAgent({
slowMetricsReply: true
});

controls = new ProcessControls({
dirname: __dirname,
agentControls: customeAgentControls,
tracingEnabled: true,
env: {
INSTANA_TRACER_METRICS_INTERVAL: 100,
INSTANA_AGENT_REQUEST_TIMEOUT: 1000
}
});

await controls.startAndWaitForAgentConnection();
});

beforeEach(async () => {
await customeAgentControls.clearReceivedTraceData();
});

after(async () => {
await controls.stop();
await customeAgentControls.stopAgent();
});

it('must reconnect to the agent and transfer metrics successfully', async () => {
const response = await controls.sendRequest({
method: 'POST',
path: '/create-spans'
});

expect(response).to.equal('OK');

await delay(5000);

const tracerMetricsBefore = await fetchTracingMetricsForProcess(customeAgentControls, controls.getPid());
// The agent stub does not store the metrics which have been sent. We simulate a total timeout scenario.
expect(tracerMetricsBefore).to.have.lengthOf(0);

const metricsBefore = await customeAgentControls.getAllMetrics(controls.getPid());
expect(metricsBefore).to.have.a.lengthOf(0);

// We forced the agent to return slow replies for metrics. The collector will
// run into constant timeouts and will try to re-announce in a loop. Now we make the agent healthy again.
await customeAgentControls.makeAgentHealthy();

await testUtils.retry(async () => {
const tracerMetrics = await fetchTracingMetricsForProcess(customeAgentControls, controls.getPid());
// Proof that tracer metrics and pid metrics occurred after the agent became healthy again.
expect(tracerMetrics).to.have.lengthOf.at.least(1);

const metrics = await customeAgentControls.getAllMetrics(controls.getPid());
expect(metrics).to.have.a.lengthOf.at.least(1);
});
});
});
});

async function fetchTracingMetricsForProcess(agentControls, pid) {
Expand Down