From b40f7906a5af636a74e6ffd529c6226994656176 Mon Sep 17 00:00:00 2001 From: Scott Anderson Date: Wed, 26 Nov 2025 17:32:04 -0700 Subject: [PATCH 1/3] fix: updates write reponses, suggests exponential backoffs, closes influxdata/DAR#557 --- .../v1-compatibility/swaggerV1Compat.yml | 32 +---- api-docs/influxdb3/cloud-dedicated/v2/ref.yml | 43 ++---- .../v1-compatibility/swaggerV1Compat.yml | 32 +---- api-docs/influxdb3/clustered/v2/ref.yml | 44 ++---- .../troubleshoot-distributed.md | 125 +++++++++++++++++- 5 files changed, 148 insertions(+), 128 deletions(-) diff --git a/api-docs/influxdb3/cloud-dedicated/v1-compatibility/swaggerV1Compat.yml b/api-docs/influxdb3/cloud-dedicated/v1-compatibility/swaggerV1Compat.yml index 128021d194..1168d8b375 100644 --- a/api-docs/influxdb3/cloud-dedicated/v1-compatibility/swaggerV1Compat.yml +++ b/api-docs/influxdb3/cloud-dedicated/v1-compatibility/swaggerV1Compat.yml @@ -130,21 +130,9 @@ paths: schema: $ref: '#/components/schemas/LineProtocolLengthError' '429': - description: Token is temporarily over quota. The Retry-After header describes when to try the write again. - headers: - Retry-After: - description: A non-negative decimal integer indicating the seconds to delay after the response is received. - schema: - type: integer - format: int32 + description: Token is temporarily over quota or ingesters are resource constrained. '503': - description: Server is temporarily unavailable to accept writes. The Retry-After header describes when to try the write again. - headers: - Retry-After: - description: A non-negative decimal integer indicating the seconds to delay after the response is received. - schema: - type: integer - format: int32 + description: Server is temporarily unavailable to accept writes due to too many concurrent requests or insufficient healthy ingesters. default: description: Internal server error content: @@ -293,13 +281,7 @@ paths: type: string format: binary '429': - description: Token is temporarily over quota. The Retry-After header describes when to try the read again. - headers: - Retry-After: - description: A non-negative decimal integer indicating the seconds to delay after the response is received. - schema: - type: integer - format: int32 + description: Token is temporarily over quota or the querier is resource constrained. default: description: Error processing query content: @@ -479,13 +461,7 @@ paths: type: string format: binary '429': - description: Token is temporarily over quota. The Retry-After header describes when to try the read again. - headers: - Retry-After: - description: A non-negative decimal integer indicating the seconds to delay after the response is received. - schema: - type: integer - format: int32 + description: Token is temporarily over quota or queriers are resource constrained. default: description: Error processing query content: diff --git a/api-docs/influxdb3/cloud-dedicated/v2/ref.yml b/api-docs/influxdb3/cloud-dedicated/v2/ref.yml index b638df94f1..f4b3e76fe5 100644 --- a/api-docs/influxdb3/cloud-dedicated/v2/ref.yml +++ b/api-docs/influxdb3/cloud-dedicated/v2/ref.yml @@ -423,15 +423,8 @@ paths: description: | Service unavailable. - - Returns this error if - the server is temporarily unavailable to accept writes. - - Returns a `Retry-After` header that describes when to try the write again. - headers: - Retry-After: - description: Non-negative decimal integer indicating seconds to wait before retrying the request. - schema: - format: int32 - type: integer + - Returns this error if the server is temporarily unavailable to accept writes due to concurrent request limits or insufficient healthy ingesters. + default: $ref: '#/components/responses/GeneralServerError' summary: Write data @@ -562,18 +555,10 @@ paths: type: string '429': description: | - #### InfluxDB Cloud: - - returns this error if a **read** or **write** request exceeds your - plan's [adjustable service quotas](/influxdb3/cloud-dedicated/account-management/limits/#adjustable-service-quotas) - or if a **delete** request exceeds the maximum - [global limit](/influxdb3/cloud-dedicated/account-management/limits/#global-limits) - - returns `Retry-After` header that describes when to try the write again. - headers: - Retry-After: - description: A non-negative decimal integer indicating the seconds to delay after the response is received. - schema: - format: int32 - type: integer + Too many requests. + + - Returns this error if a **read** or **write** request exceeds rate + limits or if queriers or ingesters are resource constrained. default: content: application/json: @@ -719,21 +704,9 @@ paths: The response body contains details about the [rejected points](/influxdb3/cloud-dedicated/write-data/troubleshoot/#troubleshoot-rejected-points). '429': - description: Token is temporarily over quota. The Retry-After header describes when to try the write again. - headers: - Retry-After: - description: A non-negative decimal integer indicating the seconds to delay after the response is received. - schema: - format: int32 - type: integer + description: Token is temporarily over quota or ingesters are resource constrained. '503': - description: Server is temporarily unavailable to accept writes. The Retry-After header describes when to try the write again. - headers: - Retry-After: - description: A non-negative decimal integer indicating the seconds to delay after the response is received. - schema: - format: int32 - type: integer + description: Server is temporarily unavailable to accept writes due to too many concurrent requests or insufficient healthy ingesters. default: content: application/json: diff --git a/api-docs/influxdb3/clustered/v1-compatibility/swaggerV1Compat.yml b/api-docs/influxdb3/clustered/v1-compatibility/swaggerV1Compat.yml index 7735c655de..6e289f1cc1 100644 --- a/api-docs/influxdb3/clustered/v1-compatibility/swaggerV1Compat.yml +++ b/api-docs/influxdb3/clustered/v1-compatibility/swaggerV1Compat.yml @@ -130,21 +130,9 @@ paths: schema: $ref: '#/components/schemas/LineProtocolLengthError' '429': - description: Token is temporarily over quota. The Retry-After header describes when to try the write again. - headers: - Retry-After: - description: A non-negative decimal integer indicating the seconds to delay after the response is received. - schema: - type: integer - format: int32 + description: Token is temporarily over quota or ingesters are resource constrained. '503': - description: Server is temporarily unavailable to accept writes. The Retry-After header describes when to try the write again. - headers: - Retry-After: - description: A non-negative decimal integer indicating the seconds to delay after the response is received. - schema: - type: integer - format: int32 + description: Server is temporarily unavailable to accept writes due to too many concurrent requests or insufficient healthy ingesters. default: description: Internal server error content: @@ -274,13 +262,7 @@ paths: type: string format: binary '429': - description: Token is temporarily over quota. The Retry-After header describes when to try the read again. - headers: - Retry-After: - description: A non-negative decimal integer indicating the seconds to delay after the response is received. - schema: - type: integer - format: int32 + description: Token is temporarily over quota or the querier is resource constrained. default: description: Error processing query content: @@ -441,13 +423,7 @@ paths: type: string format: binary '429': - description: Token is temporarily over quota. The Retry-After header describes when to try the read again. - headers: - Retry-After: - description: A non-negative decimal integer indicating the seconds to delay after the response is received. - schema: - type: integer - format: int32 + description: Token is temporarily over quota or queriers are resource constrained. default: description: Error processing query content: diff --git a/api-docs/influxdb3/clustered/v2/ref.yml b/api-docs/influxdb3/clustered/v2/ref.yml index 05507ea497..ba1cf47698 100644 --- a/api-docs/influxdb3/clustered/v2/ref.yml +++ b/api-docs/influxdb3/clustered/v2/ref.yml @@ -419,27 +419,15 @@ paths: '429': description: | Too many requests. - headers: - Retry-After: - description: Non-negative decimal integer indicating seconds to wait before retrying the request. - schema: - format: int32 - type: integer + + - Returns this error if ingesters are resource constrained. '500': $ref: '#/components/responses/InternalServerError' '503': description: | Service unavailable. - - Returns this error if - the server is temporarily unavailable to accept writes. - - Returns a `Retry-After` header that describes when to try the write again. - headers: - Retry-After: - description: Non-negative decimal integer indicating seconds to wait before retrying the request. - schema: - format: int32 - type: integer + - Returns this error if the server is temporarily unavailable to accept writes due to concurrent request limits or insufficient healthy ingesters. default: $ref: '#/components/responses/GeneralServerError' summary: Write data @@ -570,13 +558,9 @@ paths: type: string '429': description: | - Token is temporarily over quota. The Retry-After header describes when to try the write again. - headers: - Retry-After: - description: A non-negative decimal integer indicating the seconds to delay after the response is received. - schema: - format: int32 - type: integer + Too many requests. + + - Returns this error if queriers are resource constrained. default: content: application/json: @@ -678,21 +662,9 @@ paths: $ref: '#/components/schemas/LineProtocolLengthError' description: Write has been rejected because the payload is too large. Error message returns max size supported. All data in body was rejected and not written. '429': - description: Token is temporarily over quota. The Retry-After header describes when to try the write again. - headers: - Retry-After: - description: A non-negative decimal integer indicating the seconds to delay after the response is received. - schema: - format: int32 - type: integer + description: The service is temporarily unavailable or ingesters are resource constrained. '503': - description: Server is temporarily unavailable to accept writes. The Retry-After header describes when to try the write again. - headers: - Retry-After: - description: A non-negative decimal integer indicating the seconds to delay after the response is received. - schema: - format: int32 - type: integer + description: Server is temporarily unavailable to accept writes due to too many concurrent requests or insufficient healthy ingesters. default: content: application/json: diff --git a/content/shared/influxdb3-write-guides/troubleshoot-distributed.md b/content/shared/influxdb3-write-guides/troubleshoot-distributed.md index 802d518fd3..3c1d5a8cad 100644 --- a/content/shared/influxdb3-write-guides/troubleshoot-distributed.md +++ b/content/shared/influxdb3-write-guides/troubleshoot-distributed.md @@ -5,6 +5,7 @@ Learn how to avoid unexpected results and recover from errors when writing to {{ - [Troubleshoot failures](#troubleshoot-failures) - [Troubleshoot rejected points](#troubleshoot-rejected-points) - [Report write issues](#report-write-issues) +{{% show-in "cloud-dedicated,clustered" %}}- [Implement an exponential backoff strategy](#implement-an-exponential-backoff-strategy){{% /show-in %}} ## Handle write responses @@ -39,7 +40,7 @@ The `message` property of the response body may contain additional details about | `404 "Not found"` | A requested **resource type** (for example, "database"), and **resource name** | A requested resource wasn't found | | `422 "Unprocessable Entity"` | `message` contains details about the error | The data isn't allowed (for example, falls outside of the database's retention period). | | `500 "Internal server error"` | Empty | Default status for an error | -| `503 "Service unavailable"` | Empty | The server is temporarily unavailable to accept writes. The `Retry-After` header contains the number of seconds to wait before trying the write again. | +| `503 "Service unavailable"` | Empty | The server is temporarily unavailable or the requested service is resource constrained. [Implement an exponential backoff strategy](#implement-an-exponential-backoff-strategy). | {{% /show-in %}} {{% show-in "cloud-serverless" %}} @@ -346,3 +347,125 @@ Include the support package when contacting InfluxData support through your stan - Business context if the issue affects production systems This comprehensive information will help InfluxData engineers identify root causes and provide targeted solutions for your write issues. + +{{% show-in "cloud-dedicated,clustered" %}} +## Implement an exponential backoff strategy + +Use exponential backoff with jitter for retrying requests that return `429` or `503`. +This reduces load spikes and avoids thundering‑herd problems. + +**Recommended parameters**: + +- Base delay: 1s +- Multiplier: 2 (double each retry) +- Max delay: 30s +- Max retries: 5 (increase only with care) +- Jitter: use "full jitter" (random between 0 and computed delay) + +### Incremental backoff examples + +{{< code-tabs-wrapper >}} +{{% code-tabs %}} +[cURL](#) +[Python](#) +[JavaScript](#) +{{% /code-tabs %}} +{{% code-tab-content %}} + + +```sh +base=1 +max_delay=30 +max_retries=5 + +for attempt in $(seq 0 $max_retries); do + resp_code=$(curl -s -o /dev/null -w "%{http_code}" --request POST "https://{{< influxdb/host >}}/write?db=DB" ...) + if [ "$resp_code" -eq 204 ]; then + echo "Write succeeded" + break + fi + + if [ "$resp_code" -ne 429 ] && [ "$resp_code" -ne 503 ]; then + echo "Non-retryable response: $resp_code" + break + fi + + # compute exponential delay and apply full jitter + delay=$(awk -v b=$base -v a=$attempt 'BEGIN{d=b*(2^a); if(d>30) d=30; print d}') + sleep_seconds=$(awk -v d=$delay 'BEGIN{srand(); printf "%.3f", rand()*d}') + sleep $sleep_seconds +done +``` + +{{% /code-tab-content %}} + +{{% code-tab-content %}} + + +```python +import random +import time +import requests + +base = 1.0 +max_delay = 30.0 +max_retries = 5 + +for attempt in range(max_retries + 1): + r = requests.post(url, headers=headers, data=body, timeout=10) + if r.status_code == 204: + break + if r.status_code not in (429, 503): + raise RuntimeError(f"Non-retryable: {r.status_code} {r.text}") + + # honor Retry-After if present + retry_after = r.headers.get("Retry-After") + retry_delay = float(retry_after) if retry_after else base * (2 ** attempt) + retry_delay = min(retry_delay, max_delay) + + sleep = random.random() * retry_delay # full jitter + time.sleep(sleep) +else: + raise RuntimeError("Max retries exceeded") +``` + +{{% /code-tab-content %}} + +{{% code-tab-content %}} + + +```js +const base = 1000; +const maxDelay = 30000; +const maxRetries = 5; + +async function sleep(ms) { return new Promise(r => setTimeout(r, ms)); } + +for (let attempt = 0; attempt <= maxRetries; attempt++) { + const res = await fetch(url, { method: 'POST', body }); + if (res.status === 204) break; + if (![429, 503].includes(res.status)) throw new Error(`Non-retryable ${res.status}`); + + const ra = res.headers.get('Retry-After'); + let delay = ra ? Math.max(Number(ra) * 1000, base * 2 ** attempt) : base * 2 ** attempt; + delay = Math.min(delay, maxDelay); + + const sleepMs = Math.random() * delay; // full jitter + await sleep(sleepMs); +} +``` + +{{% /code-tab-content %}} +{{< /code-tabs-wrapper >}} + +### Incremental backoff best practices + +- Only retry on idempotent or safe request semantics your client supports. +- Retry only for `429` (Too Many Requests) and `503` (Service Unavailable). +- Do not retry on client errors like `400`, `401`, `404`, `422`. +- Cap the delay with `max_delay` to avoid excessively long waits. +- Limit total retries to avoid infinite loops and provide meaningful errors. +- Log retry attempts and backoff delays for observability and debugging. +- Combine backoff with bounded concurrency to avoid overwhelming the server. + +{{% /show-in %}} From b9c08cfc19123311e3c35f85edc1138c111d00e8 Mon Sep 17 00:00:00 2001 From: Scott Anderson Date: Thu, 27 Nov 2025 08:11:32 -0700 Subject: [PATCH 2/3] Apply suggestions from code review Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- api-docs/influxdb3/clustered/v2/ref.yml | 2 +- .../troubleshoot-distributed.md | 16 ++++++---------- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/api-docs/influxdb3/clustered/v2/ref.yml b/api-docs/influxdb3/clustered/v2/ref.yml index ba1cf47698..a93a582f1f 100644 --- a/api-docs/influxdb3/clustered/v2/ref.yml +++ b/api-docs/influxdb3/clustered/v2/ref.yml @@ -662,7 +662,7 @@ paths: $ref: '#/components/schemas/LineProtocolLengthError' description: Write has been rejected because the payload is too large. Error message returns max size supported. All data in body was rejected and not written. '429': - description: The service is temporarily unavailable or ingesters are resource constrained. + description: Too many requests. The service may be temporarily unavailable or ingesters are resource constrained. '503': description: Server is temporarily unavailable to accept writes due to too many concurrent requests or insufficient healthy ingesters. default: diff --git a/content/shared/influxdb3-write-guides/troubleshoot-distributed.md b/content/shared/influxdb3-write-guides/troubleshoot-distributed.md index 3c1d5a8cad..43293c37db 100644 --- a/content/shared/influxdb3-write-guides/troubleshoot-distributed.md +++ b/content/shared/influxdb3-write-guides/troubleshoot-distributed.md @@ -352,7 +352,7 @@ This comprehensive information will help InfluxData engineers identify root caus ## Implement an exponential backoff strategy Use exponential backoff with jitter for retrying requests that return `429` or `503`. -This reduces load spikes and avoids thundering‑herd problems. +This reduces load spikes and avoids thundering-herd problems. **Recommended parameters**: @@ -362,7 +362,7 @@ This reduces load spikes and avoids thundering‑herd problems. - Max retries: 5 (increase only with care) - Jitter: use "full jitter" (random between 0 and computed delay) -### Incremental backoff examples +### Exponential backoff examples {{< code-tabs-wrapper >}} {{% code-tabs %}} @@ -418,11 +418,8 @@ for attempt in range(max_retries + 1): if r.status_code not in (429, 503): raise RuntimeError(f"Non-retryable: {r.status_code} {r.text}") - # honor Retry-After if present - retry_after = r.headers.get("Retry-After") - retry_delay = float(retry_after) if retry_after else base * (2 ** attempt) - retry_delay = min(retry_delay, max_delay) - + # exponential backoff with full jitter + retry_delay = min(base * (2 ** attempt), max_delay) sleep = random.random() * retry_delay # full jitter time.sleep(sleep) else: @@ -446,8 +443,7 @@ for (let attempt = 0; attempt <= maxRetries; attempt++) { if (res.status === 204) break; if (![429, 503].includes(res.status)) throw new Error(`Non-retryable ${res.status}`); - const ra = res.headers.get('Retry-After'); - let delay = ra ? Math.max(Number(ra) * 1000, base * 2 ** attempt) : base * 2 ** attempt; + let delay = base * 2 ** attempt; delay = Math.min(delay, maxDelay); const sleepMs = Math.random() * delay; // full jitter @@ -458,7 +454,7 @@ for (let attempt = 0; attempt <= maxRetries; attempt++) { {{% /code-tab-content %}} {{< /code-tabs-wrapper >}} -### Incremental backoff best practices +### Exponential backoff best practices - Only retry on idempotent or safe request semantics your client supports. - Retry only for `429` (Too Many Requests) and `503` (Service Unavailable). From 46eb577c6d4cf71cdb76ac48ee6a91480a0f85c9 Mon Sep 17 00:00:00 2001 From: Copilot <198982749+Copilot@users.noreply.github.com> Date: Fri, 28 Nov 2025 17:40:02 -0600 Subject: [PATCH 3/3] Fix cURL example to use $max_delay variable instead of hardcoded value (#6575) * Initial plan * Fix: use $max_delay variable instead of hardcoded 30 in cURL example Co-authored-by: jstirnaman <212227+jstirnaman@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: jstirnaman <212227+jstirnaman@users.noreply.github.com> --- .../shared/influxdb3-write-guides/troubleshoot-distributed.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/content/shared/influxdb3-write-guides/troubleshoot-distributed.md b/content/shared/influxdb3-write-guides/troubleshoot-distributed.md index 43293c37db..29a2e132c0 100644 --- a/content/shared/influxdb3-write-guides/troubleshoot-distributed.md +++ b/content/shared/influxdb3-write-guides/troubleshoot-distributed.md @@ -391,7 +391,7 @@ for attempt in $(seq 0 $max_retries); do fi # compute exponential delay and apply full jitter - delay=$(awk -v b=$base -v a=$attempt 'BEGIN{d=b*(2^a); if(d>30) d=30; print d}') + delay=$(awk -v b=$base -v a=$attempt -v m=$max_delay 'BEGIN{d=b*(2^a); if(d>m) d=m; print d}') sleep_seconds=$(awk -v d=$delay 'BEGIN{srand(); printf "%.3f", rand()*d}') sleep $sleep_seconds done