Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions docs/operations/performances.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,21 @@ All graphs have been generated with the [loadtest present in the monitoring repo

![](../assets/perfs_scd_lock_notoverlapping.png)
*Non-overlapping requests. Notice the reduction of performance on the right, with a single lock.*

## SCD lock diagnostics logs

To help diagnose latency spikes (for example as discussed in [#1311](https://github.com/interuss/dss/issues/1311)), the SCD subscription lock path emits targeted warning logs when a lock query looks expensive.

The warning is emitted when lock query duration is greater than or equal to `4s`.

The log message is `Expensive SCD lock detected` and includes:
* `global_lock`: Whether global lock mode is enabled for this query
* `duration`: Time spent executing the lock query
* `cell_count`: Number of S2 cells in the request
* `explicit_subscription_id_count`: Number of explicitly provided subscription IDs

Failed lock queries also emit warnings with timing and context:
* `SCD global lock query failed`
* `SCD subscription lock query failed`

These diagnostics are intended to keep normal logs low-noise while surfacing lock contention or unexpectedly large lock scopes.
4 changes: 2 additions & 2 deletions pkg/scd/operational_intents_handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ func (a *Server) DeleteOperationalIntentReference(ctx context.Context, req *rest
subscriptionIds = append(subscriptionIds, *old.SubscriptionID)
}

err = r.LockSubscriptionsOnCells(ctx, old.Cells, subscriptionIds)
err = r.LockSubscriptionsOnCells(ctx, old.Cells, subscriptionIds, old.StartTime, old.EndTime)
if err != nil {
return stacktrace.Propagate(err, "Unable to acquire lock")
}
Expand Down Expand Up @@ -853,7 +853,7 @@ func (a *Server) upsertOperationalIntentReference(ctx context.Context, now time.
subscriptionIds = append(subscriptionIds, validParams.subscriptionID)
}

err = r.LockSubscriptionsOnCells(ctx, validParams.cells, subscriptionIds)
err = r.LockSubscriptionsOnCells(ctx, validParams.cells, subscriptionIds, validParams.uExtent.StartTime, validParams.uExtent.EndTime)
if err != nil {
return stacktrace.Propagate(err, "Unable to acquire lock")
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/scd/repos/repos.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ type Subscription interface {
IncrementNotificationIndices(ctx context.Context, subscriptionIds []dssmodels.ID) ([]int, error)

// LockSubscriptionsOnCells locks the subscriptions of interest on specific cells and, optionnaly, specific subscriptions via their IDs
LockSubscriptionsOnCells(ctx context.Context, cells s2.CellUnion, subscriptionIds []dssmodels.ID) error
LockSubscriptionsOnCells(ctx context.Context, cells s2.CellUnion, subscriptionIds []dssmodels.ID, startTime *time.Time, endTime *time.Time) error

// ListExpiredSubscriptions lists all subscriptions older than the threshold.
// Their age is determined by their end time, or by their update time if they do not have an end time.
Expand Down
54 changes: 50 additions & 4 deletions pkg/scd/store/datastore/subscriptions.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,12 @@ import (
"strings"
"time"

"github.com/interuss/dss/pkg/logging"
dssmodels "github.com/interuss/dss/pkg/models"
scdmodels "github.com/interuss/dss/pkg/scd/models"
dsssql "github.com/interuss/dss/pkg/sql"
"github.com/interuss/stacktrace"
"go.uber.org/zap"

"github.com/golang/geo/s2"
)
Expand All @@ -20,6 +22,11 @@ var (
subscriptionFieldsWithoutPrefix string
)

const (
// This threshold keep lock diagnostics low-noise in production while still surfacing likely bottlenecks.
lockQuerySlowThreshold = 4 * time.Second
)

// TODO Update database schema and fields below.
func init() {
subscriptionFieldsWithIndices[0] = "id"
Expand Down Expand Up @@ -378,7 +385,8 @@ func (c *repo) IncrementNotificationIndices(ctx context.Context, subscriptionIds
return orderedIndices, nil
}

func (c *repo) LockSubscriptionsOnCells(ctx context.Context, cells s2.CellUnion, subscriptionIds []dssmodels.ID) error {
func (c *repo) LockSubscriptionsOnCells(ctx context.Context, cells s2.CellUnion, subscriptionIds []dssmodels.ID, startTime *time.Time, endTime *time.Time) error {
logger := logging.WithValuesFromContext(ctx, logging.Logger)

if c.globalLock {

Expand All @@ -391,11 +399,26 @@ func (c *repo) LockSubscriptionsOnCells(ctx context.Context, cells s2.CellUnion,
// Notice that this requieres key entries to be present to work.
const query = `SELECT key FROM scd_locks WHERE key = 0 FOR UPDATE`

start := time.Now()
_, err := c.q.Exec(ctx, query)
duration := time.Since(start)
if err != nil {
logger.Warn("SCD global lock query failed",
zap.Duration("duration", duration),
zap.Error(err),
)
return stacktrace.Propagate(err, "Error in query: %s", query)
}

if duration >= lockQuerySlowThreshold {
logger.Warn("Expensive SCD lock detected",
zap.Bool("global_lock", true),
zap.Duration("duration", duration),
zap.Int("cell_count", len(cells)),
zap.Int("explicit_subscription_id_count", len(subscriptionIds)),
)
}

return nil

}
Expand All @@ -406,8 +429,14 @@ func (c *repo) LockSubscriptionsOnCells(ctx context.Context, cells s2.CellUnion,
FROM
scd_subscriptions
WHERE
cells && $1
OR
(
cells && $1
AND
COALESCE(starts_at <= $4, true)
AND
COALESCE(ends_at >= $3, true)
)
OR
id = ANY($2)
FOR UPDATE
`
Expand All @@ -417,11 +446,28 @@ func (c *repo) LockSubscriptionsOnCells(ctx context.Context, cells s2.CellUnion,
ids[i] = id.String()
}

_, err := c.q.Exec(ctx, query, dsssql.CellUnionToCellIds(cells), ids)
start := time.Now()
_, err := c.q.Exec(ctx, query, dsssql.CellUnionToCellIds(cells), ids, startTime, endTime)
duration := time.Since(start)
if err != nil {
logger.Warn("SCD subscription lock query failed",
zap.Duration("duration", duration),
zap.Int("cell_count", len(cells)),
zap.Int("explicit_subscription_id_count", len(subscriptionIds)),
zap.Error(err),
)
return stacktrace.Propagate(err, "Error in query: %s", query)
}

if duration >= lockQuerySlowThreshold {
logger.Warn("Expensive SCD lock detected",
zap.Bool("global_lock", false),
zap.Duration("duration", duration),
zap.Int("cell_count", len(cells)),
zap.Int("explicit_subscription_id_count", len(subscriptionIds)),
)
}

return nil
}

Expand Down
Loading