diff --git a/collector/tidb/examples/compose/README.md b/collector/tidb/examples/compose/README.md new file mode 100644 index 0000000..0020ba1 --- /dev/null +++ b/collector/tidb/examples/compose/README.md @@ -0,0 +1,45 @@ +# Monitor TiDB with the OpenTelemetry Collector for Cloud Observability + +## Overview + +TiDB is a distributed, scalable, and highly available NewSQL database. Monitoring TiDB is crucial to ensure its +reliability, performance, and scalability. Utilizing the OpenTelemetry Collector, you can efficiently collect and +transmit TiDB metrics to Cloud Observability, enabling comprehensive analysis and visualization of your TiDB deployment. +This README provides step-by-step instructions for integrating TiDB metrics with Cloud Observability using the +OpenTelemetry Collector. + +## Prerequisites + +* Docker +* Docker Compose +* A Cloud Observability account +* Cloud Observability [access token][ls-docs-access-token] + +## How to set it up + +1. **Export your Cloud Observability access token**: + ```bash + export LS_ACCESS_TOKEN= + ``` +2. **Run the docker compose example to spin up TiDB and the OpenTelemetry Collector**: + ```bash + docker-compose up -d + ``` +3. **Access [tidb-vision](https://github.com/pingcap/tidb-vision)** at http://localhost:8010. +4. **Access Spark Web UI at http://localhost:8080** and access [TiSpark](https://github.com/pingcap/tispark) through + spark://127.0.0.1:7077 +5. **Monitor TiDB Metrics in Cloud Observability**: After setup, TiDB metrics will start populating in your Cloud + Observability dashboard. +6. **Shutting down the monitoring setup**: + ```bash + docker-compose down -v + ``` + +## Configuring TiDB for Advanced Monitoring + +This guide assumes a standard TiDB setup. TiDB offers various configuration options that can impact the metrics it +provides. For detailed configurations and best practices, always refer to [the official TiDB documentation][tidb-docs]. + +[ls-docs-access-token]: https://docs.lightstep.com/docs/create-and-manage-access-tokens + +[tidb-docs]: https://docs.pingcap.com/tidb/stable/grafana-tidb-dashboard#tidb-monitoring-metrics diff --git a/collector/tidb/examples/compose/collector.yaml b/collector/tidb/examples/compose/collector.yaml new file mode 100644 index 0000000..6932e09 --- /dev/null +++ b/collector/tidb/examples/compose/collector.yaml @@ -0,0 +1,27 @@ +receivers: + prometheus: + config: + scrape_configs: + - job_name: 'tidb-cluster' + scrape_interval: 5s + honor_labels: true + static_configs: + - targets: [ 'pushgateway:9091' ] + +processors: + batch: + +exporters: + logging: + loglevel: debug + otlp: + endpoint: ingest.lightstep.com:443 + headers: + - lightstep-access-token: ${LS_ACCESS_TOKEN} + +service: + pipelines: + metrics: + receivers: [ prometheus ] + processors: [ batch ] + exporters: [ logging,otlp ] diff --git a/collector/tidb/examples/compose/config/pd.toml b/collector/tidb/examples/compose/config/pd.toml new file mode 100644 index 0000000..b1562a5 --- /dev/null +++ b/collector/tidb/examples/compose/config/pd.toml @@ -0,0 +1,86 @@ +# PD Configuration. + +name = "pd" +data-dir = "default.pd" + +client-urls = "http://127.0.0.1:2379" +# if not set, use ${client-urls} +advertise-client-urls = "" + +peer-urls = "http://127.0.0.1:2380" +# if not set, use ${peer-urls} +advertise-peer-urls = "" + +initial-cluster = "pd=http://127.0.0.1:2380" +initial-cluster-state = "new" + +lease = 3 +tso-save-interval = "3s" + +[security] +# Path of file that contains list of trusted SSL CAs. if set, following four settings shouldn't be empty +cacert-path = "" +# Path of file that contains X509 certificate in PEM format. +cert-path = "" +# Path of file that contains X509 key in PEM format. +key-path = "" + +[log] +level = "error" + +# log format, one of json, text, console +#format = "text" + +# disable automatic timestamps in output +#disable-timestamp = false + +# file logging +[log.file] +#filename = "" +# max log file size in MB +#max-size = 300 +# max log file keep days +#max-days = 28 +# maximum number of old log files to retain +#max-backups = 7 +# rotate log by day +#log-rotate = true + +[metric] +# prometheus client push interval, set "0s" to disable prometheus. +interval = "15s" +# prometheus pushgateway address, leaves it empty will disable prometheus. +address = "pushgateway:9091" + +[schedule] +max-merge-region-size = 0 +split-merge-interval = "1h" +max-snapshot-count = 3 +max-pending-peer-count = 16 +max-store-down-time = "30m" +leader-schedule-limit = 4 +region-schedule-limit = 4 +replica-schedule-limit = 8 +merge-schedule-limit = 8 +tolerant-size-ratio = 5.0 + +# customized schedulers, the format is as below +# if empty, it will use balance-leader, balance-region, hot-region as default +# [[schedule.schedulers]] +# type = "evict-leader" +# args = ["1"] + +[replication] +# The number of replicas for each region. +max-replicas = 3 +# The label keys specified the location of a store. +# The placement priorities is implied by the order of label keys. +# For example, ["zone", "rack"] means that we should place replicas to +# different zones first, then to different racks if we don't have enough zones. +location-labels = [] + +[label-property] +# Do not assign region leaders to stores that have these tags. +# [[label-property.reject-leader]] +# key = "zone" +# value = "cn1 diff --git a/collector/tidb/examples/compose/config/spark-defaults.conf b/collector/tidb/examples/compose/config/spark-defaults.conf new file mode 100644 index 0000000..76467a5 --- /dev/null +++ b/collector/tidb/examples/compose/config/spark-defaults.conf @@ -0,0 +1,2 @@ +spark.tispark.pd.addresses pd0:2379 +spark.sql.extensions org.apache.spark.sql.TiExtensions diff --git a/collector/tidb/examples/compose/config/tidb.toml b/collector/tidb/examples/compose/config/tidb.toml new file mode 100644 index 0000000..9f881b4 --- /dev/null +++ b/collector/tidb/examples/compose/config/tidb.toml @@ -0,0 +1,239 @@ +# TiDB Configuration. + +# TiDB server host. +host = "0.0.0.0" + +# TiDB server port. +port = 4000 + +# Registered store name, [tikv, mocktikv] +store = "mocktikv" + +# TiDB storage path. +path = "/tmp/tidb" + +# The socket file to use for connection. +socket = "" + +# Run ddl worker on this tidb-server. +run-ddl = true + +# Schema lease duration, very dangerous to change only if you know what you do. +lease = "0" + +# When create table, split a separated region for it. It is recommended to +# turn off this option if there will be a large number of tables created. +split-table = true + +# The limit of concurrent executed sessions. +token-limit = 1000 + +# Only print a log when out of memory quota. +# Valid options: ["log", "cancel"] +oom-action = "log" + +# Set the memory quota for a query in bytes. Default: 32GB +mem-quota-query = 34359738368 + +# Enable coprocessor streaming. +enable-streaming = false + +# Set system variable 'lower_case_table_names' +lower-case-table-names = 2 + +[log] +# Log level: debug, info, warn, error, fatal. +level = "error" + +# Log format, one of json, text, console. +format = "text" + +# Disable automatic timestamp in output +disable-timestamp = false + +# Stores slow query log into separated files. +slow-query-file = "" + +# Queries with execution time greater than this value will be logged. (Milliseconds) +slow-threshold = 300 + +# Queries with internal result greater than this value will be logged. +expensive-threshold = 10000 + +# Maximum query length recorded in log. +query-log-max-len = 2048 + +# File logging. +[log.file] +# Log file name. +filename = "" + +# Max log file size in MB (upper limit to 4096MB). +max-size = 300 + +# Max log file keep days. No clean up by default. +max-days = 0 + +# Maximum number of old log files to retain. No clean up by default. +max-backups = 0 + +# Rotate log by day +log-rotate = true + +[security] +# Path of file that contains list of trusted SSL CAs for connection with mysql client. +ssl-ca = "" + +# Path of file that contains X509 certificate in PEM format for connection with mysql client. +ssl-cert = "" + +# Path of file that contains X509 key in PEM format for connection with mysql client. +ssl-key = "" + +# Path of file that contains list of trusted SSL CAs for connection with cluster components. +cluster-ssl-ca = "" + +# Path of file that contains X509 certificate in PEM format for connection with cluster components. +cluster-ssl-cert = "" + +# Path of file that contains X509 key in PEM format for connection with cluster components. +cluster-ssl-key = "" + +[status] +# If enable status report HTTP service. +report-status = true + +# TiDB status port. +status-port = 10080 + +# Prometheus pushgateway address, leaves it empty will disable prometheus push. +metrics-addr = "pushgateway:9091" + +# Prometheus client push interval in second, set \"0\" to disable prometheus push. +metrics-interval = 15 + +[performance] +# Max CPUs to use, 0 use number of CPUs in the machine. +max-procs = 0 +# StmtCountLimit limits the max count of statement inside a transaction. +stmt-count-limit = 5000 + +# Set keep alive option for tcp connection. +tcp-keep-alive = true + +# The maximum number of retries when commit a transaction. +retry-limit = 10 + +# Whether support cartesian product. +cross-join = true + +# Stats lease duration, which influences the time of analyze and stats load. +stats-lease = "3s" + +# Run auto analyze worker on this tidb-server. +run-auto-analyze = true + +# Probability to use the query feedback to update stats, 0 or 1 for always false/true. +feedback-probability = 0.0 + +# The max number of query feedback that cache in memory. +query-feedback-limit = 1024 + +# Pseudo stats will be used if the ratio between the modify count and +# row count in statistics of a table is greater than it. +pseudo-estimate-ratio = 0.7 + +[proxy-protocol] +# PROXY protocol acceptable client networks. +# Empty string means disable PROXY protocol, * means all networks. +networks = "" + +# PROXY protocol header read timeout, unit is second +header-timeout = 5 + +[plan-cache] +enabled = false +capacity = 2560 +shards = 256 + +[prepared-plan-cache] +enabled = false +capacity = 100 + +[opentracing] +# Enable opentracing. +enable = false + +# Whether to enable the rpc metrics. +rpc-metrics = false + +[opentracing.sampler] +# Type specifies the type of the sampler: const, probabilistic, rateLimiting, or remote +type = "const" + +# Param is a value passed to the sampler. +# Valid values for Param field are: +# - for "const" sampler, 0 or 1 for always false/true respectively +# - for "probabilistic" sampler, a probability between 0 and 1 +# - for "rateLimiting" sampler, the number of spans per second +# - for "remote" sampler, param is the same as for "probabilistic" +# and indicates the initial sampling rate before the actual one +# is received from the mothership +param = 1.0 + +# SamplingServerURL is the address of jaeger-agent's HTTP sampling server +sampling-server-url = "" + +# MaxOperations is the maximum number of operations that the sampler +# will keep track of. If an operation is not tracked, a default probabilistic +# sampler will be used rather than the per operation specific sampler. +max-operations = 0 + +# SamplingRefreshInterval controls how often the remotely controlled sampler will poll +# jaeger-agent for the appropriate sampling strategy. +sampling-refresh-interval = 0 + +[opentracing.reporter] +# QueueSize controls how many spans the reporter can keep in memory before it starts dropping +# new spans. The queue is continuously drained by a background go-routine, as fast as spans +# can be sent out of process. +queue-size = 0 + +# BufferFlushInterval controls how often the buffer is force-flushed, even if it's not full. +# It is generally not useful, as it only matters for very low traffic services. +buffer-flush-interval = 0 + +# LogSpans, when true, enables LoggingReporter that runs in parallel with the main reporter +# and logs all submitted spans. Main Configuration.Logger must be initialized in the code +# for this option to have any effect. +log-spans = false + +# LocalAgentHostPort instructs reporter to send spans to jaeger-agent at this address +local-agent-host-port = "" + +[tikv-client] +# Max gRPC connections that will be established with each tikv-server. +grpc-connection-count = 16 + +# After a duration of this time in seconds if the client doesn't see any activity it pings +# the server to see if the transport is still alive. +grpc-keepalive-time = 10 + +# After having pinged for keepalive check, the client waits for a duration of Timeout in seconds +# and if no activity is seen even after that the connection is closed. +grpc-keepalive-timeout = 3 + +# max time for commit command, must be twice bigger than raft election timeout. +commit-timeout = "41s" + +[binlog] + +# Socket file to write binlog. +binlog-socket = "" + +# WriteTimeout specifies how long it will wait for writing binlog to pump. +write-timeout = "15s" + +# If IgnoreError is true, when writting binlog meets error, TiDB would stop writting binlog, +# but still provide service. +ignore-error = false diff --git a/collector/tidb/examples/compose/config/tikv.toml b/collector/tidb/examples/compose/config/tikv.toml new file mode 100644 index 0000000..b2e59c3 --- /dev/null +++ b/collector/tidb/examples/compose/config/tikv.toml @@ -0,0 +1,499 @@ +# TiKV config template +# Human-readable big numbers: +# File size(based on byte): KB, MB, GB, TB, PB +# e.g.: 1_048_576 = "1MB" +# Time(based on ms): ms, s, m, h +# e.g.: 78_000 = "1.3m" + +# log level: trace, debug, info, warn, error, off. +[log] +level = "error" +# file to store log, write to stderr if it's empty. +[log.file] +# filename = "" + +[readpool.storage] +# size of thread pool for high-priority operations +# high-concurrency = 4 +# size of thread pool for normal-priority operations +# normal-concurrency = 4 +# size of thread pool for low-priority operations +# low-concurrency = 4 +# max running high-priority operations, reject if exceed +# max-tasks-high = 8000 +# max running normal-priority operations, reject if exceed +# max-tasks-normal = 8000 +# max running low-priority operations, reject if exceed +# max-tasks-low = 8000 +# size of stack size for each thread pool +# stack-size = "10MB" + +[readpool.coprocessor] +# Notice: if CPU_NUM > 8, default thread pool size for coprocessors +# will be set to CPU_NUM * 0.8. + +# high-concurrency = 8 +# normal-concurrency = 8 +# low-concurrency = 8 +# max-tasks-high = 16000 +# max-tasks-normal = 16000 +# max-tasks-low = 16000 +# stack-size = "10MB" + +[server] +# set listening address. +# addr = "127.0.0.1:20160" +# set advertise listening address for client communication, if not set, use addr instead. +# advertise-addr = "" +# notify capacity, 40960 is suitable for about 7000 regions. +# notify-capacity = 40960 +# maximum number of messages can be processed in one tick. +# messages-per-tick = 4096 + +# compression type for grpc channel, available values are no, deflate and gzip. +# grpc-compression-type = "no" +# size of thread pool for grpc server. +# grpc-concurrency = 4 +# The number of max concurrent streams/requests on a client connection. +# grpc-concurrent-stream = 1024 +# The number of connections with each tikv server to send raft messages. +# grpc-raft-conn-num = 10 +# Amount to read ahead on individual grpc streams. +# grpc-stream-initial-window-size = "2MB" + +# How many snapshots can be sent concurrently. +# concurrent-send-snap-limit = 32 +# How many snapshots can be recv concurrently. +# concurrent-recv-snap-limit = 32 + +# max count of tasks being handled, new tasks will be rejected. +# end-point-max-tasks = 2000 + +# max recursion level allowed when decoding dag expression +# end-point-recursion-limit = 1000 + +# max time to handle coprocessor request before timeout +# end-point-request-max-handle-duration = "60s" + +# the max bytes that snapshot can be written to disk in one second, +# should be set based on your disk performance +# snap-max-write-bytes-per-sec = "100MB" + +# set attributes about this server, e.g. { zone = "us-west-1", disk = "ssd" }. +# labels = {} + +[storage] +# set the path to rocksdb directory. +# data-dir = "/tmp/tikv/store" + +# notify capacity of scheduler's channel +# scheduler-notify-capacity = 10240 + +# maximum number of messages can be processed in one tick +# scheduler-messages-per-tick = 1024 + +# the number of slots in scheduler latches, concurrency control for write. +# scheduler-concurrency = 2048000 + +# scheduler's worker pool size, should increase it in heavy write cases, +# also should less than total cpu cores. +# scheduler-worker-pool-size = 4 + +# When the pending write bytes exceeds this threshold, +# the "scheduler too busy" error is displayed. +# scheduler-pending-write-threshold = "100MB" + +[pd] +# pd endpoints +# endpoints = [] + +[metric] +# the Prometheus client push interval. Setting the value to 0s stops Prometheus client from pushing. +# interval = "15s" +# the Prometheus pushgateway address. Leaving it empty stops Prometheus client from pushing. +address = "pushgateway:9091" +# the Prometheus client push job name. Note: A node id will automatically append, e.g., "tikv_1". +# job = "tikv" + +[raftstore] +# true (default value) for high reliability, this can prevent data loss when power failure. +# sync-log = true + +# set the path to raftdb directory, default value is data-dir/raft +# raftdb-path = "" + +# set store capacity, if no set, use disk capacity. +# capacity = 0 + +# notify capacity, 40960 is suitable for about 7000 regions. +# notify-capacity = 40960 + +# maximum number of messages can be processed in one tick. +# messages-per-tick = 4096 + +# Region heartbeat tick interval for reporting to pd. +# pd-heartbeat-tick-interval = "60s" +# Store heartbeat tick interval for reporting to pd. +# pd-store-heartbeat-tick-interval = "10s" + +# When region size changes exceeds region-split-check-diff, we should check +# whether the region should be split or not. +# region-split-check-diff = "6MB" + +# Interval to check region whether need to be split or not. +# split-region-check-tick-interval = "10s" + +# When raft entry exceed the max size, reject to propose the entry. +# raft-entry-max-size = "8MB" + +# Interval to gc unnecessary raft log. +# raft-log-gc-tick-interval = "10s" +# A threshold to gc stale raft log, must >= 1. +# raft-log-gc-threshold = 50 +# When entry count exceed this value, gc will be forced trigger. +# raft-log-gc-count-limit = 72000 +# When the approximate size of raft log entries exceed this value, gc will be forced trigger. +# It's recommanded to set it to 3/4 of region-split-size. +# raft-log-gc-size-limit = "72MB" + +# When a peer hasn't been active for max-peer-down-duration, +# we will consider this peer to be down and report it to pd. +# max-peer-down-duration = "5m" + +# Interval to check whether start manual compaction for a region, +# region-compact-check-interval = "5m" +# Number of regions for each time to check. +# region-compact-check-step = 100 +# The minimum number of delete tombstones to trigger manual compaction. +# region-compact-min-tombstones = 10000 +# Interval to check whether should start a manual compaction for lock column family, +# if written bytes reach lock-cf-compact-threshold for lock column family, will fire +# a manual compaction for lock column family. +# lock-cf-compact-interval = "10m" +# lock-cf-compact-bytes-threshold = "256MB" + +# Interval (s) to check region whether the data are consistent. +# consistency-check-interval = 0 + +# Use delete range to drop a large number of continuous keys. +# use-delete-range = false + +# delay time before deleting a stale peer +# clean-stale-peer-delay = "10m" + +# Interval to cleanup import sst files. +# cleanup-import-sst-interval = "10m" + +[coprocessor] +# When it is true, it will try to split a region with table prefix if +# that region crosses tables. It is recommended to turn off this option +# if there will be a large number of tables created. +# split-region-on-table = true +# When the region's size exceeds region-max-size, we will split the region +# into two which the left region's size will be region-split-size or a little +# bit smaller. +# region-max-size = "144MB" +# region-split-size = "96MB" + +[rocksdb] +# Maximum number of concurrent background jobs (compactions and flushes) +# max-background-jobs = 8 + +# This value represents the maximum number of threads that will concurrently perform a +# compaction job by breaking it into multiple, smaller ones that are run simultaneously. +# Default: 1 (i.e. no subcompactions) +# max-sub-compactions = 1 + +# Number of open files that can be used by the DB. You may need to +# increase this if your database has a large working set. Value -1 means +# files opened are always kept open. You can estimate number of files based +# on target_file_size_base and target_file_size_multiplier for level-based +# compaction. +# If max-open-files = -1, RocksDB will prefetch index and filter blocks into +# block cache at startup, so if your database has a large working set, it will +# take several minutes to open the db. +max-open-files = 1024 + +# Max size of rocksdb's MANIFEST file. +# For detailed explanation please refer to https://github.com/facebook/rocksdb/wiki/MANIFEST +# max-manifest-file-size = "20MB" + +# If true, the database will be created if it is missing. +# create-if-missing = true + +# rocksdb wal recovery mode +# 0 : TolerateCorruptedTailRecords, tolerate incomplete record in trailing data on all logs; +# 1 : AbsoluteConsistency, We don't expect to find any corruption in the WAL; +# 2 : PointInTimeRecovery, Recover to point-in-time consistency; +# 3 : SkipAnyCorruptedRecords, Recovery after a disaster; +# wal-recovery-mode = 2 + +# rocksdb write-ahead logs dir path +# This specifies the absolute dir path for write-ahead logs (WAL). +# If it is empty, the log files will be in the same dir as data. +# When you set the path to rocksdb directory in memory like in /dev/shm, you may want to set +# wal-dir to a directory on a persistent storage. +# See https://github.com/facebook/rocksdb/wiki/How-to-persist-in-memory-RocksDB-database +# wal-dir = "/tmp/tikv/store" + +# The following two fields affect how archived write-ahead logs will be deleted. +# 1. If both set to 0, logs will be deleted asap and will not get into the archive. +# 2. If wal-ttl-seconds is 0 and wal-size-limit is not 0, +# WAL files will be checked every 10 min and if total size is greater +# then wal-size-limit, they will be deleted starting with the +# earliest until size_limit is met. All empty files will be deleted. +# 3. If wal-ttl-seconds is not 0 and wal-size-limit is 0, then +# WAL files will be checked every wal-ttl-seconds / 2 and those that +# are older than wal-ttl-seconds will be deleted. +# 4. If both are not 0, WAL files will be checked every 10 min and both +# checks will be performed with ttl being first. +# When you set the path to rocksdb directory in memory like in /dev/shm, you may want to set +# wal-ttl-seconds to a value greater than 0 (like 86400) and backup your db on a regular basis. +# See https://github.com/facebook/rocksdb/wiki/How-to-persist-in-memory-RocksDB-database +# wal-ttl-seconds = 0 +# wal-size-limit = 0 + +# rocksdb max total wal size +# max-total-wal-size = "4GB" + +# Rocksdb Statistics provides cumulative stats over time. +# Turn statistics on will introduce about 5%-10% overhead for RocksDB, +# but it is worthy to know the internal status of RocksDB. +# enable-statistics = true + +# Dump statistics periodically in information logs. +# Same as rocksdb's default value (10 min). +# stats-dump-period = "10m" + +# Due to Rocksdb FAQ: https://github.com/facebook/rocksdb/wiki/RocksDB-FAQ, +# If you want to use rocksdb on multi disks or spinning disks, you should set value at +# least 2MB; +# compaction-readahead-size = 0 + +# This is the maximum buffer size that is used by WritableFileWrite +# writable-file-max-buffer-size = "1MB" + +# Use O_DIRECT for both reads and writes in background flush and compactions +# use-direct-io-for-flush-and-compaction = false + +# Limit the disk IO of compaction and flush. Compaction and flush can cause +# terrible spikes if they exceed a certain threshold. Consider setting this to +# 50% ~ 80% of the disk throughput for a more stable result. However, in heavy +# write workload, limiting compaction and flush speed can cause write stalls too. +# rate-bytes-per-sec = 0 + +# Enable or disable the pipelined write +# enable-pipelined-write = true + +# Allows OS to incrementally sync files to disk while they are being +# written, asynchronously, in the background. +# bytes-per-sync = "0MB" + +# Allows OS to incrementally sync WAL to disk while it is being written. +# wal-bytes-per-sync = "0KB" + +# Specify the maximal size of the Rocksdb info log file. If the log file +# is larger than `max_log_file_size`, a new info log file will be created. +# If max_log_file_size == 0, all logs will be written to one log file. +# Default: 1GB +# info-log-max-size = "1GB" + +# Time for the Rocksdb info log file to roll (in seconds). +# If specified with non-zero value, log file will be rolled +# if it has been active longer than `log_file_time_to_roll`. +# Default: 0 (disabled) +# info-log-roll-time = "0" + +# Maximal Rocksdb info log files to be kept. +# Default: 10 +# info-log-keep-log-file-num = 10 + +# This specifies the Rocksdb info LOG dir. +# If it is empty, the log files will be in the same dir as data. +# If it is non empty, the log files will be in the specified dir, +# and the db data dir's absolute path will be used as the log file +# name's prefix. +# Default: empty +# info-log-dir = "" + +# Column Family default used to store actual data of the database. +[rocksdb.defaultcf] +# compression method (if any) is used to compress a block. +# no: kNoCompression +# snappy: kSnappyCompression +# zlib: kZlibCompression +# bzip2: kBZip2Compression +# lz4: kLZ4Compression +# lz4hc: kLZ4HCCompression +# zstd: kZSTD + +# per level compression +# compression-per-level = ["no", "no", "lz4", "lz4", "lz4", "zstd", "zstd"] + +# Approximate size of user data packed per block. Note that the +# block size specified here corresponds to uncompressed data. +# block-size = "64KB" + +# If you're doing point lookups you definitely want to turn bloom filters on, We use +# bloom filters to avoid unnecessary disk reads. Default bits_per_key is 10, which +# yields ~1% false positive rate. Larger bits_per_key values will reduce false positive +# rate, but increase memory usage and space amplification. +# bloom-filter-bits-per-key = 10 + +# false means one sst file one bloom filter, true means evry block has a corresponding bloom filter +# block-based-bloom-filter = false + +# level0-file-num-compaction-trigger = 4 + +# Soft limit on number of level-0 files. We start slowing down writes at this point. +# level0-slowdown-writes-trigger = 20 + +# Maximum number of level-0 files. We stop writes at this point. +# level0-stop-writes-trigger = 36 + +# Amount of data to build up in memory (backed by an unsorted log +# on disk) before converting to a sorted on-disk file. +# write-buffer-size = "128MB" + +# The maximum number of write buffers that are built up in memory. +# max-write-buffer-number = 5 + +# The minimum number of write buffers that will be merged together +# before writing to storage. +# min-write-buffer-number-to-merge = 1 + +# Control maximum total data size for base level (level 1). +# max-bytes-for-level-base = "512MB" + +# Target file size for compaction. +# target-file-size-base = "8MB" + +# Max bytes for compaction.max_compaction_bytes +# max-compaction-bytes = "2GB" + +# There are four different algorithms to pick files to compact. +# 0 : ByCompensatedSize +# 1 : OldestLargestSeqFirst +# 2 : OldestSmallestSeqFirst +# 3 : MinOverlappingRatio +# compaction-pri = 3 + +# block-cache used to cache uncompressed blocks, big block-cache can speed up read. +# in normal cases should tune to 30%-50% system's total memory. +# block-cache-size = "1GB" + +# Indicating if we'd put index/filter blocks to the block cache. +# If not specified, each "table reader" object will pre-load index/filter block +# during table initialization. +# cache-index-and-filter-blocks = true + +# Pin level0 filter and index blocks in cache. +# pin-l0-filter-and-index-blocks = true + +# Enable read amplication statistics. +# value => memory usage (percentage of loaded blocks memory) +# 1 => 12.50 % +# 2 => 06.25 % +# 4 => 03.12 % +# 8 => 01.56 % +# 16 => 00.78 % +# read-amp-bytes-per-bit = 0 + +# Pick target size of each level dynamically. +# dynamic-level-bytes = true + +# Options for Column Family write +# Column Family write used to store commit informations in MVCC model +[rocksdb.writecf] +# compression-per-level = ["no", "no", "lz4", "lz4", "lz4", "zstd", "zstd"] +# block-size = "64KB" +# write-buffer-size = "128MB" +# max-write-buffer-number = 5 +# min-write-buffer-number-to-merge = 1 +# max-bytes-for-level-base = "512MB" +# target-file-size-base = "8MB" + +# in normal cases should tune to 10%-30% system's total memory. +# block-cache-size = "256MB" +# level0-file-num-compaction-trigger = 4 +# level0-slowdown-writes-trigger = 20 +# level0-stop-writes-trigger = 36 +# cache-index-and-filter-blocks = true +# pin-l0-filter-and-index-blocks = true +# compaction-pri = 3 +# read-amp-bytes-per-bit = 0 +# dynamic-level-bytes = true + +[rocksdb.lockcf] +# compression-per-level = ["no", "no", "no", "no", "no", "no", "no"] +# block-size = "16KB" +# write-buffer-size = "128MB" +# max-write-buffer-number = 5 +# min-write-buffer-number-to-merge = 1 +# max-bytes-for-level-base = "128MB" +# target-file-size-base = "8MB" +# block-cache-size = "256MB" +# level0-file-num-compaction-trigger = 1 +# level0-slowdown-writes-trigger = 20 +# level0-stop-writes-trigger = 36 +# cache-index-and-filter-blocks = true +# pin-l0-filter-and-index-blocks = true +# compaction-pri = 0 +# read-amp-bytes-per-bit = 0 +# dynamic-level-bytes = true + +[raftdb] +# max-sub-compactions = 1 +max-open-files = 1024 +# max-manifest-file-size = "20MB" +# create-if-missing = true + +# enable-statistics = true +# stats-dump-period = "10m" + +# compaction-readahead-size = 0 +# writable-file-max-buffer-size = "1MB" +# use-direct-io-for-flush-and-compaction = false +# enable-pipelined-write = true +# allow-concurrent-memtable-write = false +# bytes-per-sync = "0MB" +# wal-bytes-per-sync = "0KB" + +# info-log-max-size = "1GB" +# info-log-roll-time = "0" +# info-log-keep-log-file-num = 10 +# info-log-dir = "" + +[raftdb.defaultcf] +# compression-per-level = ["no", "no", "lz4", "lz4", "lz4", "zstd", "zstd"] +# block-size = "64KB" +# write-buffer-size = "128MB" +# max-write-buffer-number = 5 +# min-write-buffer-number-to-merge = 1 +# max-bytes-for-level-base = "512MB" +# target-file-size-base = "8MB" + +# should tune to 256MB~2GB. +# block-cache-size = "256MB" +# level0-file-num-compaction-trigger = 4 +# level0-slowdown-writes-trigger = 20 +# level0-stop-writes-trigger = 36 +# cache-index-and-filter-blocks = true +# pin-l0-filter-and-index-blocks = true +# compaction-pri = 0 +# read-amp-bytes-per-bit = 0 +# dynamic-level-bytes = true + +[security] +# set the path for certificates. Empty string means disabling secure connectoins. +# ca-path = "" +# cert-path = "" +# key-path = "" + +[import] +# the directory to store importing kv data. +# import-dir = "/tmp/tikv/import" +# number of threads to handle RPC requests. +# num-threads = 8 +# stream channel window size, stream will be blocked on channel full. +# stream-channel-window = 128 diff --git a/collector/tidb/examples/compose/docker-compose.yaml b/collector/tidb/examples/compose/docker-compose.yaml new file mode 100644 index 0000000..a4d6e73 --- /dev/null +++ b/collector/tidb/examples/compose/docker-compose.yaml @@ -0,0 +1,244 @@ +version: '3.7' + +services: + pd0: + image: pingcap/pd:latest + ports: + - "2379" + volumes: + - ./config/pd.toml:/pd.toml:ro + - pd0-data:/data + - pd0-logs:/logs + command: + - --name=pd0 + - --client-urls=http://0.0.0.0:2379 + - --peer-urls=http://0.0.0.0:2380 + - --advertise-client-urls=http://pd0:2379 + - --advertise-peer-urls=http://pd0:2380 + - --initial-cluster=pd0=http://pd0:2380,pd1=http://pd1:2380,pd2=http://pd2:2380 + - --data-dir=/data/pd0 + - --config=/pd.toml + - --log-file=/logs/pd0.log + restart: on-failure + networks: + - tidb-cluster + pd1: + image: pingcap/pd:latest + ports: + - "2379" + volumes: + - ./config/pd.toml:/pd.toml:ro + - pd1-data:/data + - pd1-logs:/logs + command: + - --name=pd1 + - --client-urls=http://0.0.0.0:2379 + - --peer-urls=http://0.0.0.0:2380 + - --advertise-client-urls=http://pd1:2379 + - --advertise-peer-urls=http://pd1:2380 + - --initial-cluster=pd0=http://pd0:2380,pd1=http://pd1:2380,pd2=http://pd2:2380 + - --data-dir=/data/pd1 + - --config=/pd.toml + - --log-file=/logs/pd1.log + restart: on-failure + networks: + - tidb-cluster + pd2: + image: pingcap/pd:latest + ports: + - "2379" + volumes: + - ./config/pd.toml:/pd.toml:ro + - pd2-data:/data + - pd2-logs:/logs + command: + - --name=pd2 + - --client-urls=http://0.0.0.0:2379 + - --peer-urls=http://0.0.0.0:2380 + - --advertise-client-urls=http://pd2:2379 + - --advertise-peer-urls=http://pd2:2380 + - --initial-cluster=pd0=http://pd0:2380,pd1=http://pd1:2380,pd2=http://pd2:2380 + - --data-dir=/data/pd2 + - --config=/pd.toml + - --log-file=/logs/pd2.log + restart: on-failure + networks: + - tidb-cluster + + + tikv0: + image: pingcap/tikv:latest + volumes: + - ./config/tikv.toml:/tikv.toml:ro + - tikv0-data:/data + - tikv0-logs:/logs + command: + - --addr=0.0.0.0:20160 + - --advertise-addr=tikv0:20160 + - --data-dir=/data/tikv0 + - --pd=pd0:2379,pd1:2379,pd2:2379 + - --config=/tikv.toml + - --log-file=/logs/tikv0.log + depends_on: + - "pd0" + - "pd1" + - "pd2" + restart: on-failure + networks: + - tidb-cluster + tikv1: + image: pingcap/tikv:latest + volumes: + - ./config/tikv.toml:/tikv.toml:ro + - tikv1-data:/data + - tikv1-logs:/logs + command: + - --addr=0.0.0.0:20160 + - --advertise-addr=tikv1:20160 + - --data-dir=/data/tikv1 + - --pd=pd0:2379,pd1:2379,pd2:2379 + - --config=/tikv.toml + - --log-file=/logs/tikv1.log + depends_on: + - "pd0" + - "pd1" + - "pd2" + restart: on-failure + networks: + - tidb-cluster + tikv2: + image: pingcap/tikv:latest + volumes: + - ./config/tikv.toml:/tikv.toml:ro + - tikv2-data:/data + - tikv2-logs:/logs + command: + - --addr=0.0.0.0:20160 + - --advertise-addr=tikv2:20160 + - --data-dir=/data/tikv2 + - --pd=pd0:2379,pd1:2379,pd2:2379 + - --config=/tikv.toml + - --log-file=/logs/tikv2.log + depends_on: + - "pd0" + - "pd1" + - "pd2" + restart: on-failure + networks: + - tidb-cluster + + + tidb: + image: pingcap/tidb:latest + ports: + - "4000:4000" + - "10080:10080" + volumes: + - ./config/tidb.toml:/tidb.toml:ro + - tidb-logs:/logs + command: + - --store=tikv + - --path=pd0:2379,pd1:2379,pd2:2379 + - --config=/tidb.toml + - --log-file=/logs/tidb.log + - --advertise-address=tidb + depends_on: + - "tikv0" + - "tikv1" + - "tikv2" + restart: on-failure + networks: + - tidb-cluster + + tispark-master: + image: pingcap/tispark:v2.1.1 + command: + - /opt/spark/sbin/start-master.sh + volumes: + - ./config/spark-defaults.conf:/opt/spark/conf/spark-defaults.conf:ro + environment: + SPARK_MASTER_PORT: 7077 + SPARK_MASTER_WEBUI_PORT: 8080 + ports: + - "7077:7077" + - "8080:8080" + depends_on: + - "tikv0" + - "tikv1" + - "tikv2" + restart: on-failure + networks: + - tidb-cluster + + tispark-slave0: + image: pingcap/tispark:v2.1.1 + command: + - /opt/spark/sbin/start-slave.sh + - spark://tispark-master:7077 + volumes: + - ./config/spark-defaults.conf:/opt/spark/conf/spark-defaults.conf:ro + environment: + SPARK_WORKER_WEBUI_PORT: 38081 + ports: + - "38081:38081" + depends_on: + - tispark-master + restart: on-failure + + tidb-vision: + image: pingcap/tidb-vision:latest + environment: + PD_ENDPOINT: pd0:2379 + ports: + - "8010:8010" + restart: on-failure + networks: + - tidb-cluster + + pushgateway: + image: prom/pushgateway:v0.3.1 + ports: + - "9091:9091" + command: + - --log.level=error + restart: on-failure + networks: + - integrations + - tidb-cluster + otel-collector: + container_name: otel-collector + image: otel/opentelemetry-collector-contrib:0.81.0 + hostname: otel-collector + restart: always + command: [ "--config=/conf/collector.yaml" ] + volumes: + - ./collector.yaml:/conf/collector.yaml:rw + environment: + LS_ACCESS_TOKEN: "${LS_ACCESS_TOKEN}" + depends_on: + - pushgateway + networks: + - integrations + + +networks: + integrations: + tidb-cluster: + +volumes: + pd0-data: + pd0-logs: + pd1-data: + pd1-logs: + pd2-data: + pd2-logs: + tikv0-data: + tikv0-logs: + tikv1-data: + tikv1-logs: + tikv2-data: + tikv2-logs: + tidb-logs: + + + diff --git a/collector/tidb/metrics.csv b/collector/tidb/metrics.csv new file mode 100644 index 0000000..d085686 --- /dev/null +++ b/collector/tidb/metrics.csv @@ -0,0 +1,418 @@ +Name,Description,Unit,DataType,Attributes +br_raw_backup_region_seconds,Backup region latency distributions.,,histogram,"instance,job,le" +etcd_cluster_version,Which version is running. 1 for 'cluster_version' label with current cluster version,,gauge,"cluster_version,instance,job" +etcd_debugging_auth_revision,The current revision of auth store.,,gauge,"instance,job" +etcd_debugging_disk_backend_commit_rebalance_duration_seconds,The latency distributions of commit.rebalance called by bboltdb backend.,,histogram,"instance,job,le" +etcd_debugging_disk_backend_commit_spill_duration_seconds,The latency distributions of commit.spill called by bboltdb backend.,,histogram,"instance,job,le" +etcd_debugging_disk_backend_commit_write_duration_seconds,The latency distributions of commit.write called by bboltdb backend.,,histogram,"instance,job,le" +etcd_debugging_lease_granted,The total number of granted leases.,,counter,"instance,job" +etcd_debugging_lease_renewed,The number of renewed leases seen by the leader.,,counter,"instance,job" +etcd_debugging_lease_revoked,The total number of revoked leases.,,counter,"instance,job" +etcd_debugging_lease_ttl_total,Bucketed histogram of lease TTLs.,,histogram,"instance,job,le" +etcd_debugging_mvcc_compact_revision,The revision of the last compaction in store.,,gauge,"instance,job" +etcd_debugging_mvcc_current_revision,The current revision of store.,,gauge,"instance,job" +etcd_debugging_mvcc_db_compaction_keys,Total number of db keys compacted.,,counter,"instance,job" +etcd_debugging_mvcc_db_compaction_pause_duration_milliseconds,Bucketed histogram of db compaction pause duration.,,histogram,"instance,job,le" +etcd_debugging_mvcc_db_compaction_total_duration_milliseconds,Bucketed histogram of db compaction total duration.,,histogram,"instance,job,le" +etcd_debugging_mvcc_db_total_size_in_bytes,Total size of the underlying database physically allocated in bytes.,,gauge,"instance,job" +etcd_debugging_mvcc_delete,Total number of deletes seen by this member.,,counter,"instance,job" +etcd_debugging_mvcc_events,Total number of events sent by this member.,,counter,"instance,job" +etcd_debugging_mvcc_index_compaction_pause_duration_milliseconds,Bucketed histogram of index compaction pause duration.,,histogram,"instance,job,le" +etcd_debugging_mvcc_keys_total,Total number of keys.,,gauge,"instance,job" +etcd_debugging_mvcc_pending_events_total,Total number of pending events to be sent.,,gauge,"instance,job" +etcd_debugging_mvcc_put,Total number of puts seen by this member.,,counter,"instance,job" +etcd_debugging_mvcc_range,Total number of ranges seen by this member.,,counter,"instance,job" +etcd_debugging_mvcc_slow_watcher_total,Total number of unsynced slow watchers.,,gauge,"instance,job" +etcd_debugging_mvcc_total_put_size_in_bytes,The total size of put kv pairs seen by this member.,,gauge,"instance,job" +etcd_debugging_mvcc_txn,Total number of txns seen by this member.,,counter,"instance,job" +etcd_debugging_mvcc_watch_stream_total,Total number of watch streams.,,gauge,"instance,job" +etcd_debugging_mvcc_watcher_total,Total number of watchers.,,gauge,"instance,job" +etcd_debugging_server_lease_expired,The total number of expired leases.,,counter,"instance,job" +etcd_debugging_snap_save_marshalling_duration_seconds,The marshalling cost distributions of save called by snapshot.,,histogram,"instance,job,le" +etcd_debugging_snap_save_total_duration_seconds,The total latency distributions of save called by snapshot.,,histogram,"instance,job,le" +etcd_debugging_store_expires,Total number of expired keys.,,counter,"instance,job" +etcd_debugging_store_reads,"Total number of reads action by (get/getRecursive), local to this member.",,counter,"action,instance,job" +etcd_debugging_store_watch_requests,Total number of incoming watch requests (new or reestablished).,,counter,"instance,job" +etcd_debugging_store_watchers,Count of currently active watchers.,,gauge,"instance,job" +etcd_debugging_store_writes,Total number of writes (e.g. set/compareAndDelete) seen by this member.,,counter,"action,instance,job" +etcd_disk_backend_commit_duration_seconds,The latency distributions of commit called by backend.,,histogram,"instance,job,le" +etcd_disk_backend_defrag_duration_seconds,The latency distribution of backend defragmentation.,,histogram,"instance,job,le" +etcd_disk_backend_snapshot_duration_seconds,The latency distribution of backend snapshots.,,histogram,"instance,job,le" +etcd_disk_defrag_inflight,"Whether or not defrag is active on the member. 1 means active, 0 means not.",,gauge,"instance,job" +etcd_disk_wal_fsync_duration_seconds,The latency distributions of fsync called by WAL.,,histogram,"instance,job,le" +etcd_disk_wal_write_bytes_total,Total number of bytes written in WAL.,,gauge,"instance,job" +etcd_mvcc_db_open_read_transactions,The number of currently open read transactions,,gauge,"instance,job" +etcd_mvcc_db_total_size_in_bytes,Total size of the underlying database physically allocated in bytes.,,gauge,"instance,job" +etcd_mvcc_db_total_size_in_use_in_bytes,Total size of the underlying database logically in use in bytes.,,gauge,"instance,job" +etcd_mvcc_delete,Total number of deletes seen by this member.,,counter,"instance,job" +etcd_mvcc_hash_duration_seconds,The latency distribution of storage hash operation.,,histogram,"instance,job,le" +etcd_mvcc_hash_rev_duration_seconds,The latency distribution of storage hash by revision operation.,,histogram,"instance,job,le" +etcd_mvcc_put,Total number of puts seen by this member.,,counter,"instance,job" +etcd_mvcc_range,Total number of ranges seen by this member.,,counter,"instance,job" +etcd_mvcc_txn,Total number of txns seen by this member.,,counter,"instance,job" +etcd_network_active_peers,The current number of active peer connections.,,gauge,"Local,Remote,instance,job" +etcd_network_client_grpc_received_bytes,The total number of bytes received from grpc clients.,,counter,"instance,job" +etcd_network_client_grpc_sent_bytes,The total number of bytes sent to grpc clients.,,counter,"instance,job" +etcd_network_peer_received_bytes,The total number of bytes received from peers.,,counter,"From,instance,job" +etcd_network_peer_round_trip_time_seconds,Round-Trip-Time histogram between peers,,histogram,"To,instance,job,le" +etcd_network_peer_sent_bytes,The total number of bytes sent to peers.,,counter,"To,instance,job" +etcd_server_client_requests,The total number of client requests per client version.,,counter,"client_api_version,instance,job,type" +etcd_server_go_version,Which Go version server is running with. 1 for 'server_go_version' label with current version.,,gauge,"instance,job,server_go_version" +etcd_server_has_leader,"Whether or not a leader exists. 1 is existence, 0 is not.",,gauge,"instance,job" +etcd_server_health_failures,The total number of failed health checks,,counter,"instance,job" +etcd_server_health_success,The total number of successful health checks,,counter,"instance,job" +etcd_server_heartbeat_send_failures,The total number of leader heartbeat send failures (likely overloaded from slow disk).,,counter,"instance,job" +etcd_server_id,Server or member ID in hexadecimal format. 1 for 'server_id' label with current ID.,,gauge,"instance,job,server_id" +etcd_server_is_leader,"Whether or not this member is a leader. 1 if is, 0 otherwise.",,gauge,"instance,job" +etcd_server_is_learner,"Whether or not this member is a learner. 1 if is, 0 otherwise.",,gauge,"instance,job" +etcd_server_leader_changes_seen,The number of leader changes seen.,,counter,"instance,job" +etcd_server_learner_promote_successes,The total number of successful learner promotions while this member is leader.,,counter,"instance,job" +etcd_server_proposals_applied_total,The total number of consensus proposals applied.,,gauge,"instance,job" +etcd_server_proposals_committed_total,The total number of consensus proposals committed.,,gauge,"instance,job" +etcd_server_proposals_failed,The total number of failed proposals seen.,,counter,"instance,job" +etcd_server_proposals_pending,The current number of pending proposals to commit.,,gauge,"instance,job" +etcd_server_quota_backend_bytes,Current backend storage quota size in bytes.,,gauge,"instance,job" +etcd_server_read_indexes_failed,The total number of failed read indexes seen.,,counter,"instance,job" +etcd_server_slow_apply,The total number of slow apply requests (likely overloaded from slow disk).,,counter,"instance,job" +etcd_server_slow_read_indexes,The total number of pending read indexes not in sync with leader's or timed out read index requests.,,counter,"instance,job" +etcd_server_snapshot_apply_in_progress_total,1 if the server is applying the incoming snapshot. 0 if none.,,gauge,"instance,job" +etcd_server_version,Which version is running. 1 for 'server_version' label with current version.,,gauge,"instance,job,server_version" +etcd_snap_db_fsync_duration_seconds,The latency distributions of fsyncing .snap.db file,,histogram,"instance,job,le" +etcd_snap_db_save_total_duration_seconds,The total latency distributions of v3 snapshot save,,histogram,"instance,job,le" +etcd_snap_fsync_duration_seconds,The latency distributions of fsync called by snap.,,histogram,"instance,job,le" +go_cgo_go_to_c_calls_calls,Count of calls made from Go to C by the current process.,,counter,"instance,job" +go_cpu_classes_gc_mark_assist_cpu_seconds,"Estimated total CPU time goroutines spent performing GC tasks to assist the GC and prevent it from falling behind the application. This metric is an overestimate, and not directly comparable to system CPU time measurements. Compare only with other /cpu/classes metrics.",,counter,"instance,job" +go_cpu_classes_gc_mark_dedicated_cpu_seconds,"Estimated total CPU time spent performing GC tasks on processors (as defined by GOMAXPROCS) dedicated to those tasks. This includes time spent with the world stopped due to the GC. This metric is an overestimate, and not directly comparable to system CPU time measurements. Compare only with other /cpu/classes metrics.",,counter,"instance,job" +go_cpu_classes_gc_mark_idle_cpu_seconds,"Estimated total CPU time spent performing GC tasks on spare CPU resources that the Go scheduler could not otherwise find a use for. This should be subtracted from the total GC CPU time to obtain a measure of compulsory GC CPU time. This metric is an overestimate, and not directly comparable to system CPU time measurements. Compare only with other /cpu/classes metrics.",,counter,"instance,job" +go_cpu_classes_gc_pause_cpu_seconds,"Estimated total CPU time spent with the application paused by the GC. Even if only one thread is running during the pause, this is computed as GOMAXPROCS times the pause latency because nothing else can be executing. This is the exact sum of samples in /gc/pause:seconds if each sample is multiplied by GOMAXPROCS at the time it is taken. This metric is an overestimate, and not directly comparable to system CPU time measurements. Compare only with other /cpu/classes metrics.",,counter,"instance,job" +go_cpu_classes_gc_total_cpu_seconds,"Estimated total CPU time spent performing GC tasks. This metric is an overestimate, and not directly comparable to system CPU time measurements. Compare only with other /cpu/classes metrics. Sum of all metrics in /cpu/classes/gc.",,counter,"instance,job" +go_cpu_classes_idle_cpu_seconds,"Estimated total available CPU time not spent executing any Go or Go runtime code. In other words, the part of /cpu/classes/total:cpu-seconds that was unused. This metric is an overestimate, and not directly comparable to system CPU time measurements. Compare only with other /cpu/classes metrics.",,counter,"instance,job" +go_cpu_classes_scavenge_assist_cpu_seconds,"Estimated total CPU time spent returning unused memory to the underlying platform in response eagerly in response to memory pressure. This metric is an overestimate, and not directly comparable to system CPU time measurements. Compare only with other /cpu/classes metrics.",,counter,"instance,job" +go_cpu_classes_scavenge_background_cpu_seconds,"Estimated total CPU time spent performing background tasks to return unused memory to the underlying platform. This metric is an overestimate, and not directly comparable to system CPU time measurements. Compare only with other /cpu/classes metrics.",,counter,"instance,job" +go_cpu_classes_scavenge_total_cpu_seconds,"Estimated total CPU time spent performing tasks that return unused memory to the underlying platform. This metric is an overestimate, and not directly comparable to system CPU time measurements. Compare only with other /cpu/classes metrics. Sum of all metrics in /cpu/classes/scavenge.",,counter,"instance,job" +go_cpu_classes_total_cpu_seconds,"Estimated total available CPU time for user Go code or the Go runtime, as defined by GOMAXPROCS. In other words, GOMAXPROCS integrated over the wall-clock duration this process has been executing for. This metric is an overestimate, and not directly comparable to system CPU time measurements. Compare only with other /cpu/classes metrics. Sum of all metrics in /cpu/classes.",,counter,"instance,job" +go_cpu_classes_user_cpu_seconds,"Estimated total CPU time spent running user Go code. This may also include some small amount of time spent in the Go runtime. This metric is an overestimate, and not directly comparable to system CPU time measurements. Compare only with other /cpu/classes metrics.",,counter,"instance,job" +go_gc_cycles_automatic_gc_cycles,Count of completed GC cycles generated by the Go runtime.,,counter,"instance,job" +go_gc_cycles_forced_gc_cycles,Count of completed GC cycles forced by the application.,,counter,"instance,job" +go_gc_cycles_total_gc_cycles,Count of all completed GC cycles.,,counter,"instance,job" +go_gc_duration_seconds,A summary of the GC invocation durations.,,summary,quantile +go_gc_heap_allocs_by_size_bytes,"Distribution of heap allocations by approximate size. Note that this does not include tiny objects as defined by /gc/heap/tiny/allocs:objects, only tiny blocks.",,histogram,"instance,job,le" +go_gc_heap_allocs_bytes,Cumulative sum of memory allocated to the heap by the application.,,counter,"instance,job" +go_gc_heap_allocs_objects,"Cumulative count of heap allocations triggered by the application. Note that this does not include tiny objects as defined by /gc/heap/tiny/allocs:objects, only tiny blocks.",,counter,"instance,job" +go_gc_heap_frees_by_size_bytes,"Distribution of freed heap allocations by approximate size. Note that this does not include tiny objects as defined by /gc/heap/tiny/allocs:objects, only tiny blocks.",,histogram,"instance,job,le" +go_gc_heap_frees_bytes,Cumulative sum of heap memory freed by the garbage collector.,,counter,"instance,job" +go_gc_heap_frees_objects,"Cumulative count of heap allocations whose storage was freed by the garbage collector. Note that this does not include tiny objects as defined by /gc/heap/tiny/allocs:objects, only tiny blocks.",,counter,"instance,job" +go_gc_heap_goal_bytes,Heap size target for the end of the GC cycle.,,gauge,"instance,job" +go_gc_heap_objects_objects,"Number of objects, live or unswept, occupying heap memory.",,gauge,"instance,job" +go_gc_heap_tiny_allocs_objects,"Count of small allocations that are packed together into blocks. These allocations are counted separately from other allocations because each individual allocation is not tracked by the runtime, only their block. Each block is already accounted for in allocs-by-size and frees-by-size.",,counter,"instance,job" +go_gc_limiter_last_enabled_gc_cycle,"GC cycle the last time the GC CPU limiter was enabled. This metric is useful for diagnosing the root cause of an out-of-memory error, because the limiter trades memory for CPU time when the GC's CPU time gets too high. This is most likely to occur with use of SetMemoryLimit. The first GC cycle is cycle 1, so a value of 0 indicates that it was never enabled.",,gauge,"instance,job" +go_gc_pauses_seconds,Distribution individual GC-related stop-the-world pause latencies.,,histogram,"instance,job,le" +go_gc_stack_starting_size_bytes,The stack size of new goroutines.,,gauge,"instance,job" +go_goroutines,Number of goroutines that currently exist.,,gauge, +go_info,Information about the Go environment.,,gauge,"instance,job,version" +go_memory_classes_heap_free_bytes,"Memory that is completely free and eligible to be returned to the underlying system, but has not been. This metric is the runtime's estimate of free address space that is backed by physical memory.",,gauge,"instance,job" +go_memory_classes_heap_objects_bytes,Memory occupied by live objects and dead objects that have not yet been marked free by the garbage collector.,,gauge,"instance,job" +go_memory_classes_heap_released_bytes,"Memory that is completely free and has been returned to the underlying system. This metric is the runtime's estimate of free address space that is still mapped into the process, but is not backed by physical memory.",,gauge,"instance,job" +go_memory_classes_heap_stacks_bytes,"Memory allocated from the heap that is reserved for stack space, whether or not it is currently in-use.",,gauge,"instance,job" +go_memory_classes_heap_unused_bytes,Memory that is reserved for heap objects but is not currently used to hold heap objects.,,gauge,"instance,job" +go_memory_classes_metadata_mcache_free_bytes,"Memory that is reserved for runtime mcache structures, but not in-use.",,gauge,"instance,job" +go_memory_classes_metadata_mcache_inuse_bytes,Memory that is occupied by runtime mcache structures that are currently being used.,,gauge,"instance,job" +go_memory_classes_metadata_mspan_free_bytes,"Memory that is reserved for runtime mspan structures, but not in-use.",,gauge,"instance,job" +go_memory_classes_metadata_mspan_inuse_bytes,Memory that is occupied by runtime mspan structures that are currently being used.,,gauge,"instance,job" +go_memory_classes_metadata_other_bytes,Memory that is reserved for or used to hold runtime metadata.,,gauge,"instance,job" +go_memory_classes_os_stacks_bytes,Stack memory allocated by the underlying operating system.,,gauge,"instance,job" +go_memory_classes_other_bytes,"Memory used by execution trace buffers, structures for debugging the runtime, finalizer and profiler specials, and more.",,gauge,"instance,job" +go_memory_classes_profiling_buckets_bytes,Memory that is used by the stack trace hash map used for profiling.,,gauge,"instance,job" +go_memory_classes_total_bytes,All memory mapped by the Go runtime into the current process as read-write. Note that this does not include memory mapped by code called via cgo or via the syscall package. Sum of all metrics in /memory/classes.,,gauge,"instance,job" +go_memstats_alloc_bytes,Number of bytes allocated and still in use.,,gauge, +go_memstats_buck_hash_sys_bytes,Number of bytes used by the profiling bucket hash table.,,gauge, +go_memstats_frees,Total number of frees.,,counter, +go_memstats_gc_cpu_fraction,The fraction of this program's available CPU time used by the GC since the program started.,,gauge,"instance,job" +go_memstats_gc_sys_bytes,Number of bytes used for garbage collection system metadata.,,gauge, +go_memstats_heap_alloc_bytes,Number of heap bytes allocated and still in use.,,gauge, +go_memstats_heap_idle_bytes,Number of heap bytes waiting to be used.,,gauge, +go_memstats_heap_inuse_bytes,Number of heap bytes that are in use.,,gauge, +go_memstats_heap_objects,Number of allocated objects.,,gauge, +go_memstats_heap_released_bytes,Number of heap bytes released to OS.,,gauge,"instance,job" +go_memstats_heap_sys_bytes,Number of heap bytes obtained from system.,,gauge, +go_memstats_last_gc_time_seconds,Number of seconds since 1970 of last garbage collection.,,gauge, +go_memstats_lookups,Total number of pointer lookups.,,counter, +go_memstats_mallocs,Total number of mallocs.,,counter, +go_memstats_mcache_inuse_bytes,Number of bytes in use by mcache structures.,,gauge, +go_memstats_mcache_sys_bytes,Number of bytes used for mcache structures obtained from system.,,gauge, +go_memstats_mspan_inuse_bytes,Number of bytes in use by mspan structures.,,gauge, +go_memstats_mspan_sys_bytes,Number of bytes used for mspan structures obtained from system.,,gauge, +go_memstats_next_gc_bytes,Number of heap bytes when next garbage collection will take place.,,gauge, +go_memstats_other_sys_bytes,Number of bytes used for other system allocations.,,gauge, +go_memstats_stack_inuse_bytes,Number of bytes in use by the stack allocator.,,gauge, +go_memstats_stack_sys_bytes,Number of bytes obtained from system for stack allocator.,,gauge, +go_memstats_sys_bytes,Number of bytes obtained by system. Sum of all system allocations.,,gauge, +go_sched_gomaxprocs_threads,"The current runtime.GOMAXPROCS setting, or the number of operating system threads that can execute user-level Go code simultaneously.",,gauge,"instance,job" +go_sched_goroutines_goroutines,Count of live goroutines.,,gauge,"instance,job" +go_sched_latencies_seconds,Distribution of the time goroutines have spent in the scheduler in a runnable state before actually running.,,histogram,"instance,job,le" +go_sync_mutex_wait_total_seconds,Approximate cumulative time goroutines have spent blocked on a sync.Mutex or sync.RWMutex. This metric is useful for identifying global changes in lock contention. Collect a mutex or block profile using the runtime/pprof package for more detailed contention data.,,counter,"instance,job" +go_threads,Number of OS threads created.,,gauge,"instance,job" +grpc_server_handled,"Total number of RPCs completed on the server, regardless of success or failure.",,counter,"grpc_code,grpc_method,grpc_service,grpc_type,instance,job" +grpc_server_handling_seconds,Histogram of response latency (seconds) of gRPC that had been application-level handled by the server.,,histogram,"grpc_method,grpc_service,grpc_type,instance,job,le" +grpc_server_msg_received,Total number of RPC stream messages received on the server.,,counter,"grpc_method,grpc_service,grpc_type,instance,job" +grpc_server_msg_sent,Total number of gRPC stream messages sent by the server.,,counter,"grpc_method,grpc_service,grpc_type,instance,job" +grpc_server_started,Total number of RPCs started on the server.,,counter,"grpc_method,grpc_service,grpc_type,instance,job" +http_request_duration_microseconds,The HTTP request latencies in microseconds.,,summary,"handler,quantile" +http_request_size_bytes,The HTTP request sizes in bytes.,,summary,"handler,quantile" +http_requests,Total number of HTTP requests made.,,counter,"code,handler,method" +http_response_size_bytes,The HTTP response sizes in bytes.,,summary,"handler,quantile" +os_fd_limit,The file descriptor limit.,,gauge,"instance,job" +os_fd_used,The number of used file descriptors.,,gauge,"instance,job" +pd_checker_event_count,Counter of checker events.,,counter,"instance,job,name,type" +pd_checker_patrol_regions_time,Time spent of patrol checks region.,,gauge,"instance,job" +pd_checker_region_list,Number of region in waiting list,,gauge,"instance,job,type" +pd_client_cmd_handle_cmds_duration_seconds,Bucketed histogram of processing time (s) of handled success cmds.,,histogram,"instance,job,type,le" +pd_client_cmd_handle_failed_cmds_duration_seconds,Bucketed histogram of processing time (s) of failed handled cmds.,,histogram,"instance,job,type,le" +pd_client_request_handle_requests_duration_seconds,Bucketed histogram of processing time (s) of handled requests.,,histogram,"instance,job,type,le" +pd_client_request_handle_tso_batch_size,Bucketed histogram of the batch size of handled requests.,,histogram,"instance,job,le" +pd_client_request_handle_tso_best_batch_size,Bucketed histogram of the best batch size of handled requests.,,histogram,"instance,job,le" +pd_client_request_tso_batch_send_latency,tso batch send latency,,histogram,"instance,job,le" +pd_cluster_bucket_event,Counter of the bucket event,,counter,"event,instance,job" +pd_cluster_health_status,Status of the cluster.,,gauge,"instance,job,name" +pd_cluster_id,Record of id allocator.,,gauge,"instance,job,type" +pd_cluster_metadata,Record critical metadata.,,gauge,"instance,job,type" +pd_cluster_region_event,Counter of the region event,,counter,"event,instance,job" +pd_cluster_status,Status of the cluster.,,gauge,"instance,job,type" +pd_cluster_store_limit,Status of the store limit.,,gauge,"instance,job,store,type" +pd_cluster_store_sync,The state of store sync config,,counter,"address,instance,job,state" +pd_cluster_tso,Record of tso metadata.,,gauge,"dc,instance,job,type" +pd_cluster_tso_gap_millionseconds,The minimal (non-zero) TSO gap for each DC.,,gauge,"dc,instance,job" +pd_cluster_update_stores_stats_time,Time spent of updating store stats.,,gauge,"instance,job" +pd_config_status,Status of the scheduling configurations.,,gauge,"instance,job,type" +pd_hbstream_region_message,Counter of message hbstream sent.,,counter,"address,instance,job,status,store,type" +pd_hotcache_flow_queue_status,Status of the hotspot flow queue.,,gauge,"instance,job,type" +pd_hotcache_status,Status of the hotspot.,,gauge,"instance,job,name,store,type" +pd_hotspot_status,Status of the hotspot.,,gauge,"address,instance,job,store,type" +pd_monitor_time_jump_back,Counter of system time jumps backward.,,counter,"instance,job" +pd_region_syncer_status,Inner status of the region syncer.,,gauge,"instance,job,type" +pd_regions_abnormal_peer_duration_seconds,Bucketed histogram of processing time (s) of handled success cmds.,,histogram,"instance,job,type,le" +pd_regions_label_level,Number of regions in the different label level.,,gauge,"instance,job,type" +pd_regions_offline_status,Status of the offline regions.,,gauge,"instance,job,type" +pd_regions_status,Status of the regions.,,gauge,"instance,job,type" +pd_schedule_filter,Counter of the filter,,counter,"action,instance,job,scope,source,target,type" +pd_schedule_finish_operator_steps_duration_seconds,Bucketed histogram of processing time (s) of finished operator step.,,histogram,"instance,job,type,le" +pd_schedule_finish_operators_duration_seconds,Bucketed histogram of processing time (s) of finished operator.,,histogram,"instance,job,type,le" +pd_schedule_labeler_event_counter,Counter of the scheduler label.,,counter,"event,instance,job,type" +pd_schedule_operator_limit,Counter of operator meeting limit,,counter,"instance,job,name,type" +pd_schedule_operator_region_size,Bucketed histogram of the operator region size.,,histogram,"instance,job,type,le" +pd_schedule_operators_count,Counter of schedule operators.,,counter,"event,instance,job,type" +pd_schedule_operators_waiting_count,Counter of schedule waiting operators.,,counter,"event,instance,job,type" +pd_schedule_scatter_operators_count,Counter of region scatter operators.,,counter,"event,instance,job,type" +pd_schedule_store_limit_cost,limit rate cost of store.,,counter,"instance,job,limit_type,store" +pd_schedule_waiting_operators_duration_seconds,Bucketed histogram of waiting time (s) of operator for being promoted.,,histogram,"instance,job,type,le" +pd_scheduler_balance_direction,Counter of direction of balance related schedulers.,,counter,"instance,job,source,target,type" +pd_scheduler_balance_leader,Counter of balance leader scheduler.,,counter,"instance,job,store,type" +pd_scheduler_balance_region,Counter of balance region scheduler.,,counter,"instance,job,store,type" +pd_scheduler_buckets_hot_degree_hist,Bucketed histogram of bucket hot degree,,histogram,"instance,job,le" +pd_scheduler_event_count,Counter of scheduler events.,,counter,"instance,job,name,type" +pd_scheduler_handle_region_heartbeat_duration_seconds,Bucketed histogram of processing time (s) of handled region heartbeat requests.,,histogram,"address,instance,job,store,le" +pd_scheduler_handle_store_heartbeat_duration_seconds,Bucketed histogram of processing time (s) of handled store heartbeat requests.,,histogram,"address,instance,job,store,le" +pd_scheduler_hot_peers_summary,Hot peers summary for each store,,gauge,"instance,job,store,type" +pd_scheduler_inner_status,Inner status of the scheduler.,,gauge,"instance,job,name,type" +pd_scheduler_read_byte_hist,The distribution of region read bytes,,histogram,"instance,job,le" +pd_scheduler_read_key_hist,The distribution of region read keys,,histogram,"instance,job,le" +pd_scheduler_region_heartbeat,Counter of region heartbeat.,,counter,"address,instance,job,status,store,type" +pd_scheduler_region_heartbeat_interval_hist,Bucketed histogram of the batch size of handled requests.,,histogram,"instance,job,le" +pd_scheduler_region_heartbeat_latency_seconds,Bucketed histogram of latency (s) of receiving heartbeat.,,histogram,"address,instance,job,store,le" +pd_scheduler_status,Status of the scheduler.,,gauge,"instance,job,kind,type" +pd_scheduler_store_heartbeat_interval_hist,Bucketed histogram of the batch size of handled requests.,,histogram,"instance,job,le" +pd_scheduler_store_status,Store status for schedule,,gauge,"address,instance,job,store,type" +pd_scheduler_write_byte_hist,The distribution of region write bytes,,histogram,"instance,job,le" +pd_scheduler_write_key_hist,The distribution of region write keys,,histogram,"instance,job,le" +pd_server_cluster_state_cpu_usage,CPU usage to determine the cluster state,,gauge,"instance,job" +pd_server_etcd_state,Etcd raft states.,,gauge,"instance,job,type" +pd_server_handle_tso_duration_seconds,Bucketed histogram of processing time (s) of handled tso requests.,,histogram,"instance,job,le" +pd_server_handle_tso_proxy_batch_size,Bucketed histogram of the batch size of handled tso proxy requests.,,histogram,"instance,job,le" +pd_server_handle_tso_proxy_duration_seconds,Bucketed histogram of processing time (s) of handled tso proxy requests.,,histogram,"instance,job,le" +pd_server_info,"Indicate the pd server info, and the value is the start timestamp (s).",,gauge,"hash,instance,job,version" +pd_service_audit_handling_seconds,PD server service handling audit,,histogram,"component,instance,ip,job,method,service,le" +pd_service_maxprocs,The value of GOMAXPROCS.,,gauge,"instance,job" +pd_tso_events,Counter of tso events,,counter,"dc,instance,job,type" +pd_tso_role,"Indicate the PD server role info, whether it's a TSO allocator.",,gauge,"dc,instance,job" +pd_txn_handle_txns_duration_seconds,Bucketed histogram of processing time (s) of handled txns.,,histogram,"instance,job,result,le" +pd_txn_txns_count,Counter of txns.,,counter,"instance,job,result" +process_cpu_seconds,Total user and system CPU time spent in seconds.,,counter, +process_max_fds,Maximum number of open file descriptors.,,gauge, +process_open_fds,Number of open file descriptors.,,gauge, +process_resident_memory_bytes,Resident memory size in bytes.,,gauge, +process_start_time_seconds,Start time of the process since unix epoch in seconds.,,gauge, +process_virtual_memory_bytes,Virtual memory size in bytes.,,gauge, +process_virtual_memory_max_bytes,Maximum amount of virtual memory available in bytes.,,gauge,"instance,job" +promhttp_metric_handler_requests_in_flight,Current number of scrapes being served.,,gauge,"instance,job" +promhttp_metric_handler_requests,Total number of scrapes by HTTP status code.,,counter,"code,instance,job" +pushgateway_build_info,"A metric with a constant '1' value labeled by version, revision, branch, and goversion from which pushgateway was built.",,gauge,"branch,goversion,revision,version" +resource_manager_client_token_request_duration,Bucketed histogram of latency(s) of token request.,,histogram,"instance,job,type,le" +tidb_autoid_operation_duration_seconds,Bucketed histogram of processing time (s) of handled autoid.,,histogram,"instance,job,result,type,le" +tidb_config_status,Status of the TiDB server configurations.,,gauge,"instance,job,type" +tidb_ddl_deploy_syncer_duration_seconds,Bucketed histogram of processing time (s) of deploy syncer,,histogram,"instance,job,result,type,le" +tidb_ddl_handle_job_duration_seconds,Bucketed histogram of processing time (s) of handle jobs,,histogram,"instance,job,result,type,le" +tidb_ddl_job_table_duration_seconds,Bucketed histogram of processing time (s) of the 3 DDL job tables,,histogram,"instance,job,type,le" +tidb_ddl_running_job_count,Running DDL jobs count,,gauge,"instance,job,type" +tidb_ddl_update_self_ver_duration_seconds,Bucketed histogram of processing time (s) of update self version,,histogram,"instance,job,result,le" +tidb_ddl_waiting_jobs,Gauge of jobs.,,gauge,"instance,job,type" +tidb_ddl_worker_operation_duration_seconds,Bucketed histogram of processing time (s) of ddl worker operations,,histogram,"action,instance,job,result,type,le" +tidb_ddl_worker_operation,Counter of creating ddl/worker and isowner.,,counter,"instance,job,type" +tidb_distsql_copr_cache,"coprocessor cache hit, evict and miss number",,counter,"instance,job,type" +tidb_distsql_copr_closest_read,counter of total copr read local read hit.,,counter,"instance,job,type" +tidb_distsql_copr_resp_size,copr task response data size in bytes.,,histogram,"instance,job,store,le" +tidb_distsql_handle_query_duration_seconds,Bucketed histogram of processing time (s) of handled queries.,,histogram,"copr_type,instance,job,sql_type,type,le" +tidb_distsql_partial_num,number of partial results for each query.,,histogram,"instance,job,le" +tidb_distsql_scan_keys_num,number of scanned keys for each query.,,histogram,"instance,job,le" +tidb_distsql_scan_keys_partial_num,number of scanned keys for each partial result.,,histogram,"instance,job,le" +tidb_domain_infocache_counters,Counters of infoCache: get/hit.,,counter,"action,instance,job,type" +tidb_domain_load_schema_duration_seconds,Bucketed histogram of processing time (s) in load schema.,,histogram,"instance,job,le" +tidb_domain_load_schema,Counter of load schema,,counter,"instance,job,type" +tidb_domain_load_sysvarcache,Counter of load sysvar cache,,counter,"instance,job,type" +tidb_executor_expensive,Counter of Expensive Executors.,,counter,"instance,job,type" +tidb_executor_phase_duration_seconds,Summary of each execution phase duration.,,summary,"instance,internal,job,phase" +tidb_log_backup_advancer_owner,"If the node is the owner of advancers, set this to `1`, otherwise `0`.",,gauge,"instance,job" +tidb_meta_autoid_duration_seconds,Bucketed histogram of processing time (s) in parse SQL.,,histogram,"instance,job,le" +tidb_meta_operation_duration_seconds,Bucketed histogram of processing time (s) of tidb meta data operations.,,histogram,"instance,job,result,type,le" +tidb_monitor_time_jump_back,Counter of system time jumps backward.,,counter,"instance,job" +tidb_owner_campaign_owner,Counter of campaign owner.,,counter,"instance,job,result,type" +tidb_owner_new_session_duration_seconds,Bucketed histogram of processing time (s) of new session.,,histogram,"instance,job,result,type,le" +tidb_owner_watch_owner,Counter of watch owner.,,counter,"instance,job,result,type" +tidb_plan_replayer_register_task,gauge of plan replayer registered task,,gauge,"instance,job" +tidb_plan_replayer_task,counter of plan replayer captured task,,counter,"instance,job,result,type" +tidb_rm_ema_cpu_usage,exponential moving average of CPU usage,,gauge,"instance,job" +tidb_rm_pool_concurrency,How many concurrency in the pool,,gauge,"instance,job,type" +tidb_server_affected_rows,Counters of server affected rows.,,counter,"instance,job,sql_type" +tidb_server_conn_idle_duration_seconds,Bucketed histogram of connection idle time (s).,,histogram,"in_txn,instance,job,le" +tidb_server_connections,Number of connections.,,gauge,"instance,job" +tidb_server_cpu_profile,Counter of cpu profiling,,counter,"instance,job" +tidb_server_critical_error,Counter of critical errors.,,counter,"instance,job" +tidb_server_disconnection,Counter of connections disconnected.,,counter,"instance,job,result" +tidb_server_event,Counter of tidb-server event.,,counter,"instance,job,type" +tidb_server_get_token_duration_seconds,"Duration (us) for getting token, it should be small until concurrency limit is reached.",,histogram,"instance,job,le" +tidb_server_gogc,The value of GOGC,,gauge,"instance,job" +tidb_server_handle_query_duration_seconds,Bucketed histogram of processing time (s) of handled queries.,,histogram,"db,instance,job,sql_type,le" +tidb_server_handshake_error,Counter of hand shake error.,,counter,"instance,job" +tidb_server_info,"Indicate the tidb server info, and the value is the start timestamp (s).",,gauge,"hash,instance,job,version" +tidb_server_load_table_cache_seconds,Duration (us) for loading table cache.,,histogram,"instance,job,le" +tidb_server_maxprocs,The value of GOMAXPROCS.,,gauge,"instance,job" +tidb_server_multi_query_num,The number of queries contained in a multi-query statement.,,histogram,"instance,job,le" +tidb_server_packet_io_bytes,Counters of packet IO bytes.,,counter,"instance,job,type" +tidb_server_pd_api_execution_duration_seconds,Bucketed histogram of all pd api execution time (s),,histogram,"instance,job,type,le" +tidb_server_pd_api_request,Counter of the pd http api requests,,counter,"instance,job,result,type" +tidb_server_plan_cache_instance_memory_usage,Total plan cache memory usage of all sessions in a instance,,gauge,"instance,job,type" +tidb_server_plan_cache_instance_plan_num_total,Counter of plan of all prepared plan cache in a instance,,gauge,"instance,job,type" +tidb_server_plan_cache_miss,Counter of plan cache miss.,,counter,"instance,job,type" +tidb_server_plan_cache,Counter of query using plan cache.,,counter,"instance,job,type" +tidb_server_prepared_stmts,number of prepared statements.,,gauge,"instance,job" +tidb_server_query,Counter of queries.,,counter,"instance,job,result,type" +tidb_server_rc_check_ts_conflict,Counter of WriteConflict caused by RCCheckTS.,,counter,"instance,job,type" +tidb_server_read_from_tablecache,Counter of query read from table cache.,,counter,"instance,job" +tidb_server_slow_query_cop_duration_seconds,Bucketed histogram of all cop processing time (s) of of slow queries.,,histogram,"instance,job,sql_type,le" +tidb_server_slow_query_process_duration_seconds,Bucketed histogram of processing time (s) of of slow queries.,,histogram,"instance,job,sql_type,le" +tidb_server_slow_query_wait_duration_seconds,Bucketed histogram of all cop waiting time (s) of of slow queries.,,histogram,"instance,job,sql_type,le" +tidb_server_tiflash_failed_store,"Statues of failed tiflash mpp store,-1 means detector heartbeat,0 means reachable,1 means abnormal.",,gauge,"address,instance,job" +tidb_server_tiflash_query,Counter of TiFlash queries.,,counter,"instance,job,result,type" +tidb_server_tokens,The number of concurrent executing session,,gauge,"instance,job" +tidb_server_ttl_insert_rows,The count of TTL rows inserted,,counter,"instance,job" +tidb_server_ttl_job_status,The jobs count in the specified status,,gauge,"instance,job,type" +tidb_server_ttl_phase_time,The time spent in each phase,,counter,"instance,job,phase,type" +tidb_server_ttl_processed_expired_rows,The count of expired rows processed in TTL jobs,,counter,"instance,job,result,sql_type" +tidb_server_ttl_query_duration,Bucketed histogram of processing time (s) of handled TTL queries.,,histogram,"instance,job,result,sql_type,le" +tidb_server_ttl_task_status,The tasks count in the specified status,,gauge,"instance,job,type" +tidb_server_ttl_watermark_delay,Bucketed delay time in seconds for TTL tables.,,gauge,"instance,job,name,type" +tidb_session_compile_duration_seconds,Bucketed histogram of processing time (s) in query optimize.,,histogram,"instance,job,sql_type,le" +tidb_session_execute_duration_seconds,Bucketed histogram of processing time (s) in running executor.,,histogram,"instance,job,sql_type,le" +tidb_session_non_transactional_dml_count,Counter of non-transactional delete,,counter,"instance,job,type" +tidb_session_parse_duration_seconds,Bucketed histogram of processing time (s) in parse SQL.,,histogram,"instance,job,sql_type,le" +tidb_session_restricted_sql,Counter of internal restricted sql.,,counter,"instance,job" +tidb_session_retry_num,Bucketed histogram of session retry count.,,histogram,"instance,job,scope,le" +tidb_session_statement_deadlock_detect_duration_seconds,Bucketed histogram of a statement deadlock detect duration.,,histogram,"instance,job,le" +tidb_session_statement_lock_keys_count,Keys locking for a single statement,,histogram,"instance,job,le" +tidb_session_statement_pessimistic_retry_count,Bucketed histogram of statement pessimistic retry count,,histogram,"instance,job,le" +tidb_session_transaction_duration_seconds,"Bucketed histogram of a transaction execution duration, including retry.",,histogram,"instance,job,scope,txn_mode,type,le" +tidb_session_transaction_fair_locking_usage,The counter of statements and transactions in which fair locking is used or takes effect,,counter,"instance,job,type" +tidb_session_transaction_pessimistic_dml_duration_by_attempt,"Bucketed histogram of duration of pessimistic DMLs, distinguished by first attempt and retries",,histogram,"instance,job,phase,type,le" +tidb_session_transaction_statement_num,Bucketed histogram of statements count in each transaction.,,histogram,"instance,job,scope,txn_mode,type,le" +tidb_session_txn_state_entering_count,How many times transactions enter this state,,counter,"instance,job,type" +tidb_session_txn_state_seconds,Bucketed histogram of different states of a transaction.,,histogram,"has_lock,instance,job,type,le" +tidb_session_validate_read_ts_from_pd_count,Counter of validating read ts by getting a timestamp from PD,,counter,"instance,job" +tidb_sli_small_txn_write_duration_seconds,Bucketed histogram of small transaction write time (s).,,histogram,"instance,job,le" +tidb_sli_tikv_read_throughput,Read throughput of TiKV read in Bytes/s.,,histogram,"instance,job,le" +tidb_sli_tikv_small_read_duration,Read time of TiKV small read.,,histogram,"instance,job,le" +tidb_sli_txn_write_throughput,Bucketed histogram of transaction write throughput (bytes/second).,,histogram,"instance,job,le" +tidb_statistics_auto_analyze_duration_seconds,Bucketed histogram of processing time (s) of auto analyze.,,histogram,"instance,job,le" +tidb_statistics_fast_analyze_status,Bucketed histogram of some stats in fast analyze.,,histogram,"instance,job,sql_type,type,le" +tidb_statistics_high_error_rate_feedback,Counter of query feedback whose actual count is much different than calculated by current statistics,,counter,"instance,job" +tidb_statistics_historical_stats,counter of the historical stats operation,,counter,"instance,job,result,type" +tidb_statistics_pseudo_estimation,Counter of pseudo estimation caused by outdated stats.,,counter,"instance,job,type" +tidb_statistics_read_stats_latency_millis,Bucketed histogram of latency time (ms) of stats read during sync-load.,,histogram,"instance,job,le" +tidb_statistics_stats_cache_lru_op,Counter of lru for statsCache operation,,counter,"instance,job,type" +tidb_statistics_stats_cache_lru_val,gauge of stats cache lru value,,gauge,"instance,job,type" +tidb_statistics_stats_healthy,Gauge of stats healthy,,gauge,"instance,job,type" +tidb_statistics_stats_inaccuracy_rate,Bucketed histogram of stats inaccuracy rate.,,histogram,"instance,job,le" +tidb_statistics_sync_load_latency_millis,Bucketed histogram of latency time (ms) of sync load.,,histogram,"instance,job,le" +tidb_statistics_sync_load_timeout,Counter of sync load timeout.,,counter,"instance,job" +tidb_statistics_sync_load,Counter of sync load.,,counter,"instance,job" +tidb_tikvclient_aggressive_locking_count,Counter of keys locked in aggressive locking mode,,counter,"instance,job,type" +tidb_tikvclient_async_commit_txn_counter,Counter of async commit transactions.,,counter,"instance,job,type" +tidb_tikvclient_backoff_seconds,total backoff seconds of a single backoffer.,,histogram,"instance,job,type,le" +tidb_tikvclient_batch_client_no_available_connection,Counter of no available batch client.,,counter,"instance,job" +tidb_tikvclient_batch_client_reset,batch client recycle connection and reconnect duration,,histogram,"instance,job,le" +tidb_tikvclient_batch_client_unavailable_seconds,batch client unavailable,,histogram,"instance,job,le" +tidb_tikvclient_batch_client_wait_connection_establish,batch client wait new connection establish,,histogram,"instance,job,le" +tidb_tikvclient_batch_executor_token_wait_duration,tidb txn token wait duration to process batches,,histogram,"instance,job,le" +tidb_tikvclient_batch_pending_requests,number of requests pending in the batch channel,,histogram,"instance,job,store,le" +tidb_tikvclient_batch_recv_latency,batch recv latency,,histogram,"instance,job,result,le" +tidb_tikvclient_batch_requests,number of requests in one batch,,histogram,"instance,job,store,le" +tidb_tikvclient_batch_send_latency,batch send latency,,histogram,"instance,job,le" +tidb_tikvclient_batch_wait_duration,batch wait duration,,histogram,"instance,job,le" +tidb_tikvclient_batch_wait_overload,event of tikv transport layer overload,,counter,"instance,job" +tidb_tikvclient_commit_txn_counter,Counter of 2PC transactions.,,counter,"instance,job,type" +tidb_tikvclient_cop_duration_seconds,"Run duration of a single coprocessor task, includes backoff time.",,histogram,"instance,job,scope,stale_read,store,le" +tidb_tikvclient_gc_config,Gauge of GC configs.,,gauge,"instance,job,type" +tidb_tikvclient_gc_region_too_many_locks,Counter of gc scan lock request more than once in the same region.,,counter,"instance,job" +tidb_tikvclient_gc_worker_actions,Counter of gc worker actions.,,counter,"instance,job,type" +tidb_tikvclient_grpc_connection_state,State of gRPC connection,,gauge,"connection_id,grpc_state,instance,job,store_ip" +tidb_tikvclient_kv_status_api_count,Counter of access kv status api.,,counter,"instance,job,result" +tidb_tikvclient_load_region_cache_seconds,Load region information duration,,histogram,"instance,job,type,le" +tidb_tikvclient_load_safepoint,Counter of load safepoint.,,counter,"instance,job,type" +tidb_tikvclient_local_latch_wait_seconds,Wait time of a get local latch.,,histogram,"instance,job,le" +tidb_tikvclient_lock_cleanup_task,failure statistic of secondary lock cleanup task.,,counter,"instance,job,type" +tidb_tikvclient_lock_resolver_actions,Counter of lock resolver actions.,,counter,"instance,job,type" +tidb_tikvclient_min_safets_gap_seconds,The minimal (non-zero) SafeTS gap for each store.,,gauge,"instance,job,store" +tidb_tikvclient_one_pc_txn_counter,Counter of 1PC transactions.,,counter,"instance,job,type" +tidb_tikvclient_pessimistic_lock_keys_duration,tidb txn pessimistic lock keys duration,,histogram,"instance,job,le" +tidb_tikvclient_prefer_leader_flows_gauge,Counter of flows under PreferLeader mode.,,gauge,"instance,job,store,type" +tidb_tikvclient_prewrite_assertion_count,Counter of assertions used in prewrite requests,,counter,"instance,job,type" +tidb_tikvclient_rawkv_cmd_seconds,Bucketed histogram of processing time of rawkv cmds.,,histogram,"instance,job,type,le" +tidb_tikvclient_rawkv_kv_size_bytes,"Size of key/value to put, in bytes.",,histogram,"instance,job,type,le" +tidb_tikvclient_region_cache_operations,Counter of region cache.,,counter,"instance,job,result,type" +tidb_tikvclient_region_err,Counter of region errors.,,counter,"instance,job,scope,type" +tidb_tikvclient_replica_selector_failure_counter,Counter of the reason why the replica selector cannot yield a potential leader.,,counter,"instance,job,type" +tidb_tikvclient_request_counter,Counter of sending request with multi dimensions.,,counter,"instance,job,scope,source,stale_read,store,type" +tidb_tikvclient_request_retry_times,Bucketed histogram of how many times a region request retries.,,histogram,"instance,job,le" +tidb_tikvclient_request_seconds,Bucketed histogram of sending request duration.,,histogram,"instance,job,scope,stale_read,store,type,le" +tidb_tikvclient_request_time_counter,Counter of request time with multi dimensions.,,counter,"instance,job,scope,source,stale_read,store,type" +tidb_tikvclient_rpc_net_latency_seconds,Bucketed histogram of time difference between TiDB and TiKV.,,histogram,"instance,job,scope,store,le" +tidb_tikvclient_safets_update_counter,Counter of tikv safe_ts being updated.,,counter,"instance,job,result,store" +tidb_tikvclient_stale_read_bytes,Counter of stale read requests bytes,,counter,"direction,instance,job,result" +tidb_tikvclient_stale_read_counter,Counter of stale read hit/miss,,counter,"instance,job,result" +tidb_tikvclient_stale_read_req_counter,Counter of stale read requests,,counter,"instance,job,type" +tidb_tikvclient_store_slow_score,Slow scores of each tikv node based on RPC timecosts,,gauge,"instance,job,store" +tidb_tikvclient_ts_future_wait_seconds,Bucketed histogram of seconds cost for waiting timestamp future.,,histogram,"instance,job,le" +tidb_tikvclient_ttl_lifetime_reach,Counter of ttlManager live too long.,,counter,"instance,job" +tidb_tikvclient_txn_cmd_duration_seconds,Bucketed histogram of processing time of txn cmds.,,histogram,"instance,job,scope,type,le" +tidb_tikvclient_txn_commit_backoff_count,Bucketed histogram of the backoff count in committing a transaction.,,histogram,"instance,job,le" +tidb_tikvclient_txn_commit_backoff_seconds,Bucketed histogram of the total backoff duration in committing a transaction.,,histogram,"instance,job,le" +tidb_tikvclient_txn_heart_beat,Bucketed histogram of the txn_heartbeat request duration.,,histogram,"instance,job,type,le" +tidb_tikvclient_txn_regions_num,Number of regions in a transaction.,,histogram,"instance,job,scope,type,le" +tidb_tikvclient_txn_ttl_manager,Bucketed histogram of the txn ttl manager lifetime duration.,,histogram,"instance,job,le" +tidb_tikvclient_txn_write_kv_num,Count of kv pairs to write in a transaction.,,histogram,"instance,job,scope,le" +tidb_tikvclient_txn_write_size_bytes,Size of kv pairs to write in a transaction.,,histogram,"instance,job,scope,le" +tidb_topsql_ignored,"Counter of ignored top-sql metrics (register-sql, register-plan, collect-data and report-data), normally it should be 0.",,counter,"instance,job,type" +tidb_topsql_report_data_total,Bucket histogram of reporting records/sql/plan count to the top-sql agent.,,histogram,"instance,job,type,le" +tidb_topsql_report_duration_seconds,Bucket histogram of reporting time (s) to the top-sql agent,,histogram,"instance,job,result,type,le" +tso_monitor_time_jump_back,Counter of system time jumps backward.,,counter,"instance,job" +tso_server_handle_tso_duration_seconds,Bucketed histogram of processing time (s) of handled tso requests.,,histogram,"instance,job,le" +tso_server_handle_tso_proxy_batch_size,Bucketed histogram of the batch size of handled tso proxy requests.,,histogram,"instance,job,le" +tso_server_handle_tso_proxy_duration_seconds,Bucketed histogram of processing time (s) of handled tso proxy requests.,,histogram,"instance,job,le"