From fb919bd2da764fb8ea237eb5620df9f6febff384 Mon Sep 17 00:00:00 2001 From: ricoberger Date: Sat, 4 Sep 2021 13:29:33 +0200 Subject: [PATCH] Improve query performance for ClickHouse Queries to get logs from ClickHouse over a large time range should now be faster. For this we are only returning a maximum of round about 10000 documents from ClickHouse. For this we had to remove the stats enpoint which returned the overall document count and the buckets data and added it to the documents endpoint. With the help of the retrieved buckets we are modifying the start time of a query, so that large queries should be faster. More information about this new approach can be found in the code as inline comment. For example, queries to get all logs from the last 7 days, which took up to 3 minutes, are taking 10 seconds now. Note: We decided for the 10000 limit, because the default limit in Kibana is 500 and so it should be large enough. In the future we can also provide an option in the ClickHouse configuration or an additional field in the Options component to increase this limit. --- CHANGELOG.md | 1 + plugins/clickhouse/clickhouse.go | 54 +------ plugins/clickhouse/pkg/instance/instance.go | 143 ++++++++++-------- .../clickhouse/src/components/page/Logs.tsx | 38 +++-- .../clickhouse/src/components/panel/Logs.tsx | 18 +-- .../src/components/panel/LogsStats.tsx | 116 -------------- plugins/clickhouse/src/utils/interfaces.ts | 8 +- 7 files changed, 120 insertions(+), 258 deletions(-) delete mode 100644 plugins/clickhouse/src/components/panel/LogsStats.tsx diff --git a/CHANGELOG.md b/CHANGELOG.md index 14e199d8f..ed043b140 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -46,6 +46,7 @@ NOTE: As semantic versioning states all 0.y.z releases can contain breaking chan - [#108](https://github.com/kobsio/kobs/pull/108): Improve tooltip position in all nivo charts. - [#121](https://github.com/kobsio/kobs/pull/121): :warning: *Breaking change:* :warning: Allow multiple queries in the panel options for the Elasticsearch plugin. - [#130](https://github.com/kobsio/kobs/pull/130): :warning: *Breaking change:* :warning: Allow multiple queries in the panel options for the Jaeger plugin. +- [#133](https://github.com/kobsio/kobs/pull/133): Improve querie performance to get logs from ClickHouse. ## [v0.5.0](https://github.com/kobsio/kobs/releases/tag/v0.5.0) (2021-08-03) diff --git a/plugins/clickhouse/clickhouse.go b/plugins/clickhouse/clickhouse.go index 2d1591f90..4993b1b85 100644 --- a/plugins/clickhouse/clickhouse.go +++ b/plugins/clickhouse/clickhouse.go @@ -124,7 +124,7 @@ func (router *Router) getLogs(w http.ResponseWriter, r *http.Request) { return } - documents, fields, took, newOffset, err := i.GetLogs(r.Context(), query, parsedLimit, parsedOffset, parsedTimeStart, parsedTimeEnd) + documents, fields, count, took, buckets, newOffset, newTimeStart, err := i.GetLogs(r.Context(), query, parsedLimit, parsedOffset, parsedTimeStart, parsedTimeEnd) if err != nil { errresponse.Render(w, r, err, http.StatusBadRequest, "Could not get logs") return @@ -133,56 +133,19 @@ func (router *Router) getLogs(w http.ResponseWriter, r *http.Request) { data := struct { Documents []map[string]interface{} `json:"documents"` Fields []string `json:"fields"` + Count int64 `json:"count"` Took int64 `json:"took"` + Buckets []instance.Bucket `json:"buckets"` Offset int64 `json:"offset"` + TimeStart int64 `json:"timeStart"` }{ documents, fields, - took, - newOffset, - } - - render.JSON(w, r, data) -} - -func (router *Router) getLogsStats(w http.ResponseWriter, r *http.Request) { - name := chi.URLParam(r, "name") - query := r.URL.Query().Get("query") - timeStart := r.URL.Query().Get("timeStart") - timeEnd := r.URL.Query().Get("timeEnd") - - log.WithFields(logrus.Fields{"name": name, "query": query, "timeStart": timeStart, "timeEnd": timeEnd}).Tracef("getLogsCount") - - i := router.getInstance(name) - if i == nil { - errresponse.Render(w, r, nil, http.StatusBadRequest, "Could not find instance name") - return - } - - parsedTimeStart, err := strconv.ParseInt(timeStart, 10, 64) - if err != nil { - errresponse.Render(w, r, err, http.StatusBadRequest, "Could not parse start time") - return - } - - parsedTimeEnd, err := strconv.ParseInt(timeEnd, 10, 64) - if err != nil { - errresponse.Render(w, r, err, http.StatusBadRequest, "Could not parse end time") - return - } - - count, buckets, err := i.GetLogsStats(r.Context(), query, parsedTimeStart, parsedTimeEnd) - if err != nil { - errresponse.Render(w, r, err, http.StatusBadRequest, "Could not get logs count") - return - } - - data := struct { - Count int64 `json:"count"` - Buckets []instance.Bucket `json:"buckets"` - }{ count, + took, buckets, + newOffset, + newTimeStart, } render.JSON(w, r, data) @@ -220,8 +183,7 @@ func Register(clusters *clusters.Clusters, plugins *plugin.Plugins, config Confi } router.Get("/sql/{name}", router.getSQL) - router.Get("/logs/documents/{name}", router.getLogs) - router.Get("/logs/stats/{name}", router.getLogsStats) + router.Get("/logs/{name}", router.getLogs) return router } diff --git a/plugins/clickhouse/pkg/instance/instance.go b/plugins/clickhouse/pkg/instance/instance.go index 21714ea15..4ccbf40c3 100644 --- a/plugins/clickhouse/pkg/instance/instance.go +++ b/plugins/clickhouse/pkg/instance/instance.go @@ -73,7 +73,9 @@ func (i *Instance) GetSQL(ctx context.Context, query string) ([][]interface{}, [ // GetLogs parses the given query into the sql syntax, which is then run against the ClickHouse instance. The returned // rows are converted into a document schema which can be used by our UI. -func (i *Instance) GetLogs(ctx context.Context, query string, limit, offset, timeStart, timeEnd int64) ([]map[string]interface{}, []string, int64, int64, error) { +func (i *Instance) GetLogs(ctx context.Context, query string, limit, offset, timeStart, timeEnd int64) ([]map[string]interface{}, []string, int64, int64, []Bucket, int64, int64, error) { + var count int64 + var buckets []Bucket var documents []map[string]interface{} fields := defaultFields queryStartTime := time.Now() @@ -85,12 +87,80 @@ func (i *Instance) GetLogs(ctx context.Context, query string, limit, offset, tim if query != "" { parsedQuery, err := parseLogsQuery(query) if err != nil { - return nil, nil, 0, offset, err + return nil, nil, 0, 0, nil, offset, timeStart, err } conditions = fmt.Sprintf("AND %s", parsedQuery) } + // The count of documents and the buckets are only needed for the first query where the offset is 0. For the + // following queries we can reuse the data returned by the first query, because the number of documents shouldn't + // change in the selected time range. + if offset == 0 { + // Determine the number of documents, which are available in the users selected time range. We are using the same + // query as to get the documents, but we are skipping the limit and offset parameters. + sqlQueryCount := fmt.Sprintf("SELECT count(*) FROM %s.logs WHERE timestamp >= ? AND timestamp <= ? %s SETTINGS skip_unavailable_shards = 1", i.database, conditions) + log.WithFields(logrus.Fields{"query": sqlQueryCount, "timeStart": timeStart, "timeEnd": timeEnd}).Tracef("sql count query") + rowsCount, err := i.client.QueryContext(ctx, sqlQueryCount, time.Unix(timeStart, 0), time.Unix(timeEnd, 0)) + if err != nil { + return nil, nil, 0, 0, nil, offset, timeStart, err + } + defer rowsCount.Close() + + for rowsCount.Next() { + if err := rowsCount.Scan(&count); err != nil { + return nil, nil, 0, 0, nil, offset, timeStart, err + } + } + + // Now we are creating 30 buckets for the selected time range and count the documents in each bucket. This is used + // to render the distribution chart, which shows how many documents/rows are available within a bucket. + interval := (timeEnd - timeStart) / 30 + sqlQueryBuckets := fmt.Sprintf("SELECT toStartOfInterval(timestamp, INTERVAL %d second) AS interval_data , count(*) AS count_data FROM %s.logs WHERE timestamp >= ? AND timestamp <= ? %s GROUP BY interval_data SETTINGS skip_unavailable_shards = 1", interval, i.database, conditions) + log.WithFields(logrus.Fields{"query": sqlQueryBuckets, "timeStart": timeStart, "timeEnd": timeEnd}).Tracef("sql buckets query") + rowsBuckets, err := i.client.QueryContext(ctx, sqlQueryBuckets, time.Unix(timeStart, 0), time.Unix(timeEnd, 0)) + if err != nil { + return nil, nil, 0, 0, nil, offset, timeStart, err + } + defer rowsBuckets.Close() + + for rowsBuckets.Next() { + var intervalData time.Time + var countData int64 + + if err := rowsBuckets.Scan(&intervalData, &countData); err != nil { + return nil, nil, 0, 0, nil, offset, timeStart, err + } + + buckets = append(buckets, Bucket{ + Interval: intervalData, + IntervalFormatted: intervalData.Format("01-02 15:04:05"), + Count: countData, + }) + } + + sort.Slice(buckets, func(i, j int) bool { + return buckets[i].Interval.Before(buckets[j].Interval) + }) + + // We are only returning the first 10000 documents in buckets of the given limit, to speed up the following + // query to get the documents. For that we are looping through the sorted buckets and using the timestamp from + // the bucket where the sum of all newer buckets contains 10000 docuemnts. + // This new start time is then also returned in the response and can be used for the "load more" call as the new + // start date. In these follow up calls the start time isn't changed again, because we are skipping the count + // and bucket queries. + // NOTE: If a user has problems with this limit in the future, we can provide an option for this via the + // config.yaml file or maybe even better via an additional field in the Options component in the React UI. + var bucketCount int64 + for i := len(buckets) - 1; i >= 0; i-- { + bucketCount = bucketCount + buckets[i].Count + if bucketCount > 10000 { + timeStart = buckets[i].Interval.Unix() + break + } + } + } + // Now we are building and executing our sql query. We always return all fields from the logs table, where the // timestamp of a row is within the selected query range and the parsed query. We also order all the results by the // timestamp field and limiting the results / using a offset for pagination. @@ -98,7 +168,7 @@ func (i *Instance) GetLogs(ctx context.Context, query string, limit, offset, tim log.WithFields(logrus.Fields{"query": sqlQuery, "timeStart": timeStart, "timeEnd": timeEnd}).Tracef("sql query") rows, err := i.client.QueryContext(ctx, sqlQuery, time.Unix(timeStart, 0), time.Unix(timeEnd, 0)) if err != nil { - return nil, nil, 0, offset, err + return nil, nil, 0, 0, nil, offset, timeStart, err } defer rows.Close() @@ -111,7 +181,7 @@ func (i *Instance) GetLogs(ctx context.Context, query string, limit, offset, tim for rows.Next() { var r Row if err := rows.Scan(&r.Timestamp, &r.Cluster, &r.Namespace, &r.App, &r.Pod, &r.Container, &r.Host, &r.FieldsString.Key, &r.FieldsString.Value, &r.FieldsNumber.Key, &r.FieldsNumber.Value, &r.Log); err != nil { - return nil, nil, 0, offset, err + return nil, nil, 0, 0, nil, offset, timeStart, err } var document map[string]interface{} @@ -145,72 +215,11 @@ func (i *Instance) GetLogs(ctx context.Context, query string, limit, offset, tim } if err := rows.Err(); err != nil { - return nil, nil, 0, offset, err + return nil, nil, 0, 0, nil, offset, timeStart, err } sort.Strings(fields) - return documents, fields, time.Now().Sub(queryStartTime).Milliseconds(), offset + limit, nil -} - -// GetLogsStats returns the number of documents, which could be returned by the user provided query and the distribution -// of the logs over the selected time range. -func (i *Instance) GetLogsStats(ctx context.Context, query string, timeStart, timeEnd int64) (int64, []Bucket, error) { - var count int64 - var buckets []Bucket - - conditions := "" - if query != "" { - parsedQuery, err := parseLogsQuery(query) - if err != nil { - return 0, nil, err - } - - conditions = fmt.Sprintf("AND %s", parsedQuery) - } - - sqlQueryCount := fmt.Sprintf("SELECT count(*) FROM %s.logs WHERE timestamp >= ? AND timestamp <= ? %s SETTINGS skip_unavailable_shards = 1", i.database, conditions) - log.WithFields(logrus.Fields{"query": sqlQueryCount, "timeStart": timeStart, "timeEnd": timeEnd}).Tracef("sql count query") - rowsCount, err := i.client.QueryContext(ctx, sqlQueryCount, time.Unix(timeStart, 0), time.Unix(timeEnd, 0)) - if err != nil { - return 0, nil, err - } - defer rowsCount.Close() - - for rowsCount.Next() { - if err := rowsCount.Scan(&count); err != nil { - return 0, nil, err - } - } - - interval := (timeEnd - timeStart) / 30 - sqlQueryBuckets := fmt.Sprintf("SELECT toStartOfInterval(timestamp, INTERVAL %d second) AS interval_data , count(*) AS count_data FROM %s.logs WHERE timestamp >= ? AND timestamp <= ? %s GROUP BY interval_data SETTINGS skip_unavailable_shards = 1", interval, i.database, conditions) - log.WithFields(logrus.Fields{"query": sqlQueryBuckets, "timeStart": timeStart, "timeEnd": timeEnd}).Tracef("sql buckets query") - rowsBuckets, err := i.client.QueryContext(ctx, sqlQueryBuckets, time.Unix(timeStart, 0), time.Unix(timeEnd, 0)) - if err != nil { - return 0, nil, err - } - defer rowsBuckets.Close() - - for rowsBuckets.Next() { - var intervalData time.Time - var countData int64 - - if err := rowsBuckets.Scan(&intervalData, &countData); err != nil { - return 0, nil, err - } - - buckets = append(buckets, Bucket{ - Interval: intervalData, - IntervalFormatted: intervalData.Format("01-02 15:04:05"), - Count: countData, - }) - } - - sort.Slice(buckets, func(i, j int) bool { - return buckets[i].Interval.Before(buckets[j].Interval) - }) - - return count, buckets, nil + return documents, fields, count, time.Now().Sub(queryStartTime).Milliseconds(), buckets, offset + limit, timeStart, nil } // New returns a new ClickHouse instance for the given configuration. diff --git a/plugins/clickhouse/src/components/page/Logs.tsx b/plugins/clickhouse/src/components/page/Logs.tsx index cc34b1682..36e870e95 100644 --- a/plugins/clickhouse/src/components/page/Logs.tsx +++ b/plugins/clickhouse/src/components/page/Logs.tsx @@ -5,7 +5,11 @@ import { Button, ButtonVariant, Card, + CardActions, CardBody, + CardHeader, + CardHeaderMain, + CardTitle, Grid, GridItem, Spinner, @@ -16,9 +20,9 @@ import { useHistory } from 'react-router-dom'; import { ILogsData } from '../../utils/interfaces'; import { IPluginTimes } from '@kobsio/plugin-core'; +import LogsChart from '../panel/LogsChart'; import LogsDocuments from '../panel/LogsDocuments'; import LogsFields from './LogsFields'; -import LogsStats from '../panel/LogsStats'; interface IPageLogsProps { name: string; @@ -43,10 +47,11 @@ const PageLogs: React.FunctionComponent = ({ ['clickhouse/logs', query, times], async ({ pageParam }) => { try { + console.log(pageParam); const response = await fetch( - `/api/plugins/clickhouse/logs/documents/${name}?query=${encodeURIComponent(query)}&timeStart=${ - times.timeStart - }&timeEnd=${times.timeEnd}&limit=100&offset=${pageParam || ''}`, + `/api/plugins/clickhouse/logs/${name}?query=${encodeURIComponent(query)}&timeStart=${ + pageParam && pageParam.timeStart ? pageParam.timeStart : times.timeStart + }&timeEnd=${times.timeEnd}&limit=100&offset=${pageParam && pageParam.offset ? pageParam.offset : ''}`, { method: 'get', }, @@ -67,7 +72,9 @@ const PageLogs: React.FunctionComponent = ({ } }, { - getNextPageParam: (lastPage, pages) => lastPage.offset, + getNextPageParam: (lastPage, pages) => { + return { offset: lastPage.offset, timeStart: lastPage.timeStart }; + }, keepPreviousData: true, }, ); @@ -111,14 +118,19 @@ const PageLogs: React.FunctionComponent = ({ - + + + + + {data.pages[0].count} Documents in {data.pages[0].took} Milliseconds + + + {isFetching && } + + + + +

 

diff --git a/plugins/clickhouse/src/components/panel/Logs.tsx b/plugins/clickhouse/src/components/panel/Logs.tsx index 029ab15aa..6fce4367e 100644 --- a/plugins/clickhouse/src/components/panel/Logs.tsx +++ b/plugins/clickhouse/src/components/panel/Logs.tsx @@ -16,8 +16,8 @@ import React, { useState } from 'react'; import { ILogsData, IQuery } from '../../utils/interfaces'; import { IPluginTimes, PluginCard } from '@kobsio/plugin-core'; import LogsActions from './LogsActions'; +import LogsChart from './LogsChart'; import LogsDocuments from '../panel/LogsDocuments'; -import LogsStats from './LogsStats'; interface ILogsProps { name: string; @@ -50,7 +50,7 @@ const Logs: React.FunctionComponent = ({ } const response = await fetch( - `/api/plugins/clickhouse/logs/documents/${name}?query=${encodeURIComponent(selectedQuery.query)}&timeStart=${ + `/api/plugins/clickhouse/logs/${name}?query=${encodeURIComponent(selectedQuery.query)}&timeStart=${ times.timeStart }&timeEnd=${times.timeEnd}&limit=100&offset=${pageParam || ''}`, { @@ -138,15 +138,11 @@ const Logs: React.FunctionComponent = ({ ) : data && data.pages.length > 0 ? (
- {showChart && selectedQuery.query ? ( - + {showChart ? ( +
+ +

 

+
) : null} diff --git a/plugins/clickhouse/src/components/panel/LogsStats.tsx b/plugins/clickhouse/src/components/panel/LogsStats.tsx deleted file mode 100644 index 6140430ad..000000000 --- a/plugins/clickhouse/src/components/panel/LogsStats.tsx +++ /dev/null @@ -1,116 +0,0 @@ -import { - Alert, - AlertActionLink, - AlertVariant, - Card, - CardActions, - CardBody, - CardHeader, - CardHeaderMain, - CardTitle, - Spinner, -} from '@patternfly/react-core'; -import { QueryObserverResult, useQuery } from 'react-query'; -import React from 'react'; - -import { ILogsStats } from '../../utils/interfaces'; -import { IPluginTimes } from '@kobsio/plugin-core'; -import LogsChart from './LogsChart'; - -interface ILogsStatsProps { - name: string; - query: string; - times: IPluginTimes; - took: number; - isFetchingDocuments: boolean; - isPanel: boolean; -} - -const LogsStats: React.FunctionComponent = ({ - name, - query, - times, - took, - isFetchingDocuments, - isPanel, -}: ILogsStatsProps) => { - const { isError, isLoading, data, error, refetch } = useQuery( - ['clickhouse/logsstats', query, times], - async () => { - try { - const response = await fetch( - `/api/plugins/clickhouse/logs/stats/${name}?query=${encodeURIComponent(query)}&timeStart=${ - times.timeStart - }&timeEnd=${times.timeEnd}`, - { - method: 'get', - }, - ); - const json = await response.json(); - - if (response.status >= 200 && response.status < 300) { - return json; - } else { - if (json.error) { - throw new Error(json.error); - } else { - throw new Error('An unknown error occured'); - } - } - } catch (err) { - throw err; - } - }, - ); - - if (isPanel) { - if (data && data.buckets) { - return ( -
- -

 

-
- ); - } else { - return null; - } - } - - return ( - - - - - {data && data.count ? data.count : 0} Documents in {took} Milliseconds - - - {isFetchingDocuments && } - - - {isLoading ? ( -
- -
- ) : isError ? ( - - > => refetch()}> - Retry - - - } - > -

{error?.message}

-
- ) : data ? ( - - ) : null} -
-
- ); -}; - -export default LogsStats; diff --git a/plugins/clickhouse/src/utils/interfaces.ts b/plugins/clickhouse/src/utils/interfaces.ts index df4f246a1..9574cf3f4 100644 --- a/plugins/clickhouse/src/utils/interfaces.ts +++ b/plugins/clickhouse/src/utils/interfaces.ts @@ -25,9 +25,12 @@ export interface IQuery { // ILogsData is the interface of the data returned from our Go API for the logs view of the ClickHouse plugin. export interface ILogsData { offset: number; + timeStart: number; + count?: number; took?: number; fields?: string[]; documents?: IDocument[]; + buckets?: IBucket[]; } export interface IDocument { @@ -35,11 +38,6 @@ export interface IDocument { [key: string]: any; } -export interface ILogsStats { - count?: number; - buckets?: IBucket[]; -} - export interface IBucket extends BarDatum { interval: string; count: number;