In [None]:
# Install packages

!pip install elasticsearch

Collecting elasticsearch
  Downloading elasticsearch-8.12.0-py3-none-any.whl (431 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m431.9/431.9 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting elastic-transport<9,>=8 (from elasticsearch)
  Downloading elastic_transport-8.12.0-py3-none-any.whl (59 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.9/59.9 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: elastic-transport, elasticsearch
Successfully installed elastic-transport-8.12.0 elasticsearch-8.12.0


In [None]:
# Import packages

from elasticsearch import Elasticsearch
from elasticsearch.client import EsqlClient
from getpass import getpass

In [None]:
# Connect to Elasticsearch and create client instance

# https://www.elastic.co/search-labs/tutorials/install-elasticsearch/elastic-cloud#finding-your-cloud-id
ELASTIC_CLOUD_ID = getpass("Elastic Cloud ID: ")

# https://www.elastic.co/search-labs/tutorials/install-elasticsearch/elastic-cloud#creating-an-api-key
ELASTIC_API_KEY = getpass("Elastic API Key: ")

# Create the client instance
client = Elasticsearch(
    # For local development
    # hosts=["http://localhost:9200"]
    cloud_id=ELASTIC_CLOUD_ID,
    api_key=ELASTIC_API_KEY,
)

Elastic Cloud ID: ··········
Elastic Api Key: ··········


In [None]:
# Create ES|QL client instance

esql = EsqlClient(client)

In [None]:
# Suppress specific Elasticsearch warnings about default limit of [500] that pollute responses

import warnings
from elasticsearch import ElasticsearchWarning

warnings.filterwarnings('ignore', category=ElasticsearchWarning)

In [None]:
# Format response to return human-readable tables

def format_response(response_data):
    column_names = [col['name'] for col in response_data['columns']]
    column_widths = [max(len(name), max((len(str(row[i])) for row in response_data['values']), default=0)) for i, name in enumerate(column_names)]
    row_format = " | ".join(["{:<" + str(width) + "}" for width in column_widths])
    print(row_format.format(*column_names))
    print("-" * sum(column_widths) + "-" * (len(column_widths) - 1) * 3)
    for row in response_data['values']:
      print(row_format.format(*row))

In [None]:
# Your first ES|QL query!

esql_query = "FROM sample_data"

response = esql.query(query=esql_query)
format_response(response)

@timestamp               | client_ip    | event_duration | message              
--------------------------------------------------------------------------------
2023-10-23T12:15:03.360Z | 172.21.2.162 | 3450233        | Connected to 10.1.0.3
2023-10-23T12:27:28.948Z | 172.21.2.113 | 2764889        | Connected to 10.1.0.2
2023-10-23T13:33:34.937Z | 172.21.0.5   | 1232382        | Disconnected         
2023-10-23T13:51:54.732Z | 172.21.3.15  | 725448         | Connection error     
2023-10-23T13:52:55.015Z | 172.21.3.15  | 8268153        | Connection error     
2023-10-23T13:53:55.832Z | 172.21.3.15  | 5033755        | Connection error     
2023-10-23T13:55:01.543Z | 172.21.3.15  | 1756467        | Connected to 10.1.0.1


## Processing commands

A source command can be followed by one or more [processing commands](https://www.elastic.co/guide/en/elasticsearch/reference/current/esql-commands.html#esql-processing-commands), separated by a pipe character: `|`. Processing commands change an input table by adding, removing, or changing rows and columns. Processing commands can perform filtering, projection, aggregation, and more.

For example, you can use the `LIMIT` command to limit the number of rows that are returned, up to a maximum of 10,000 rows:

In [None]:
esql_query = """
FROM sample_data
| LIMIT 3
"""

response = esql.query(query=esql_query)
format_response(response)

@timestamp               | client_ip    | event_duration | message              
--------------------------------------------------------------------------------
2023-10-23T12:15:03.360Z | 172.21.2.162 | 3450233        | Connected to 10.1.0.3
2023-10-23T12:27:28.948Z | 172.21.2.113 | 2764889        | Connected to 10.1.0.2
2023-10-23T13:33:34.937Z | 172.21.0.5   | 1232382        | Disconnected         


### Sort a table

Another processing command is the [`SORT`](https://www.elastic.co/guide/en/elasticsearch/reference/current/esql-commands.html#esql-sort) command. By default, the rows returned by `FROM` don’t have a defined sort order. Use the `SORT` command to sort rows on one or more columns:

In [None]:
esql_query = """
FROM sample_data
| SORT @timestamp DESC
"""

response = esql.query(query=esql_query)
format_response(response)

@timestamp               | client_ip    | event_duration | message              
--------------------------------------------------------------------------------
2023-10-23T12:15:03.360Z | 172.21.2.162 | 3450233        | Connected to 10.1.0.3
2023-10-23T12:27:28.948Z | 172.21.2.113 | 2764889        | Connected to 10.1.0.2
2023-10-23T13:33:34.937Z | 172.21.0.5   | 1232382        | Disconnected         
2023-10-23T13:51:54.732Z | 172.21.3.15  | 725448         | Connection error     
2023-10-23T13:52:55.015Z | 172.21.3.15  | 8268153        | Connection error     
2023-10-23T13:53:55.832Z | 172.21.3.15  | 5033755        | Connection error     
2023-10-23T13:55:01.543Z | 172.21.3.15  | 1756467        | Connected to 10.1.0.1


### Query the data

Use the [`WHERE`](https://www.elastic.co/guide/en/elasticsearch/reference/current/esql-commands.html#esql-where) command to query the data. For example, to find all events with a duration longer than 5ms:

In [None]:
esql_query = """
FROM sample_data
| WHERE event_duration > 5000000
"""

response = esql.query(query=esql_query)
format_response(response)

@timestamp               | client_ip   | event_duration | message         
--------------------------------------------------------------------------
2023-10-23T13:52:55.015Z | 172.21.3.15 | 8268153        | Connection error
2023-10-23T13:53:55.832Z | 172.21.3.15 | 5033755        | Connection error


`WHERE` supports several [operators](https://www.elastic.co/guide/en/elasticsearch/reference/current/esql-functions-operators.html#esql-operators).

For example, you can use [`LIKE`](https://www.elastic.co/guide/en/elasticsearch/reference/current/esql-functions-operators.html#esql-like-operator) to run a wildcard query against the message column:

In [None]:
esql_query = """
FROM sample_data
| WHERE message LIKE "Connected*"
"""

response = esql.query(query=esql_query)
format_response(response)

@timestamp               | client_ip    | event_duration | message              
--------------------------------------------------------------------------------
2023-10-23T12:15:03.360Z | 172.21.2.162 | 3450233        | Connected to 10.1.0.3
2023-10-23T12:27:28.948Z | 172.21.2.113 | 2764889        | Connected to 10.1.0.2
2023-10-23T13:55:01.543Z | 172.21.3.15  | 1756467        | Connected to 10.1.0.1


### More processing commands

There are many other processing commands, like [`KEEP`](https://www.elastic.co/guide/en/elasticsearch/reference/current/esql-commands.html#esql-keep "KEEP") and [`DROP`](https://www.elastic.co/guide/en/elasticsearch/reference/current/esql-commands.html#esql-drop "DROP") to keep or drop columns, [`ENRICH`](https://www.elastic.co/guide/en/elasticsearch/reference/current/esql-commands.html#esql-enrich "ENRICH") to enrich a table with data from indices in Elasticsearch, and [`DISSECT`](https://www.elastic.co/guide/en/elasticsearch/reference/current/esql-commands.html#esql-dissect "DISSECT") and [`GROK`](https://www.elastic.co/guide/en/elasticsearch/reference/current/esql-commands.html#esql-grok "GROK") to process data. Refer to [Processing commands](https://www.elastic.co/guide/en/elasticsearch/reference/current/esql-commands.html#esql-processing-commands "Processing commands") for an overview.

## Chain processing commands

You can chain processing commands, separated by a pipe character: `|`. Each
processing command works on the output table of the previous command. The result
of a query is the table produced by the final processing command.

The following example first sorts the table on `@timestamp`, and next limits the
result set to 3 rows:

In [None]:
esql_query = """
FROM sample_data
| SORT @timestamp DESC
| LIMIT 3
"""

response = esql.query(query=esql_query)
format_response(response)

> ℹ️ The order of processing commands is important. First limiting the result set to 3 rows before sorting those 3 rows would most likely return a result that is different than this example, where the sorting comes before the limit.

## Compute values

Use the [`EVAL`](https://www.elastic.co/guide/en/elasticsearch/reference/current/esql-commands.html#esql-eval "EVAL") command to append columns to a table, with calculated values. For example, the following query appends a `duration_ms` column. The values in the column are computed by dividing `event_duration` by 1,000,000. In other words: `event_duration` converted from nanoseconds to milliseconds.

In [None]:
esql_query = """
FROM sample_data
| EVAL duration_ms = event_duration/1000000.0
"""

response = esql.query(query=esql_query)
format_response(response)

`EVAL` supports several [functions](https://www.elastic.co/guide/en/elasticsearch/reference/current/esql-functions-operators.html#esql-functions). For example, to round a number to the closest number with the specified number of digits, use the [`ROUND`](https://www.elastic.co/guide/en/elasticsearch/reference/current/esql-functions-operators.html#esql-round "ROUND") function:

In [None]:
esql_query = """
FROM sample_data
| EVAL duration_ms = ROUND(event_duration/1000000.0, 1)
"""

response = esql.query(query=esql_query)
format_response(response)

## Calculate statistics

You can also use ES|QL to aggregate your data. Use the [`STATS ... BY`](https://www.elastic.co/guide/en/elasticsearch/reference/current/esql-commands.html#esql-stats-by "STATS ... BY") command to calculate statistics.

For example, to calculate the median duration:

In [None]:
esql_query = """
FROM sample_data
| STATS median_duration = MEDIAN(event_duration)
"""

response = esql.query(query=esql_query)
format_response(response)

You can calculate multiple stats with one command:

In [None]:
esql_query = """
FROM sample_data
| STATS median_duration = MEDIAN(event_duration), max_duration = MAX(event_duration)
"""

response = esql.query(query=esql_query)
format_response(response)

Use BY to group calculated stats by one or more columns. For example, to calculate the median duration per client IP:

In [None]:
esql_query = """
FROM sample_data
| STATS median_duration = MEDIAN(event_duration) BY client_ip
"""

response = esql.query(query=esql_query)
format_response(response)

## Access columns

You can access columns by their name. If a name contains special characters, [it needs to be quoted](https://www.elastic.co/guide/en/elasticsearch/reference/current/esql-syntax.html#esql-identifiers "Identifiers") with backticks (`` ` ``).

Assigning an explicit name to a column created by `EVAL` or `STATS` is optional. If you don’t provide a name, the new column name is equal to the function expression. For example:

In [None]:
esql_query = """
FROM sample_data
| EVAL event_duration/1000000.0
"""

response = esql.query(query=esql_query)
format_response(response)

In this query, `EVAL` adds a new column named `event_duration/1000000.0`. Because its name contains special characters, to access this column, quote it with backticks:

In [None]:
esql_query = """
FROM sample_data
| EVAL event_duration/1000000.0
| STATS MEDIAN(`event_duration/1000000.0`)
"""
response = esql.query(query=esql_query)
format_response(response)

## Create a histogram

To track statistics over time, ES|QL enables you to create histograms using the [`AUTO_BUCKET`](https://www.elastic.co/guide/en/elasticsearch/reference/current/esql-functions-operators.html#esql-auto_bucket "AUTO_BUCKET") function. `AUTO_BUCKET` creates human-friendly bucket sizes and returns a value for each row that corresponds to the resulting bucket the row falls into.

For example, to create hourly buckets for the data on October 23rd:

In [None]:
esql_query = """
FROM sample_data
| KEEP @timestamp
| EVAL bucket = AUTO_BUCKET (@timestamp, 24, "2023-10-23T00:00:00Z", "2023-10-23T23:59:59Z")
"""
response = esql.query(query=esql_query)
format_response(response)

Combine `AUTO_BUCKET` with [`STATS ... BY`](https://www.elastic.co/guide/en/elasticsearch/reference/current/esql-commands.html#esql-stats-by "STATS ... BY") to create a histogram. For example, to count the number of events per hour:

In [None]:
esql_query = """
FROM sample_data
| KEEP @timestamp, event_duration
| EVAL bucket = AUTO_BUCKET (@timestamp, 24, "2023-10-23T00:00:00Z", "2023-10-23T23:59:59Z")
| STATS COUNT(*) BY bucket
"""
response = esql.query(query=esql_query)
format_response(response)

Or the median duration per hour:

In [None]:
esql_query = """
FROM sample_data
| KEEP @timestamp, event_duration
| EVAL bucket = AUTO_BUCKET (@timestamp, 24, "2023-10-23T00:00:00Z", "2023-10-23T23:59:59Z")
| STATS median_duration = MEDIAN(event_duration) BY bucket
"""
response = esql.query(query=esql_query)
format_response(response)

## Enrich data

ES|QL enables you to [enrich](https://www.elastic.co/guide/en/elasticsearch/reference/current/esql-enrich-data.html "Data enrichment") a table with data from indices in Elasticsearch, using the [`ENRICH`](https://www.elastic.co/guide/en/elasticsearch/reference/current/esql-commands.html#esql-enrich "ENRICH") command.

> ℹ️ Before you can use `ENRICH`, you first need to [create](https://www.elastic.co/guide/en/elasticsearch/reference/current/esql-enrich-data.html#esql-create-enrich-policy "Create an enrich policy") and [execute](https://www.elastic.co/guide/en/elasticsearch/reference/current/esql-enrich-data.html#esql-execute-enrich-policy "Execute the enrich policy") an [enrich policy](https://www.elastic.co/guide/en/elasticsearch/reference/current/esql-enrich-data.html#esql-enrich-policy). We provide a demo environment at [ela.st/ql](https://ela.st/ql/), where an enrich policy called `clientip_policy` has already been created and executed, if you just want to see how it works.

The following requests create and execute a policy called `clientip_policy`. The policy links an IP address to an environment ("Development", "QA", or "Production").



In [None]:
# Define the mapping
mapping = {
    "mappings": {
        "properties": {
            "client_ip": {"type": "keyword"},
            "env": {"type": "keyword"}
        }
    }
}

# Create the index with the mapping
client.indices.create(index="clientips", body=mapping)

# Prepare bulk data
bulk_data = [
    {"index": {}},
    {"client_ip": "172.21.0.5", "env": "Development"},
    {"index": {}},
    {"client_ip": "172.21.2.113", "env": "QA"},
    {"index": {}},
    {"client_ip": "172.21.2.162", "env": "QA"},
    {"index": {}},
    {"client_ip": "172.21.3.15", "env": "Production"},
    {"index": {}},
    {"client_ip": "172.21.3.16", "env": "Production"}
]

# Bulk index the data
client.bulk(index="clientips", body=bulk_data)

# Define the enrich policy
policy = {
    "match": {
        "indices": "clientips",
        "match_field": "client_ip",
        "enrich_fields": ["env"]
    }
}

# Put the enrich policy
client.enrich.put_policy(name='clientip_policy', body=policy)

# Execute the enrich policy without waiting for completion
client.enrich.execute_policy(name='clientip_policy', wait_for_completion=False)

After creating and executing a policy, you can use it with the `ENRICH` command:

In [None]:
esql_query = """
FROM sample_data
| KEEP @timestamp, client_ip, event_duration
| EVAL client_ip = TO_STRING(client_ip)
| ENRICH clientip_policy ON client_ip WITH env
"""
response = esql.query(query=esql_query)
format_response(response)

You can use the new `env` column that’s added by the `ENRICH` command in subsequent commands. For example, to calculate the median duration per environment:

In [None]:
esql_query = """
FROM sample_data
| KEEP @timestamp, client_ip, event_duration
| EVAL client_ip = TO_STRING(client_ip)
| ENRICH clientip_policy ON client_ip WITH env
| STATS median_duration = MEDIAN(event_duration) BY env
"""
response = esql.query(query=esql_query)
format_response(response)

For more about data enrichment with ES|QL, refer to [Data enrichment](https://www.elastic.co/guide/en/elasticsearch/reference/current/esql-enrich-data.html "Data enrichment").

## Process data

Your data may contain unstructured strings that you want to [structure](https://www.elastic.co/guide/en/elasticsearch/reference/current/esql-process-data-with-dissect-and-grok.html "Data processing with DISSECT and GROK") to make it easier to analyze the data. For example, the sample data contains log messages like:

```
"Connected to 10.1.0.3"
```

By extracting the IP address from these messages, you can determine which IP has accepted the most client connections.

To structure unstructured strings at query time, you can use the ES|QL [`DISSECT`](https://www.elastic.co/guide/en/elasticsearch/reference/current/esql-commands.html#esql-dissect "DISSECT") and [`GROK`](https://www.elastic.co/guide/en/elasticsearch/reference/current/esql-commands.html#esql-grok "GROK") commands. `DISSECT` works by breaking up a string using a delimiter-based pattern. `GROK` works similarly, but uses regular expressions. This makes `GROK` more powerful, but generally also slower.

In this case, no regular expressions are needed, as the `message` is straightforward: "Connected to ", followed by the server IP. To match this string, you can use the following `DISSECT` command:

In [None]:
esql_query = """
FROM sample_data
| DISSECT message "Connected to %{server_ip}"
"""
response = esql.query(query=esql_query)
format_response(response)

This adds a `server_ip` column to those rows that have a `message` that matches this pattern. For other rows, the value of `server_ip` is `null`.

You can use the new `server_ip` column that’s added by the `DISSECT` command in subsequent commands. For example, to determine how many connections each server has accepted:

In [None]:
esql_query = """
FROM sample_data
| WHERE STARTS_WITH(message, "Connected to")
| DISSECT message "Connected to %{server_ip}"
| STATS COUNT(*) BY server_ip
"""
response = esql.query(query=esql_query)
format_response(response)

> ℹ️ To learn more about data processing with ES|QL, refer to [Data processing with DISSECT and GROK](https://www.elastic.co/guide/en/elasticsearch/reference/current/esql-process-data-with-dissect-and-grok.html "Data processing with DISSECT and GROK").

## Learn more

To learn more about ES|QL, refer to:
- [_Learning ES|QL_](https://www.elastic.co/guide/en/elasticsearch/reference/current/esql-language.html "Learning ES|QL")
- [_Using ES|QL_](https://www.elastic.co/guide/en/elasticsearch/reference/current/esql-using.html "Using ES|QL")