## Environment preparation

In [1]:
# Load `.env` variables.
import dotenv
dotenv.load_dotenv(override=True)

True

In [2]:
# Not necessary, but useful for live-reloading changes to Obsinthe itself.
%load_ext autoreload
%autoreload 2

In [3]:
# Some core dependencies.
import os
from datetime import timedelta, datetime

# For printing values to notebook.
from IPython.display import display, HTML

# For talking to Prometheus
from obsinthe.prometheus.client import Client
from obsinthe.prometheus.loader import Loader

# For simulating alerts data (when not connected to live source)
from obsinthe.testing.prometheus.client import MockedClient
from obsinthe.testing.prometheus.alerts import AlertsDatasetBuilder

# For merging daily data into a single dataset
from obsinthe.prometheus.data import intervals_concat_days

# For alerts clusterin
from obsinthe.alerts.grouping import alerts_groups_one_hot, alerts_clustering_dbscan

# For visualization
from obsinthe.vis.alerts import plot_alerts_timeline
from obsinthe.vis.clustering import plot_clustering

# To avoid some issues when rendering Plotly on export to HTML
import cufflinks
cufflinks.go_offline()

## Data loading

In [4]:
# What date range we want to load the data for.
START = datetime(2024, 3, 19, 10, 10)
END = datetime(2024, 3, 23, 0, 0)

In [5]:
# Load the data or simulate them when endpoint not available.

# Replace with your instance, e.g. "https://prometheus.example.com".
PROM_URL = ""

if not PROM_URL:
    client = MockedClient(AlertsDatasetBuilder(START, END))
    client.mock_setup((START, END))
else:
    # We expect the token being available in `PROM_TOKEN`. Add it to `.env` file
    client = Client(url=PROM_URL, token=os.getenv("PROM_TOKEN"))

# Load alerts data from the Prometheus instance.
loader = Loader(client)

alerts_ranges_collection = loader.interval_query("ALERTS[24h:1m]", START, END)
alerts_ranges_collection[0].df

  0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0,alertname,severity,instance_id,values
0,TargetDown,warning,15,"[1710857100.0, 1.0, 1710857160.0, 1.0, 1710857..."
1,KubeDeploymentReplicasMismatch,warning,15,"[1710857040.0, 1.0, 1710857100.0, 1.0, 1710857..."
2,KubeNodeNotReady,critical,15,"[1710857220.0, 1.0, 1710857280.0, 1.0, 1710857..."
3,noise_h,warning,15,"[1710857280.0, 1.0, 1710857340.0, 1.0, 1710857..."
4,TargetDown,warning,32,"[1710886380.0, 1.0, 1710886440.0, 1.0, 1710886..."
...,...,...,...,...
79,ElasticsearchJVMHeapUseHigh,info,57,"[1710879060.0, 1.0, 1710879120.0, 1.0, 1710879..."
80,ElasticsearchJVMHeapUseHigh,info,57,"[1710879300.0, 1.0, 1710879360.0, 1.0, 1710879..."
81,ElasticsearchJVMHeapUseHigh,info,57,"[1710879840.0, 1.0]"
82,ElasticsearchJVMHeapUseHigh,info,57,"[1710880080.0, 1.0, 1710880140.0, 1.0, 1710880..."


## Data transformation

In [6]:
alerts_intervals_collection = alerts_ranges_collection.fmap(
    lambda ds: ds.to_intervals_ds(timedelta(minutes=1))
)
alerts_intervals_collection[0].df

Unnamed: 0,alertname,severity,instance_id,start,end
0,TargetDown,warning,15,2024-03-19 14:05:00+00:00,2024-03-19 14:31:00+00:00
1,KubeDeploymentReplicasMismatch,warning,15,2024-03-19 14:04:00+00:00,2024-03-19 14:31:00+00:00
2,KubeNodeNotReady,critical,15,2024-03-19 14:07:00+00:00,2024-03-19 14:34:00+00:00
3,noise_h,warning,15,2024-03-19 14:08:00+00:00,2024-03-19 14:35:00+00:00
4,TargetDown,warning,32,2024-03-19 22:13:00+00:00,2024-03-19 22:42:00+00:00
...,...,...,...,...,...
79,ElasticsearchJVMHeapUseHigh,info,57,2024-03-19 20:11:00+00:00,2024-03-19 20:13:00+00:00
80,ElasticsearchJVMHeapUseHigh,info,57,2024-03-19 20:15:00+00:00,2024-03-19 20:19:00+00:00
81,ElasticsearchJVMHeapUseHigh,info,57,2024-03-19 20:24:00+00:00,2024-03-19 20:24:00+00:00
82,ElasticsearchJVMHeapUseHigh,info,57,2024-03-19 20:28:00+00:00,2024-03-19 20:33:00+00:00


In [7]:
alerts_intervals = intervals_concat_days(
    alerts_intervals_collection
).correct_for_resolution(timedelta(minutes=1))
alerts_intervals.df

  0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0,alertname,severity,instance_id,start,end
0,TargetDown,warning,15,2024-03-19 14:05:00+00:00,2024-03-19 14:32:00+00:00
1,KubeDeploymentReplicasMismatch,warning,15,2024-03-19 14:04:00+00:00,2024-03-19 14:32:00+00:00
2,KubeNodeNotReady,critical,15,2024-03-19 14:07:00+00:00,2024-03-19 14:35:00+00:00
3,noise_h,warning,15,2024-03-19 14:08:00+00:00,2024-03-19 14:36:00+00:00
4,TargetDown,warning,32,2024-03-19 22:13:00+00:00,2024-03-19 22:43:00+00:00
...,...,...,...,...,...
470,ElasticsearchJVMHeapUseHigh,info,30,2024-03-22 04:05:00+00:00,2024-03-22 04:06:00+00:00
471,ElasticsearchJVMHeapUseHigh,info,30,2024-03-22 04:08:00+00:00,2024-03-22 04:10:00+00:00
472,noise_s,warning,30,2024-03-22 03:43:00+00:00,2024-03-22 04:08:00+00:00
473,ElasticsearchClusterNotHealthy,warning,36,2024-03-22 23:53:00+00:00,2024-03-23 00:01:00+00:00


## Identifying groups of alerts

In [8]:
alert_id = lambda a: f"{a['alertname']}-{a['instance_id']}"

In [9]:
plot_alerts_timeline(alerts_intervals, alert_id=alert_id).show()

plot_alerts_timeline(
    alerts_intervals.fmap(lambda df: df.query("instance_id == '1'")), alert_id=alert_id
).show()

In [10]:
# Group alert starting within provided tolerange and turn data into one-hot encoding.
one_hot = alerts_groups_one_hot(
    alerts_intervals,
    groupby_columns=["instance_id"],
    group_tolerance=timedelta(minutes=3),
)

one_hot

alertname,ElasticsearchClusterNotHealthy,ElasticsearchJVMHeapUseHigh,KubeDeploymentReplicasMismatch,KubeNodeNotReady,TargetDown,noise_b,noise_c,noise_d,noise_e,noise_f,...,noise_p,noise_q,noise_r,noise_s,noise_t,noise_u,noise_v,noise_w,noise_x,noise_y
group_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1-0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1-1,0,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10-0,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10-1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10-2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95-4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
95-5,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
96-0,0,0,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
98-0,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Apply clustering alogorithm

In [11]:
# Apply the clustering.
ac = alerts_clustering_dbscan(
    one_hot,
    eps=1,
    n_neighbors=2,
    min_samples=2,
    min_dist=0.1,
)

plot_clustering(ac)

## Additional data cleanup

In [12]:
# Show case with flapping alerts.

fig_flap = plot_alerts_timeline(
    alerts_intervals.fmap(
        lambda df: df.query("instance_id == '10'")
    ),
    alert_id=alert_id,
    height=600
)

display(HTML("<h3>With flapping</h3>"))
fig_flap.show()

In [13]:
# Reduce the flapping by merging with positive threshold.
alerts_intervals_reduced_flap = alerts_intervals.merge_overlaps(
    threshold=timedelta(minutes=30)
)

fig_noflap = plot_alerts_timeline(
    alerts_intervals_reduced_flap.fmap(
        lambda df: df.query("instance_id == '10'")
    ),
    alert_id=alert_id,
    height=600
)

display(HTML("<h3>Without flapping</h3>"))
fig_noflap.show()

## Re-apply the clustering after the cleanup

In [14]:
# To compare with previous version.
one_hot_noflap = alerts_groups_one_hot(
    alerts_intervals_reduced_flap,
    groupby_columns=["instance_id"],
    group_tolerance=timedelta(minutes=3),
)

# Apply clustering after flapping reduction.
ac_noflap = alerts_clustering_dbscan(
    one_hot_noflap,
    eps=1,
    n_neighbors=2,
    min_samples=2,
    min_dist=0.1,
)

display(HTML("<h3>With flapping</h3>"))
plot_clustering(ac).show()

display(HTML("<h3>Without flapping</h3>"))
plot_clustering(ac_noflap).show()