In [6]:
%pip install --quiet pandas numpy requests pyarrow mlxtend pyyaml
# If you’ll pull the topology with Python client (optional):
# %pip install --quiet kubernetes openshift


Note: you may need to restart the kernel to use updated packages.


In [7]:
from pathlib import Path
import os, json, time, math, pandas as pd, numpy as np, requests

# Storage locations
DATA_DIR = Path("data"); DATA_DIR.mkdir(parents=True, exist_ok=True)
UNIFIED_DIR = DATA_DIR / "unified_logs"; UNIFIED_DIR.mkdir(exist_ok=True, parents=True)
INCIDENTS_DIR = Path("incidents"); INCIDENTS_DIR.mkdir(exist_ok=True, parents=True)
RULES_DIR = Path("rules"); RULES_DIR.mkdir(exist_ok=True, parents=True)

# Loki connection
LOKI_URL = os.environ.get("LOKI_URL", "https://logging-loki-openshift-logging.apps.rhoai.ocp-poc-demo.com")
LOKI_TOKEN = os.environ.get("LOKI_TOKEN","sha256~0ojgYvJZK8VO_w5ew30iXkQIe8pZt7VzujI3lc3Z3SU")  # or None if not needed

# Pull window (UTC). Adjust!
END = pd.Timestamp.utcnow()
START = END - pd.Timedelta("90min")

print("Window:", START, "→", END)


Window: 2025-09-10 14:13:46.094889+00:00 → 2025-09-10 15:43:46.094889+00:00


In [8]:
# Cell 1.1 — Loki query_range helper (tenant-aware + token + better diagnostics)
import os, pandas as pd, requests, urllib3

# ---- Config (adjust to your cluster) ----
LOKI_BASE       = os.environ.get(
    "LOKI_BASE",
    "https://logging-loki-openshift-logging.apps.rhoai.ocp-poc-demo.com"
)  # e.g. https://logging-loki-...apps.cluster.com
LOKI_TENANT     = os.environ.get("LOKI_TENANT", "application")
LOKI_TOKEN     = os.environ.get("LOKI_TOKEN", "sha256~0ojgYvJZK8VO_w5ew30iXkQIe8pZt7VzujI3lc3Z3SU")           # from: oc whoami -t
LOKI_ORG_ID     = os.environ.get("LOKI_ORG_ID")  # often same as tenant (optional)
LOKI_INSECURE   = os.environ.get("LOKI_INSECURE", "true").lower() in ("1","true","yes")
LOKI_BASIC_USER = os.environ.get("LOKI_BASIC_USER")  # rarely used
LOKI_BASIC_PASS = os.environ.get("LOKI_BASIC_PASS")

if LOKI_INSECURE:
    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

_session = requests.Session()
_default_headers = {"Accept": "application/json"}
if LOKI_TOKEN:
    _default_headers["Authorization"] = f"Bearer {LOKI_TOKEN}"
if LOKI_ORG_ID:
    _default_headers["X-Scope-OrgID"] = LOKI_ORG_ID


def _debug_response(resp):
    ct = resp.headers.get("Content-Type", "")
    preview = (resp.text or "")[:500]
    return f"HTTP {resp.status_code} CT={ct} URL={resp.url}\nBody (first 500):\n{preview}"



In [9]:
# Helpers for label discovery in your Loki tenant

def loki_labels():
    """
    List all label keys available in the current tenant.
    """
    url = f"{LOKI_BASE.rstrip('/')}/api/logs/v1/{LOKI_TENANT}/loki/api/v1/labels"
    r = _session.get(
        url,
        headers=_default_headers,
        timeout=30,
        verify=not LOKI_INSECURE,
        allow_redirects=False
    )
    r.raise_for_status()
    return r.json().get("data", [])

def loki_label_values(label):
    """
    List values for a given label key (e.g., namespaces, pods).
    """
    url = f"{LOKI_BASE.rstrip('/')}/api/logs/v1/{LOKI_TENANT}/loki/api/v1/label/{label}/values"
    r = _session.get(
        url,
        headers=_default_headers,
        timeout=30,
        verify=not LOKI_INSECURE,
        allow_redirects=False
    )
    r.raise_for_status()
    return r.json().get("data", [])

def loki_query_range(expr, start_ts, end_ts, step="15s", limit=5000, direction="forward"):
    """
    Calls: {LOKI_BASE}/api/logs/v1/{LOKI_TENANT}/loki/api/v1/query_range
    Returns DataFrame with columns: ts, line, and all series labels.
    """
    if not LOKI_BASE:
        raise RuntimeError("Set LOKI_BASE to your Loki gateway URL.")

    url = f"{LOKI_BASE.rstrip('/')}/api/logs/v1/{LOKI_TENANT}/loki/api/v1/query_range"
    params = {
        "query": expr,
        "start": int(pd.Timestamp(start_ts).value),  # already ns
        "end": int(pd.Timestamp(end_ts).value),      # already ns
        "step": step,
        "limit": str(limit),
        "direction": direction,
    }

    auth = (LOKI_BASIC_USER, LOKI_BASIC_PASS) if (LOKI_BASIC_USER and LOKI_BASIC_PASS) else None
    r = _session.get(
        url, params=params, headers=_default_headers, timeout=60,
        verify=not LOKI_INSECURE, auth=auth, allow_redirects=False
    )
    if not r.ok:
        raise RuntimeError("Loki query_range failed:\n" + _debug_response(r))
    if "application/json" not in r.headers.get("Content-Type","").lower():
        raise RuntimeError("Unexpected non-JSON response from Loki:\n" + _debug_response(r))

    payload = r.json()
    data = payload.get("data", {}).get("result", [])

    def _parse_ns(ts_val):
        # Loki returns ns since epoch as a string (sometimes scientific notation).
        s = str(ts_val)
        try:
            # fast path if it’s plain digits
            ns = int(s)
        except ValueError:
            # fallback for "1.7575129300981494e+18" style
            ns = int(float(s))
        return pd.to_datetime(ns, unit="ns", utc=True)

    rows = []
    for series in data:
        labels = series.get("metric", {})
        for ts, line in series.get("values", []):
            rows.append({"ts": _parse_ns(ts), "line": line, **labels})

    return pd.DataFrame(rows)



In [10]:
labels = loki_labels()
print("Available labels:", labels[:20])

if "log_type" in labels:
    print("log_type values:", loki_label_values("log_type"))

if "namespace" in labels:
    print("namespace values:", loki_label_values("namespace")[:10])


Available labels: ['k8s_container_name', 'k8s_namespace_name', 'k8s_node_name', 'k8s_pod_name', 'kubernetes_container_name', 'kubernetes_host', 'kubernetes_namespace_name', 'kubernetes_pod_name', 'log_type', 'openshift_log_type']
log_type values: ['application']


In [11]:
# Define selectors for your environment
LOGQL_APP   = r'{log_type="application"}'
LOGQL_INFRA = r'{log_type="application", k8s_namespace_name="openshift-kube-apiserver"}'
LOGQL_EVENT = r'{log_type="application", k8s_namespace_name="default"} |= "Warning"'
LOGQL_AUDIT = r'{log_type="application", k8s_namespace_name="openshift-apiserver"}'


In [12]:
df_app = loki_query_range(LOGQL_APP, START, END)
#df_infra = loki_query_range(LOGQL_INFRA, START, END)
#df_evt = loki_query_range(LOGQL_EVENT, START, END)
#df_audit = loki_query_range(LOGQL_AUDIT, START, END)
#print(len(df_app), len(df_infra), len(df_evt), len(df_audit))
print(len(df_app))

5000


In [13]:
df_app

Unnamed: 0,ts,line
0,2025-09-10 14:37:51.922941430+00:00,"{""@timestamp"":""2025-09-10T14:37:51.922941430Z""..."
1,2025-09-10 14:37:52.218583208+00:00,"{""@timestamp"":""2025-09-10T14:37:52.218583208Z""..."
2,2025-09-10 14:37:52.218768776+00:00,"{""@timestamp"":""2025-09-10T14:37:52.218768776Z""..."
3,2025-09-10 14:37:52.218806614+00:00,"{""@timestamp"":""2025-09-10T14:37:52.218806614Z""..."
4,2025-09-10 14:37:52.218820290+00:00,"{""@timestamp"":""2025-09-10T14:37:52.218820290Z""..."
...,...,...
4995,2025-09-10 15:26:55.486423440+00:00,"{""@timestamp"":""2025-09-10T15:26:55.486423440Z""..."
4996,2025-09-10 15:26:55.486452263+00:00,"{""@timestamp"":""2025-09-10T15:26:55.486452263Z""..."
4997,2025-09-10 15:26:55.491212198+00:00,"{""@timestamp"":""2025-09-10T15:26:55.491212198Z""..."
4998,2025-09-10 15:26:55.521759138+00:00,"{""@timestamp"":""2025-09-10T15:26:55.521759138Z""..."


In [14]:
# Your labels: k8s_namespace_name, k8s_pod_name, k8s_node_name, log_type
import pandas as pd

def normalize_level(line: str) -> str:
    s = (line or "").lower()
    if any(w in s for w in ["error", "exception", "fail", "backoff", "oomkilled", "notready"]): return "error"
    if "warn" in s or "throttle" in s: return "warn"
    return "info"

def extract_code(line: str):
    for c in ("500","502","503","504","404","401","429"):
        if f" {c} " in f" {line} ":
            try: return int(c)
            except: pass
    return None

def extract_route(line: str):
    for verb in (" GET ", " POST ", " PUT ", " PATCH ", " DELETE "):
        if verb in f" {line} ":
            try: return line.split(verb,1)[-1].split()[0]
            except: return None
    return None

def project_unified(df: pd.DataFrame, source_guess: str):
    ns   = df.get("k8s_namespace_name") or df.get("kubernetes_namespace_name")
    pod  = df.get("k8s_pod_name")       or df.get("kubernetes_pod_name")
    node = df.get("k8s_node_name")      or df.get("kubernetes_host")
    src  = source_guess  # for now all your data is log_type="application"
    out = pd.DataFrame({
        "ts": df["ts"],
        "source": src,
        "namespace": ns if ns is not None else None,
        "pod": pod if pod is not None else None,
        "node": node if node is not None else None,
        "level": df["line"].map(normalize_level),
        "verb": None,  # fill if your audit lines have it
        "code": df["line"].map(extract_code),
        "route": df["line"].map(extract_route),
        "msg": df["line"].astype(str).str.slice(0, 400),
        "container_restart": df["line"].str.contains("Restarted container", case=False, na=False).astype(int),
        "rollout_in_window": df["line"].str.contains("Scaled up replica set|deployment created|rollout", case=False, na=False).astype(float),
    })
    return out


In [15]:
from pathlib import Path

parts = []
if not df_app.empty:   parts.append(project_unified(df_app, "app"))
if not df_infra.empty: parts.append(project_unified(df_infra, "infra"))
if not df_evt.empty:   parts.append(project_unified(df_evt, "event"))
if not df_audit.empty: parts.append(project_unified(df_audit, "audit"))

unified = pd.concat(parts, ignore_index=True) if parts else pd.DataFrame(columns=[
    "ts","source","namespace","pod","node","level","verb","code","route","msg","container_restart","rollout_in_window"
])
unified = unified.sort_values("ts").reset_index(drop=True)

unified_dir = Path("data/unified_logs"); unified_dir.mkdir(parents=True, exist_ok=True)
unified_path = unified_dir / "latest.parquet"
unified.to_parquet(unified_path, index=False)

print("Unified rows:", len(unified))
unified.head(5)

NameError: name 'df_infra' is not defined

In [None]:
def build_episodes(df: pd.DataFrame, window="10min", keys=("namespace","pod","node")):
    df = df.copy()
    df["ts"] = pd.to_datetime(df["ts"], utc=True)
    df.set_index("ts", inplace=True)
    episodes = []
    for wstart, wdf in df.groupby(pd.Grouper(freq=window)):
        if wdf.empty: continue
        wend = wstart + pd.to_timedelta(window)
        grp_cols = [k for k in keys if k in wdf.columns]
        groups = dict(tuple(wdf.groupby(grp_cols, dropna=False))) if grp_cols else {"_": wdf}
        for gkey, gdf in groups.items():
            total = len(gdf)
            errors = (gdf["level"]=="error").sum()
            err_ratio = (errors/total) if total else 0.0
            restarts = gdf.get("container_restart", pd.Series([0]*total, index=gdf.index)).sum()
            http5xx = (gdf.get("code", pd.Series(dtype=float))>=500).sum()
            rollout = 1.0 if (gdf.get("rollout_in_window", pd.Series(dtype=float))>0).any() else 0.0
            entities = {}
            for col in ["namespace","pod","node"]:
                vals = [v for v in gdf[col].astype(str).dropna().unique().tolist() if v and v!="None"]
                if vals: entities[col] = vals
            episodes.append({
                "episode_id": f"{int(wstart.value)}::{hash(str(gkey)) & 0xfffffff:07x}",
                "start": wstart, "end": wend,
                "entities": entities,
                "features": {"count": float(total), "error_ratio": float(err_ratio), "restarts": float(restarts), "http5xx": float(http5xx), "rollout_in_window": rollout},
            })
    return episodes

eps = build_episodes(unified, window="10min")
print("Episodes:", len(eps))
pd.DataFrame([{"id": e["episode_id"], **e["features"], **{f"ent_{k}": v for k,v in e["entities"].items()}} for e in eps]).head(10)


Episodes: 8


Unnamed: 0,id,count,error_ratio,restarts,http5xx,rollout_in_window
0,1757511600000000000::6294c1c,143.0,0.020979,0.0,0.0,0.0
1,1757512200000000000::6294c1c,869.0,0.050633,0.0,0.0,0.0
2,1757512800000000000::6294c1c,598.0,0.041806,0.0,0.0,0.0
3,1757513400000000000::6294c1c,849.0,0.031802,0.0,0.0,0.0
4,1757514000000000000::6294c1c,1019.0,0.042198,0.0,0.0,0.0
5,1757514600000000000::6294c1c,594.0,0.045455,0.0,0.0,0.0
6,1757515200000000000::6294c1c,705.0,0.041135,0.0,0.0,0.0
7,1757515800000000000::6294c1c,223.0,0.053812,0.0,0.0,0.0


In [None]:
# If you still have `eps` in memory from build_episodes(...):
import pandas as pd

epi_dbg = pd.DataFrame([
    {
        "id": e["episode_id"],
        "start": e["start"],
        "end": e["end"],
        "count": e["features"]["count"],
        "error_ratio": e["features"]["error_ratio"],
        "restarts": e["features"]["restarts"],
        "http5xx": e["features"]["http5xx"],
        "rollout_in_window": e["features"]["rollout_in_window"],
        "ent_namespace": ",".join(e["entities"].get("namespace", [])),
        "ent_pod": ",".join(e["entities"].get("pod", [])),
        "ent_node": ",".join(e["entities"].get("node", [])),
    }
    for e in eps
]).sort_values("start")
epi_dbg.head(10)


Unnamed: 0,id,start,end,count,error_ratio,restarts,http5xx,rollout_in_window,ent_namespace,ent_pod,ent_node
0,1757511600000000000::6294c1c,2025-09-10 13:40:00+00:00,2025-09-10 13:50:00+00:00,143.0,0.020979,0.0,0.0,0.0,,,
1,1757512200000000000::6294c1c,2025-09-10 13:50:00+00:00,2025-09-10 14:00:00+00:00,869.0,0.050633,0.0,0.0,0.0,,,
2,1757512800000000000::6294c1c,2025-09-10 14:00:00+00:00,2025-09-10 14:10:00+00:00,598.0,0.041806,0.0,0.0,0.0,,,
3,1757513400000000000::6294c1c,2025-09-10 14:10:00+00:00,2025-09-10 14:20:00+00:00,849.0,0.031802,0.0,0.0,0.0,,,
4,1757514000000000000::6294c1c,2025-09-10 14:20:00+00:00,2025-09-10 14:30:00+00:00,1019.0,0.042198,0.0,0.0,0.0,,,
5,1757514600000000000::6294c1c,2025-09-10 14:30:00+00:00,2025-09-10 14:40:00+00:00,594.0,0.045455,0.0,0.0,0.0,,,
6,1757515200000000000::6294c1c,2025-09-10 14:40:00+00:00,2025-09-10 14:50:00+00:00,705.0,0.041135,0.0,0.0,0.0,,,
7,1757515800000000000::6294c1c,2025-09-10 14:50:00+00:00,2025-09-10 15:00:00+00:00,223.0,0.053812,0.0,0.0,0.0,,,


In [None]:
import json
import pandas as pd
import re

# Try to parse a few rows to see real structure:
print(unified["msg"].head(3).tolist())  # optional preview

def _maybe_json(s):
    s = s.strip()
    if not s or s[0] not in "{[":
        return None
    try:
        return json.loads(s)
    except Exception:
        return None

def normalize_level_from_json(obj, fallback_line: str):
    # common keys across frameworks
    for k in ["level","severity","loglevel","lvl","logger_level"]:
        if k in obj:
            return str(obj[k]).lower()
    # fallback from line content
    s = fallback_line.lower()
    if any(w in s for w in ["error","exception","fail","backoff","oomkilled","notready"]): return "error"
    if "warn" in s or "throttle" in s: return "warn"
    return "info"

def extract_code_from_json(obj, fallback_line: str):
    # common http status fields
    for k in ["status","status_code","code","http_status","response_status"]:
        if k in obj:
            try: return int(obj[k])
            except: pass
    # fallback text sniff
    m = re.search(r"\s(1\d{2}|2\d{2}|3\d{2}|4\d{2}|5\d{2})\s", " " + fallback_line + " ")
    if m:
        try: return int(m.group(1))
        except: pass
    return None

def extract_route_from_json(obj, fallback_line: str):
    for k in ["path","route","url","request_path","request","endpoint"]:
        if k in obj and isinstance(obj[k], str):
            return obj[k].split("?")[0]
    # fallback: look for “ GET /foo ” etc.
    m = re.search(r"\s(?:GET|POST|PUT|PATCH|DELETE)\s+(\S+)", " " + fallback_line + " ")
    return m.group(1) if m else None

def project_unified_stronger(df: pd.DataFrame, source_guess: str):
    # your labels:
    ns   = df.get("k8s_namespace_name")      or df.get("kubernetes_namespace_name")
    pod  = df.get("k8s_pod_name")            or df.get("kubernetes_pod_name")
    node = df.get("k8s_node_name")           or df.get("kubernetes_host")

    objs = df["line"].map(_maybe_json)

    level = [
        normalize_level_from_json(o, ln) if o is not None else
        ("error" if any(w in ln.lower() for w in ["error","exception","fail","backoff","oomkilled","notready"]) else
         "warn" if "warn" in ln.lower() or "throttle" in ln.lower() else "info")
        for o, ln in zip(objs, df["line"])
    ]
    code = [
        extract_code_from_json(o, ln) if o is not None else extract_code_from_json({}, ln)
        for o, ln in zip(objs, df["line"])
    ]
    route = [
        extract_route_from_json(o, ln) if o is not None else extract_route_from_json({}, ln)
        for o, ln in zip(objs, df["line"])
    ]

    container_restart = df["line"].str.contains("Restarted container", case=False, na=False).astype(int)

    # rollout hints (tune for your org’s messages)
    rollout_hit = df["line"].str.contains(
        r"Scaled up replica set|deployment (created|updated|rolled out)|Rollout", 
        case=False, na=False, regex=True
    ).astype(float)

    out = pd.DataFrame({
        "ts": df["ts"],
        "source": source_guess,
        "namespace": ns if ns is not None else None,
        "pod": pod if pod is not None else None,
        "node": node if node is not None else None,
        "level": level,
        "verb": None,  # fill if you have audit verbs later
        "code": code,
        "route": route,
        "msg": df["line"].astype(str).str.slice(0, 400),
        "container_restart": container_restart,
        "rollout_in_window": rollout_hit,
    })
    return out



['{"@timestamp":"2025-09-10T13:46:28.130418503Z","hostname":"rhoai-sno","kubernetes":{"annotations":{"k8s.ovn.org/pod-networks":"{\\"default\\":{\\"ip_addresses\\":[\\"10.128.0.79/23\\"],\\"mac_address\\":\\"0a:58:0a:80:00:4f\\",\\"gateway_ips\\":[\\"10.128.0.1\\"],\\"routes\\":[{\\"dest\\":\\"10.128.0.0/14\\",\\"nextHop\\":\\"10.128.0.1\\"},{\\"dest\\":\\"172.30.0.0/16\\",\\"nextHop\\":\\"10.128.0.1\\"},{\\"dest\\":\\"169.254.0.5/32', '{"@timestamp":"2025-09-10T13:46:28.132786364Z","hostname":"rhoai-sno","kubernetes":{"annotations":{"k8s.ovn.org/pod-networks":"{\\"default\\":{\\"ip_addresses\\":[\\"10.128.0.79/23\\"],\\"mac_address\\":\\"0a:58:0a:80:00:4f\\",\\"gateway_ips\\":[\\"10.128.0.1\\"],\\"routes\\":[{\\"dest\\":\\"10.128.0.0/14\\",\\"nextHop\\":\\"10.128.0.1\\"},{\\"dest\\":\\"172.30.0.0/16\\",\\"nextHop\\":\\"10.128.0.1\\"},{\\"dest\\":\\"169.254.0.5/32', '{"@timestamp":"2025-09-10T13:46:28.135940156Z","hostname":"rhoai-sno","kubernetes":{"annotations":{"k8s.ovn.org/pod-netw

In [None]:
parts = []
if not df_app.empty:   parts.append(project_unified_stronger(df_app, "app"))
if not df_infra.empty: parts.append(project_unified_stronger(df_infra, "infra"))
if not df_evt.empty:   parts.append(project_unified_stronger(df_evt, "event"))
if not df_audit.empty: parts.append(project_unified_stronger(df_audit, "audit"))

unified = pd.concat(parts, ignore_index=True)
unified = unified.sort_values("ts").reset_index(drop=True)
print(len(unified))
unified.head(8)


  rollout_hit = df["line"].str.contains(


5000


Unnamed: 0,ts,source,namespace,pod,node,level,verb,code,route,msg,container_restart,rollout_in_window
0,2025-09-10 13:46:28.130418503+00:00,app,,,,info,,,,"{""@timestamp"":""2025-09-10T13:46:28.130418503Z""...",0,0.0
1,2025-09-10 13:46:28.132786364+00:00,app,,,,info,,,,"{""@timestamp"":""2025-09-10T13:46:28.132786364Z""...",0,0.0
2,2025-09-10 13:46:28.135940156+00:00,app,,,,info,,,,"{""@timestamp"":""2025-09-10T13:46:28.135940156Z""...",0,0.0
3,2025-09-10 13:46:41.969324786+00:00,app,,,,default,,,,"{""@timestamp"":""2025-09-10T13:46:41.969324786Z""...",0,0.0
4,2025-09-10 13:46:41.969516446+00:00,app,,,,default,,,,"{""@timestamp"":""2025-09-10T13:46:41.969516446Z""...",0,0.0
5,2025-09-10 13:46:41.969604381+00:00,app,,,,default,,,,"{""@timestamp"":""2025-09-10T13:46:41.969604381Z""...",0,0.0
6,2025-09-10 13:46:41.969744595+00:00,app,,,,default,,,,"{""@timestamp"":""2025-09-10T13:46:41.969744595Z""...",0,0.0
7,2025-09-10 13:46:49.352347414+00:00,app,,,,info,,,,"{""@timestamp"":""2025-09-10T13:46:49.352347414Z""...",0,0.0


In [None]:
unified["code"].dropna().astype(int).value_counts().head(10)


Series([], Name: count, dtype: int64)

In [None]:
unified.query("pod.notna()").groupby("pod")["level"].apply(lambda s: (s=="error").mean()).sort_values(ascending=False).head(10)


Series([], Name: level, dtype: object)

In [None]:
[e for e in eps if e["features"]["http5xx"] > 0][:3]


[]