In [1]:
%pip install --quiet pandas numpy requests pyarrow mlxtend pyyaml
# If you’ll pull the topology with Python client (optional):
# %pip install --quiet kubernetes openshift


Note: you may need to restart the kernel to use updated packages.


In [2]:
from pathlib import Path
import os, json, time, math, pandas as pd, numpy as np, requests

# Storage locations
DATA_DIR = Path("data"); DATA_DIR.mkdir(parents=True, exist_ok=True)
UNIFIED_DIR = DATA_DIR / "unified_logs"; UNIFIED_DIR.mkdir(exist_ok=True, parents=True)
INCIDENTS_DIR = Path("incidents"); INCIDENTS_DIR.mkdir(exist_ok=True, parents=True)
RULES_DIR = Path("rules"); RULES_DIR.mkdir(exist_ok=True, parents=True)

# Loki connection
LOKI_URL = os.environ.get("LOKI_URL", "https://logging-loki-openshift-logging.apps.rhoai.ocp-poc-demo.com")
LOKI_TOKEN = os.environ.get("LOKI_TOKEN","<REDACTED>")  # or None if not needed

# Pull window (UTC). Adjust!
END = pd.Timestamp.utcnow()
START = END - pd.Timedelta("90min")

print("Window:", START, "→", END)


Window: 2025-09-10 16:32:03.845028+00:00 → 2025-09-10 18:02:03.845028+00:00


In [None]:
# Cell 1.1 — Loki query_range helper (tenant-aware + token + better diagnostics)
import os, pandas as pd, requests, urllib3

# ---- Config (adjust to your cluster) ----
LOKI_BASE       = os.environ.get(
    "LOKI_BASE",
    "https://logging-loki-openshift-logging.apps.rhoai.ocp-poc-demo.com"
)  # e.g. https://logging-loki-...apps.cluster.com
LOKI_TENANT     = os.environ.get("LOKI_TENANT", "application")
LOKI_TOKEN     = os.environ.get("LOKI_TOKEN", "<REDACTED>")           # from: oc whoami -t
LOKI_ORG_ID     = os.environ.get("LOKI_ORG_ID")  # often same as tenant (optional)
LOKI_INSECURE   = os.environ.get("LOKI_INSECURE", "true").lower() in ("1","true","yes")
LOKI_BASIC_USER = os.environ.get("LOKI_BASIC_USER")  # rarely used
LOKI_BASIC_PASS = os.environ.get("LOKI_BASIC_PASS")

if LOKI_INSECURE:
    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

_session = requests.Session()
_default_headers = {"Accept": "application/json"}
if LOKI_TOKEN:
    _default_headers["Authorization"] = f"Bearer {LOKI_TOKEN}"
if LOKI_ORG_ID:
    _default_headers["X-Scope-OrgID"] = LOKI_ORG_ID


def _debug_response(resp):
    ct = resp.headers.get("Content-Type", "")
    preview = (resp.text or "")[:500]
    return f"HTTP {resp.status_code} CT={ct} URL={resp.url}\nBody (first 500):\n{preview}"



In [4]:
# Helpers for label discovery in your Loki tenant

def loki_labels():
    """
    List all label keys available in the current tenant.
    """
    url = f"{LOKI_BASE.rstrip('/')}/api/logs/v1/{LOKI_TENANT}/loki/api/v1/labels"
    r = _session.get(
        url,
        headers=_default_headers,
        timeout=30,
        verify=not LOKI_INSECURE,
        allow_redirects=False
    )
    r.raise_for_status()
    return r.json().get("data", [])

def loki_label_values(label):
    """
    List values for a given label key (e.g., namespaces, pods).
    """
    url = f"{LOKI_BASE.rstrip('/')}/api/logs/v1/{LOKI_TENANT}/loki/api/v1/label/{label}/values"
    r = _session.get(
        url,
        headers=_default_headers,
        timeout=30,
        verify=not LOKI_INSECURE,
        allow_redirects=False
    )
    r.raise_for_status()
    return r.json().get("data", [])

def loki_query_range(expr, start_ts, end_ts, step="15s", limit=5000, direction="forward"):
    """
    Calls: {LOKI_BASE}/api/logs/v1/{LOKI_TENANT}/loki/api/v1/query_range
    Returns DataFrame with columns: ts, line, and all series labels.
    """
    if not LOKI_BASE:
        raise RuntimeError("Set LOKI_BASE to your Loki gateway URL.")

    url = f"{LOKI_BASE.rstrip('/')}/api/logs/v1/{LOKI_TENANT}/loki/api/v1/query_range"
    params = {
        "query": expr,
        "start": int(pd.Timestamp(start_ts).value),  # already ns
        "end": int(pd.Timestamp(end_ts).value),      # already ns
        "step": step,
        "limit": str(limit),
        "direction": direction,
    }

    auth = (LOKI_BASIC_USER, LOKI_BASIC_PASS) if (LOKI_BASIC_USER and LOKI_BASIC_PASS) else None
    r = _session.get(
        url, params=params, headers=_default_headers, timeout=60,
        verify=not LOKI_INSECURE, auth=auth, allow_redirects=False
    )
    if not r.ok:
        raise RuntimeError("Loki query_range failed:\n" + _debug_response(r))
    if "application/json" not in r.headers.get("Content-Type","").lower():
        raise RuntimeError("Unexpected non-JSON response from Loki:\n" + _debug_response(r))

    payload = r.json()
    data = payload.get("data", {}).get("result", [])

    def _parse_ns(ts_val):
        # Loki returns ns since epoch as a string (sometimes scientific notation).
        s = str(ts_val)
        try:
            # fast path if it’s plain digits
            ns = int(s)
        except ValueError:
            # fallback for "1.7575129300981494e+18" style
            ns = int(float(s))
        return pd.to_datetime(ns, unit="ns", utc=True)

    rows = []
    for series in data:
        labels = series.get("metric", {})
        for ts, line in series.get("values", []):
            rows.append({"ts": _parse_ns(ts), "line": line, **labels})

    return pd.DataFrame(rows)



In [5]:
labels = loki_labels()
print("Available labels:", labels[:20])

if "log_type" in labels:
    print("log_type values:", loki_label_values("log_type"))

if "namespace" in labels:
    print("namespace values:", loki_label_values("namespace")[:10])


Available labels: ['k8s_container_name', 'k8s_namespace_name', 'k8s_node_name', 'k8s_pod_name', 'kubernetes_container_name', 'kubernetes_host', 'kubernetes_namespace_name', 'kubernetes_pod_name', 'log_type', 'openshift_log_type']
log_type values: ['application']


In [6]:
# Define selectors for your environment
LOGQL_APP   = r'{log_type="application"}'
LOGQL_INFRA = r'{log_type="application", k8s_namespace_name="openshift-kube-apiserver"}'
LOGQL_EVENT = r'{log_type="application", k8s_namespace_name="default"} |= "Warning"'
LOGQL_AUDIT = r'{log_type="application", k8s_namespace_name="openshift-apiserver"}'


In [7]:
df_app = loki_query_range(LOGQL_APP, START, END)
df_infra = loki_query_range(LOGQL_INFRA, START, END)
df_evt = loki_query_range(LOGQL_EVENT, START, END)
df_audit = loki_query_range(LOGQL_AUDIT, START, END)
print(len(df_app), len(df_infra), len(df_evt), len(df_audit))

5000 0 0 0


In [8]:
df_app['line']

0       {"@timestamp":"2025-09-10T16:41:21.426970749Z"...
1       {"@timestamp":"2025-09-10T16:41:21.568561508Z"...
2       {"@timestamp":"2025-09-10T16:41:21.568863033Z"...
3       {"@timestamp":"2025-09-10T16:41:21.568926952Z"...
4       {"@timestamp":"2025-09-10T16:41:21.568953310Z"...
                              ...                        
4995    {"@timestamp":"2025-09-10T17:41:08.635628755Z"...
4996    {"@timestamp":"2025-09-10T17:41:08.635720825Z"...
4997    {"@timestamp":"2025-09-10T17:41:08.639424428Z"...
4998    {"@timestamp":"2025-09-10T17:41:08.674922351Z"...
4999    {"@timestamp":"2025-09-10T17:41:25.078345158Z"...
Name: line, Length: 5000, dtype: object

In [9]:
import json, re
import pandas as pd

def _maybe_json(s: str):
    if not isinstance(s, str): return None
    s = s.strip()
    if not s or s[0] not in "{[": return None
    try:
        return json.loads(s)
    except Exception:
        return None

def _get_any(obj, keys):
    for k in keys:
        cur = obj
        try:
            for part in k.split("."):
                if isinstance(cur, dict) and part in cur:
                    cur = cur[part]
                else:
                    raise KeyError
            return cur
        except Exception:
            continue
    return None

def _normalize_level(obj, line: str):
    if isinstance(obj, dict):
        v = _get_any(obj, ["level","severity","loglevel","lvl","logger_level"])
        if v is not None: return str(v).lower()
    s = (line or "").lower()
    if any(w in s for w in ["error","exception","fail","backoff","oomkilled","notready"]): return "error"
    if "warn" in s or "throttle" in s: return "warn"
    return "info"

def _extract_code(obj, line: str):
    if isinstance(obj, dict):
        v = _get_any(obj, ["status","status_code","code","http.status","response.status"])
        try:
            if v is not None: return int(v)
        except Exception:
            pass
    m = re.search(r"\s(1\d{2}|2\d{2}|3\d{2}|4\d{2}|5\d{2})\s", " " + (line or "") + " ")
    return int(m.group(1)) if m else None

def _extract_route(obj, line: str):
    if isinstance(obj, dict):
        v = _get_any(obj, ["path","route","url","request_path","http.path","request.url","endpoint"])
        if isinstance(v, str): return v.split("?")[0]
    m = re.search(r"\s(?:GET|POST|PUT|PATCH|DELETE)\s+(\S+)", " " + (line or "") + " ")
    return m.group(1) if m else None

def _label_or_json(series_or_none, objs, json_keys):
    """
    Returns a Series: prefer label column if present; otherwise pull from JSON.
    """
    if series_or_none is not None:
        return series_or_none
    vals = []
    for o in objs:
        v = _get_any(o, json_keys) if isinstance(o, dict) else None
        vals.append(v)
    return pd.Series(vals, index=objs.index if isinstance(objs, pd.Series) else None)

def project_unified_stronger(df: pd.DataFrame, source_guess: str) -> pd.DataFrame:
    # Parse JSON bodies up front
    objs = df["line"].map(_maybe_json)

    # Prefer label columns; if missing, use JSON keys commonly used by fluent-bit/vector
    ns_series   = df.get("k8s_namespace_name") or df.get("kubernetes_namespace_name")
    pod_series  = df.get("k8s_pod_name")       or df.get("kubernetes_pod_name")
    node_series = df.get("k8s_node_name")      or df.get("kubernetes_host")

    namespace = _label_or_json(ns_series,   objs, ["kubernetes.namespace_name", "k8s.namespace.name", "k8s.ns", "namespace"])
    pod       = _label_or_json(pod_series,  objs, ["kubernetes.pod_name", "k8s.pod.name", "pod"])
    node      = _label_or_json(node_series, objs, ["kubernetes.host", "kubernetes.node_name", "k8s.node.name", "node"])

    level = [_normalize_level(o, ln) for o, ln in zip(objs, df["line"])]
    code  = [_extract_code(o, ln)   for o, ln in zip(objs, df["line"])]
    route = [_extract_route(o, ln)  for o, ln in zip(objs, df["line"])]

    container_restart = df["line"].str.contains(r"\bRestarted container\b", case=False, na=False).astype(int)
    rollout_hit = df["line"].str.contains(
        r"Scaled up replica set|deployment (created|updated|rolled out)|\brollout\b",
        case=False, na=False, regex=True
    ).astype(float)

    return pd.DataFrame({
        "ts": df["ts"],
        "source": source_guess,
        "namespace": namespace,
        "pod": pod,
        "node": node,
        "level": level,
        "verb": None,
        "code": code,
        "route": route,
        "msg": df["line"].astype(str).str.slice(0, 400),
        "container_restart": container_restart,
        "rollout_in_window": rollout_hit,
    })


In [10]:
from pathlib import Path
import pandas as pd

print("Sizes -> app/infra/event/audit:",
      len(df_app), len(df_infra), len(df_evt), len(df_audit))
print("Projector in use:", project_unified.__name__)

parts = []
if not df_app.empty:   parts.append(project_unified_stronger(df_app, "app"))
if not df_infra.empty: parts.append(project_unified_stronger(df_infra, "infra"))
if not df_evt.empty:   parts.append(project_unified_stronger(df_evt, "event"))
if not df_audit.empty: parts.append(project_unified_stronger(df_audit, "audit"))
unified = pd.concat(parts, ignore_index=True)



# Dtypes & cleanup
unified["ts"] = pd.to_datetime(unified["ts"], utc=True, errors="coerce")
unified = unified.dropna(subset=["ts"]).sort_values("ts").reset_index(drop=True)

# Make sure numeric columns are numeric
for col in ("container_restart",):
    unified[col] = pd.to_numeric(unified[col], errors="coerce").fillna(0).astype("int64")
for col in ("code",):
    unified[col] = pd.to_numeric(unified[col], errors="coerce")

# Write outputs
unified_dir = Path("data/unified_logs"); unified_dir.mkdir(parents=True, exist_ok=True)
unified_path = unified_dir / "latest.parquet"
unified.to_parquet(unified_path, index=False)     # %pip install pyarrow if needed
unified_csv = unified_dir / "latest.csv"
unified.to_csv(unified_csv, index=False)

print("Unified rows:", len(unified))
print("Nulls by column:\n", unified.isna().mean().round(3))
print("Level distribution:\n", unified["level"].value_counts(dropna=False).head(10))
print("HTTP status sample:\n", unified["code"].dropna().astype(int).value_counts().head(10))
display(unified.head(5))


Sizes -> app/infra/event/audit: 5000 0 0 0


NameError: name 'project_unified' is not defined

In [None]:
print("Namespaces sample:", unified["namespace"].dropna().unique()[:10])
print("Pods sample:", unified["pod"].dropna().unique()[:10])
unified.head(5)

Namespaces sample: ['istio-system' 'nvidia-gpu-operator' 'redhat-ods-applications'
 'redhat-ods-operator']
Pods sample: ['istiod-data-science-smcp-6ffc7f559-qgwqq' 'gpu-feature-discovery-9dqf2'
 'gpu-operator-8468cd9bf-wrn7p' 'istio-egressgateway-6844bcfb86-mcwj5'
 'istio-ingressgateway-56d4957dc4-68gvd'
 'codeflare-operator-manager-744b8b6b85-dcdk7'
 'kueue-controller-manager-5b76bb944d-p47dj'
 'modelmesh-controller-77545d9594-wfcff'
 'odh-model-controller-7bdb5d4bdd-5psq7'
 'odh-notebook-controller-manager-7f6b79cfd5-mpngr']


Unnamed: 0,ts,source,namespace,pod,node,level,verb,code,route,msg,container_restart,rollout_in_window
0,2025-09-10 16:10:45.457688715+00:00,app,istio-system,istiod-data-science-smcp-6ffc7f559-qgwqq,,info,,,,"{""@timestamp"":""2025-09-10T16:10:45.457688715Z""...",0,0.0
1,2025-09-10 16:10:45.578677538+00:00,app,istio-system,istiod-data-science-smcp-6ffc7f559-qgwqq,,info,,,,"{""@timestamp"":""2025-09-10T16:10:45.578677538Z""...",0,0.0
2,2025-09-10 16:10:45.578849106+00:00,app,istio-system,istiod-data-science-smcp-6ffc7f559-qgwqq,,info,,,,"{""@timestamp"":""2025-09-10T16:10:45.578849106Z""...",0,0.0
3,2025-09-10 16:10:45.578886849+00:00,app,istio-system,istiod-data-science-smcp-6ffc7f559-qgwqq,,info,,,,"{""@timestamp"":""2025-09-10T16:10:45.578886849Z""...",0,0.0
4,2025-09-10 16:10:45.578902150+00:00,app,istio-system,istiod-data-science-smcp-6ffc7f559-qgwqq,,info,,,,"{""@timestamp"":""2025-09-10T16:10:45.578902150Z""...",0,0.0


In [None]:
def build_episodes(df: pd.DataFrame, window="10min", keys=("namespace","pod","node")):
    df = df.copy()
    df["ts"] = pd.to_datetime(df["ts"], utc=True)
    df.set_index("ts", inplace=True)
    episodes = []
    for wstart, wdf in df.groupby(pd.Grouper(freq=window)):
        if wdf.empty: continue
        wend = wstart + pd.to_timedelta(window)
        grp_cols = [k for k in keys if k in wdf.columns]
        groups = dict(tuple(wdf.groupby(grp_cols, dropna=False))) if grp_cols else {"_": wdf}
        for gkey, gdf in groups.items():
            total = len(gdf)
            errors = (gdf["level"]=="error").sum()
            err_ratio = (errors/total) if total else 0.0
            restarts = gdf.get("container_restart", pd.Series([0]*total, index=gdf.index)).sum()
            http5xx = (gdf.get("code", pd.Series(dtype=float))>=500).sum()
            rollout = 1.0 if (gdf.get("rollout_in_window", pd.Series(dtype=float))>0).any() else 0.0
            entities = {}
            for col in ["namespace","pod","node"]:
                vals = [v for v in gdf[col].astype(str).dropna().unique().tolist() if v and v!="None"]
                if vals: entities[col] = vals
            episodes.append({
                "episode_id": f"{int(wstart.value)}::{hash(str(gkey)) & 0xfffffff:07x}",
                "start": wstart, "end": wend,
                "entities": entities,
                "features": {"count": float(total), "error_ratio": float(err_ratio), "restarts": float(restarts), "http5xx": float(http5xx), "rollout_in_window": rollout},
            })
    return episodes

eps = build_episodes(unified, window="10min")
print("Episodes:", len(eps))
pd.DataFrame([{"id": e["episode_id"], **e["features"], **{f"ent_{k}": v for k,v in e["entities"].items()}} for e in eps]).head(10)


Episodes: 8


Unnamed: 0,id,count,error_ratio,restarts,http5xx,rollout_in_window
0,1757511600000000000::6294c1c,143.0,0.020979,0.0,0.0,0.0
1,1757512200000000000::6294c1c,869.0,0.050633,0.0,0.0,0.0
2,1757512800000000000::6294c1c,598.0,0.041806,0.0,0.0,0.0
3,1757513400000000000::6294c1c,849.0,0.031802,0.0,0.0,0.0
4,1757514000000000000::6294c1c,1019.0,0.042198,0.0,0.0,0.0
5,1757514600000000000::6294c1c,594.0,0.045455,0.0,0.0,0.0
6,1757515200000000000::6294c1c,705.0,0.041135,0.0,0.0,0.0
7,1757515800000000000::6294c1c,223.0,0.053812,0.0,0.0,0.0


In [None]:
# If you still have `eps` in memory from build_episodes(...):
import pandas as pd

epi_dbg = pd.DataFrame([
    {
        "id": e["episode_id"],
        "start": e["start"],
        "end": e["end"],
        "count": e["features"]["count"],
        "error_ratio": e["features"]["error_ratio"],
        "restarts": e["features"]["restarts"],
        "http5xx": e["features"]["http5xx"],
        "rollout_in_window": e["features"]["rollout_in_window"],
        "ent_namespace": ",".join(e["entities"].get("namespace", [])),
        "ent_pod": ",".join(e["entities"].get("pod", [])),
        "ent_node": ",".join(e["entities"].get("node", [])),
    }
    for e in eps
]).sort_values("start")
epi_dbg.head(10)


Unnamed: 0,id,start,end,count,error_ratio,restarts,http5xx,rollout_in_window,ent_namespace,ent_pod,ent_node
0,1757511600000000000::6294c1c,2025-09-10 13:40:00+00:00,2025-09-10 13:50:00+00:00,143.0,0.020979,0.0,0.0,0.0,,,
1,1757512200000000000::6294c1c,2025-09-10 13:50:00+00:00,2025-09-10 14:00:00+00:00,869.0,0.050633,0.0,0.0,0.0,,,
2,1757512800000000000::6294c1c,2025-09-10 14:00:00+00:00,2025-09-10 14:10:00+00:00,598.0,0.041806,0.0,0.0,0.0,,,
3,1757513400000000000::6294c1c,2025-09-10 14:10:00+00:00,2025-09-10 14:20:00+00:00,849.0,0.031802,0.0,0.0,0.0,,,
4,1757514000000000000::6294c1c,2025-09-10 14:20:00+00:00,2025-09-10 14:30:00+00:00,1019.0,0.042198,0.0,0.0,0.0,,,
5,1757514600000000000::6294c1c,2025-09-10 14:30:00+00:00,2025-09-10 14:40:00+00:00,594.0,0.045455,0.0,0.0,0.0,,,
6,1757515200000000000::6294c1c,2025-09-10 14:40:00+00:00,2025-09-10 14:50:00+00:00,705.0,0.041135,0.0,0.0,0.0,,,
7,1757515800000000000::6294c1c,2025-09-10 14:50:00+00:00,2025-09-10 15:00:00+00:00,223.0,0.053812,0.0,0.0,0.0,,,
