# ðŸ”Ž Loki â†’ Parquet â†’ Episodes (with optional RCA)

This notebook pulls logs from a tenant-aware Loki gateway (OpenShift LokiStack compatible),
normalizes them to a unified table (`latest.parquet`), builds 10â€‘minute episodes, and prints quick stats.
It includes robust auth/SSL handling and JSON-first log parsing.


## 0) Setup

In [1]:
# If needed, install deps (uncomment to run once)
# %pip install --quiet pandas numpy requests pyarrow mlxtend pyyaml


In [2]:
from pathlib import Path
import os, pandas as pd, numpy as np

# --- Storage locations
DATA_DIR = Path("data"); DATA_DIR.mkdir(parents=True, exist_ok=True)
UNIFIED_DIR = DATA_DIR / "unified_logs"; UNIFIED_DIR.mkdir(exist_ok=True, parents=True)
INCIDENTS_DIR = Path("incidents"); INCIDENTS_DIR.mkdir(exist_ok=True, parents=True)
RULES_DIR = Path("rules"); RULES_DIR.mkdir(exist_ok=True, parents=True)

# --- Time window (adjust as needed)
END   = pd.Timestamp.utcnow()
START = END - pd.Timedelta("90min")
print("Window:", START, "â†’", END)


Window: 2025-09-10 16:41:46.517605+00:00 â†’ 2025-09-10 18:11:46.517605+00:00


## 1) Loki helpers (tenantâ€‘aware + token + SSL toggle)

In [None]:
# Cell 1.1 â€” config + session + diagnostics
import pandas as pd, requests, urllib3

# ---- Config (set env vars or edit here) ----
LOKI_BASE       = os.environ.get("LOKI_BASE", "https://logging-loki-openshift-logging.apps.rhoai.ocp-poc-demo.com")
LOKI_TENANT     = os.environ.get("LOKI_TENANT", "application")   # application|infrastructure|audit
LOKI_TOKEN      = os.environ.get("LOKI_TOKEN", "<REDACTED>")                   # e.g. export LOKI_TOKEN="$(oc whoami -t)"
LOKI_ORG_ID     = os.environ.get("LOKI_ORG_ID")                  # often same as tenant (some gateways require it)
LOKI_INSECURE   = os.environ.get("LOKI_INSECURE", "true").lower() in ("1","true","yes")
LOKI_BASIC_USER = os.environ.get("LOKI_BASIC_USER")
LOKI_BASIC_PASS = os.environ.get("LOKI_BASIC_PASS")

if LOKI_INSECURE:
    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

_session = requests.Session()
_default_headers = {"Accept": "application/json"}
if LOKI_TOKEN:
    _default_headers["Authorization"] = f"Bearer {LOKI_TOKEN}"
if LOKI_ORG_ID:
    _default_headers["X-Scope-OrgID"] = LOKI_ORG_ID

def _debug_response(resp):
    ct = resp.headers.get("Content-Type", "")
    preview = (resp.text or "")[:500]
    return f"HTTP {resp.status_code} CT={ct} URL={resp.url}\nBody (first 500):\n{preview}"


In [4]:
# Cell 1.2 â€” healthy ping & label discovery
def loki_ping():
    url = f"{LOKI_BASE.rstrip('/')}/api/logs/v1/{LOKI_TENANT}/loki/api/v1/labels"
    r = _session.get(url, headers=_default_headers, timeout=30, verify=not LOKI_INSECURE, allow_redirects=False)
    if not r.ok or "application/json" not in r.headers.get("Content-Type","").lower():
        raise RuntimeError("Ping failed:\n" + _debug_response(r))
    return r.json()

def loki_labels():
    url = f"{LOKI_BASE.rstrip('/')}/api/logs/v1/{LOKI_TENANT}/loki/api/v1/labels"
    r = _session.get(url, headers=_default_headers, timeout=30, verify=not LOKI_INSECURE, allow_redirects=False)
    r.raise_for_status()
    return r.json().get("data", [])

def loki_label_values(label):
    url = f"{LOKI_BASE.rstrip('/')}/api/logs/v1/{LOKI_TENANT}/loki/api/v1/label/{label}/values"
    r = _session.get(url, headers=_default_headers, timeout=30, verify=not LOKI_INSECURE, allow_redirects=False)
    r.raise_for_status()
    return r.json().get("data", [])

print(loki_ping())  # should print JSON; if not, fix token/tenant/headers
print("Some labels:", sorted(loki_labels())[:20])


{'status': 'success', 'data': ['k8s_container_name', 'k8s_namespace_name', 'k8s_node_name', 'k8s_pod_name', 'kubernetes_container_name', 'kubernetes_host', 'kubernetes_namespace_name', 'kubernetes_pod_name', 'log_type', 'openshift_log_type']}
Some labels: ['k8s_container_name', 'k8s_namespace_name', 'k8s_node_name', 'k8s_pod_name', 'kubernetes_container_name', 'kubernetes_host', 'kubernetes_namespace_name', 'kubernetes_pod_name', 'log_type', 'openshift_log_type']


In [5]:
# Cell 1.3 â€” range query with safe nanoseconds parsing
def loki_query_range(expr, start_ts, end_ts, step="15s", limit=5000, direction="forward"):
    if not LOKI_BASE:
        raise RuntimeError("Set LOKI_BASE to your Loki gateway URL.")
    url = f"{LOKI_BASE.rstrip('/')}/api/logs/v1/{LOKI_TENANT}/loki/api/v1/query_range"
    params = {
        "query": expr,
        "start": int(pd.Timestamp(start_ts).value),  # ns since epoch
        "end": int(pd.Timestamp(end_ts).value),
        "step": step,
        "limit": str(limit),
        "direction": direction,
    }
    auth = (LOKI_BASIC_USER, LOKI_BASIC_PASS) if (LOKI_BASIC_USER and LOKI_BASIC_PASS) else None
    r = _session.get(url, params=params, headers=_default_headers, timeout=60,
                     verify=not LOKI_INSECURE, auth=auth, allow_redirects=False)
    if r.is_redirect or r.status_code in (301,302,303,307,308):
        raise RuntimeError("Auth redirect detected; provide a valid token.\n" + _debug_response(r))
    if not r.ok:
        raise RuntimeError("Loki query_range failed:\n" + _debug_response(r))
    if "application/json" not in r.headers.get("Content-Type","").lower():
        raise RuntimeError("Non-JSON response from Loki:\n" + _debug_response(r))

    payload = r.json()
    data = payload.get("data", {}).get("result", [])

    def _parse_ns(ts_val):
        s = str(ts_val)
        try:
            ns = int(s)
        except ValueError:
            ns = int(float(s))
        return pd.to_datetime(ns, unit="ns", utc=True)

    rows = []
    for series in data:
        labels = series.get("metric", {})
        for ts, line in series.get("values", []):
            rows.append({"ts": _parse_ns(ts), "line": line, **labels})
    return pd.DataFrame(rows)


## 2) Define selectors (LOGQL) and pull a small window

In [6]:
# Define selectors for your environment
# Tip: run loki_labels() to see available keys. You have: log_type, k8s_* and kubernetes_* variants.
LOGQL_APP   = r'{log_type="application"}'
# If you have infra/audit on this tenant, adjust below; otherwise keep as app-only for testing.
LOGQL_INFRA = r'{log_type="application", k8s_namespace_name="openshift-kube-apiserver"}'
LOGQL_EVENT = r'{log_type="application", k8s_namespace_name="default"} |= "Warning"'
LOGQL_AUDIT = r'{log_type="application", k8s_namespace_name="openshift-apiserver"}'

df_app   = loki_query_range(LOGQL_APP,   START, END, step="30s", limit=5000)
df_infra = loki_query_range(LOGQL_INFRA, START, END, step="30s", limit=5000)
df_evt   = loki_query_range(LOGQL_EVENT, START, END, step="30s", limit=5000)
df_audit = loki_query_range(LOGQL_AUDIT, START, END, step="30s", limit=5000)
print("Sizes -> app/infra/event/audit:", len(df_app), len(df_infra), len(df_evt), len(df_audit))


Sizes -> app/infra/event/audit: 5000 0 0 0


In [7]:
# Optional: chunked query for bigger windows
def chunked_query(expr, start_ts, end_ts, chunk="15m", **kwargs):
    out = []
    t0 = pd.Timestamp(start_ts); t1 = pd.Timestamp(end_ts)
    step = pd.Timedelta(chunk)
    cur = t0
    while cur < t1:
        cur_end = min(cur + step, t1)
        df = loki_query_range(expr, cur, cur_end, **kwargs)
        out.append(df)
        cur = cur_end
    return pd.concat(out, ignore_index=True) if out else pd.DataFrame()
# Example: df_app = chunked_query(LOGQL_APP, START, END, chunk="10m", step="15s", limit=5000)


## 3) JSONâ€‘first projector (populate namespace/pod/node + basic HTTP fields)

In [8]:
import json, re

def _maybe_json(s: str):
    if not isinstance(s, str): return None
    s = s.strip()
    if not s or s[0] not in "{[": return None
    try:
        return json.loads(s)
    except Exception:
        return None

def _get_any(obj, keys):
    for k in keys:
        cur = obj
        try:
            for part in k.split("."):
                if isinstance(cur, dict) and part in cur:
                    cur = cur[part]
                else:
                    raise KeyError
            return cur
        except Exception:
            continue
    return None

def _normalize_level(obj, line: str):
    if isinstance(obj, dict):
        v = _get_any(obj, ["level","severity","loglevel","lvl","logger_level"])
        if v is not None: return str(v).lower()
    s = (line or "").lower()
    if any(w in s for w in ["error","exception","fail","backoff","oomkilled","notready"]): return "error"
    if "warn" in s or "throttle" in s: return "warn"
    return "info"

def _extract_code(obj, line: str):
    if isinstance(obj, dict):
        v = _get_any(obj, ["status","status_code","code","http.status","response.status"])
        try:
            if v is not None: return int(v)
        except Exception:
            pass
    m = re.search(r"\s(1\d{2}|2\d{2}|3\d{2}|4\d{2}|5\d{2})\s", " " + (line or "") + " ")
    return int(m.group(1)) if m else None

def _extract_route(obj, line: str):
    if isinstance(obj, dict):
        v = _get_any(obj, ["path","route","url","request_path","http.path","request.url","endpoint"])
        if isinstance(v, str): return v.split("?")[0]
    m = re.search(r"\s(?:GET|POST|PUT|PATCH|DELETE)\s+(\S+)", " " + (line or "") + " ")
    return m.group(1) if m else None

def _label_or_json(series_or_none, objs, json_keys):
    if series_or_none is not None:
        return series_or_none
    vals = []
    for o in objs:
        v = _get_any(o, json_keys) if isinstance(o, dict) else None
        vals.append(v)
    import pandas as pd
    return pd.Series(vals)

def project_unified_stronger(df: pd.DataFrame, source_guess: str) -> pd.DataFrame:
    objs = df["line"].map(_maybe_json)

    ns_series   = df.get("k8s_namespace_name") or df.get("kubernetes_namespace_name")
    pod_series  = df.get("k8s_pod_name")       or df.get("kubernetes_pod_name")
    node_series = df.get("k8s_node_name")      or df.get("kubernetes_host")

    namespace = _label_or_json(ns_series,   objs, ["kubernetes.namespace_name","k8s.namespace.name","k8s.ns","namespace"])
    pod       = _label_or_json(pod_series,  objs, ["kubernetes.pod_name","k8s.pod.name","pod"])
    node      = _label_or_json(node_series, objs, ["kubernetes.host","kubernetes.node_name","k8s.node.name","node"])

    level = [_normalize_level(o, ln) for o, ln in zip(objs, df["line"])]
    code  = [_extract_code(o, ln)    for o, ln in zip(objs, df["line"])]
    route = [_extract_route(o, ln)   for o, ln in zip(objs, df["line"])]

    container_restart = df["line"].str.contains(r"\bRestarted container\b", case=False, na=False).astype(int)
    rollout_hit = df["line"].str.contains(
        r"Scaled up replica set|deployment (created|updated|rolled out)|\brollout\b",
        case=False, na=False, regex=True
    ).astype(float)

    return pd.DataFrame({
        "ts": df["ts"],
        "source": source_guess,
        "namespace": namespace,
        "pod": pod,
        "node": node,
        "level": level,
        "verb": None,
        "code": code,
        "route": route,
        "msg": df["line"].astype(str).str.slice(0, 400),
        "container_restart": container_restart,
        "rollout_in_window": rollout_hit,
    })


## 4) Concat, write `latest.parquet`, and show quick stats

In [9]:
from pathlib import Path

print("Sizes -> app/infra/event/audit:", len(df_app), len(df_infra), len(df_evt), len(df_audit))

parts = []
if not df_app.empty:   parts.append(project_unified_stronger(df_app, "app"))
if not df_infra.empty: parts.append(project_unified_stronger(df_infra, "infra"))
if not df_evt.empty:   parts.append(project_unified_stronger(df_evt, "event"))
if not df_audit.empty: parts.append(project_unified_stronger(df_audit, "audit"))

if parts:
    unified = pd.concat(parts, ignore_index=True)
else:
    unified = pd.DataFrame(columns=[
        "ts","source","namespace","pod","node","level","verb","code","route","msg",
        "container_restart","rollout_in_window"
    ])

# Dtypes & cleanup
unified["ts"] = pd.to_datetime(unified["ts"], utc=True, errors="coerce")
unified = unified.dropna(subset=["ts"]).sort_values("ts").reset_index(drop=True)
unified["container_restart"] = pd.to_numeric(unified["container_restart"], errors="coerce").fillna(0).astype("int64")
unified["code"] = pd.to_numeric(unified["code"], errors="coerce")

# Write
unified_path = UNIFIED_DIR / "latest.parquet"
unified.to_parquet(unified_path, index=False)
unified_csv = UNIFIED_DIR / "latest.csv"
unified.to_csv(unified_csv, index=False)

print("Unified rows:", len(unified))
print("Nulls by column:\n", unified.isna().mean().round(3))
print("Level distribution:\n", unified["level"].value_counts(dropna=False).head(10))
print("HTTP status sample:\n", unified["code"].dropna().astype(int).value_counts().head(10))
unified.head(8)


Sizes -> app/infra/event/audit: 5000 0 0 0


  rollout_hit = df["line"].str.contains(


Unified rows: 5000
Nulls by column:
 ts                   0.0
source               0.0
namespace            0.0
pod                  0.0
node                 1.0
level                0.0
verb                 1.0
code                 1.0
route                1.0
msg                  0.0
container_restart    0.0
rollout_in_window    0.0
dtype: float64
Level distribution:
 level
info       2877
default    1875
error       143
debug        91
warn         14
Name: count, dtype: int64
HTTP status sample:
 Series([], Name: count, dtype: int64)


Unnamed: 0,ts,source,namespace,pod,node,level,verb,code,route,msg,container_restart,rollout_in_window
0,2025-09-10 16:41:47.499431695+00:00,app,nvidia-gpu-operator,nvidia-node-status-exporter-hz7qw,,info,,,,"{""@timestamp"":""2025-09-10T16:41:47.499431695Z""...",0,0.0
1,2025-09-10 16:41:47.503167951+00:00,app,nvidia-gpu-operator,nvidia-node-status-exporter-hz7qw,,info,,,,"{""@timestamp"":""2025-09-10T16:41:47.503167951Z""...",0,0.0
2,2025-09-10 16:41:47.506351140+00:00,app,nvidia-gpu-operator,nvidia-node-status-exporter-hz7qw,,info,,,,"{""@timestamp"":""2025-09-10T16:41:47.506351140Z""...",0,0.0
3,2025-09-10 16:41:49.308207451+00:00,app,redhat-ods-applications,odh-model-controller-7bdb5d4bdd-5psq7,,info,,,,"{""@timestamp"":""2025-09-10T16:41:49.308207451Z""...",0,0.0
4,2025-09-10 16:41:49.308454762+00:00,app,redhat-ods-applications,odh-model-controller-7bdb5d4bdd-5psq7,,info,,,,"{""@timestamp"":""2025-09-10T16:41:49.308454762Z""...",0,0.0
5,2025-09-10 16:41:49.308454762+00:00,app,redhat-ods-applications,odh-model-controller-7bdb5d4bdd-5psq7,,debug,,,,"{""@timestamp"":""2025-09-10T16:41:49.308454762Z""...",0,0.0
6,2025-09-10 16:41:49.308454762+00:00,app,redhat-ods-applications,odh-model-controller-7bdb5d4bdd-5psq7,,info,,,,"{""@timestamp"":""2025-09-10T16:41:49.308454762Z""...",0,0.0
7,2025-09-10 16:41:49.308476949+00:00,app,redhat-ods-applications,odh-model-controller-7bdb5d4bdd-5psq7,,info,,,,"{""@timestamp"":""2025-09-10T16:41:49.308476949Z""...",0,0.0


## 5) Build 10â€‘minute episodes (namespace/pod/node groups)

In [10]:
def build_episodes(df: pd.DataFrame, window="10min", keys=("namespace","pod","node")):
    df = df.copy()
    df["ts"] = pd.to_datetime(df["ts"], utc=True)
    df.set_index("ts", inplace=True)
    episodes = []
    for wstart, wdf in df.groupby(pd.Grouper(freq=window)):
        if wdf.empty: continue
        wend = wstart + pd.to_timedelta(window)
        grp_cols = [k for k in keys if k in wdf.columns]
        groups = dict(tuple(wdf.groupby(grp_cols, dropna=False))) if grp_cols else {"_": wdf}
        for gkey, gdf in groups.items():
            total = len(gdf)
            errors = (gdf["level"]=="error").sum()
            err_ratio = (errors/total) if total else 0.0
            restarts = gdf.get("container_restart", pd.Series([0]*total, index=gdf.index)).sum()
            http5xx = (gdf.get("code", pd.Series(dtype=float))>=500).sum()
            rollout = 1.0 if (gdf.get("rollout_in_window", pd.Series(dtype=float))>0).any() else 0.0
            entities = {}
            for col in ["namespace","pod","node"]:
                vals = [v for v in gdf[col].astype(str).dropna().unique().tolist() if v and v!="None"]
                if vals: entities[col] = vals
            episodes.append({
                "episode_id": f"{int(wstart.value)}::{hash(str(gkey)) & 0xfffffff:07x}",
                "start": wstart, "end": wend,
                "entities": entities,
                "features": {"count": float(total), "error_ratio": float(err_ratio), "restarts": float(restarts), "http5xx": float(http5xx), "rollout_in_window": rollout},
            })
    return episodes

eps = build_episodes(unified, window="10min")
print("Episodes:", len(eps))
import pandas as pd
epi_dbg = pd.DataFrame([
    {"id": e["episode_id"], **e["features"], 
     "ent_namespace": ",".join(e["entities"].get("namespace", [])),
     "ent_pod": ",".join(e["entities"].get("pod", [])),
     "ent_node": ",".join(e["entities"].get("node", [])),}
    for e in eps
]).sort_values("id")
epi_dbg.head(12)


Episodes: 67


Unnamed: 0,id,count,error_ratio,restarts,http5xx,rollout_in_window,ent_namespace,ent_pod,ent_node
2,1757522400000000000::045587e,24.0,0.0,0.0,0.0,0.0,nvidia-gpu-operator,gpu-feature-discovery-9dqf2,
5,1757522400000000000::0678d23,3.0,0.0,0.0,0.0,0.0,redhat-ods-applications,kueue-controller-manager-5b76bb944d-p47dj,
1,1757522400000000000::0bfe967,5.0,0.0,0.0,0.0,0.0,istio-system,istiod-data-science-smcp-6ffc7f559-qgwqq,
0,1757522400000000000::1017c48,1.0,0.0,0.0,0.0,0.0,istio-system,istio-ingressgateway-56d4957dc4-68gvd,
8,1757522400000000000::2c6ecbd,32.0,0.0,0.0,0.0,0.0,redhat-ods-applications,rhods-dashboard-79f7447f6d-xl5w8,
3,1757522400000000000::2d29ce9,100.0,0.0,0.0,0.0,0.0,nvidia-gpu-operator,gpu-operator-8468cd9bf-wrn7p,
7,1757522400000000000::52bfd5c,32.0,0.0,0.0,0.0,0.0,redhat-ods-applications,rhods-dashboard-79f7447f6d-5q7p5,
4,1757522400000000000::d8be8be,27.0,0.0,0.0,0.0,0.0,nvidia-gpu-operator,nvidia-node-status-exporter-hz7qw,
9,1757522400000000000::da6958a,50.0,0.02,0.0,0.0,0.0,redhat-ods-operator,rhods-operator-6fd6c57948-78jgv,
6,1757522400000000000::ef22124,384.0,0.054688,0.0,0.0,0.0,redhat-ods-applications,odh-model-controller-7bdb5d4bdd-5psq7,
