# 🔎 Loki (multi-tenant) → Parquet → Episodes (Auto-load)

In [1]:
# %pip install --quiet pandas numpy requests pyarrow
from pathlib import Path
import os, pandas as pd, numpy as np

DATA_DIR = Path("data"); DATA_DIR.mkdir(parents=True, exist_ok=True)
UNIFIED_DIR = DATA_DIR / "unified_logs"; UNIFIED_DIR.mkdir(parents=True, exist_ok=True)

END   = pd.Timestamp.utcnow()
START = END - pd.Timedelta("6h")
print("Window:", START, "→", END)


Window: 2025-09-10 13:22:19.780245+00:00 → 2025-09-10 19:22:19.780245+00:00


## Helpers

In [None]:
import requests, urllib3

LOKI_BASE       = os.environ.get("LOKI_BASE", "https://logging-loki-openshift-logging.apps.rhoai.ocp-poc-demo.com")
LOKI_TOKEN      = os.environ.get("LOKI_TOKEN", "<REDACTED>")
LOKI_INSECURE   = os.environ.get("LOKI_INSECURE", "true").lower() in ("1","true","yes")

if LOKI_INSECURE:
    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

_session = requests.Session()
_headers = {"Accept": "application/json"}
if LOKI_TOKEN:
    _headers["Authorization"] = f"Bearer {LOKI_TOKEN}"

def _debug_response(resp):
    return f"HTTP {resp.status_code} {resp.url}\n{(resp.text or '')[:300]}"


In [3]:
def loki_ping_tenant(tenant: str):
    url = f"{LOKI_BASE}/api/logs/v1/{tenant}/loki/api/v1/labels"
    r = _session.get(url, headers=_headers, verify=not LOKI_INSECURE)
    r.raise_for_status()
    return r.json()


In [4]:
def loki_query_range_tenant(tenant: str, expr, start_ts, end_ts, step="15s", limit=5000):
    url = f"{LOKI_BASE}/api/logs/v1/{tenant}/loki/api/v1/query_range"
    params = {
        "query": expr,
        "start": int(pd.Timestamp(start_ts).value),
        "end": int(pd.Timestamp(end_ts).value),
        "step": step,
        "limit": str(limit),
    }
    r = _session.get(url, params=params, headers=_headers, verify=not LOKI_INSECURE)
    if not r.ok:
        raise RuntimeError(_debug_response(r))
    data = r.json().get("data", {}).get("result", [])
    rows = []
    for series in data:
        labels = series.get("metric", {})
        for ts, line in series.get("values", []):
            ns = int(float(ts))
            rows.append({"ts": pd.to_datetime(ns, unit="ns", utc=True), "line": line, **labels})
    return pd.DataFrame(rows)


## Safe Fetch

In [5]:
def tenant_wildcard_selector(tenant: str):
    try:
        labels = set(loki_ping_tenant(tenant).get("data", []))
    except Exception:
        labels = set()
    for key in ["k8s_namespace_name","namespace","k8s_pod_name","pod","k8s_node_name","node","job","log_type"]:
        if key in labels:
            return f'{{{key}=~".+"}}'
    return '{job=~".+"}'

def safe_fetch(tenant: str, start, end):
    try:
        sel = tenant_wildcard_selector(tenant)
        print(f"[{tenant}] {sel}")
        df = loki_query_range_tenant(tenant, sel, start, end)
        print(f"[{tenant}] rows {len(df)}")
        return df
    except Exception as e:
        print(f"[{tenant}] failed → {e}")
        return pd.DataFrame(columns=["ts","line"])

df_app   = safe_fetch("application", START, END)
df_infra = safe_fetch("infrastructure", START, END)
df_audit = safe_fetch("audit", START, END)


[application] {k8s_namespace_name=~".+"}
[application] rows 5000
[infrastructure] {k8s_namespace_name=~".+"}
[infrastructure] rows 5000
[audit] {k8s_node_name=~".+"}
[audit] rows 5000


## Projector

In [6]:
import json, re

def _to_text(v):
    if isinstance(v,str): return v
    try: return json.dumps(v)
    except: return str(v)

def project_unified_stronger(df, src):
    line = df["line"].map(_to_text).astype("string")
    return pd.DataFrame({
        "ts": df.get("ts"),
        "source": src,
        "msg": line,
        "level": "info"
    })


## Step 4: Concat/write

In [7]:
parts = []
if not df_app.empty: parts.append(project_unified_stronger(df_app,"app"))
if not df_infra.empty: parts.append(project_unified_stronger(df_infra,"infra"))
if not df_audit.empty: parts.append(project_unified_stronger(df_audit,"audit"))
unified = pd.concat(parts, ignore_index=True) if parts else pd.DataFrame(columns=["ts","source","msg","level"])

unified.to_parquet(UNIFIED_DIR/"latest.parquet", index=False)
print("Unified rows:", len(unified))


Unified rows: 15000


## Step 5: Episodes auto-load

In [8]:
if 'unified' not in globals() or unified.empty:
    if (UNIFIED_DIR/"latest.parquet").exists():
        unified = pd.read_parquet(UNIFIED_DIR/"latest.parquet")
        print("Loaded unified from file:", unified.shape)

def build_episodes(df, window="10min"):
    if df.empty: return []
    df = df.copy(); df["ts"]=pd.to_datetime(df["ts"],utc=True,errors="coerce")
    df=df.dropna(subset=["ts"]).set_index("ts")
    eps=[]
    for wstart,wdf in df.groupby(pd.Grouper(freq=window)):
        if wdf.empty: continue
        eps.append({"start":wstart,"count":len(wdf)})
    return eps

eps=build_episodes(unified)
print("Episodes:", len(eps))
pd.DataFrame(eps).head()


Episodes: 2


Unnamed: 0,start,count
0,2025-09-10 19:10:00+00:00,2407
1,2025-09-10 19:20:00+00:00,12593
