In [5]:
# Inspect transactions.jsonl (single cell)
import json
from itertools import islice

EXPECTED = [
    "ts","timestamp","service_id","peer_id",
    "req_count","error_count","p50_latency_ms","p95_latency_ms",
    "bytes_in","bytes_out"
]

def inspect_jsonl(path: str, n: int = 5):
    keys_union, samples = set(), []
    with open(path, "r", encoding="utf-8") as f:
        for line in islice(f, n):
            if not line.strip(): 
                continue
            obj = json.loads(line)
            samples.append(obj)
            keys_union.update(obj.keys())
    keys_union = sorted(keys_union)
    print("=== HEADER / TOP-LEVEL KEYS ===")
    print(keys_union)
    print("\n=== EXPECTED FIELD CHECK ===")
    present = [k for k in EXPECTED if k in keys_union]
    missing = [k for k in EXPECTED if k not in keys_union]
    print("Present:", present)
    print("Missing:", missing)
    print(f"\n=== FIRST {len(samples)} ROWS ===")
    for i, obj in enumerate(samples, 1):
        print(f"\n--- Row {i} ---")
        print({k: obj.get(k) for k in keys_union})

inspect_jsonl("transactions.jsonl", n=5)


=== HEADER / TOP-LEVEL KEYS ===
['tenant/id', 'transaction/consumer/id', 'transaction/consumer/name', 'transaction/cost', 'transaction/data', 'transaction/id', 'transaction/response', 'transaction/supplier/id', 'transaction/supplier/name', 'transaction/time']

=== EXPECTED FIELD CHECK ===
Present: []
Missing: ['ts', 'timestamp', 'service_id', 'peer_id', 'req_count', 'error_count', 'p50_latency_ms', 'p95_latency_ms', 'bytes_in', 'bytes_out']

=== FIRST 5 ROWS ===

--- Row 1 ---
{'tenant/id': 'DEMO', 'transaction/consumer/id': 'SELENE', 'transaction/consumer/name': 'Selene Customer Warehouse', 'transaction/cost': 41.5, 'transaction/data': 0.3, 'transaction/id': 't0x78b44bd3-GWVR7-O9HZF', 'transaction/response': 'success', 'transaction/supplier/id': 'AWS', 'transaction/supplier/name': 'Amazon Web Services', 'transaction/time': '2024-06-01 00:00:00'}

--- Row 2 ---
{'tenant/id': 'DEMO', 'transaction/consumer/id': 'SELENE', 'transaction/consumer/name': 'Selene Customer Warehouse', 'transact

In [6]:
# Build hourly features from transactions.jsonl (single cell)
import json
from collections import defaultdict
from datetime import datetime
import numpy as np

# --- helpers ---
def floor_to_hour(ts: str):
    """Parse timestamp and floor to the hour, return 'YYYY-MM-DD HH:00:00' or None."""
    if not ts:
        return None
    fmts = [
        "%Y-%m-%d %H:%M:%S",
        "%Y-%m-%dT%H:%M:%S",
        "%Y-%m-%d %H:%M:%S.%f",
        "%Y-%m-%dT%H:%M:%S.%f",
    ]
    for fmt in fmts:
        try:
            dt = datetime.strptime(ts, fmt)
            dt = dt.replace(minute=0, second=0, microsecond=0)
            return dt.strftime("%Y-%m-%d %H:%M:%S")
        except Exception:
            continue
    return None

def is_success(resp):
    """Return True if response indicates success."""
    if resp is None:
        return False
    s = str(resp).strip().lower()
    return s in {"success", "ok", "200", "true", "passed"}

# --- aggregate per (consumer_id, supplier_id, hour) ---
agg = defaultdict(lambda: {
    "n": 0,
    "success": 0,
    "cost_sum": 0.0,
    "cost_sqsum": 0.0,   # optional variance later
    "data_sum": 0.0,
    "data_sqsum": 0.0,   # optional variance later
})

path = "transactions.jsonl"  # adjust if needed
rows_read = 0
with open(path, "r", encoding="utf-8") as f:
    for line in f:
        if not line.strip():
            continue
        obj = json.loads(line)

        ts = obj.get("transaction/time")
        bucket = floor_to_hour(ts)
        if bucket is None:
            continue

        consumer = obj.get("transaction/consumer/id") or ""
        supplier = obj.get("transaction/supplier/id") or ""
        if not consumer or not supplier:
            continue

        cost = obj.get("transaction/cost", 0.0) or 0.0
        data = obj.get("transaction/data", 0.0) or 0.0
        resp = obj.get("transaction/response")

        key = (consumer, supplier, bucket)
        a = agg[key]
        a["n"] += 1
        a["success"] += 1 if is_success(resp) else 0
        a["cost_sum"] += float(cost)
        a["cost_sqsum"] += float(cost) ** 2
        a["data_sum"] += float(data)
        a["data_sqsum"] += float(data) ** 2
        rows_read += 1

# --- build feature matrix ---
keys = []
rows = []
for key, a in agg.items():
    n = float(a["n"])
    succ = float(a["success"])
    err = max(n - succ, 0.0)
    err_rate = (err / n) if n > 0 else 0.0

    cost_sum = a["cost_sum"]
    data_sum = a["data_sum"]
    cost_mean = (cost_sum / n) if n > 0 else 0.0
    data_mean = (data_sum / n) if n > 0 else 0.0

    # features: [req_count, error_rate, cost_sum, cost_mean, data_sum, data_mean]
    feat = [n, err_rate, cost_sum, cost_mean, data_sum, data_mean]
    rows.append(feat)
    keys.append(key)

X = np.array(rows, dtype=float) if rows else np.zeros((0, 6), dtype=float)

print(f"Rows read from JSONL: {rows_read}")
print(f"Aggregated windows: {len(keys)}")
print(f"Feature matrix shape: {X.shape}  # [samples, features]")

# show a small preview
for i in range(min(5, len(keys))):
    (consumer, supplier, bucket) = keys[i]
    print(f"{i:02d} key={keys[i]}  feats={X[i].tolist()}")


Rows read from JSONL: 7254656
Aggregated windows: 1540175
Feature matrix shape: (1540175, 6)  # [samples, features]
00 key=('SELENE', 'AWS', '2024-06-01 00:00:00')  feats=[71.0, 0.0, 1630.9500000000007, 22.97112676056339, 25.650000000000002, 0.36126760563380284]
01 key=('SELENE', 'DBRCKS', '2024-06-01 00:00:00')  feats=[71.0, 0.0, 3083.45, 43.42887323943662, 85.50000000000001, 1.2042253521126762]
02 key=('SELENE', 'SNWFLK', '2024-06-01 00:00:00')  feats=[71.0, 0.0, 1618.500000000001, 22.795774647887338, 65.4, 0.9211267605633804]
03 key=('FRDETCT', 'SELENE', '2024-06-01 00:00:00')  feats=[1.0, 0.0, 116.2, 116.2, 1.65, 1.65]
04 key=('HMFRP', 'AZURE', '2024-06-01 00:00:00')  feats=[71.0, 0.0, 1361.2, 19.171830985915495, 8.250000000000002, 0.11619718309859157]


In [7]:
# Task 1: Naive Bayes baseline (weak labels + metrics)
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import average_precision_score, precision_recall_curve, f1_score, accuracy_score

assert 'X' in globals() and 'keys' in globals(), "Run the feature cell first."

# Features: [req_count, error_rate, cost_sum, cost_mean, data_sum, data_mean]
er = X[:, 1]
cmean = X[:, 3]
dmean = X[:, 5]

def robust_z(x):
    med = np.median(x)
    mad = np.median(np.abs(x - med)) + 1e-9
    return np.abs(x - med) / (1.4826 * mad)

z_c = robust_z(cmean)
z_d = robust_z(dmean)

# Weak labels: anomaly if high error_rate OR cost/data mean outlier
y = ((er > 0.10) | (z_c > 3.0) | (z_d > 3.0)).astype(int)

# Time-based split (80/20) by hour bucket string
idx = np.arange(len(keys))
idx = idx[np.argsort([k[2] for k in keys])]
cut = int(len(idx) * 0.8)
train_idx, valid_idx = idx[:cut], idx[cut:] if cut > 0 else (idx, idx)

Xtr, ytr = X[train_idx], y[train_idx]
Xva, yva = X[valid_idx], y[valid_idx]

model = Pipeline([
    ("scaler", StandardScaler()),
    ("nb", GaussianNB())
]).fit(Xtr, ytr)

if len(Xva) and len(np.unique(yva)) > 1:
    proba = model.predict_proba(Xva)[:, 1]
    pr_auc = average_precision_score(yva, proba)
    prec, rec, thr = precision_recall_curve(yva, proba)
    f1s = (2 * prec * rec) / (prec + rec + 1e-9)
    bi = int(np.argmax(f1s))
    best_thr = thr[bi-1] if bi > 0 and (bi-1) < len(thr) else 0.5
    yhat = (proba >= best_thr).astype(int)
    acc_pos = accuracy_score(yva[yva == 1], yhat[yva == 1]) if np.any(yva == 1) else 0.0
    acc_mean = accuracy_score(yva, yhat)
    print({
        "valid_size": int(len(yva)),
        "pos_rate_valid": float(yva.mean()),
        "PR_AUC": float(pr_auc),
        "F1": float(f1_score(yva, yhat)),
        "Precision": float(prec[bi]),
        "Recall": float(rec[bi]),
        "Acc_pos": float(acc_pos),
        "Acc_mean": float(acc_mean),
        "threshold": float(best_thr)
    })
else:
    print({"note": "Validation split has only one class; try adjusting weak labels or split."})


{'valid_size': 308035, 'pos_rate_valid': 0.0848767185547097, 'PR_AUC': 0.9774733165108653, 'F1': 0.9235196292766784, 'Precision': 0.9028533849694933, 'Recall': 0.9451902849493211, 'Acc_pos': 0.9451902849493211, 'Acc_mean': 0.9867125488986641, 'threshold': 0.9806067807015026}


In [8]:
# Save Task 1 metrics to CSV + JSON (one-row table)
import csv, json
import numpy as np
from sklearn.metrics import average_precision_score, precision_recall_curve, f1_score, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline

assert 'X' in globals() and 'keys' in globals(), "Run feature cell first."

# Rebuild weak labels (same as before)
er = X[:,1]; cmean = X[:,3]; dmean = X[:,5]
def robust_z(x):
    med = np.median(x); mad = np.median(np.abs(x - med)) + 1e-9
    return np.abs(x - med) / (1.4826 * mad)
z_c = robust_z(cmean); z_d = robust_z(dmean)
y = ((er > 0.10) | (z_c > 3.0) | (z_d > 3.0)).astype(int)

# Time split
idx = np.arange(len(keys)); idx = idx[np.argsort([k[2] for k in keys])]
cut = int(len(idx)*0.8)
train_idx, valid_idx = idx[:cut], idx[cut:]
Xtr, ytr = X[train_idx], y[train_idx]
Xva, yva = X[valid_idx], y[valid_idx]

# Train NB
model = Pipeline([("scaler", StandardScaler()), ("nb", GaussianNB())]).fit(Xtr, ytr)

# Metrics
proba = model.predict_proba(Xva)[:,1]
pr_auc = average_precision_score(yva, proba)
prec, rec, thr = precision_recall_curve(yva, proba)
f1s = (2*prec*rec)/(prec+rec+1e-9)
bi = int(np.argmax(f1s))
best_thr = thr[bi-1] if bi>0 and (bi-1)<len(thr) else 0.5
yhat = (proba >= best_thr).astype(int)
acc_pos = accuracy_score(yva[yva==1], yhat[yva==1]) if np.any(yva==1) else 0.0
acc_mean = accuracy_score(yva, yhat)
row = {
    "Feature Set / Variant": "All features (NB)",
    "Precision": float(prec[bi]),
    "Recall": float(rec[bi]),
    "F1": float(f1_score(yva, yhat)),
    "PR-AUC": float(pr_auc),
    "Accuracy (+ve)": float(acc_pos),
    "Accuracy (mean)": float(acc_mean),
    "Threshold": float(best_thr),
    "Valid size": int(len(yva)),
    "Pos rate (valid)": float(yva.mean())
}

# Write CSV
csv_path = "task1_results.csv"
with open(csv_path, "w", newline="", encoding="utf-8") as f:
    w = csv.DictWriter(f, fieldnames=list(row.keys()))
    w.writeheader(); w.writerow(row)

# Write JSON
json_path = "task1_results.json"
with open(json_path, "w", encoding="utf-8") as f:
    json.dump({"results": [row]}, f, indent=2)

print("Saved:", csv_path, "and", json_path)
print(row)


Saved: task1_results.csv and task1_results.json
{'Feature Set / Variant': 'All features (NB)', 'Precision': 0.9028533849694933, 'Recall': 0.9451902849493211, 'F1': 0.9235196292766784, 'PR-AUC': 0.9774733165108653, 'Accuracy (+ve)': 0.9451902849493211, 'Accuracy (mean)': 0.9867125488986641, 'Threshold': 0.9806067807015026, 'Valid size': 308035, 'Pos rate (valid)': 0.0848767185547097}


In [9]:
# Task 2 — Service similarity (cosine over consumer/supplier profiles)
import json
import numpy as np
from collections import defaultdict
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity

assert 'X' in globals() and 'keys' in globals(), "Run the feature cell first."

k = 5  # top-k neighbors

# Accumulate per service, separately for consumer and supplier roles.
# For each service we keep:
# [n_cons, err_wsum_cons, cost_wsum_cons, data_wsum_cons,
#  n_sup,  err_wsum_sup,  cost_wsum_sup,  data_wsum_sup]
acc = defaultdict(lambda: np.zeros(8, dtype=float))

for (cons, supp, _), row in zip(keys, X):
    n, err_rate, _cost_sum, cost_mean, _data_sum, data_mean = row

    # consumer role
    a = acc[cons]
    a[0] += n
    a[1] += err_rate * n
    a[2] += cost_mean * n
    a[3] += data_mean * n

    # supplier role
    b = acc[supp]
    b[4] += n
    b[5] += err_rate * n
    b[6] += cost_mean * n
    b[7] += data_mean * n

services = sorted(acc.keys())
mat = np.zeros((len(services), 8), dtype=float)

# Convert weighted sums to means where appropriate
for i, s in enumerate(services):
    v = acc[s].copy()
    # consumer means
    v[1] = v[1] / (v[0] + 1e-9)
    v[2] = v[2] / (v[0] + 1e-9)
    v[3] = v[3] / (v[0] + 1e-9)
    # supplier means
    v[5] = v[5] / (v[4] + 1e-9)
    v[6] = v[6] / (v[4] + 1e-9)
    v[7] = v[7] / (v[4] + 1e-9)
    mat[i] = v

# Cosine similarity on normalized vectors
Xn = normalize(mat)
S = cosine_similarity(Xn)

neighbors = {}
for i, s in enumerate(services):
    order = np.argsort(-S[i])
    top = [(services[j], float(S[i, j])) for j in order[1:k+1]]  # skip self
    neighbors[s] = top

print("Services:", len(services), "Vector shape:", mat.shape)
for i in range(min(5, len(services))):
    print(services[i], "->", neighbors[services[i]][:k])

with open("task2_neighbors.json", "w", encoding="utf-8") as f:
    json.dump({"k": k, "neighbors": neighbors}, f, indent=2)
print("Saved task2_neighbors.json")


Services: 87 Vector shape: (87, 8)
ABTEST -> [('NOTIFY', 0.9999999999809024), ('MKTDB', 0.9999999999349333), ('DYNPRC', 0.9999999998786389), ('XCHATTR', 0.9999999997867001), ('TAXCALC', 0.9999999996770227)]
AIRFLW -> [('HMFRP', 0.999999998609385), ('YELLOWS', 0.9999999953721767), ('KAFKA', 0.9999999888122127), ('DDSD', 0.9999999793005887), ('ELASTIC', 0.9999996950612356)]
ANALAPI -> [('AUTOML', 0.9999992614814076), ('RISKMG', 0.9999991143549807), ('BILLING', 0.9999989627174771), ('GLOBCMP', 0.9999989224119094), ('SUPCHN', 0.9999988448202543)]
APIGWY -> [('SNAPINT', 0.9999999992863867), ('MULTILNG', 0.9999999971777137), ('SLSDB', 0.9999999962098263), ('TAXCALC', 0.9999999958172383), ('XCHATTR', 0.9999999955270946)]
ATLAS -> [('DATAHUB', 0.9999896440875172), ('MDLREG', 0.9999718818442569), ('JPNXPP', 0.999951212040954), ('SPARK', 0.9999474668239722), ('PANDRA', 0.9999318254451391)]
Saved task2_neighbors.json


In [10]:
# Task 2 tables: neighbors CSV + KMeans clustering (silhouette) summary
import json, csv
import numpy as np
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

assert 'services' in globals() and 'mat' in globals(), "Run the Task 2 cell first."

# 1) Neighbors CSV (top-5)
Xn = normalize(mat)
S = cosine_similarity(Xn)

k = 5
rows = []
for i, s in enumerate(services):
    order = np.argsort(-S[i])
    nbrs = [(services[j], float(S[i,j])) for j in order[1:k+1]]
    rows.append({
        "service": s,
        "n1": nbrs[0][0], "s1": nbrs[0][1],
        "n2": nbrs[1][0], "s2": nbrs[1][1],
        "n3": nbrs[2][0], "s3": nbrs[2][1],
        "n4": nbrs[3][0], "s4": nbrs[3][1],
        "n5": nbrs[4][0], "s5": nbrs[4][1],
    })

with open("task2_neighbors.csv", "w", newline="", encoding="utf-8") as f:
    fieldnames = ["service","n1","s1","n2","s2","n3","s3","n4","s4","n5","s5"]
    w = csv.DictWriter(f, fieldnames=fieldnames)
    w.writeheader()
    for r in rows: w.writerow(r)

# 2) Clustering summary (silhouette across k)
ks = [3,4,5,6,7,8,9,10]
summary = []
best = (None, -1.0, None)  # (k, score, labels)

for k_ in ks:
    km = KMeans(n_clusters=k_, random_state=42, n_init=10)
    labels = km.fit_predict(Xn)
    sil = silhouette_score(Xn, labels, metric="cosine")
    summary.append({"Representation":"Role means (8-dim)", "Method":"KMeans", "k":k_, "Silhouette":float(sil)})
    if sil > best[1]:
        best = (k_, sil, labels)

with open("task2_clustering_results.csv", "w", newline="", encoding="utf-8") as f:
    fieldnames = ["Representation","Method","k","Silhouette"]
    w = csv.DictWriter(f, fieldnames=fieldnames)
    w.writeheader()
    for r in summary: w.writerow(r)

# Save best-k cluster assignments
best_k, best_sil, best_labels = best
with open(f"task2_clusters_k{best_k}.csv", "w", newline="", encoding="utf-8") as f:
    w = csv.writer(f)
    w.writerow(["service","cluster"])
    for s, lbl in zip(services, best_labels):
        w.writerow([s, int(lbl)])

print("Saved: task2_neighbors.csv")
print("Saved: task2_clustering_results.csv")
print(f"Best k: {best_k}  Silhouette(cosine): {best_sil:.4f}  -> task2_clusters_k{best_k}.csv")


Saved: task2_neighbors.csv
Saved: task2_clustering_results.csv
Best k: 8  Silhouette(cosine): 0.9835  -> task2_clusters_k8.csv


In [11]:
# Task 2 tables: neighbors CSV + KMeans clustering (silhouette) summary
import json, csv
import numpy as np
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

assert 'services' in globals() and 'mat' in globals(), "Run the Task 2 cell first."

# 1) Neighbors CSV (top-5)
Xn = normalize(mat)
S = cosine_similarity(Xn)

k = 5
rows = []
for i, s in enumerate(services):
    order = np.argsort(-S[i])
    nbrs = [(services[j], float(S[i,j])) for j in order[1:k+1]]
    rows.append({
        "service": s,
        "n1": nbrs[0][0], "s1": nbrs[0][1],
        "n2": nbrs[1][0], "s2": nbrs[1][1],
        "n3": nbrs[2][0], "s3": nbrs[2][1],
        "n4": nbrs[3][0], "s4": nbrs[3][1],
        "n5": nbrs[4][0], "s5": nbrs[4][1],
    })

with open("task2_neighbors.csv", "w", newline="", encoding="utf-8") as f:
    fieldnames = ["service","n1","s1","n2","s2","n3","s3","n4","s4","n5","s5"]
    w = csv.DictWriter(f, fieldnames=fieldnames)
    w.writeheader()
    for r in rows: w.writerow(r)

# 2) Clustering summary (silhouette across k)
ks = [3,4,5,6,7,8,9,10]
summary = []
best = (None, -1.0, None)  # (k, score, labels)

for k_ in ks:
    km = KMeans(n_clusters=k_, random_state=42, n_init=10)
    labels = km.fit_predict(Xn)
    sil = silhouette_score(Xn, labels, metric="cosine")
    summary.append({"Representation":"Role means (8-dim)", "Method":"KMeans", "k":k_, "Silhouette":float(sil)})
    if sil > best[1]:
        best = (k_, sil, labels)

with open("task2_clustering_results.csv", "w", newline="", encoding="utf-8") as f:
    fieldnames = ["Representation","Method","k","Silhouette"]
    w = csv.DictWriter(f, fieldnames=fieldnames)
    w.writeheader()
    for r in summary: w.writerow(r)

# Save best-k cluster assignments
best_k, best_sil, best_labels = best
with open(f"task2_clusters_k{best_k}.csv", "w", newline="", encoding="utf-8") as f:
    w = csv.writer(f)
    w.writerow(["service","cluster"])
    for s, lbl in zip(services, best_labels):
        w.writerow([s, int(lbl)])

print("Saved: task2_neighbors.csv")
print("Saved: task2_clustering_results.csv")
print(f"Best k: {best_k}  Silhouette(cosine): {best_sil:.4f}  -> task2_clusters_k{best_k}.csv")


Saved: task2_neighbors.csv
Saved: task2_clustering_results.csv
Best k: 8  Silhouette(cosine): 0.9835  -> task2_clusters_k8.csv


In [12]:
# Build a report-ready Markdown with Task 1 & Task 2 tables
import csv, os, json

task1_csv = "task1_results.csv"
task2_neighbors_csv = "task2_neighbors.csv"
task2_cluster_csv = "task2_clustering_results.csv"
out_md = "tables_for_report.md"

# --- load Task 1 (one row) ---
with open(task1_csv, "r", encoding="utf-8") as f:
    r = list(csv.DictReader(f))
t1 = r[0] if r else {}

# --- load Task 2 neighbors (limit to 10 services for the doc) ---
rows_nbr = []
if os.path.exists(task2_neighbors_csv):
    with open(task2_neighbors_csv, "r", encoding="utf-8") as f:
        rows_nbr = list(csv.DictReader(f))
rows_nbr = rows_nbr[:10]

# --- load Task 2 clustering summary ---
rows_clu = []
if os.path.exists(task2_cluster_csv):
    with open(task2_cluster_csv, "r", encoding="utf-8") as f:
        rows_clu = list(csv.DictReader(f))

md = []

md.append("# Results Tables\n")

# Task 1 table
md.append("## Task 1 — Insight Detection (Naive Bayes Baseline)\n")
md.append("| Feature Set / Variant | Precision | Recall | F1 | PR-AUC | Accuracy (+ve) | Accuracy (mean) | Threshold | Valid size | Pos rate (valid) |")
md.append("|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|")
md.append(f"| {t1.get('Feature Set / Variant','')} | {t1.get('Precision','')} | {t1.get('Recall','')} | {t1.get('F1','')} | {t1.get('PR-AUC','')} | {t1.get('Accuracy (+ve)','')} | {t1.get('Accuracy (mean)','')} | {t1.get('Threshold','')} | {t1.get('Valid size','')} | {t1.get('Pos rate (valid)','')} |")
md.append("")

# Task 2 neighbors table (sample)
md.append("## Task 2 — Service Similarity (Top-5 Neighbors, sample)")
md.append("| service | n1 | s1 | n2 | s2 | n3 | s3 | n4 | s4 | n5 | s5 |")
md.append("|---|---|---:|---|---:|---|---:|---|---:|---|---:|")
for r in rows_nbr:
    md.append(f"| {r['service']} | {r['n1']} | {r['s1']} | {r['n2']} | {r['s2']} | {r['n3']} | {r['s3']} | {r['n4']} | {r['s4']} | {r['n5']} | {r['s5']} |")
md.append("")

# Task 2 clustering summary
md.append("## Task 2 — Clustering Summary (Silhouette, cosine)")
if rows_clu:
    md.append("| Representation | Method | k | Silhouette |")
    md.append("|---|---|---:|---:|")
    for r in rows_clu:
        md.append(f"| {r['Representation']} | {r['Method']} | {r['k']} | {r['Silhouette']} |")
else:
    md.append("_No clustering summary file found._")

with open(out_md, "w", encoding="utf-8") as f:
    f.write("\n".join(md))

print("Saved:", out_md)
# preview first ~40 lines
with open(out_md, "r", encoding="utf-8") as f:
    for i, line in zip(range(40), f):
        print(line.rstrip())


Saved: tables_for_report.md
# Results Tables

## Task 1 — Insight Detection (Naive Bayes Baseline)

| Feature Set / Variant | Precision | Recall | F1 | PR-AUC | Accuracy (+ve) | Accuracy (mean) | Threshold | Valid size | Pos rate (valid) |
|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|
| All features (NB) | 0.9028533849694933 | 0.9451902849493211 | 0.9235196292766784 | 0.9774733165108653 | 0.9451902849493211 | 0.9867125488986641 | 0.9806067807015026 | 308035 | 0.0848767185547097 |

## Task 2 — Service Similarity (Top-5 Neighbors, sample)
| service | n1 | s1 | n2 | s2 | n3 | s3 | n4 | s4 | n5 | s5 |
|---|---|---:|---|---:|---|---:|---|---:|---|---:|
| ABTEST | NOTIFY | 0.9999999999809024 | MKTDB | 0.9999999999349333 | DYNPRC | 0.9999999998786389 | XCHATTR | 0.9999999997867001 | TAXCALC | 0.9999999996770227 |
| AIRFLW | HMFRP | 0.999999998609385 | YELLOWS | 0.9999999953721767 | KAFKA | 0.9999999888122127 | DDSD | 0.9999999793005887 | ELASTIC | 0.9999996950612356 |
| ANALAPI | AUTOML 

In [13]:
# Task 1 — Top-20 incidents (ranked by proba * cost_sum)
import csv
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import average_precision_score, precision_recall_curve

assert 'X' in globals() and 'keys' in globals(), "Run the feature cell first."

# Rebuild weak labels (same as earlier)
er = X[:,1]; cmean = X[:,3]; dmean = X[:,5]
def robust_z(x):
    med = np.median(x); mad = np.median(np.abs(x - med)) + 1e-9
    return np.abs(x - med) / (1.4826 * mad)
z_c = robust_z(cmean); z_d = robust_z(dmean)
y = ((er > 0.10) | (z_c > 3.0) | (z_d > 3.0)).astype(int)

# Time-based split (80/20)
idx = np.arange(len(keys))
idx = idx[np.argsort([k[2] for k in keys])]
cut = int(len(idx)*0.8)
train_idx, valid_idx = idx[:cut], idx[cut:]

Xtr, ytr = X[train_idx], y[train_idx]
Xva, yva = X[valid_idx], y[valid_idx]

model = Pipeline([("scaler", StandardScaler()), ("nb", GaussianNB())]).fit(Xtr, ytr)

proba = model.predict_proba(Xva)[:,1]
prec, rec, thr = precision_recall_curve(yva, proba)
f1s = (2*prec*rec)/(prec+rec+1e-9)
bi = int(np.argmax(f1s))
best_thr = thr[bi-1] if bi>0 and (bi-1)<len(thr) else 0.5
pred = (proba >= best_thr).astype(int)

# Rank by probability * cost_sum (impact proxy)
cost_sum = Xva[:,2]
score = proba * (cost_sum + 1e-9)
order = np.argsort(-score)

top_k = 20
rows = []
for r in order[:top_k]:
    cons, supp, bucket = keys[valid_idx[r]]
    req_count, error_rate, cost_sum, cost_mean, data_sum, data_mean = Xva[r].tolist()
    rows.append({
        "time_bucket": bucket,
        "consumer_id": cons,
        "supplier_id": supp,
        "probability": float(proba[r]),
        "predicted_anomaly": int(pred[r]),
        "weak_label": int(yva[r]),
        "req_count": float(req_count),
        "error_rate": float(error_rate),
        "cost_sum": float(cost_sum),
        "cost_mean": float(cost_mean),
        "data_sum": float(data_sum),
        "data_mean": float(data_mean)
    })

out_path = "task1_top_incidents.csv"
with open(out_path, "w", newline="", encoding="utf-8") as f:
    w = csv.DictWriter(f, fieldnames=list(rows[0].keys()))
    w.writeheader()
    for rr in rows: w.writerow(rr)

print("Saved:", out_path)
print("Top 3 preview:")
for rr in rows[:3]:
    print(rr)


Saved: task1_top_incidents.csv
Top 3 preview:
{'time_bucket': '2025-05-12 02:00:00', 'consumer_id': 'ECOMLP', 'supplier_id': 'MCSCBT', 'probability': 1.0, 'predicted_anomaly': 1, 'weak_label': 1, 'req_count': 6.0, 'error_rate': 0.0, 'cost_sum': 3539.9500000000007, 'cost_mean': 589.9916666666668, 'data_sum': 70.8, 'data_mean': 11.799999999999999}
{'time_bucket': '2025-03-30 01:00:00', 'consumer_id': 'CUSTPRT', 'supplier_id': 'MCSCBT', 'probability': 1.0, 'predicted_anomaly': 1, 'weak_label': 1, 'req_count': 6.0, 'error_rate': 0.0, 'cost_sum': 3465.25, 'cost_mean': 577.5416666666666, 'data_sum': 73.05000000000001, 'data_mean': 12.175000000000002}
{'time_bucket': '2025-03-25 05:00:00', 'consumer_id': 'ECOMLP', 'supplier_id': 'MCSCBT', 'probability': 1.0, 'predicted_anomaly': 1, 'weak_label': 1, 'req_count': 5.0, 'error_rate': 0.0, 'cost_sum': 2921.6, 'cost_mean': 584.3199999999999, 'data_sum': 59.25, 'data_mean': 11.85}


In [14]:
# Build concise LLM-style summaries (template) for Top-20 incidents
import csv, math
import numpy as np

assert 'X' in globals(), "Run the feature cell first."

# Globals from X to compute robust baselines
# Features: [req_count, error_rate, cost_sum, cost_mean, data_sum, data_mean]
cmean_all = X[:,3]; dmean_all = X[:,5]; er_all = X[:,1]

def robust_stats(x):
    med = float(np.median(x))
    mad = float(np.median(np.abs(x - med))) + 1e-9
    return med, mad

c_med, c_mad = robust_stats(cmean_all)
d_med, d_mad = robust_stats(dmean_all)
e_med, e_mad = robust_stats(er_all)

def rz(val, med, mad):
    return (abs(val - med) / (1.4826 * mad)) if mad > 0 else 0.0

def conf_str(p):
    return "high" if p >= 0.9 else ("medium" if p >= 0.7 else "low")

rows = []
with open("task1_top_incidents.csv", "r", encoding="utf-8") as f:
    rows = list(csv.DictReader(f))

lines = ["# Top Incidents — Brief Summaries\n"]
for r in rows:
    tb   = r["time_bucket"]
    cons = r["consumer_id"]
    supp = r["supplier_id"]
    proba = float(r["probability"])
    pred  = int(r["predicted_anomaly"])
    lab   = int(r["weak_label"])
    req   = float(r["req_count"])
    er    = float(r["error_rate"])
    csum  = float(r["cost_sum"])
    cmean = float(r["cost_mean"])
    dsum  = float(r["data_sum"])
    dmean = float(r["data_mean"])

    z_c = rz(cmean, c_med, c_mad)
    z_d = rz(dmean, d_med, d_mad)
    z_e = rz(er,    e_med, e_mad)

    reasons = []
    if er > max(0.10, e_med + 2*1.4826*e_mad): reasons.append(f"error rate spike ({er:.1%})")
    if z_c > 3: reasons.append(f"cost/tx high (z={z_c:.1f})")
    if z_d > 3: reasons.append(f"data/tx high (z={z_d:.1f})")
    if not reasons:
        reasons.append("unusual pattern vs baseline")

    lines.append(f"## {tb} — {cons} → {supp}")
    lines.append(f"- **Anomaly score**: p={proba:.3f} (confidence: {conf_str(proba)}), predicted={pred}, weak_label={lab}")
    lines.append(f"- **Impact**: req={req:.0f}, cost_sum=${csum:,.2f}, cost/tx={cmean:.2f}, data_sum={dsum:.2f}, data/tx={dmean:.2f}")
    lines.append(f"- **Signals**: {', '.join(reasons)}")
    lines.append(f"- **Next checks**: verify recent deployments/config for `{supp}`, inspect capacity/quotas, correlate with upstream `{cons}` traffic and provider billing.")
    lines.append("")

out_md = "task1_incident_summaries.md"
with open(out_md, "w", encoding="utf-8") as f:
    f.write("\n".join(lines))

print("Saved:", out_md)
print("\nPreview:\n")
print("\n".join(lines[:18]))


Saved: task1_incident_summaries.md

Preview:

# Top Incidents — Brief Summaries

## 2025-05-12 02:00:00 — ECOMLP → MCSCBT
- **Anomaly score**: p=1.000 (confidence: high), predicted=1, weak_label=1
- **Impact**: req=6, cost_sum=$3,539.95, cost/tx=589.99, data_sum=70.80, data/tx=11.80
- **Signals**: cost/tx high (z=14.7), data/tx high (z=8.2)
- **Next checks**: verify recent deployments/config for `MCSCBT`, inspect capacity/quotas, correlate with upstream `ECOMLP` traffic and provider billing.

## 2025-03-30 01:00:00 — CUSTPRT → MCSCBT
- **Anomaly score**: p=1.000 (confidence: high), predicted=1, weak_label=1
- **Impact**: req=6, cost_sum=$3,465.25, cost/tx=577.54, data_sum=73.05, data/tx=12.18
- **Signals**: cost/tx high (z=14.4), data/tx high (z=8.4)
- **Next checks**: verify recent deployments/config for `MCSCBT`, inspect capacity/quotas, correlate with upstream `CUSTPRT` traffic and provider billing.

## 2025-03-25 05:00:00 — ECOMLP → MCSCBT
- **Anomaly score**: p=1.000 (confidence: 

In [16]:
# Task 1 variants: Counts-only (NB), All features (NB), PCA(3)+NB)
import csv
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import average_precision_score, precision_recall_curve, f1_score, accuracy_score

assert 'X' in globals() and 'keys' in globals(), "Run the feature cell first."

# --- weak labels (same as before) ---
er = X[:,1]; cmean = X[:,3]; dmean = X[:,5]
def robust_z(x):
    med = np.median(x); mad = np.median(np.abs(x - med)) + 1e-9
    return np.abs(x - med) / (1.4826 * mad)
z_c = robust_z(cmean); z_d = robust_z(dmean)
y = ((er > 0.10) | (z_c > 3.0) | (z_d > 3.0)).astype(int)

# time-based split
idx = np.arange(len(keys))
idx = idx[np.argsort([k[2] for k in keys])]
cut = int(len(idx)*0.8)
train_idx, valid_idx = idx[:cut], idx[cut:]
Xtr_all, ytr = X[train_idx], y[train_idx]
Xva_all, yva = X[valid_idx], y[valid_idx]

def eval_model(model, Xtr, ytr, Xva, yva, label):
    model.fit(Xtr, ytr)
    proba = model.predict_proba(Xva)[:,1]
    pr_auc = average_precision_score(yva, proba) if len(np.unique(yva))>1 else 0.0
    prec, rec, thr = precision_recall_curve(yva, proba)
    f1s = (2*prec*rec)/(prec+rec+1e-9)
    bi = int(np.argmax(f1s))
    best_thr = thr[bi-1] if bi>0 and (bi-1)<len(thr) else 0.5
    yhat = (proba >= best_thr).astype(int)
    acc_pos = accuracy_score(yva[yva==1], yhat[yva==1]) if np.any(yva==1) else 0.0
    acc_mean = accuracy_score(yva, yhat)
    return {
        "Feature Set / Variant": label,
        "Precision": float(prec[bi]),
        "Recall": float(rec[bi]),
        "F1": float(f1_score(yva, yhat)),
        "PR-AUC": float(pr_auc),
        "Accuracy (+ve)": float(acc_pos),
        "Accuracy (mean)": float(acc_mean),
        "Threshold": float(best_thr),
        "Valid size": int(len(yva)),
        "Pos rate (valid)": float(yva.mean())
    }

rows = []

# Variant 1: Counts only (req_count, error_rate) + NB
rows.append(
    eval_model(
        Pipeline([("scaler", StandardScaler()), ("nb", GaussianNB())]),
        Xtr_all[:, [0,1]], ytr,
        Xva_all[:, [0,1]], yva,
        "Counts only (NB)"
    )
)

# Variant 2: All features (NB)
rows.append(
    eval_model(
        Pipeline([("scaler", StandardScaler()), ("nb", GaussianNB())]),
        Xtr_all, ytr, Xva_all, yva,
        "All features (NB)"
    )
)

# Variant 3: PCA(3) + NB
rows.append(
    eval_model(
        Pipeline([("scaler", StandardScaler()), ("pca", PCA(n_components=3, random_state=42)), ("nb", GaussianNB())]),
        Xtr_all, ytr, Xva_all, yva,
        "PCA(3) + NB"
    )
)

# Save multi-row CSV
out_csv = "task1_results_variants.csv"
with open(out_csv, "w", newline="", encoding="utf-8") as f:
    w = csv.DictWriter(f, fieldnames=list(rows[0].keys()))
    w.writeheader()
    for r in rows: w.writerow(r)

print("Saved:", out_csv)
for r in rows: print(r)


Saved: task1_results_variants.csv
{'Feature Set / Variant': 'Counts only (NB)', 'Precision': 0.10568402532383772, 'Recall': 0.8179001721170396, 'F1': 0.1830056388612474, 'PR-AUC': 0.1021308496893949, 'Accuracy (+ve)': 0.9179575444635686, 'Accuracy (mean)': 0.3043420390540036, 'Threshold': 0.3514217132994965, 'Valid size': 308035, 'Pos rate (valid)': 0.0848767185547097}
{'Feature Set / Variant': 'All features (NB)', 'Precision': 0.9028533849694933, 'Recall': 0.9451902849493211, 'F1': 0.9235196292766784, 'PR-AUC': 0.9774733165108653, 'Accuracy (+ve)': 0.9451902849493211, 'Accuracy (mean)': 0.9867125488986641, 'Threshold': 0.9806067807015026, 'Valid size': 308035, 'Pos rate (valid)': 0.0848767185547097}
{'Feature Set / Variant': 'PCA(3) + NB', 'Precision': 0.6810904129423886, 'Recall': 0.9355517307324537, 'F1': 0.7882821186290465, 'PR-AUC': 0.6950759672884285, 'Accuracy (+ve)': 0.9355517307324537, 'Accuracy (mean)': 0.9573457561640722, 'Threshold': 0.154903618666408, 'Valid size': 308035,

In [17]:
# Build FINAL report tables (includes Task 1 variants + Task 2)
import csv, os

t1_var_csv = "task1_results_variants.csv"
t2_nbr_csv = "task2_neighbors.csv"
t2_clu_csv = "task2_clustering_results.csv"
out_md = "tables_for_report_full.md"

# --- load Task 1 variants ---
with open(t1_var_csv, "r", encoding="utf-8") as f:
    t1_rows = list(csv.DictReader(f))

# --- load Task 2 neighbors (sample 10) ---
rows_nbr = []
if os.path.exists(t2_nbr_csv):
    with open(t2_nbr_csv, "r", encoding="utf-8") as f:
        rows_nbr = list(csv.DictReader(f))[:10]

# --- load Task 2 clustering summary ---
rows_clu = []
if os.path.exists(t2_clu_csv):
    with open(t2_clu_csv, "r", encoding="utf-8") as f:
        rows_clu = list(csv.DictReader(f))

lines = []
lines.append("# Results Tables (Final)\n")

# ---- Task 1 variants table ----
hdr = ["Feature Set / Variant","Precision","Recall","F1","PR-AUC","Accuracy (+ve)","Accuracy (mean)","Threshold","Valid size","Pos rate (valid)"]
lines.append("## Task 1 — Insight Detection (Naive Bayes Variants)\n")
lines.append("| " + " | ".join(hdr) + " |")
lines.append("|" + "|".join(["---"] + [":---:" for _ in hdr[1:]]) + "|")
for r in t1_rows:
    lines.append("| " + " | ".join(str(r[h]) for h in hdr) + " |")
lines.append("")

# ---- Task 2 neighbors (sample) ----
lines.append("## Task 2 — Service Similarity (Top-5 Neighbors, sample)")
nbr_hdr = ["service","n1","s1","n2","s2","n3","s3","n4","s4","n5","s5"]
lines.append("| " + " | ".join(nbr_hdr) + " |")
lines.append("|" + "|".join(["---","---",":---:","---",":---:","---",":---:","---",":---:","---",":---:"]) + "|")
for r in rows_nbr:
    lines.append("| " + " | ".join(r[h] for h in nbr_hdr) + " |")
lines.append("")

# ---- Task 2 clustering summary ----
lines.append("## Task 2 — Clustering Summary (Silhouette, cosine)")
if rows_clu:
    lines.append("| Representation | Method | k | Silhouette |")
    lines.append("|---|---|---:|---:|")
    for r in rows_clu:
        lines.append(f"| {r['Representation']} | {r['Method']} | {r['k']} | {r['Silhouette']} |")
else:
    lines.append("_No clustering summary file found._")

with open(out_md, "w", encoding="utf-8") as f:
    f.write("\n".join(lines))

print("Saved:", out_md)


Saved: tables_for_report_full.md


In [18]:
# Inspect daily_metrics.jsonl (schema + first rows)
import json
from itertools import islice

def inspect_jsonl(path: str, n: int = 5):
    keys_union, samples = set(), []
    with open(path, "r", encoding="utf-8") as f:
        for line in islice(f, n):
            if not line.strip(): 
                continue
            obj = json.loads(line)
            samples.append(obj)
            keys_union.update(obj.keys())
    keys = sorted(keys_union)
    print("=== KEYS ===")
    print(keys)
    print(f"\n=== FIRST {len(samples)} ROWS ===")
    for i, obj in enumerate(samples, 1):
        print(f"\n--- Row {i} ---")
        print({k: obj.get(k) for k in keys})

inspect_jsonl("daily_metrics.jsonl", n=5)


=== KEYS ===
['app/id', 'app/name', 'daily/label', 'daily/metric', 'daily/time', 'daily/value', 'tenant/id']

=== FIRST 5 ROWS ===

--- Row 1 ---
{'app/id': 'SELENE', 'app/name': 'Selene Customer Warehouse', 'daily/label': 'Cost', 'daily/metric': 'cost', 'daily/time': '2024-06-01T00:00:00', 'daily/value': 158758.2500000005, 'tenant/id': 'DEMO'}

--- Row 2 ---
{'app/id': 'SELENE', 'app/name': 'Selene Customer Warehouse', 'daily/label': 'Value', 'daily/metric': 'value', 'daily/time': '2024-06-01T00:00:00', 'daily/value': 4646.0999999999985, 'tenant/id': 'DEMO'}

--- Row 3 ---
{'app/id': 'SELENE', 'app/name': 'Selene Customer Warehouse', 'daily/label': 'Data Used', 'daily/metric': 'data_used', 'daily/time': '2024-06-01T00:00:00', 'daily/value': 4646.100000000008, 'tenant/id': 'DEMO'}

--- Row 4 ---
{'app/id': 'SELENE', 'app/name': 'Selene Customer Warehouse', 'daily/label': 'Data Sent', 'daily/metric': 'data_sent', 'daily/time': '2024-06-01T00:00:00', 'daily/value': 4646.0999999999985, 't

In [20]:
# Pivot daily_metrics.jsonl -> daily_features.csv (per-app per-day)
import json, csv
from collections import defaultdict
from datetime import datetime

path = "daily_metrics.jsonl"

def to_day(s: str):
    for fmt in ("%Y-%m-%dT%H:%M:%S", "%Y-%m-%d %H:%M:%S"):
        try:
            return datetime.strptime(s, fmt).strftime("%Y-%m-%d")
        except Exception:
            pass
    return None

# Accumulate values per (app_id, day, metric)
agg = defaultdict(lambda: defaultdict(float))
apps = set()
metrics_seen = set()
rows_read = 0

with open(path, "r", encoding="utf-8") as f:
    for line in f:
        if not line.strip():
            continue
        obj = json.loads(line)
        app = obj.get("app/id")
        day = to_day(obj.get("daily/time", ""))
        metric = obj.get("daily/metric")
        val = obj.get("daily/value", 0.0) or 0.0
        if not app or not day or not metric:
            continue
        agg[(app, day)][metric] += float(val)
        apps.add(app)
        metrics_seen.add(metric)
        rows_read += 1

metrics_base = ["cost", "value", "data_used", "data_sent", "requests_made"]
ordered_metrics = [m for m in metrics_base if m in metrics_seen] + \
                  sorted([m for m in metrics_seen if m not in metrics_base])

# Build rows + simple derived features
out_rows = []
for (app, day), mvals in agg.items():
    row = {"app_id": app, "day": day}
    for m in ordered_metrics:
        row[m] = mvals.get(m, 0.0)

    req  = row.get("requests_made", 0.0)
    cost = row.get("cost", 0.0)
    val  = row.get("value", 0.0)
    data_u = row.get("data_used", 0.0)

    row["cost_per_request"] = (cost / max(req, 1.0)) if "cost" in ordered_metrics else 0.0
    row["value_per_cost"]   = (val / max(cost, 1e-9)) if ("value" in ordered_metrics and "cost" in ordered_metrics) else 0.0
    row["data_per_request"] = (data_u / max(req, 1.0)) if "data_used" in ordered_metrics else 0.0

    out_rows.append(row)

fieldnames = ["app_id", "day"] + ordered_metrics + ["cost_per_request", "value_per_cost", "data_per_request"]
csv_path = "daily_features.csv"

with open(csv_path, "w", newline="", encoding="utf-8") as f:
    w = csv.DictWriter(f, fieldnames=fieldnames)
    w.writeheader()
    for r in out_rows:
        w.writerow(r)

print("Rows read:", rows_read)
print("Apps:", len(apps))
print("Days (unique):", len({d for _, d in agg.keys()}))
print("Metrics included:", ordered_metrics)
print("Saved:", csv_path)

print("\nPreview:")
for r in out_rows[:5]:
    print({k: r.get(k) for k in fieldnames})


Rows read: 444570
Apps: 87
Days (unique): 365
Metrics included: ['cost', 'value', 'data_used', 'data_sent', 'requests_made', 'cost_per_request_made', 'cost_per_request_received', 'data_per_request', 'data_sent_per_received', 'data_used_per_received', 'requests_per_business_hour', 'requests_per_hour', 'requests_received', 'value_per_cost']
Saved: daily_features.csv

Preview:
{'app_id': 'SELENE', 'day': '2024-06-01', 'cost': 158758.2500000005, 'value': 4646.0999999999985, 'data_used': 4646.100000000008, 'data_sent': 4646.0999999999985, 'requests_made': 5490.0, 'cost_per_request_made': 28.917714025501002, 'cost_per_request_received': 86.753142076503, 'data_per_request': 0.8462841530054659, 'data_sent_per_received': 2.538852459016393, 'data_used_per_received': 2.5388524590163977, 'requests_per_business_hour': 296.8, 'requests_per_hour': 228.75, 'requests_received': 1830.0, 'value_per_cost': 0.02926525078224271, 'cost_per_request': 28.917714025501002}
{'app_id': 'AWS', 'day': '2024-06-01', 

In [21]:
# Inspect monthly_metrics.jsonl (schema + first rows)
import json
from itertools import islice

def inspect_jsonl(path: str, n: int = 5):
    keys_union, samples = set(), []
    with open(path, "r", encoding="utf-8") as f:
        for line in islice(f, n):
            if not line.strip(): 
                continue
            obj = json.loads(line)
            samples.append(obj)
            keys_union.update(obj.keys())
    keys = sorted(keys_union)
    print("=== KEYS ===")
    print(keys)
    print(f"\n=== FIRST {len(samples)} ROWS ===")
    for i, obj in enumerate(samples, 1):
        print(f"\n--- Row {i} ---")
        print({k: obj.get(k) for k in keys})

inspect_jsonl("monthly_metrics.jsonl", n=5)


=== KEYS ===
['app/id', 'app/name', 'monthly/created', 'monthly/label', 'monthly/metric', 'monthly/value', 'tenant/id']

=== FIRST 5 ROWS ===

--- Row 1 ---
{'app/id': 'SELENE', 'app/name': 'Selene Customer Warehouse', 'monthly/created': '2024-08-01T00:00:00', 'monthly/label': 'Cost', 'monthly/metric': 'cost', 'monthly/value': 1219070.7999999553, 'tenant/id': 'DEMO'}

--- Row 2 ---
{'app/id': 'SELENE', 'app/name': 'Selene Customer Warehouse', 'monthly/created': '2024-08-01T00:00:00', 'monthly/label': 'Value', 'monthly/metric': 'value', 'monthly/value': 35701.04999999981, 'tenant/id': 'DEMO'}

--- Row 3 ---
{'app/id': 'SELENE', 'app/name': 'Selene Customer Warehouse', 'monthly/created': '2024-08-01T00:00:00', 'monthly/label': 'Data Used', 'monthly/metric': 'data_used', 'monthly/value': 35701.049999999675, 'tenant/id': 'DEMO'}

--- Row 4 ---
{'app/id': 'SELENE', 'app/name': 'Selene Customer Warehouse', 'monthly/created': '2024-08-01T00:00:00', 'monthly/label': 'Data Sent', 'monthly/metri

In [22]:
# Pivot monthly_metrics.jsonl -> monthly_features.csv (per-app per-month)
import json, csv
from collections import defaultdict
from datetime import datetime

path = "monthly_metrics.jsonl"

def to_month(s: str):
    for fmt in ("%Y-%m-%dT%H:%M:%S", "%Y-%m-%d %H:%M:%S"):
        try:
            return datetime.strptime(s, fmt).strftime("%Y-%m")
        except Exception:
            pass
    return None

# Accumulate values per (app_id, month, metric)
agg = defaultdict(lambda: defaultdict(float))
apps = set()
metrics_seen = set()
rows_read = 0

with open(path, "r", encoding="utf-8") as f:
    for line in f:
        if not line.strip():
            continue
        obj = json.loads(line)
        app = obj.get("app/id")
        month = to_month(obj.get("monthly/created", ""))  # timestamp field
        metric = obj.get("monthly/metric")
        val = obj.get("monthly/value", 0.0) or 0.0
        if not app or not month or not metric:
            continue
        agg[(app, month)][metric] += float(val)
        apps.add(app)
        metrics_seen.add(metric)
        rows_read += 1

metrics_base = ["cost","value","data_used","data_sent","requests_made","requests_received"]
ordered_metrics = [m for m in metrics_base if m in metrics_seen] + \
                  sorted([m for m in metrics_seen if m not in metrics_base])

# Build rows + simple derived features
out_rows = []
for (app, month), mvals in agg.items():
    row = {"app_id": app, "month": month}
    for m in ordered_metrics:
        row[m] = mvals.get(m, 0.0)

    req  = row.get("requests_made", 0.0)
    cost = row.get("cost", 0.0)
    val  = row.get("value", 0.0)
    data_u = row.get("data_used", 0.0)

    row["cost_per_request"] = (cost / max(req, 1.0)) if "cost" in ordered_metrics else 0.0
    row["value_per_cost"]   = (val / max(cost, 1e-9)) if ("value" in ordered_metrics and "cost" in ordered_metrics) else 0.0
    row["data_per_request"] = (data_u / max(req, 1.0)) if "data_used" in ordered_metrics else 0.0

    out_rows.append(row)

fieldnames = ["app_id","month"] + ordered_metrics + ["cost_per_request","value_per_cost","data_per_request"]
csv_path = "monthly_features.csv"

with open(csv_path, "w", newline="", encoding="utf-8") as f:
    w = csv.DictWriter(f, fieldnames=fieldnames)
    w.writeheader()
    for r in out_rows:
        w.writerow(r)

print("Rows read:", rows_read)
print("Apps:", len(apps))
print("Months (unique):", len({m for _, m in agg.keys()}))
print("Metrics included:", ordered_metrics)
print("Saved:", csv_path)

print("\nPreview:")
for r in out_rows[:5]:
    print({k: r.get(k) for k in fieldnames})


Rows read: 18270
Apps: 87
Months (unique): 10
Metrics included: ['cost', 'value', 'data_used', 'data_sent', 'requests_made', 'requests_received', 'cost_per_request_made', 'cost_per_request_received', 'data_per_request', 'data_sent_per_received', 'data_used_per_received', 'outage_cost', 'outage_count', 'outage_duration', 'outage_efficiency', 'outage_frequency', 'outage_impact', 'outage_severity', 'rate_of_return', 'requests_per_business_hour', 'requests_per_hour']
Saved: monthly_features.csv

Preview:
{'app_id': 'SELENE', 'month': '2024-08', 'cost': 1219070.7999999553, 'value': 35701.04999999981, 'data_used': 35701.049999999675, 'data_sent': 35701.04999999981, 'requests_made': 41964.0, 'requests_received': 13988.0, 'cost_per_request_made': 29.050395577160312, 'cost_per_request_received': 87.15118673148093, 'data_per_request': 0.8507542179010503, 'data_sent_per_received': 2.5522626537031603, 'data_used_per_received': 2.552262653703151, 'outage_cost': 0.0, 'outage_count': 2.0, 'outage_dur

In [None]:
# Build a ranked WATCHLIST of apps (combine Task 1 anomalies + monthly trends)
import csv
import numpy as np
from collections import defaultdict
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_recall_curve, average_precision_score

assert 'X' in globals() and 'keys' in globals(), "Run the transactions feature cell first."

# ---------- Re-train Task 1 model on same split ----------
er = X[:,1]; cmean = X[:,3]; dmean = X[:,5]
def robust_z(x):
    med = np.median(x); mad = np.median(np.abs(x - med)) + 1e-9
    return np.abs(x - med) / (1.4826 * mad)
y = ((er > 0.10) | (robust_z(cmean) > 3.0) | (robust_z(dmean) > 3.0)).astype(int)

idx = np.arange(len(keys))
idx = idx[np.argsort([k[2] for k in keys])]
cut = int(len(idx)*0.8)
train_idx, valid_idx = idx[:cut], idx[cut:]
Xtr, ytr = X[train_idx], y[train_idx]
Xva, yva = X[valid_idx], y[valid_idx]

model = Pipeline([("scaler", StandardScaler()), ("nb", GaussianNB())]).fit(Xtr, ytr)
proba = model.predict_proba(Xva)[:,1]
prec, rec, thr = precision_recall_curve(yva, proba)
f1s = (2*prec*rec)/(prec+rec+1e-9)
bi = int(np.argmax(f1s))
best_thr = thr[bi-1] if bi>0 and (bi-1) < len(thr) else 0.5
pred = (proba >= best_thr).astype(int)

# ---------- Aggregate anomalies per app (consumer & supplier roles) ----------
cons_stats = defaultdict(lambda: {"count":0, "score":0.0})
supp_stats = defaultdict(lambda: {"count":0, "score":0.0})

for va_idx, glob_idx in enumerate(valid_idx):
    cons, supp, _bucket = keys[glob_idx]
    req_count, error_rate, cost_sum, cost_mean, data_sum, data_mean = Xva[va_idx]
    p = proba[va_idx]; yhat = pred[va_idx]
    impact = p * (cost_sum + 1e-9)
    if yhat == 1:
        cons_stats[cons]["count"] += 1
        supp_stats[supp]["count"] += 1
    cons_stats[cons]["score"] += impact
    supp_stats[supp]["score"] += impact

# ---------- Monthly trends (latest + MoM change) ----------
monthly_path = "monthly_features.csv"
monthly = defaultdict(dict)  # monthly[app][month] = metrics dict
try:
    with open(monthly_path, "r", encoding="utf-8") as f:
        r = csv.DictReader(f)
        for row in r:
            app = row["app_id"]; month = row["month"]
            monthly[app][month] = row
except FileNotFoundError:
    pass  # if monthly not present, we will just skip trend columns

def latest_mom(app):
    if app not in monthly or not monthly[app]:
        return ("", "", "", "", "")
    months = sorted(monthly[app].keys())  # 'YYYY-MM' sorts lexicographically
    latest = months[-1]
    prev = months[-2] if len(months) >= 2 else None
    r_latest = monthly[app][latest]
    def fget(k):
        try: return float(r_latest.get(k, "0") or 0.0)
        except: return 0.0
    cost_latest = fget("cost")
    vpc_latest = fget("value_per_cost")
    oc_latest  = float(r_latest.get("outage_count", 0.0) or 0.0)
    od_latest  = float(r_latest.get("outage_duration", 0.0) or 0.0)
    if prev:
        try:
            cost_prev = float((monthly[app][prev].get("cost", "0") or 0.0))
            mom = (cost_latest - cost_prev) / (cost_prev + 1e-9)
        except:
            mom = ""
    else:
        mom = ""
    return (latest, cost_latest, mom, vpc_latest, (oc_latest, od_latest))

# ---------- Build watchlist rows ----------
apps = set(list(cons_stats.keys()) + list(supp_stats.keys()) + list(monthly.keys()))
rows = []
for app in apps:
    c = cons_stats.get(app, {"count":0,"score":0.0})
    s = supp_stats.get(app, {"count":0,"score":0.0})
    latest, cost_latest, mom, vpc, outages = latest_mom(app)
    oc, od = (outages if outages else (0.0, 0.0))
    rows.append({
        "app_id": app,
        "anomalies_consumer": int(c["count"]),
        "score_consumer": float(c["score"]),
        "anomalies_supplier": int(s["count"]),
        "score_supplier": float(s["score"]),
        "anomalies_total": int(c["count"] + s["count"]),
        "score_total": float(c["score"] + s["score"]),
        "latest_month": latest,
        "cost_latest": float(cost_latest) if cost_latest != "" else "",
        "cost_mom_pct": float(mom) if mom != "" else "",
        "value_per_cost_latest": float(vpc) if vpc != "" else "",
        "outage_count_latest": float(oc),
        "outage_duration_latest": float(od)
    })

# Rank by score_total desc
rows.sort(key=lambda r: r["score_total"], reverse=True)

# Save CSV + preview
out_csv = "watchlist_apps.csv"
with open(out_csv, "w", newline="", encoding="utf-8") as f:
    fn = list(rows[0].keys()) if rows else []
    w = csv.DictWriter(f, fieldnames=fn)
    w.writeheader()
    for r in rows: w.writerow(r)

print("Saved:", out_csv)
print("Top 10 preview:")
for r in rows[:10]:
    print(r)


Saved: watchlist_apps.csv
Top 10 preview:
{'app_id': 'MCSCBT', 'anomalies_consumer': 3010, 'score_consumer': 2247204.864172169, 'anomalies_supplier': 3069, 'score_supplier': 2627282.000003058, 'anomalies_total': 6079, 'score_total': 4874486.864175227, 'latest_month': '2025-05', 'cost_latest': 1522083.049999968, 'cost_mom_pct': 0.07695582850548877, 'value_per_cost_latest': 0.015829359639738805, 'outage_count_latest': 83.0, 'outage_duration_latest': 13955.0}
{'app_id': 'CLVMDL', 'anomalies_consumer': 0, 'score_consumer': 4555.476827580488, 'anomalies_supplier': 6695, 'score_supplier': 2512970.4724194533, 'anomalies_total': 6695, 'score_total': 2517525.949247034, 'latest_month': '2025-05', 'cost_latest': 1249448.799999955, 'cost_mom_pct': 0.06576045593726959, 'value_per_cost_latest': 0.01809545937376611, 'outage_count_latest': 287.0, 'outage_duration_latest': 85770.0}
{'app_id': 'CSTSEG', 'anomalies_consumer': 0, 'score_consumer': 10762.728691977702, 'anomalies_supplier': 3332, 'score_sup

In [26]:
# Build a concise, slide-ready report: report_pack.md
import csv, os

t1_var_csv   = "task1_results_variants.csv"
t2_nbr_csv   = "task2_neighbors.csv"
t2_clu_csv   = "task2_clustering_results.csv"
inc_md       = "task1_incident_summaries.md"
watch_csv    = "watchlist_apps.csv"

out_md = "report_pack.md"
lines = []

lines += [
    "# Trufflow 1B — Data Product Insight Recommendation",
    "",
    "## Summary",
    "- **Task 1 (Insight Detection, NB)**: strong PR-AUC and F1 on weak labels; threshold tuned by F1.",
    "- **Task 2 (Service Similarity)**: cosine NN and KMeans; high silhouette with compact role vectors.",
    "- **Watchlist**: ranked by anomaly impact × cost; includes latest month KPIs.",
    "",
    "Artifacts: `task1_results_variants.csv`, `task1_top_incidents.csv`, `task2_neighbors.csv`, `task2_clustering_results.csv`, `watchlist_apps.csv`.",
    ""
]

# Task 1 table
if os.path.exists(t1_var_csv):
    with open(t1_var_csv, "r", encoding="utf-8") as f:
        rows = list(csv.DictReader(f))
    hdr = ["Feature Set / Variant","Precision","Recall","F1","PR-AUC","Accuracy (+ve)","Accuracy (mean)","Threshold","Valid size","Pos rate (valid)"]
    lines += ["## Task 1 — Insight Detection (Naive Bayes Variants)", ""]
    lines += ["| " + " | ".join(hdr) + " |"]
    lines += ["|" + "|".join(["---"] + [":---:" for _ in hdr[1:]]) + "|"]
    for r in rows:
        lines += ["| " + " | ".join(str(r[h]) for h in hdr) + " |"]
    lines += [""]

# Task 2 neighbors (sample)
if os.path.exists(t2_nbr_csv):
    with open(t2_nbr_csv, "r", encoding="utf-8") as f:
        nbr_rows = list(csv.DictReader(f))[:10]
    lines += ["## Task 2 — Service Similarity (Top-5 Neighbors, sample)", ""]
    hdr = ["service","n1","s1","n2","s2","n3","s3","n4","s4","n5","s5"]
    lines += ["| " + " | ".join(hdr) + " |"]
    lines += ["|---|---|---:|---|---:|---|---:|---|---:|---|---:|"]
    for r in nbr_rows:
        lines += ["| " + " | ".join(r[h] for h in hdr) + " |"]
    lines += [""]

# Task 2 clustering summary
if os.path.exists(t2_clu_csv):
    with open(t2_clu_csv, "r", encoding="utf-8") as f:
        clu_rows = list(csv.DictReader(f))
    lines += ["## Task 2 — Clustering Summary (Silhouette, cosine)", ""]
    lines += ["| Representation | Method | k | Silhouette |"]
    lines += ["|---|---|---:|---:|"]
    for r in clu_rows:
        lines += [f"| {r['Representation']} | {r['Method']} | {r['k']} | {r['Silhouette']} |"]
    lines += [""]

# Watchlist Top-20
if os.path.exists(watch_csv):
    with open(watch_csv, "r", encoding="utf-8") as f:
        wrows = list(csv.DictReader(f))[:20]
    lines += ["## Watchlist — Top 20 Apps (by anomaly impact × cost)", ""]
    hdr = ["app_id","anomalies_total","score_total","latest_month","cost_latest","cost_mom_pct","value_per_cost_latest","outage_count_latest","outage_duration_latest"]
    lines += ["| " + " | ".join(hdr) + " |"]
    lines += ["|---|---:|---:|---|---:|---:|---:|---:|---:|"]
    for r in wrows:
        lines += ["| " + " | ".join(str(r.get(h, "")) for h in hdr) + " |"]
    lines += [""]

# Top incident summaries (sample)
if os.path.exists(inc_md):
    lines += ["## Top Incidents — Brief Summaries (sample)", ""]
    with open(inc_md, "r", encoding="utf-8") as f:
        for i, line in zip(range(40), f):  # ~first 40 lines
            lines.append(line.rstrip())
    lines += ["", f"_See full: `{inc_md}`_"]

with open(out_md, "w", encoding="utf-8") as f:
    f.write("\n".join(lines))

print("Saved:", out_md)
with open(out_md, "r", encoding="utf-8") as f:
    for i, line in zip(range(30), f):
        print(line.rstrip())


Saved: report_pack.md
# Trufflow 1B — Data Product Insight Recommendation

## Summary
- **Task 1 (Insight Detection, NB)**: strong PR-AUC and F1 on weak labels; threshold tuned by F1.
- **Task 2 (Service Similarity)**: cosine NN and KMeans; high silhouette with compact role vectors.
- **Watchlist**: ranked by anomaly impact × cost; includes latest month KPIs.

Artifacts: `task1_results_variants.csv`, `task1_top_incidents.csv`, `task2_neighbors.csv`, `task2_clustering_results.csv`, `watchlist_apps.csv`.

## Task 1 — Insight Detection (Naive Bayes Variants)

| Feature Set / Variant | Precision | Recall | F1 | PR-AUC | Accuracy (+ve) | Accuracy (mean) | Threshold | Valid size | Pos rate (valid) |
|---|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|
| Counts only (NB) | 0.10568402532383772 | 0.8179001721170396 | 0.1830056388612474 | 0.1021308496893949 | 0.9179575444635686 | 0.3043420390540036 | 0.3514217132994965 | 308035 | 0.0848767185547097 |
| All features (NB) | 0.90285338496949

In [27]:
# PR curve for Task 1 (uses X, keys from earlier)
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_recall_curve, average_precision_score, f1_score

assert 'X' in globals() and 'keys' in globals(), "Run the feature cell first."

# Rebuild weak labels
er = X[:,1]; cmean = X[:,3]; dmean = X[:,5]
def robust_z(x):
    med = np.median(x); mad = np.median(np.abs(x - med)) + 1e-9
    return np.abs(x - med) / (1.4826 * mad)
y = ((er > 0.10) | (robust_z(cmean) > 3.0) | (robust_z(dmean) > 3.0)).astype(int)

# Time-based split
idx = np.arange(len(keys))
idx = idx[np.argsort([k[2] for k in keys])]
cut = int(len(idx)*0.8)
train_idx, valid_idx = idx[:cut], idx[cut:]
Xtr, ytr = X[train_idx], y[train_idx]
Xva, yva = X[valid_idx], y[valid_idx]

# Train NB
model = Pipeline([("scaler", StandardScaler()), ("nb", GaussianNB())]).fit(Xtr, ytr)

# PR data
proba = model.predict_proba(Xva)[:,1]
prec, rec, thr = precision_recall_curve(yva, proba)
f1s = (2*prec*rec)/(prec+rec+1e-9)
bi = int(np.argmax(f1s))
best_thr = thr[bi-1] if bi>0 and (bi-1)<len(thr) else 0.5
best_f1 = float(f1s[bi])
ap = float(average_precision_score(yva, proba))

# Try to plot; if matplotlib unavailable, save CSV instead
plotted = False
try:
    import matplotlib.pyplot as plt
    plt.figure()
    plt.plot(rec, prec, label=f"PR curve (AP={ap:.3f})")
    # Mark best F1 point
    plt.scatter([rec[bi]], [prec[bi]], s=40)
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.title("Task 1 — Precision–Recall")
    plt.legend()
    plt.tight_layout()
    plt.savefig("task1_pr_curve.png", dpi=150)
    plt.show()
    print("Saved figure: task1_pr_curve.png")
    plotted = True
except Exception as e:
    pass

if not plotted:
    import csv
    with open("task1_pr_curve.csv", "w", newline="", encoding="utf-8") as f:
        w = csv.writer(f); w.writerow(["recall","precision"])
        for r_, p_ in zip(rec, prec): w.writerow([float(r_), float(p_)])
    print("matplotlib unavailable — saved curve points to task1_pr_curve.csv")

print({
    "avg_precision": ap,
    "best_F1": best_f1,
    "best_point": {"precision": float(prec[bi]), "recall": float(rec[bi])},
    "best_threshold": float(best_thr)
})


matplotlib unavailable — saved curve points to task1_pr_curve.csv
{'avg_precision': 0.9774733165108653, 'best_F1': 0.9235368856651477, 'best_point': {'precision': 0.9028533849694933, 'recall': 0.9451902849493211}, 'best_threshold': 0.9806067807015026}


In [28]:
# Confusion matrix + classification report at best-F1 threshold (Task 1)
import numpy as np, csv, json
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import (
    precision_recall_curve, average_precision_score,
    confusion_matrix, classification_report,
    accuracy_score, precision_score, recall_score, f1_score
)

assert 'X' in globals() and 'keys' in globals(), "Run the feature cell first."

# Rebuild weak labels (same rule used before)
er = X[:,1]; cmean = X[:,3]; dmean = X[:,5]
def robust_z(x):
    med = np.median(x); mad = np.median(np.abs(x - med)) + 1e-9
    return np.abs(x - med) / (1.4826 * mad)
y = ((er > 0.10) | (robust_z(cmean) > 3.0) | (robust_z(dmean) > 3.0)).astype(int)

# Time-based split
idx = np.arange(len(keys))
idx = idx[np.argsort([k[2] for k in keys])]
cut = int(len(idx)*0.8)
train_idx, valid_idx = idx[:cut], idx[cut:]
Xtr, ytr = X[train_idx], y[train_idx]
Xva, yva = X[valid_idx], y[valid_idx]

# Train & score
model = Pipeline([("scaler", StandardScaler()), ("nb", GaussianNB())]).fit(Xtr, ytr)
proba = model.predict_proba(Xva)[:,1]

# Best-F1 threshold (same as PR cell)
prec, rec, thr = precision_recall_curve(yva, proba)
f1s = (2*prec*rec)/(prec+rec+1e-9)
bi = int(np.argmax(f1s))
best_thr = thr[bi-1] if bi>0 and (bi-1)<len(thr) else 0.5

# Predictions at chosen threshold
yhat = (proba >= best_thr).astype(int)

# Confusion matrix + metrics
cm = confusion_matrix(yva, yhat, labels=[0,1])
TN, FP, FN, TP = int(cm[0,0]), int(cm[0,1]), int(cm[1,0]), int(cm[1,1])
metrics = {
    "threshold": float(best_thr),
    "avg_precision": float(average_precision_score(yva, proba)),
    "accuracy": float(accuracy_score(yva, yhat)),
    "precision": float(precision_score(yva, yhat, zero_division=0)),
    "recall": float(recall_score(yva, yhat, zero_division=0)),
    "f1": float(f1_score(yva, yhat, zero_division=0)),
    "pos_rate_valid": float(yva.mean()),
    "pred_pos_rate": float(yhat.mean()),
    "TN": TN, "FP": FP, "FN": FN, "TP": TP
}

print("CONFUSION (labels=[0,1]):")
print(cm)
print("\nMETRICS:")
print(metrics)

print("\nCLASSIFICATION REPORT:")
print(classification_report(yva, yhat, digits=4))

# Save matrix + summary
with open("task1_confusion_matrix.csv", "w", newline="", encoding="utf-8") as f:
    w = csv.writer(f); w.writerow(["", "Pred_0", "Pred_1"])
    w.writerow(["Actual_0", TN, FP])
    w.writerow(["Actual_1", FN, TP])

with open("task1_confusion_summary.json", "w", encoding="utf-8") as f:
    json.dump(metrics, f, indent=2)

print("\nSaved: task1_confusion_matrix.csv, task1_confusion_summary.json")


CONFUSION (labels=[0,1]):
[[279230   2660]
 [  1433  24712]]

METRICS:
{'threshold': 0.9806067807015026, 'avg_precision': 0.9774733165108653, 'accuracy': 0.9867125488986641, 'precision': 0.9028204004091772, 'recall': 0.9451902849493211, 'f1': 0.9235196292766784, 'pos_rate_valid': 0.0848767185547097, 'pred_pos_rate': 0.08886003213920496, 'TN': 279230, 'FP': 2660, 'FN': 1433, 'TP': 24712}

CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           0     0.9949    0.9906    0.9927    281890
           1     0.9028    0.9452    0.9235     26145

    accuracy                         0.9867    308035
   macro avg     0.9489    0.9679    0.9581    308035
weighted avg     0.9871    0.9867    0.9869    308035


Saved: task1_confusion_matrix.csv, task1_confusion_summary.json


In [30]:
# Milestone1_key_tables.md 
import os, csv, json

out_md = "milestone1_key_tables.md"

def fmt(x, nd=4):
    try:
        f = float(x)
        if abs(f) >= 1000:
            return f"{f:,.{nd}f}".rstrip("0").rstrip(".")
        return f"{f:.{nd}f}".rstrip("0").rstrip(".")
    except:
        return str(x)

# ---------- Transactions counts ----------
tx_rows = "N/A"
if os.path.exists("transactions.jsonl"):
    try:
        with open("transactions.jsonl", "r", encoding="utf-8") as f:
            tx_rows = sum(1 for _ in f)
    except Exception:
        pass

# From memory if available
agg_windows = None
feat_shape = None
try:
    agg_windows = len(keys)
    feat_shape = f"({X.shape[0]:,}, {X.shape[1]})"
except:
    pass

# ---------- Services count from neighbors ----------
services_count = "N/A"
if os.path.exists("task2_neighbors.csv"):
    with open("task2_neighbors.csv", "r", encoding="utf-8") as f:
        services_count = f"{sum(1 for _ in csv.DictReader(f)):,}"

# ---------- Daily & monthly counts ----------
def count_lines(path):
    try:
        with open(path, "r", encoding="utf-8") as f:
            return sum(1 for _ in f)
    except:
        return None

daily_rows = count_lines("daily_metrics.jsonl")
monthly_rows = count_lines("monthly_metrics.jsonl")

days_unique = "N/A"
if os.path.exists("daily_features.csv"):
    seen = set()
    with open("daily_features.csv", "r", encoding="utf-8") as f:
        for r in csv.DictReader(f): seen.add(r["day"])
    days_unique = f"{len(seen):,}"

months_unique = "N/A"
if os.path.exists("monthly_features.csv"):
    seen = set()
    with open("monthly_features.csv", "r", encoding="utf-8") as f:
        for r in csv.DictReader(f): seen.add(r["month"])
    months_unique = f"{len(seen):,}"

# ---------- Task 1 variants ----------
t1_rows = []
if os.path.exists("task1_results_variants.csv"):
    with open("task1_results_variants.csv", "r", encoding="utf-8") as f:
        t1_rows = list(csv.DictReader(f))

# ---------- Task 1 confusion summary ----------
conf = {}
if os.path.exists("task1_confusion_summary.json"):
    with open("task1_confusion_summary.json", "r", encoding="utf-8") as f:
        conf = json.load(f)

# ---------- Task 1 top incidents (from transactions)
top_inc = []
if os.path.exists("task1_top_incidents.csv"):
    with open("task1_top_incidents.csv", "r", encoding="utf-8") as f:
        top_inc = list(csv.DictReader(f))[:10]

# ---------- Task 2 clustering sweep & neighbors ----------
clu_rows = []
best_k, best_sil = None, None
if os.path.exists("task2_clustering_results.csv"):
    with open("task2_clustering_results.csv", "r", encoding="utf-8") as f:
        clu_rows = list(csv.DictReader(f))
    if clu_rows:
        best = max(clu_rows, key=lambda r: float(r["Silhouette"]))
        best_k, best_sil = best["k"], best["Silhouette"]

nbr_rows = []
if os.path.exists("task2_neighbors.csv"):
    with open("task2_neighbors.csv", "r", encoding="utf-8") as f:
        nbr_rows = list(csv.DictReader(f))[:15]

# ---------- Watchlist top 5 ----------
watch_top = []
if os.path.exists("watchlist_apps.csv"):
    with open("watchlist_apps.csv", "r", encoding="utf-8") as f:
        watch_top = list(csv.DictReader(f))[:5]

# ---------------- Compose Markdown ----------------
lines = []
lines.append("# Key Results — Milestone 1\n")

# Transactions (explicit)
lines += [
"## Transactions — Coverage",
"| Item | Value |",
"|---|---:|",
f"| Transactions rows read | **{(tx_rows if isinstance(tx_rows,int) else tx_rows):,}** |" if isinstance(tx_rows, int) else f"| Transactions rows read | **{tx_rows}** |",
f"| Aggregated hourly windows | **{agg_windows:,}** |" if isinstance(agg_windows, int) else "| Aggregated hourly windows | **N/A** |",
f"| Feature matrix shape | **{feat_shape or 'N/A'}** |",
""
]

# Data & feature engineering (daily/monthly + services)
lines += [
"## Data & Feature Engineering (Daily/Monthly)",
"| Item | Value |",
"|---|---:|",
f"| Unique services represented | **{services_count}** |",
f"| Daily metrics rows / days | **{(daily_rows and format(daily_rows, ',') or 'N/A')} / {days_unique}** |",
f"| Monthly metrics rows / months | **{(monthly_rows and format(monthly_rows, ',') or 'N/A')} / {months_unique}** |",
""
]

# Task 1 variants
lines += [
"## Task 1 — Naive Bayes (baseline + variants)",
"| Feature Set / Variant | Precision | Recall | F1 | PR-AUC | Accuracy (mean) | Threshold | Valid size | Pos rate (valid) |",
"|---|---:|---:|---:|---:|---:|---:|---:|---:|",
]
for r in t1_rows:
    lines.append(
        f"| {r['Feature Set / Variant']} | {fmt(r['Precision'])} | {fmt(r['Recall'])} | {fmt(r['F1'])} | {fmt(r['PR-AUC'])} | "
        f"{fmt(r['Accuracy (mean)'])} | {fmt(r['Threshold'])} | {int(float(r['Valid size'])):,} | {fmt(r['Pos rate (valid)'])} |"
    )
lines.append("")

# Task 1 PR/confusion
if conf:
    lines += [
    "## Task 1 — PR/Threshold & Confusion (at best-F1 threshold)",
    "| Metric | Value |",
    "|---|---:|",
    f"| Best threshold (by F1) | **{fmt(conf.get('threshold'))}** |",
    f"| Average precision (PR-AUC) | **{fmt(conf.get('avg_precision'))}** |",
    f"| Accuracy (mean) | **{fmt(conf.get('accuracy'))}** |",
    f"| Precision / Recall | **{fmt(conf.get('precision'))} / {fmt(conf.get('recall'))}** |",
    f"| F1 | **{fmt(conf.get('f1'))}** |",
    f"| Validation positive rate | **{fmt(conf.get('pos_rate_valid'))}** |",
    f"| Predicted positive rate | **{fmt(conf.get('pred_pos_rate'))}** |",
    f"| Confusion matrix (valid) | **TN={int(conf.get('TN',0)):,}  FP={int(conf.get('FP',0)):,}  FN={int(conf.get('FN',0)):,}  TP={int(conf.get('TP',0)):,}** |",
    ""
    ]

# Task 1 Top Incidents (transactions-derived)
if top_inc:
    lines += [
    "## Task 1 — Top Incidents (from transactions, Top 10)",
    "| time_bucket | consumer_id | supplier_id | probability | predicted_anomaly | weak_label | req_count | error_rate | cost_sum | cost_mean | data_sum | data_mean |",
    "|---|---|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|",
    ]
    for r in top_inc:
        lines.append(
            f"| {r['time_bucket']} | {r['consumer_id']} | {r['supplier_id']} | {fmt(r['probability'],3)} | {r['predicted_anomaly']} | {r['weak_label']} | "
            f"{fmt(r['req_count'],0)} | {fmt(r['error_rate'],3)} | {fmt(r['cost_sum'],2)} | {fmt(r['cost_mean'],2)} | {fmt(r['data_sum'],2)} | {fmt(r['data_mean'],2)} |"
        )
    lines.append("")

# Task 2 summary
lines += [
"## Task 2 — Service Similarity & Clustering (Cosine)",
"| Item | Value |",
"|---|---:|",
f"| Services (vectors) | **{services_count}** |",
"| Role vector dimensionality | **8** |",
f"| Best k (KMeans, cosine) | **{best_k or 'N/A'}** |",
f"| Silhouette (cosine) @ best k | **{fmt(best_sil) if best_sil else 'N/A'}** |",
""
]

# Task 2 clustering sweep (full)
if clu_rows:
    lines += [
    "### Clustering Sweep (Silhouette over k)",
    "| k | Silhouette (cosine) |",
    "|---:|---:|",
    ]
    for r in clu_rows:
        lines.append(f"| {r['k']} | {fmt(r['Silhouette'])} |")
    lines.append("")

# Task 2 nearest neighbors (sample)
if nbr_rows:
    lines += [
    "### Nearest Neighbors (Top-5 per service, sample of 15)",
    "| service | n1 | s1 | n2 | s2 | n3 | s3 | n4 | s4 | n5 | s5 |",
    "|---|---|---:|---|---:|---|---:|---|---:|---|---:|",
    ]
    for r in nbr_rows:
        lines.append("| " + " | ".join([r[h] for h in ["service","n1","s1","n2","s2","n3","s3","n4","s4","n5","s5"]]) + " |")
    lines.append("")

# Watchlist Top-5
if watch_top:
    lines += [
    "## Watchlist — Top 5 Apps by Anomaly Impact × Cost (validation slice)",
    "| # | App | Anomalies (total) | Score_total | Latest month | Cost_latest | MoM cost % | Value/Cost (latest) | Outages (count / duration) |",
    "|---:|---|---:|---:|---|---:|---:|---:|---:|",
    ]
    for i, r in enumerate(watch_top, 1):
        lines.append(
            f"| {i} | {r['app_id']} | {int(float(r['anomalies_total'])):,} | {fmt(r['score_total'],2)} | {r['latest_month']} | "
            f"{fmt(r.get('cost_latest',''),2)} | {fmt(r.get('cost_mom_pct',''))} | {fmt(r.get('value_per_cost_latest',''),5)} | "
            f"{int(float(r.get('outage_count_latest',0))):,} / {int(float(r.get('outage_duration_latest',0))):,} |"
        )
    lines.append("")

# Write + preview
with open(out_md, "w", encoding="utf-8") as f:
    f.write("\n".join(lines))

print("Saved:", out_md)
with open(out_md, "r", encoding="utf-8") as f:
    for i, line in zip(range(40), f):
        print(line.rstrip())


Saved: milestone1_key_tables.md
# Key Results — Milestone 1

## Transactions — Coverage
| Item | Value |
|---|---:|
| Transactions rows read | **7,254,656** |
| Aggregated hourly windows | **1,540,175** |
| Feature matrix shape | **(1,540,175, 6)** |

## Data & Feature Engineering (Daily/Monthly)
| Item | Value |
|---|---:|
| Unique services represented | **87** |
| Daily metrics rows / days | **666,855 / 365** |
| Monthly metrics rows / months | **18,270 / 10** |

## Task 1 — Naive Bayes (baseline + variants)
| Feature Set / Variant | Precision | Recall | F1 | PR-AUC | Accuracy (mean) | Threshold | Valid size | Pos rate (valid) |
|---|---:|---:|---:|---:|---:|---:|---:|---:|
| Counts only (NB) | 0.1057 | 0.8179 | 0.183 | 0.1021 | 0.3043 | 0.3514 | 308,035 | 0.0849 |
| All features (NB) | 0.9029 | 0.9452 | 0.9235 | 0.9775 | 0.9867 | 0.9806 | 308,035 | 0.0849 |
| PCA(3) + NB | 0.6811 | 0.9356 | 0.7883 | 0.6951 | 0.9573 | 0.1549 | 308,035 | 0.0849 |

## Task 1 — PR/Threshold & Confusion 