In [1]:
import re
import ast
from pathlib import Path
from typing import Dict, List, Tuple
import json
import pandas as pd
from collections import defaultdict

FOLDER = Path("Tucsan_Sent")         
OUTPUT_DIR = FOLDER / "_parsed"     
SPLIT_POSITION = True             
SAVE_CSVS = False              
SAVE_JSON = False                    

LINE_RE = re.compile(
    r"^(?P<log_time>\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2},\d{3})\s+-\s+INFO\s+-\s+RabbitMQ\s+\|\s+Published message:\s+(?P<payload>\{.*\})\s*$"
)

PREFERRED_ORDER = [
    "_log_time",
    "timeStamp", "id", "cType",
    "position", "pos_x", "pos_y", "pos_z",
    "heading", "speed",
    "width", "lengthf", "lengthb", "height", "accMax",
    "t_veh_sent", "t_meta_rev", "t_meta_sent",
    "t_intel_rev", "t_intel_sent",
    "t_veh_rev",
]

def parse_file(file_path: Path, split_position: bool = True) -> pd.DataFrame:
    records = []
    with file_path.open("r", encoding="utf-8", errors="ignore") as f:
        for raw in f:
            line = raw.rstrip("\n")
            m = LINE_RE.match(line.strip())
            if not m:
                continue

            payload_str = m.group("payload")
            try:
                payload = ast.literal_eval(payload_str)
            except Exception:
                last = payload_str.rfind("}")
                if last != -1:
                    try:
                        payload = ast.literal_eval(payload_str[: last + 1])
                    except Exception:
                        continue
                else:
                    continue

            payload["_log_time"] = m.group("log_time")

            if split_position and isinstance(payload.get("position"), str):
                try:
                    x_str, y_str, z_str = [t.strip() for t in payload["position"].split(",")]
                    payload["pos_x"] = float(x_str)
                    payload["pos_y"] = float(y_str)
                    payload["pos_z"] = float(z_str)
                except Exception:
                    pass 

            records.append(payload)

    df = pd.DataFrame.from_records(records)

    if "_log_time" in df.columns:
        df["_log_time"] = pd.to_datetime(df["_log_time"], format="%Y-%m-%d %H:%M:%S,%f", errors="coerce")

    if not df.empty:
        ordered = [c for c in PREFERRED_ORDER if c in df.columns]
        remaining = [c for c in df.columns if c not in ordered]
        df = df[ordered + remaining]

    return df

def parse_folder(folder_path: Path, split_position: bool = True) -> Dict[str, pd.DataFrame]:
    results: Dict[str, pd.DataFrame] = {}
    for fp in sorted(folder_path.glob("*.txt")):
        df = parse_file(fp, split_position=split_position)
        results[fp.name] = df
    return results

def get_kph_key(file_name: str) -> str:
    m = re.search(r"(\d{1,3})\s*kph", file_name, flags=re.IGNORECASE)
    if m:
        return f"{int(m.group(1))}kph"
    m2 = re.search(r"(\d{1,3})\s*kmh", file_name, flags=re.IGNORECASE)
    if m2:
        return f"{int(m2.group(1))}kph"
    return "unknown"

def extract_run_window(df: pd.DataFrame) -> Tuple[float, float]:
    if df is None or df.empty:
        raise ValueError("Empty DataFrame; no timeStamp values present.")
    col = "timeStamp" if "timeStamp" in df.columns else None
    if col is None:
        raise ValueError("Column 'timeStamp' not found in DataFrame.")
    s = pd.to_numeric(df[col], errors="coerce").dropna()
    if s.empty:
        raise ValueError("No valid numeric 'timeStamp' values.")
    # Align back to DataFrame order by sorting on timeStamp
    df2 = df.loc[s.index].copy()
    df2 = df2.sort_values(by=[col], kind="mergesort")  # stable sort
    return float(df2[col].iloc[0]), float(df2[col].iloc[-1])

def save_dataframes_as_csv(dfs_by_file: Dict[str, pd.DataFrame], out_dir: Path) -> None:
    out_dir.mkdir(parents=True, exist_ok=True)
    for fname, df in dfs_by_file.items():
        stem = Path(fname).stem
        out_path = out_dir / f"{stem}.csv"
        # UTF-8 with BOM helps Excel show Unicode correctly
        df.to_csv(out_path, index=False, encoding="utf-8-sig")

if not FOLDER.exists():
    raise FileNotFoundError(f"Folder not found: {FOLDER.resolve()}")

dfs = parse_folder(FOLDER, split_position=SPLIT_POSITION)

if SAVE_CSVS:
    save_dataframes_as_csv(dfs, OUTPUT_DIR)

by_kph: Dict[str, List[List[float]]] = defaultdict(list)
by_kph_with_files: Dict[str, List[List[object]]] = defaultdict(list)

for fname, df in dfs.items():
    if df is None or df.empty:
        continue
    try:
        start_ts, end_ts = extract_run_window(df)
    except Exception as e:
        print(f"[WARN] Skipping {fname}: {e}")
        continue

    key = get_kph_key(fname)  
    by_kph[key].append([start_ts, end_ts])
    by_kph_with_files[key].append([fname, start_ts, end_ts])

for k in list(by_kph.keys()):
    by_kph[k].sort(key=lambda x: x[0])
for k in list(by_kph_with_files.keys()):
    by_kph_with_files[k].sort(key=lambda x: x[1])

print("=== Run windows by kph (lists of [start, end]) ===")
for k in sorted(by_kph.keys(), key=lambda x: (x=="unknown", int(x.replace("kph","")) if x!="unknown" else 0)):
    print(f"{k} = {by_kph[k]}")

print("\n=== Run windows by kph (with filenames) ===")
for k in sorted(by_kph_with_files.keys(), key=lambda x: (x=="unknown", int(x.replace("kph","")) if x!="unknown" else 0)):
    print(f"{k}:")
    for fname, s, e in by_kph_with_files[k]:
        print(f"  - {fname}: [{s}, {e}]")

if SAVE_JSON:
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
    with (OUTPUT_DIR / "run_windows_by_kph.json").open("w", encoding="utf-8") as f:
        json.dump(by_kph, f, ensure_ascii=False, indent=2)
    with (OUTPUT_DIR / "run_windows_by_kph_with_files.json").open("w", encoding="utf-8") as f:
        json.dump(by_kph_with_files, f, ensure_ascii=False, indent=2)

=== Run windows by kph (lists of [start, end]) ===
20kph = [[1757914226.6105194, 1757917563.0991616], [1757914226.6105194, 1757914420.6993194], [1757914226.6105194, 1757914550.887972], [1757914226.6105194, 1757914667.5927155], [1757914226.6105194, 1757914786.275123], [1757914226.6105194, 1757914923.5330007], [1757914226.6105194, 1757916150.7172039], [1757917557.797726, 1757917675.8044224], [1757917557.797726, 1757917787.7237718], [1757917557.797726, 1757917900.7781775], [1757917557.797726, 1757918013.5039613]]
30kph = [[1757914226.6105194, 1757915024.5590355], [1757914226.6105194, 1757915121.9400544], [1757914226.6105194, 1757915234.0027125], [1757914226.6105194, 1757915329.2626548], [1757914226.6105194, 1757915425.3140006], [1757917557.797726, 1757918125.5372443], [1757917557.797726, 1757918237.9573307], [1757917557.797726, 1757918350.3041887], [1757917557.797726, 1757918433.1740332], [1757917557.797726, 1757918531.0669913]]
40kph = [[1757914226.6105194, 1757915528.026441], [175791422

In [2]:
import json
from pathlib import Path
import pandas as pd

def load_jsonl_to_df(path: Path, split_position: bool = True) -> pd.DataFrame:
    records = []
    if not path.exists():
        raise FileNotFoundError(f"File not found: {path.resolve()}")

    with path.open("r", encoding="utf-8", errors="ignore") as f:
        for raw in f:
            line = raw.strip()
            if not line:
                continue
            start = line.find("{")
            end = line.rfind("}")
            if start == -1 or end == -1 or end <= start:
                continue
            payload_str = line[start:end+1]
            try:
                obj = json.loads(payload_str)
            except json.JSONDecodeError:
                continue

            if split_position and isinstance(obj.get("position"), str):
                try:
                    x_str, y_str, z_str = [t.strip() for t in obj["position"].split(",")]
                    obj["pos_x"] = float(x_str)
                    obj["pos_y"] = float(y_str)
                    obj["pos_z"] = float(z_str)
                except Exception:
                    pass  

            if "timeStamp" in obj:
                try:
                    obj["_ts_datetime"] = pd.to_datetime(float(obj["timeStamp"]), unit="s", errors="coerce")
                except Exception:
                    obj["_ts_datetime"] = pd.NaT

            records.append(obj)

    df = pd.DataFrame.from_records(records)

    preferred = [
        "_ts_datetime", "timeStamp", "cType", "id",
        "position", "pos_x", "pos_y", "pos_z",
        "heading", "speed", "width", "lengthf", "lengthb", "height", "accMax",
        "t_veh_sent", "t_meta_rev", "t_vir_obj_sent", "t_vir_obj_rev",
        "t_meta_sent", "t_intel_rev", "t_intel_sent", "t_veh_rev"
    ]
    if not df.empty:
        ordered = [c for c in preferred if c in df.columns]
        remaining = [c for c in df.columns if c not in ordered]
        df = df[ordered + remaining]

    return df

base = Path(".")  

files = {
    "df_avante_sent": base / "0915_Avante_Sent.txt",
    "df_sedan_to_vehicle": base / "0915_sedan_to_vehicle.txt",
    "df_tucsan_received": base / "0915_Tucsan_Received.txt",
}

for var_name, path in files.items():
    globals()[var_name] = load_jsonl_to_df(path, split_position=True)

# Quick sanity prints
for var_name in files.keys():
    df = globals()[var_name]
    print(f"{var_name}: {len(df)} rows, columns = {list(df.columns)[:10]}{'...' if df.shape[1] > 10 else ''}")

df_avante_sent: 326140 rows, columns = ['_ts_datetime', 'timeStamp', 'cType', 'id', 'position', 'pos_x', 'pos_y', 'pos_z', 'heading', 'speed']...
df_sedan_to_vehicle: 108263 rows, columns = ['_ts_datetime', 'timeStamp', 'cType', 'id', 'position', 'pos_x', 'pos_y', 'pos_z', 'heading', 'speed']...
df_tucsan_received: 14556 rows, columns = ['_ts_datetime', 'timeStamp', 'cType', 'id', 'position', 'pos_x', 'pos_y', 'pos_z', 'heading', 'speed']...


In [3]:
import re
from pathlib import Path
from typing import Dict
import pandas as pd

VRT_FOLDER = Path("Virtual_Received_By_Tucsan")  # folder next to the notebook
TRAFFIC_ID_KEEP = 3

_VRT_LINE_RE = re.compile(
    r"""^\[INFO\]\s+\[(?P<ros_time>\d+(?:\.\d+)?)\]:\s*
        traffic_id:\s*(?P<traffic_id>\d+),\s*
        t_veh_sent:\s*(?P<t_veh_sent>-?\d+(?:\.\d+)?),\s*
        current:\s*(?P<current>-?\d+(?:\.\d+)?),\s*
        latency:\s*(?P<latency>-?\d+(?:\.\d+)?)
        \s*$""",
    re.VERBOSE,
)

def _kph_from_name(name: str) -> str:
    m = re.search(r"(\d{1,3})\s*(kph|kmh)", name, flags=re.IGNORECASE)
    return f"{int(m.group(1))}kph" if m else "unknown"

def parse_virtual_received_file(path: Path, traffic_id_keep: int = 3) -> pd.DataFrame:
    rows = []
    with path.open("r", encoding="utf-8", errors="ignore") as f:
        for raw in f:
            line = raw.strip()
            if not line or "speed_suggest" in line:  # explicitly ignore suggestion lines
                continue
            m = _VRT_LINE_RE.match(line)
            if not m:
                continue
            gd = m.groupdict()
            if int(gd["traffic_id"]) != traffic_id_keep:
                continue
            rows.append({
                "source_file": path.name,
                "kph": _kph_from_name(path.name),
                "_ros_time": float(gd["ros_time"]),
                "traffic_id": int(gd["traffic_id"]),
                "t_veh_sent": float(gd["t_veh_sent"]),
                "current": float(gd["current"]),
                "latency": float(gd["latency"]),
            })
    df = pd.DataFrame.from_records(rows)
    if not df.empty:
        df["_ros_datetime"] = pd.to_datetime(df["_ros_time"], unit="s", errors="coerce")
        preferred = ["source_file", "kph", "_ros_datetime", "_ros_time",
                     "traffic_id", "t_veh_sent", "current", "latency"]
        ordered = [c for c in preferred if c in df.columns]
        remain = [c for c in df.columns if c not in ordered]
        df = df[ordered + remain].sort_values("_ros_time").reset_index(drop=True)
    return df

def parse_virtual_received_folder(folder: Path, traffic_id_keep: int = 3) -> Dict[str, pd.DataFrame]:
    results: Dict[str, pd.DataFrame] = {}
    for fp in sorted(folder.glob("*.txt")):
        results[fp.name] = parse_virtual_received_file(fp, traffic_id_keep=traffic_id_keep)
    return results

if not VRT_FOLDER.exists():
    raise FileNotFoundError(f"Folder not found: {VRT_FOLDER.resolve()}")

vrt_dfs_by_file = parse_virtual_received_folder(VRT_FOLDER, traffic_id_keep=TRAFFIC_ID_KEEP)

print(f"Parsed {len(vrt_dfs_by_file)} files from {VRT_FOLDER.resolve()}")
for name, df in vrt_dfs_by_file.items():
    print(f" - {name}: kept {len(df)} lines (traffic_id={TRAFFIC_ID_KEEP})")


Parsed 17 files from C:\Users\user\Desktop\mOS\mOS_Data_Analysis\Packet Loss\Virtual_Received_By_Tucsan
 - speed_sueggest_250915_001_20kph+vir.txt: kept 820 lines (traffic_id=3)
 - speed_sueggest_250915_001_30kph+vir.txt: kept 541 lines (traffic_id=3)
 - speed_sueggest_250915_001_40kph+vir.txt: kept 431 lines (traffic_id=3)
 - speed_sueggest_250915_001_50kph+vir.txt: kept 367 lines (traffic_id=3)
 - speed_sueggest_250915_002_20kph+vir.txt: kept 757 lines (traffic_id=3)
 - speed_sueggest_250915_002_30kph+vir.txt: kept 597 lines (traffic_id=3)
 - speed_sueggest_250915_002_40kph+vir.txt: kept 427 lines (traffic_id=3)
 - speed_sueggest_250915_002_50kph+vir.txt: kept 364 lines (traffic_id=3)
 - speed_sueggest_250915_003_20kph+vir.txt: kept 704 lines (traffic_id=3)
 - speed_sueggest_250915_003_30kph+vir.txt: kept 856 lines (traffic_id=3)
 - speed_sueggest_250915_003_40kph+vir.txt: kept 433 lines (traffic_id=3)
 - speed_sueggest_250915_004_20kph+vir.txt: kept 754 lines (traffic_id=3)
 - speed

In [4]:
import math
import numpy as np
import pandas as pd
import re

TOL_SEC = 1e-6  

if "df_sedan_to_vehicle" not in globals():
    raise NameError("df_sedan_to_vehicle not found. Load 0915_sedan_to_vehicle.txt first.")
if "vrt_dfs_by_file" not in globals():
    raise NameError("vrt_dfs_by_file not found. Parse Virtual_Received_By_Tucsan first.")

if "t_veh_sent" not in df_sedan_to_vehicle.columns:
    raise KeyError("df_sedan_to_vehicle missing 't_veh_sent' column.")
df_sedan_to_vehicle["t_veh_sent"] = pd.to_numeric(df_sedan_to_vehicle["t_veh_sent"], errors="coerce")

for _name, _df in vrt_dfs_by_file.items():
    if not _df.empty and "t_veh_sent" not in _df.columns:
        raise KeyError(f"Receiver df '{_name}' missing 't_veh_sent' column.")

def _kph_from_name(name: str) -> str:
    m = re.search(r"(\d{1,3})\s*(kph|kmh)", name, flags=re.IGNORECASE)
    return f"{int(m.group(1))}kph" if m else "unknown"

def _speed_sort_key(k: str):
    return (k == "unknown",
            int(k.replace("kph", "")) if isinstance(k, str) and k.endswith("kph") and k[:-3].isdigit() else math.inf)

def _unique_sorted_positive(series: pd.Series) -> np.ndarray:
    s = pd.to_numeric(series, errors="coerce").dropna().astype(float)
    s = s[s > 0]
    return np.unique(np.sort(s.values)) if not s.empty else np.array([], dtype=float)

def _greedy_match_count(a: np.ndarray, b: np.ndarray, tol: float) -> int:
    i = j = 0
    matched = 0
    na, nb = len(a), len(b)
    while i < na and j < nb:
        diff = a[i] - b[j]
        if abs(diff) <= tol:
            matched += 1
            i += 1
            j += 1
        elif diff < 0:
            i += 1
        else:
            j += 1
    return matched

rows_srv2cav = []

for recv_name, recv_df in sorted(vrt_dfs_by_file.items()):
    recv_vals = _unique_sorted_positive(recv_df.get("t_veh_sent", pd.Series(dtype=float)))
    if recv_vals.size == 0:
        rows_srv2cav.append({
            "receiver_file": recv_name,
            "speed": _kph_from_name(recv_name),
            "start_t_veh_sent": np.nan,
            "end_t_veh_sent": np.nan,
            "sender_count": np.nan,
            "receiver_count": 0,
            "matched_count": 0,
            "packet_loss_pct": np.nan,
            "tol_sec": TOL_SEC,
        })
        continue

    start_t = float(recv_vals[0])
    end_t   = float(recv_vals[-1])

    sender_vals = _unique_sorted_positive(
        df_sedan_to_vehicle.loc[
            (df_sedan_to_vehicle["t_veh_sent"] >= start_t) &
            (df_sedan_to_vehicle["t_veh_sent"] <= end_t),
            "t_veh_sent"
        ]
    )

    sender_count = int(sender_vals.size)
    receiver_count = int(recv_vals.size)
    matched = _greedy_match_count(sender_vals, recv_vals, tol=TOL_SEC)

    pkt_loss_pct = (100.0 * (1.0 - matched / sender_count)) if sender_count > 0 else np.nan

    rows_srv2cav.append({
        "receiver_file": recv_name,
        "speed": _kph_from_name(recv_name),
        "start_t_veh_sent": start_t,
        "end_t_veh_sent": end_t,
        "sender_count": sender_count,
        "receiver_count": receiver_count,
        "matched_count": matched,
        "packet_loss_pct": pkt_loss_pct,
        "tol_sec": TOL_SEC,
    })

pktloss_server_to_cav_runs = pd.DataFrame(rows_srv2cav)

def _q1(x): return x.quantile(0.25)
def _q3(x): return x.quantile(0.75)

pktloss_server_to_cav_by_speed = (
    pktloss_server_to_cav_runs
    .dropna(subset=["packet_loss_pct"])
    .groupby("speed")["packet_loss_pct"]
    .agg(mean="mean", std="std", min="min", q1=_q1, q3=_q3, max="max")
    .reset_index()
    .sort_values(by="speed", key=lambda s: s.map(_speed_sort_key))
)

pd.set_option("display.float_format", lambda v: f"{v:0.6f}")
print("=== Metaverse Server -> CAV packet loss per receiver file ===")
print(pktloss_server_to_cav_runs[[
    "receiver_file","speed","start_t_veh_sent","end_t_veh_sent",
    "sender_count","receiver_count","matched_count","packet_loss_pct","tol_sec"
]])

print("\n=== Packet loss summary by speed (Server -> CAV) ===")
print(pktloss_server_to_cav_by_speed)

=== Metaverse Server -> CAV packet loss per receiver file ===
                              receiver_file  speed  start_t_veh_sent  \
0   speed_sueggest_250915_001_20kph+vir.txt  20kph 1757917458.993619   
1   speed_sueggest_250915_001_30kph+vir.txt  30kph 1757918013.503961   
2   speed_sueggest_250915_001_40kph+vir.txt  40kph 1757918531.066991   
3   speed_sueggest_250915_001_50kph+vir.txt  50kph 1757919030.999043   
4   speed_sueggest_250915_002_20kph+vir.txt  20kph 1757917562.989060   
5   speed_sueggest_250915_002_30kph+vir.txt  30kph 1757918125.537244   
6   speed_sueggest_250915_002_40kph+vir.txt  40kph 1757918619.511653   
7   speed_sueggest_250915_002_50kph+vir.txt  50kph 1757919112.457446   
8   speed_sueggest_250915_003_20kph+vir.txt  20kph 1757917675.744356   
9   speed_sueggest_250915_003_30kph+vir.txt  30kph 1757918237.957331   
10  speed_sueggest_250915_003_40kph+vir.txt  40kph 1757918733.455906   
11  speed_sueggest_250915_004_20kph+vir.txt  20kph 1757917787.603899   
12

In [5]:
import numpy as np
import pandas as pd

EXPAND_SEC = 0.05       

TOL_SEC = 1e-6       

def _uniq_sorted_pos(x: pd.Series) -> np.ndarray:
    s = pd.to_numeric(x, errors="coerce").dropna().astype(float)
    s = s[s > 0]
    return np.unique(np.sort(s.values)) if not s.empty else np.array([], dtype=float)

def _greedy_match(a: np.ndarray, b: np.ndarray, tol: float) -> int:
    i = j = 0
    m = 0
    na, nb = len(a), len(b)
    while i < na and j < nb:
        d = a[i] - b[j]
        if abs(d) <= tol:
            m += 1; i += 1; j += 1
        elif d < 0:
            i += 1
        else:
            j += 1
    return m

rows_check = []
for recv_name, recv_df in sorted(vrt_dfs_by_file.items()):
    recv_vals = _uniq_sorted_pos(recv_df.get("t_veh_sent", pd.Series(dtype=float)))
    if recv_vals.size == 0:
        rows_check.append({
            "receiver_file": recv_name,
            "sender_in_recv_count": np.nan,
            "receiver_count": 0,
            "matched_in_recv": 0,
            "loss_pct_in_recv": np.nan,
            "sender_in_exp_count": np.nan,
            "matched_in_exp": 0,
            "loss_pct_in_exp": np.nan,
        })
        continue

    r0, r1 = float(recv_vals[0]), float(recv_vals[-1])

    sender_vals_full = _uniq_sorted_pos(df_sedan_to_vehicle["t_veh_sent"])
    sender_in_recv = sender_vals_full[(sender_vals_full >= r0) & (sender_vals_full <= r1)]
    sender_in_exp  = sender_vals_full[(sender_vals_full >= r0 - 0.09) & (sender_vals_full <= r1 + 0.05)]
    
    matched_in_recv = _greedy_match(sender_in_recv, recv_vals, TOL_SEC)
    loss_pct_in_recv = 100.0 * (1.0 - matched_in_recv / len(sender_in_recv)) if len(sender_in_recv) > 0 else np.nan

    matched_in_exp = _greedy_match(sender_in_exp, recv_vals, TOL_SEC)
    loss_pct_in_exp = 100.0 * (1.0 - matched_in_exp / len(sender_in_exp)) if len(sender_in_exp) > 0 else np.nan

    rows_check.append({
        "receiver_file": recv_name,
        "sender_in_recv_count": int(len(sender_in_recv)),
        "receiver_count": int(len(recv_vals)),
        "matched_in_recv": int(matched_in_recv),
        "loss_pct_in_recv": loss_pct_in_recv,
        "sender_in_exp_count": int(len(sender_in_exp)),
        "matched_in_exp": int(matched_in_exp),
        "loss_pct_in_exp": loss_pct_in_exp,
    })

srv2cav_window_check = pd.DataFrame(rows_check)
pd.set_option("display.float_format", lambda v: f"{v:0.6f}")
print(srv2cav_window_check)

                              receiver_file  sender_in_recv_count  \
0   speed_sueggest_250915_001_20kph+vir.txt                   319   
1   speed_sueggest_250915_001_30kph+vir.txt                   206   
2   speed_sueggest_250915_001_40kph+vir.txt                   160   
3   speed_sueggest_250915_001_50kph+vir.txt                   129   
4   speed_sueggest_250915_002_20kph+vir.txt                   286   
5   speed_sueggest_250915_002_30kph+vir.txt                   248   
6   speed_sueggest_250915_002_40kph+vir.txt                   162   
7   speed_sueggest_250915_002_50kph+vir.txt                   158   
8   speed_sueggest_250915_003_20kph+vir.txt                   290   
9   speed_sueggest_250915_003_30kph+vir.txt                   358   
10  speed_sueggest_250915_003_40kph+vir.txt                   187   
11  speed_sueggest_250915_004_20kph+vir.txt                   290   
12  speed_sueggest_250915_004_30kph+vir.txt                   222   
13  speed_sueggest_250915_004_40kp

In [6]:
import re
import numpy as np
import pandas as pd

if "srv2cav_window_check" not in globals():
    raise NameError("srv2cav_window_check not found. Run the window-check cell first.")

def _kph_from_name(name: str) -> str:
    m = re.search(r"(\d{1,3})\s*(kph|kmh)", str(name), flags=re.IGNORECASE)
    return f"{int(m.group(1))}kph" if m else "unknown"

def _speed_sort_key(k: str):
    try:
        return (k == "unknown", int(k.replace("kph", "")))
    except Exception:
        return (True, 10**9)

def _q1(x): return x.quantile(0.25)
def _q3(x): return x.quantile(0.75)

df_sum = srv2cav_window_check.copy()

df_sum["speed"] = df_sum["receiver_file"].map(_kph_from_name)
df_sum["retention_in_recv"] = 100.0 - df_sum["loss_pct_in_recv"]
df_sum["retention_in_exp"]  = 100.0 - df_sum["loss_pct_in_exp"]
df_sum["delta_loss_pct"]    = df_sum["loss_pct_in_exp"] - df_sum["loss_pct_in_recv"]
df_sum["delta_matched"]     = df_sum["matched_in_exp"] - df_sum["matched_in_recv"]
df_sum["delta_sender"]      = df_sum["sender_in_exp_count"] - df_sum["sender_in_recv_count"]

summary_recv = (
    df_sum
    .groupby("speed", dropna=False)
    .agg(
        n_files=("receiver_file", "count"),
        sender_in_recv_mean=("sender_in_recv_count", "mean"),
        receiver_count_mean=("receiver_count", "mean"),
        loss_recv_mean=("loss_pct_in_recv", "mean"),
        loss_recv_std=("loss_pct_in_recv", "std"),
        loss_recv_min=("loss_pct_in_recv", "min"),
        loss_recv_q1=("loss_pct_in_recv", _q1),
        loss_recv_median=("loss_pct_in_recv", "median"),
        loss_recv_q3=("loss_pct_in_recv", _q3),
        loss_recv_max=("loss_pct_in_recv", "max"),
    )
    .reset_index()
    .sort_values(by="speed", key=lambda s: s.map(_speed_sort_key))
)

summary_exp = (
    df_sum
    .groupby("speed", dropna=False)
    .agg(
        n_files=("receiver_file", "count"),
        sender_in_exp_mean=("sender_in_exp_count", "mean"),
        loss_exp_mean=("loss_pct_in_exp", "mean"),
        loss_exp_std=("loss_pct_in_exp", "std"),
        loss_exp_min=("loss_pct_in_exp", "min"),
        loss_exp_q1=("loss_pct_in_exp", _q1),
        loss_exp_median=("loss_pct_in_exp", "median"),
        loss_exp_q3=("loss_pct_in_exp", _q3),
        loss_exp_max=("loss_pct_in_exp", "max"),
        pct_runs_more_matched=("delta_matched", lambda s: 100.0 * (s > 0).mean()),
        pct_runs_less_matched=("delta_matched", lambda s: 100.0 * (s < 0).mean()),
        delta_loss_mean=("delta_loss_pct", "mean"),
        delta_loss_median=("delta_loss_pct", "median"),
    )
    .reset_index()
    .sort_values(by="speed", key=lambda s: s.map(_speed_sort_key))
)

overall_recv = df_sum["loss_pct_in_recv"].describe(percentiles=[0.25, 0.5, 0.75]).rename("loss_in_recv_desc")
overall_exp  = df_sum["loss_pct_in_exp"].describe(percentiles=[0.25, 0.5, 0.75]).rename("loss_in_exp_desc")

pd.set_option("display.float_format", lambda v: f"{v:0.6f}")

print("=== Per-speed summary (Receiver window) ===")
print(summary_recv)

print("\n=== Per-speed summary (Expanded window) ===")
print(summary_exp)

print("\n=== Overall loss distribution — Receiver window ===")
print(overall_recv)

print("\n=== Overall loss distribution — Expanded window ===")
print(overall_exp)

print("\n=== Quick diagnostics ===")
print(f"Files where expanded window increased matched count: {(df_sum['delta_matched'] > 0).sum()} / {len(df_sum)} "
      f"({100.0*(df_sum['delta_matched']>0).mean():.2f}%)")
print(f"Files where expanded window decreased matched count: {(df_sum['delta_matched'] < 0).sum()} / {len(df_sum)} "
      f"({100.0*(df_sum['delta_matched']<0).mean():.2f}%)")
print(f"Mean Δloss (expanded - receiver) across all files: {df_sum['delta_loss_pct'].mean():.6f} %")

=== Per-speed summary (Receiver window) ===
   speed  n_files  sender_in_recv_mean  receiver_count_mean  loss_recv_mean  \
0  20kph        5           294.400000           294.400000        0.000000   
1  30kph        5           250.800000           250.800000        0.000000   
2  40kph        5           177.200000           177.200000        0.000000   
3  50kph        2           143.500000           143.500000        0.000000   

   loss_recv_std  loss_recv_min  loss_recv_q1  loss_recv_median  loss_recv_q3  \
0       0.000000       0.000000      0.000000          0.000000      0.000000   
1       0.000000       0.000000      0.000000          0.000000      0.000000   
2       0.000000       0.000000      0.000000          0.000000      0.000000   
3       0.000000       0.000000      0.000000          0.000000      0.000000   

   loss_recv_max  
0       0.000000  
1       0.000000  
2       0.000000  
3       0.000000  

=== Per-speed summary (Expanded window) ===
   speed  n_fi