In [7]:
# --------------------------------------------------------------
#  IMPORTS
# --------------------------------------------------------------
import pathlib
import re
from typing import List, Tuple

import pandas as pd
import numpy as np

# --------------------------------------------------------------
#  HELPER – extract the case identifier (numeric part before the first '_' )
# --------------------------------------------------------------
def _case_id_from_filename(fname: str) -> int | str:
    """
    Return the part of *fname* before the first underscore.
    If it can be converted to an int the integer is returned,
    otherwise the raw string is returned.
    """
    stem = pathlib.Path(fname).stem          # drop the final extension
    case_part = stem.split("_")[0]
    try:
        return int(case_part)
    except ValueError:
        return case_part


# --------------------------------------------------------------
#  1️⃣  LOAD *.time files  (keep only the `time` column)
# --------------------------------------------------------------
def _parse_time_value(p: pathlib.Path) -> float:
    """Read a *.time file and return the `real` value (seconds) – renamed to `time`."""
    with p.open("r", encoding="utf-8") as f:
        for line in f:
            m = re.match(r"^\s*real\s+([0-9]*\.?[0-9]+)", line)
            if m:
                return float(m.group(1))
    # If the file does not contain a `real` line
    return float("nan")


def load_time_dir(dir_path: pathlib.Path) -> pd.DataFrame:
    """
    Load all *.time files in *dir_path* and keep only the `time` column.

    Returns a DataFrame with columns:
        case_id   – int (or str) extracted from the filename
        time      – float, execution time in seconds
        filename  – original file name (for traceability)
    """
    records: List[dict] = []

    for p in sorted(dir_path.glob("*.time")):
        case_id = _case_id_from_filename(p.name)
        time_val = _parse_time_value(p)

        records.append(
            {"case_id": case_id, "time": time_val, "filename": p.name}
        )

    df = pd.DataFrame.from_records(records)

    # Sort by numeric case_id when possible – makes later merges easier
    if not df.empty and pd.api.types.is_integer_dtype(df["case_id"]):
        df = df.sort_values("case_id").reset_index(drop=True)

    return df


# --------------------------------------------------------------
#  2️⃣  LOAD size‑listing files (plaintext_data.txt, hhe_data.txt)
# --------------------------------------------------------------
def _load_size_file(p: pathlib.Path) -> pd.DataFrame:
    """
    Generic loader for the two size‑listing files.

    Expected format per line: `<size_kb> <filename>` (whitespace‑separated).

    Returns a DataFrame with columns `case_id`, `size_kb` and `filename`.
    """
    records: List[dict] = []

    with p.open("r", encoding="utf-8") as f:
        for line in f:
            if not line.strip():
                continue
            size_str, filename = line.strip().split(maxsplit=1)
            size_kb = int(size_str)                     # will raise if not numeric
            case_id = _case_id_from_filename(filename)

            records.append(
                {"case_id": case_id, "size_kb": size_kb, "filename": filename}
            )

    df = pd.DataFrame.from_records(records)

    if not df.empty and pd.api.types.is_integer_dtype(df["case_id"]):
        df = df.sort_values("case_id").reset_index(drop=True)

    return df


def load_plaintext_sizes(root_dir: pathlib.Path) -> pd.DataFrame:
    """Load `plaintext_data.txt` (size KB + filename) into a DataFrame."""
    p = root_dir / "plaintext_data.txt"
    if not p.is_file():
        raise FileNotFoundError(f"{p!s} does not exist.")
    return _load_size_file(p)


def load_hhe_sizes(root_dir: pathlib.Path) -> pd.DataFrame:
    """Load `hhe_data.txt` (size KB + filename) into a DataFrame."""
    p = root_dir / "hhe_data.txt"
    if not p.is_file():
        raise FileNotFoundError(f"{p!s} does not exist.")
    return _load_size_file(p)


# --------------------------------------------------------------
#  3️⃣  CALCULATE SPEEDS (kb / second)
# --------------------------------------------------------------
def _merge_time_and_size(
    time_df: pd.DataFrame, size_df: pd.DataFrame, suffix: str
) -> pd.DataFrame:
    """
    Merge a timing DataFrame with a size DataFrame on ``case_id``.
    Adds a ``speed_kb_per_s`` column (size / time).

    Parameters
    ----------
    time_df : pd.DataFrame
        Must contain ``case_id`` and ``time``.
    size_df : pd.DataFrame
        Must contain ``case_id`` and ``size_kb``.
    suffix : str
        Used to label the speed column (e.g. ``'hhe_upload'``).

    Returns
    -------
    pd.DataFrame
        ``case_id``, ``time``, ``size_kb``, ``speed_kb_per_s`` and the two
        ``filename`` columns (renamed for clarity).
    """
    merged = pd.merge(
        time_df,
        size_df,
        on="case_id",
        how="inner",
        suffixes=("_time", "_size"),
    )

    # Compute speed; protect against division by zero
    merged["speed_kb_per_s"] = merged["size_kb"] / merged["time"].replace({0: pd.NA})

    # Rename columns for readability
    merged = merged.rename(
        columns={
            "filename_time": f"filename_{suffix}",
            "filename_size": f"filename_{suffix}_size",
        }
    )

    # Re‑order columns (optional, but makes the table tidy)
    cols = [
        "case_id",
        f"filename_{suffix}",
        f"filename_{suffix}_size",
        "size_kb",
        "time",
        "speed_kb_per_s",
    ]
    merged = merged[[c for c in cols if c in merged.columns]]

    return merged


def calculate_hhe_upload_speed(root_dir: pathlib.Path) -> pd.DataFrame:
    """
    Speed (kb/s) for the HHE **upload** phase.

    The upload phase concerns the transfer of the **plaintext** files,
    so we merge the HHE upload timing data with the *plaintext* size
    information.
    """
    time_df = load_time_dir(root_dir / "hhe_upload_time")
    size_df = load_plaintext_sizes(root_dir)          # <-- plaintext sizes
    return _merge_time_and_size(time_df, size_df, suffix="hhe_upload")


def calculate_hhe_evaluate_speed(root_dir: pathlib.Path) -> pd.DataFrame:
    """Speed (kb/s) for the HHE **evaluation** phase (using HHE output sizes)."""
    time_df = load_time_dir(root_dir / "hhe_evaluate_time")
    size_df = load_hhe_sizes(root_dir)               # original implementation
    return _merge_time_and_size(time_df, size_df, suffix="hhe_evaluate")


def calculate_hhe_evaluate_speed_using_plaintext(
    root_dir: pathlib.Path,
) -> pd.DataFrame:
    """
    **New** speed (kb/s) for the HHE evaluation phase **using the original
    plaintext file sizes**.

    This tells you how many kilobytes of *original* data are processed per
    second during the evaluation step.
    """
    time_df = load_time_dir(root_dir / "hhe_evaluate_time")
    size_df = load_plaintext_sizes(root_dir)          # <-- plaintext sizes
    return _merge_time_and_size(
        time_df, size_df, suffix="hhe_evaluate_plaintext"
    )


def calculate_plaintext_upload_speed(root_dir: pathlib.Path) -> pd.DataFrame:
    """Speed (kb/s) for the **plaintext upload** phase."""
    time_df = load_time_dir(root_dir / "plaintext_upload_time")
    size_df = load_plaintext_sizes(root_dir)
    return _merge_time_and_size(time_df, size_df, suffix="plaintext_upload")

In [8]:
# --------------------------------------------------------------
#  4️⃣  QUICK DEMO (run after the definitions above)
# --------------------------------------------------------------
# Adjust this path to point at the folder that contains the ESADA_4percent
# directory structure (the folder that has hhe_upload_time/, …, plaintext_data.txt, hhe_data.txt)
root = pathlib.Path("ESADA_4percent")   # <-- change if needed

# ------------------------------------------------------------------
# Load the raw tables (optional – you can also call the individual loaders)
# ------------------------------------------------------------------
hhe_upload_df, hhe_eval_df, plaintext_upload_df, plaintext_sizes_df, hhe_sizes_df = (
    load_time_dir(root / "hhe_upload_time"),
    load_time_dir(root / "hhe_evaluate_time"),
    load_time_dir(root / "plaintext_upload_time"),
    load_plaintext_sizes(root),
    load_hhe_sizes(root),
)

# ------------------------------------------------------------------
# Compute speeds (kb/s) – four new tables now
# ------------------------------------------------------------------
hhe_upload_speed_df               = calculate_hhe_upload_speed(root)
hhe_evaluate_speed_df             = calculate_hhe_evaluate_speed(root)
hhe_evaluate_speed_plaintext_df   = calculate_hhe_evaluate_speed_using_plaintext(root)
plaintext_upload_speed_df         = calculate_plaintext_upload_speed(root)

# ------------------------------------------------------------------
# Show a few rows of each speed table
# ------------------------------------------------------------------
display(hhe_upload_speed_df.head())
display(hhe_evaluate_speed_df.head())
display(hhe_evaluate_speed_plaintext_df.head())
display(plaintext_upload_speed_df.head())

# ------------------------------------------------------------------
# OPTIONAL: overall statistics (mean/median speed)
# ------------------------------------------------------------------
def speed_summary(df: pd.DataFrame, label: str) -> pd.DataFrame:
    """Return mean & median speed for a given speed DataFrame."""
    return pd.DataFrame(
        {
            "mean_kb_per_s":   [df["speed_kb_per_s"].mean()],
            "median_kb_per_s": [df["speed_kb_per_s"].median()],
        },
        index=[label],
    )

summary = pd.concat(
    [
        speed_summary(hhe_upload_speed_df, "hhe_upload"),
        speed_summary(hhe_evaluate_speed_df, "hhe_evaluate_hhe_sizes"),
        speed_summary(
            hhe_evaluate_speed_plaintext_df, "hhe_evaluate_plaintext_sizes"
        ),
        speed_summary(plaintext_upload_speed_df, "plaintext_upload"),
    ]
)

display(summary)

Unnamed: 0,case_id,filename_hhe_upload,filename_hhe_upload_size,size_kb,time,speed_kb_per_s
0,1,1_data.txt.time,1_data.txt,60,529.6,0.113293
1,2,2_data.txt.time,2_data.txt,68,598.92,0.113538
2,3,3_data.txt.time,3_data.txt,76,631.3,0.120387
3,4,4_data.txt.time,4_data.txt,76,648.43,0.117206
4,5,5_data.txt.time,5_data.txt,68,562.04,0.120988


Unnamed: 0,case_id,filename_hhe_evaluate,filename_hhe_evaluate_size,size_kb,time,speed_kb_per_s
0,1,1_0aedab90-1656-42e1-b1f9-e6f731849929.bin.time,1_0aedab90-1656-42e1-b1f9-e6f731849929.bin,119496,243.82,490.099254
1,2,2_0aedab90-1656-42e1-b1f9-e6f731849929.bin.time,2_0aedab90-1656-42e1-b1f9-e6f731849929.bin,137320,271.77,505.2802
2,3,3_0aedab90-1656-42e1-b1f9-e6f731849929.bin.time,3_0aedab90-1656-42e1-b1f9-e6f731849929.bin,148028,298.87,495.292268
3,4,4_0aedab90-1656-42e1-b1f9-e6f731849929.bin.time,4_0aedab90-1656-42e1-b1f9-e6f731849929.bin,151596,293.08,517.251262
4,5,5_0aedab90-1656-42e1-b1f9-e6f731849929.bin.time,5_0aedab90-1656-42e1-b1f9-e6f731849929.bin,131968,261.52,504.61915


Unnamed: 0,case_id,filename_hhe_evaluate_plaintext,filename_hhe_evaluate_plaintext_size,size_kb,time,speed_kb_per_s
0,1,1_0aedab90-1656-42e1-b1f9-e6f731849929.bin.time,1_data.txt,60,243.82,0.246083
1,2,2_0aedab90-1656-42e1-b1f9-e6f731849929.bin.time,2_data.txt,68,271.77,0.250212
2,3,3_0aedab90-1656-42e1-b1f9-e6f731849929.bin.time,3_data.txt,76,298.87,0.254291
3,4,4_0aedab90-1656-42e1-b1f9-e6f731849929.bin.time,4_data.txt,76,293.08,0.259315
4,5,5_0aedab90-1656-42e1-b1f9-e6f731849929.bin.time,5_data.txt,68,261.52,0.260018


Unnamed: 0,case_id,filename_plaintext_upload,filename_plaintext_upload_size,size_kb,time,speed_kb_per_s
0,1,1_data.txt.time,1_data.txt,60,0.19,315.789474
1,2,2_data.txt.time,2_data.txt,68,0.23,295.652174
2,3,3_data.txt.time,3_data.txt,76,0.21,361.904762
3,4,4_data.txt.time,4_data.txt,76,0.27,281.481481
4,5,5_data.txt.time,5_data.txt,68,0.17,400.0


Unnamed: 0,mean_kb_per_s,median_kb_per_s
hhe_upload,0.118875,0.118789
hhe_evaluate_hhe_sizes,498.663115,505.2802
hhe_evaluate_plaintext_sizes,0.257314,0.257028
plaintext_upload,291.826947,300.0
