In [7]:
# /colab_notebooks/week3_step1_bootstrap.ipynb
# Week 3 — Step 1: initialize week_3 and validate Week 2 prerequisites

from __future__ import annotations

import os
import json
from dataclasses import dataclass, asdict
from datetime import datetime
from typing import Dict

# ----------------------------- Paths ----------------------------- #
DRIVE_MOUNT_PT = "/content/drive"
PROJECT_ROOT   = f"{DRIVE_MOUNT_PT}/MyDrive/Colab Notebooks/New_cyber_project"

DATASET_ROOT = os.path.join(PROJECT_ROOT, "dataset")

W2_OUT = os.path.join(PROJECT_ROOT, "week_2", "outputs")
W3_ROOT = os.path.join(PROJECT_ROOT, "week_3")
W3_NB   = os.path.join(W3_ROOT, "notebooks")
W3_OUT  = os.path.join(W3_ROOT, "outputs")

REQUIRED_W2 = {
    "session_meta":        os.path.join(W2_OUT, "session_meta.json"),
    "eda_files":           os.path.join(W2_OUT, "eda_files.csv"),
    "eda_overview":        os.path.join(W2_OUT, "eda_overview.csv"),
    "column_profile":      os.path.join(W2_OUT, "column_profile.csv"),
    "dtype_summary":       os.path.join(W2_OUT, "dtype_summary.csv"),
    "target_availability": os.path.join(W2_OUT, "target_availability.csv"),
    "validation_report":   os.path.join(W2_OUT, "validation_report.json"),
    "week2_report_md":     os.path.join(W2_OUT, "week2_report.md"),
}

# ----------------------------- Models ---------------------------- #
@dataclass(frozen=True)
class SessionMeta:
    project_root: str
    dataset_root: str
    week_2_outputs: Dict[str, str]
    week_3: Dict[str, str]
    created_at: str
    notes: str

# ---------------------------- Helpers ---------------------------- #
def ensure_drive() -> None:
    from google.colab import drive  # type: ignore
    if not os.path.ismount(DRIVE_MOUNT_PT):
        drive.mount(DRIVE_MOUNT_PT)

def ensure_dirs() -> None:
    os.makedirs(W3_NB, exist_ok=True)
    os.makedirs(W3_OUT, exist_ok=True)

def validate_w2() -> Dict[str, str]:
    missing = [k for k, p in REQUIRED_W2.items() if not os.path.isfile(p)]
    if missing:
        raise FileNotFoundError("Week 2 prerequisites not found: " + ", ".join(missing))
    return REQUIRED_W2

def write_session_meta(resolved: Dict[str, str]) -> str:
    meta = SessionMeta(
        project_root=PROJECT_ROOT,
        dataset_root=DATASET_ROOT,
        week_2_outputs=resolved,
        week_3={"root": W3_ROOT, "outputs": W3_OUT, "notebooks": W3_NB},
        created_at=datetime.utcnow().isoformat(timespec="seconds") + "Z",
        notes="Week 3 initialized.",
    )
    out_path = os.path.join(W3_OUT, "session_meta.json")
    with open(out_path, "w") as f:
        json.dump(asdict(meta), f, indent=2)
    return out_path

def main() -> None:
    ensure_drive()
    ensure_dirs()
    resolved = validate_w2()
    meta_path = write_session_meta(resolved)
    print(f"week_3 root: {W3_ROOT}")
    print(f"session meta: {meta_path}")

if __name__ == "__main__":
    main()


week_3 root: /content/drive/MyDrive/Colab Notebooks/New_cyber_project/week_3
session meta: /content/drive/MyDrive/Colab Notebooks/New_cyber_project/week_3/outputs/session_meta.json


  created_at=datetime.utcnow().isoformat(timespec="seconds") + "Z",


In [8]:
# /colab_notebooks/week3_step2_manifest.ipynb
# Week 3 — Step 2: build unified dataset manifest with per-file row estimates

from __future__ import annotations

import os
import json
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple

import pandas as pd

try:
    import pyarrow.parquet as pq
except Exception:
    raise ImportError("pyarrow is required for Parquet row counting. Install with: pip install pyarrow")

# ----------------------------- Paths ----------------------------- #
DRIVE_MOUNT_PT = "/content/drive"
PROJECT_ROOT   = f"{DRIVE_MOUNT_PT}/MyDrive/Colab Notebooks/New_cyber_project"

W1_OUT = os.path.join(PROJECT_ROOT, "week_1", "outputs")
W3_OUT = os.path.join(PROJECT_ROOT, "week_3", "outputs")

INVENTORY_CSV = os.path.join(W1_OUT, "data_inventory_paths.csv")
SCHEMA_JSON   = os.path.join(W1_OUT, "schema_preview.json")

MANIFEST_CSV  = os.path.join(W3_OUT, "manifest.csv")

os.makedirs(W3_OUT, exist_ok=True)

# ----------------------------- Models ---------------------------- #
@dataclass(frozen=True)
class FileEntry:
    domain: str
    path: str
    extension: str
    size_bytes: Optional[int]

@dataclass(frozen=True)
class Targets:
    has_label: bool
    has_type: bool
    label_col: Optional[str]
    type_col: Optional[str]

# ----------------------------- Helpers --------------------------- #
def _ensure_inputs() -> None:
    if not os.path.isfile(INVENTORY_CSV):
        raise FileNotFoundError(f"Required file not found: {INVENTORY_CSV}")
    if not os.path.isfile(SCHEMA_JSON):
        raise FileNotFoundError(f"Required file not found: {SCHEMA_JSON}")

def _read_inventory() -> List[FileEntry]:
    df = pd.read_csv(INVENTORY_CSV)
    if df.empty:
        raise ValueError("Inventory contains no entries.")
    # size_bytes might be NaN; coerce to int or None
    def _sb(v):
        try:
            return int(v)
        except Exception:
            return None
    return [
        FileEntry(
            domain=str(r["domain"]),
            path=str(r["path"]),
            extension=str(r["extension"]).lower(),
            size_bytes=_sb(r.get("size_bytes")),
        )
        for _, r in df.iterrows()
    ]

def _read_schema_columns() -> Dict[str, List[str]]:
    data = json.load(open(SCHEMA_JSON, "r"))
    mapping: Dict[str, List[str]] = {}
    for rec in data:
        p = str(rec.get("path", ""))
        cols = rec.get("columns", [])
        if p and isinstance(cols, list):
            mapping[p] = [str(c) for c in cols]
    return mapping

def _infer_targets(cols: List[str]) -> Targets:
    low = {c.lower(): c for c in cols}
    label = low.get("label") or low.get("labels") or low.get("target") or low.get("class")
    type_ = low.get("type") or low.get("attack_cat")
    return Targets(
        has_label=label is not None,
        has_type=type_ is not None,
        label_col=label,
        type_col=type_,
    )

def _count_rows_csv(path: str, chunksize: int = 200_000) -> int:
    total = 0
    for chunk in pd.read_csv(path, usecols=None, chunksize=chunksize, low_memory=True):
        total += len(chunk)
    return total

def _count_rows_parquet(path: str) -> int:
    pf = pq.ParquetFile(path)
    return sum(pf.metadata.row_group(i).num_rows for i in range(pf.metadata.num_row_groups))

def _estimate_rows(entry: FileEntry) -> int:
    if entry.extension == ".csv":
        return _count_rows_csv(entry.path)
    if entry.extension == ".parquet":
        return _count_rows_parquet(entry.path)
    return 0

def _fmt_mb(nbytes: Optional[int]) -> float:
    if nbytes is None:
        return 0.0
    return round(nbytes / (1024 * 1024), 3)

# ------------------------------ Main ----------------------------- #
def main() -> None:
    _ensure_inputs()
    entries = _read_inventory()
    schema_cols = _read_schema_columns()

    rows = []
    for e in entries:
        cols = schema_cols.get(e.path, [])
        t = _infer_targets(cols)
        try:
            est_rows = _estimate_rows(e)
        except Exception:
            est_rows = 0
        rows.append(
            {
                "domain": e.domain,
                "path": e.path,
                "extension": e.extension,
                "size_bytes": e.size_bytes if e.size_bytes is not None else "",
                "size_mb": _fmt_mb(e.size_bytes),
                "est_rows": int(est_rows),
                "has_label_col": t.has_label,
                "has_type_col": t.has_type,
                "label_col": t.label_col or "",
                "type_col": t.type_col or "",
            }
        )

    manifest = pd.DataFrame(rows).sort_values(["domain", "extension", "path"]).reset_index(drop=True)
    manifest.to_csv(MANIFEST_CSV, index=False)
    print(f"manifest: {MANIFEST_CSV}")
    # optional glance
    by_domain = (
        manifest.groupby("domain")
        .agg(files=("path", "count"),
             total_rows=("est_rows", "sum"),
             total_size_mb=("size_mb", "sum"),
             files_with_label=("has_label_col", "sum"),
             files_with_type=("has_type_col", "sum"))
        .reset_index()
    )
    print(by_domain.to_string(index=False))

if __name__ == "__main__":
    main()


  for chunk in pd.read_csv(path, usecols=None, chunksize=chunksize, low_memory=True):
  for chunk in pd.read_csv(path, usecols=None, chunksize=chunksize, low_memory=True):
  for chunk in pd.read_csv(path, usecols=None, chunksize=chunksize, low_memory=True):
  for chunk in pd.read_csv(path, usecols=None, chunksize=chunksize, low_memory=True):
  for chunk in pd.read_csv(path, usecols=None, chunksize=chunksize, low_memory=True):
  for chunk in pd.read_csv(path, usecols=None, chunksize=chunksize, low_memory=True):
  for chunk in pd.read_csv(path, usecols=None, chunksize=chunksize, low_memory=True):


manifest: /content/drive/MyDrive/Colab Notebooks/New_cyber_project/week_3/outputs/manifest.csv
 domain  files  total_rows  total_size_mb  files_with_label  files_with_type
    IoT      7     3606134        166.995                 7                7
  Linux      6     5855329        316.272                 6                6
Network     13    12339021       1755.110                13               13
Windows      2       64342         59.484                 2                2


  for chunk in pd.read_csv(path, usecols=None, chunksize=chunksize, low_memory=True):


In [3]:
# /colab_notebooks/week3_step3_feature_plan.ipynb
# Week 3 — Step 3: derive common feature set + write config

from __future__ import annotations

import os
import json
from dataclasses import dataclass, asdict
from typing import Dict, List, Set

import pandas as pd

# ----------------------------- Paths ----------------------------- #
DRIVE_MOUNT_PT = "/content/drive"
PROJECT_ROOT   = f"{DRIVE_MOUNT_PT}/MyDrive/Colab Notebooks/New_cyber_project"

W1_OUT = os.path.join(PROJECT_ROOT, "week_1", "outputs")
W2_OUT = os.path.join(PROJECT_ROOT, "week_2", "outputs")
W3_OUT = os.path.join(PROJECT_ROOT, "week_3", "outputs")

SCHEMA_JSON = os.path.join(W1_OUT, "schema_preview.json")
COLPROF_CSV = os.path.join(W2_OUT, "column_profile.csv")

FEATURES_CSV = os.path.join(W3_OUT, "features_common.csv")
CONFIG_JSON  = os.path.join(W3_OUT, "config.json")

os.makedirs(W3_OUT, exist_ok=True)

# ----------------------------- Params ---------------------------- #
COVERAGE_THRESHOLD = 80.0   # % of files in a domain that must contain the column
RANDOM_SEED = 42
MAX_SAMPLE_ROWS = 10_000

# ----------------------------- Models ---------------------------- #
@dataclass(frozen=True)
class FeaturePlan:
    coverage_threshold: float
    random_seed: int
    max_sample_rows: int
    targets: Dict[str, List[str]]
    drop_patterns: List[str]
    features_common_path: str

# ---------------------------- Helpers ---------------------------- #
def _ensure_inputs() -> None:
    if not os.path.isfile(SCHEMA_JSON):
        raise FileNotFoundError(f"Required file not found: {SCHEMA_JSON}")
    if not os.path.isfile(COLPROF_CSV):
        raise FileNotFoundError(f"Required file not found: {COLPROF_CSV}")

def _infer_targets_from_schema() -> Dict[str, List[str]]:
    data = json.load(open(SCHEMA_JSON, "r"))
    # Collect all candidate target names encountered
    all_targets: Set[str] = set()
    for rec in data:
        for c in rec.get("candidate_targets", []) or []:
            all_targets.add(str(c))
    # Normalize common spellings; keep original names as options
    normalized = []
    low = {t.lower(): t for t in all_targets}
    for key in ("label","labels","target","class","type","attack_cat"):
        if key in low:
            normalized.append(low[key])
    # Ensure deduplicated, stable order
    targets = sorted(set(normalized), key=str.lower)
    return {"candidate_targets": targets}

def _compute_common_features(colprof: pd.DataFrame, thr: float) -> pd.DataFrame:
    # Keep columns that are well-covered within each domain
    ok = colprof[colprof["pct_coverage"] >= thr]
    # Intersect column names across all domains
    by_domain = {d: set(df["column"].astype(str)) for d, df in ok.groupby("domain")}
    domains = sorted(by_domain.keys())
    common = set.intersection(*(by_domain[d] for d in domains)) if domains else set()
    # Build a tidy table with per-domain coverage for the common set
    subset = colprof[colprof["column"].isin(common)].copy()
    pivot = subset.pivot_table(index="column", columns="domain", values="pct_coverage", aggfunc="max")
    pivot = pivot.reset_index().sort_values("column").reset_index(drop=True)
    return pivot

# ------------------------------ Main ----------------------------- #
def main() -> None:
    _ensure_inputs()
    colprof = pd.read_csv(COLPROF_CSV)
    if colprof.empty:
        raise ValueError("column_profile.csv is empty.")

    features_tbl = _compute_common_features(colprof, COVERAGE_THRESHOLD)
    features_tbl.to_csv(FEATURES_CSV, index=False)

    targets = _infer_targets_from_schema()

    drop_patterns = [
        r"^ts$",
        r"^date$",
        r"^time$",
        r"^timestamp$",
        r"(^|_)id$",
        r".*ip$",
        r".*uri$",
        r".*user[_-]?agent.*",
        r"^dns_.*",
    ]

    plan = FeaturePlan(
        coverage_threshold=COVERAGE_THRESHOLD,
        random_seed=RANDOM_SEED,
        max_sample_rows=MAX_SAMPLE_ROWS,
        targets=targets,
        drop_patterns=drop_patterns,
        features_common_path=FEATURES_CSV,
    )
    with open(CONFIG_JSON, "w") as f:
        json.dump(asdict(plan), f, indent=2)

    print(f"features_common: {FEATURES_CSV}")
    print(f"config:          {CONFIG_JSON}")

if __name__ == "__main__":
    main()


features_common: /content/drive/MyDrive/Colab Notebooks/New_cyber_project/week_3/outputs/features_common.csv
config:          /content/drive/MyDrive/Colab Notebooks/New_cyber_project/week_3/outputs/config.json


In [4]:
# /colab_notebooks/week3_step4_materialize_training_data.ipynb
# Week 3 — Step 4: build per-domain & ALL training-ready Parquet from common features

from __future__ import annotations

import os
import re
import json
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple

import pandas as pd

try:
    import pyarrow as pa
    import pyarrow.parquet as pq
except Exception:
    raise ImportError("pyarrow is required. Install with: pip install pyarrow")

# ----------------------------- Paths ----------------------------- #

DRIVE_MOUNT_PT = "/content/drive"
PROJECT_ROOT   = f"{DRIVE_MOUNT_PT}/MyDrive/Colab Notebooks/New_cyber_project"

W1_OUT = os.path.join(PROJECT_ROOT, "week_1", "outputs")
W2_OUT = os.path.join(PROJECT_ROOT, "week_2", "outputs")
W3_OUT = os.path.join(PROJECT_ROOT, "week_3", "outputs")

INVENTORY_CSV = os.path.join(W1_OUT, "data_inventory_paths.csv")
SCHEMA_JSON   = os.path.join(W1_OUT, "schema_preview.json")
FEATURES_CSV  = os.path.join(W3_OUT, "features_common.csv")
CONFIG_JSON   = os.path.join(W3_OUT, "config.json")

OUT_DIR       = os.path.join(W3_OUT, "training_data")
os.makedirs(OUT_DIR, exist_ok=True)

# ----------------------------- Params ---------------------------- #

CSV_CHUNKSIZE = 200_000  # controlled memory for large CSVs

# ----------------------------- Models ---------------------------- #

@dataclass(frozen=True)
class FileEntry:
    domain: str
    path: str
    extension: str

@dataclass(frozen=True)
class TargetCols:
    label_src: Optional[str]
    type_src: Optional[str]

# ----------------------------- Helpers --------------------------- #

def _ensure_inputs() -> None:
    for p in [INVENTORY_CSV, SCHEMA_JSON, FEATURES_CSV, CONFIG_JSON]:
        if not os.path.isfile(p):
            raise FileNotFoundError(f"Required file not found: {p}")

def _read_inventory() -> List[FileEntry]:
    df = pd.read_csv(INVENTORY_CSV)
    if df.empty:
        raise ValueError("Inventory contains no entries.")
    return [FileEntry(str(r["domain"]), str(r["path"]), str(r["extension"]).lower()) for _, r in df.iterrows()]

def _read_schema_map() -> Dict[str, List[str]]:
    data = json.load(open(SCHEMA_JSON, "r"))
    m: Dict[str, List[str]] = {}
    for rec in data:
        p = str(rec.get("path", ""))
        cols = rec.get("columns", [])
        if p and isinstance(cols, list):
            m[p] = [str(c) for c in cols]
    return m

def _read_config() -> Dict:
    return json.load(open(CONFIG_JSON, "r"))

def _load_common_features() -> List[str]:
    df = pd.read_csv(FEATURES_CSV)
    # expects a "column" field (from Week 3 Step 3)
    if "column" not in df.columns:
        raise ValueError("features_common.csv must contain a 'column' field.")
    return [str(c) for c in df["column"].tolist()]

def _compile_drops(patterns: List[str]) -> List[re.Pattern]:
    return [re.compile(p, re.I) for p in patterns]

def _should_drop(name: str, drops: List[re.Pattern]) -> bool:
    for pat in drops:
        if pat.search(name):
            return True
    return False

def _find_targets(cols: List[str]) -> TargetCols:
    low = {c.lower(): c for c in cols}
    label = low.get("label") or low.get("labels") or low.get("target") or low.get("class")
    typ   = low.get("type") or low.get("attack_cat")
    return TargetCols(label_src=label, type_src=typ)

def _normalize_targets(df: pd.DataFrame, t: TargetCols) -> pd.DataFrame:
    # Rename present target columns to standardized 'label' / 'type'
    if t.label_src and t.label_src in df.columns and "label" not in df.columns:
        df = df.rename(columns={t.label_src: "label"})
    if t.type_src and t.type_src in df.columns and "type" not in df.columns:
        df = df.rename(columns={t.type_src: "type"})
    return df

def _ensure_columns(df: pd.DataFrame, ordered_cols: List[str]) -> pd.DataFrame:
    # Add missing columns as NA, then reorder
    missing = [c for c in ordered_cols if c not in df.columns]
    for c in missing:
        df[c] = pd.NA
    return df[ordered_cols]

def _to_arrow_table(df: pd.DataFrame) -> pa.Table:
    # Keep Arrow schema stable across chunks
    return pa.Table.from_pandas(df, preserve_index=False)

# ----------------------------- Core I/O --------------------------- #

class DomainWriters:
    """Manages per-domain Parquet writers and a combined 'ALL' writer."""
    def __init__(self, out_dir: str, desired_cols: List[str]) -> None:
        self.out_dir = out_dir
        self.desired_cols = desired_cols
        self.domain_writers: Dict[str, pq.ParquetWriter] = {}
        self.all_writer: Optional[pq.ParquetWriter] = None
        self.schemas: Dict[str, pa.Schema] = {}
        self.all_schema: Optional[pa.Schema] = None
        self.rows_written: Dict[str, int] = {}
        self.files_processed: Dict[str, int] = {}

    def _path_for(self, domain: str) -> str:
        return os.path.join(self.out_dir, f"train_ready_{domain}.parquet")

    def _all_path(self) -> str:
        return os.path.join(self.out_dir, "train_ready_ALL.parquet")

    def write(self, domain: str, batch_df: pd.DataFrame) -> None:
        # Track counts
        self.rows_written[domain] = self.rows_written.get(domain, 0) + len(batch_df)
        # Domain writer
        tbl = _to_arrow_table(batch_df)
        if domain not in self.domain_writers:
            self.schemas[domain] = tbl.schema
            self.domain_writers[domain] = pq.ParquetWriter(self._path_for(domain), self.schemas[domain], compression="snappy")
            self.files_processed[domain] = 0
        self.domain_writers[domain].write_table(tbl)

        # ALL writer (add domain col)
        if "domain" not in batch_df.columns:
            batch_df = batch_df.assign(domain=domain)
            tbl_all = _to_arrow_table(batch_df)
        else:
            tbl_all = _to_arrow_table(batch_df)

        if self.all_writer is None:
            self.all_schema = tbl_all.schema
            self.all_writer = pq.ParquetWriter(self._all_path(), self.all_schema, compression="snappy")
        self.all_writer.write_table(tbl_all)

    def bump_file_counter(self, domain: str) -> None:
        self.files_processed[domain] = self.files_processed.get(domain, 0) + 1

    def close(self) -> None:
        for w in self.domain_writers.values():
            w.close()
        if self.all_writer is not None:
            self.all_writer.close()

# ------------------------------ Main ----------------------------- #

def main() -> None:
    _ensure_inputs()

    cfg = _read_config()
    common_cols = _load_common_features()
    drops = _compile_drops(cfg.get("drop_patterns", []))

    inv = _read_inventory()
    schema_map = _read_schema_map()

    # Desired final columns: common features + standardized targets ('label','type')
    desired_cols = [c for c in common_cols if not _should_drop(c, drops)]
    desired_cols += ["label", "type"]  # may be absent in some files; we add as NA
    desired_cols = list(dict.fromkeys(desired_cols))  # stable dedupe

    writers = DomainWriters(OUT_DIR, desired_cols)
    report_rows: List[Dict[str, object]] = []

    for e in inv:
        cols = schema_map.get(e.path, [])
        t = _find_targets(cols)
        has_any = bool(t.label_src or t.type_src)

        # Determine read columns for this file: common∩file + present targets
        file_cols = set(cols)
        read_cols = [c for c in common_cols if c in file_cols and not _should_drop(c, drops)]
        if t.label_src and t.label_src in file_cols:
            read_cols.append(t.label_src)
        if t.type_src and t.type_src in file_cols:
            read_cols.append(t.type_src)
        read_cols = list(dict.fromkeys(read_cols))

        total_rows = 0
        written_rows = 0

        try:
            if e.extension == ".csv":
                if read_cols:
                    for chunk in pd.read_csv(e.path, usecols=read_cols, chunksize=CSV_CHUNKSIZE, low_memory=True):
                        total_rows += len(chunk)
                        chunk = _normalize_targets(chunk, t)
                        chunk = _ensure_columns(chunk, writers.desired_cols)
                        writers.write(e.domain, chunk)
                        written_rows += len(chunk)
                else:
                    # no selected columns; still count rows
                    for chunk in pd.read_csv(e.path, usecols=None, chunksize=CSV_CHUNKSIZE, low_memory=True):
                        total_rows += len(chunk)
            elif e.extension == ".parquet":
                if read_cols:
                    pf = pq.ParquetFile(e.path)
                    for rg in range(pf.metadata.num_row_groups):
                        tbl = pf.read_row_group(rg, columns=read_cols)
                        df = tbl.to_pandas()
                        total_rows += len(df)
                        df = _normalize_targets(df, t)
                        df = _ensure_columns(df, writers.desired_cols)
                        writers.write(e.domain, df)
                        written_rows += len(df)
                else:
                    pf = pq.ParquetFile(e.path)
                    total_rows = sum(pf.metadata.row_group(i).num_rows for i in range(pf.metadata.num_row_groups))
        except Exception:
            # skip problematic file silently, but record in report
            total_rows = total_rows or 0
            written_rows = written_rows or 0

        if written_rows > 0:
            writers.bump_file_counter(e.domain)

        report_rows.append({
            "domain": e.domain,
            "path": e.path,
            "extension": e.extension,
            "had_any_target_col": has_any,
            "selected_cols": ",".join(read_cols),
            "rows_total_seen": int(total_rows),
            "rows_written": int(written_rows),
        })

    writers.close()

    # Write a compact build report
    build_report = pd.DataFrame(report_rows).sort_values(["domain", "rows_written"], ascending=[True, False])
    build_report.to_csv(os.path.join(OUT_DIR, "build_report.csv"), index=False)

    # Domain summary
    dom_summary = (
        build_report.groupby("domain")
        .agg(files_processed=("path", "count"),
             files_with_rows=("rows_written", lambda s: int((s > 0).sum())),
             rows_total_seen=("rows_total_seen", "sum"),
             rows_written=("rows_written", "sum"))
        .reset_index()
    )
    dom_summary.to_csv(os.path.join(OUT_DIR, "build_summary.csv"), index=False)

    print(os.path.join(OUT_DIR, "train_ready_ALL.parquet"))
    for dom in sorted(dom_summary["domain"].unique()):
        print(os.path.join(OUT_DIR, f"train_ready_{dom}.parquet"))
    print(os.path.join(OUT_DIR, "build_summary.csv"))

if __name__ == "__main__":
    main()


/content/drive/MyDrive/Colab Notebooks/New_cyber_project/week_3/outputs/training_data/train_ready_ALL.parquet
/content/drive/MyDrive/Colab Notebooks/New_cyber_project/week_3/outputs/training_data/train_ready_IoT.parquet
/content/drive/MyDrive/Colab Notebooks/New_cyber_project/week_3/outputs/training_data/train_ready_Linux.parquet
/content/drive/MyDrive/Colab Notebooks/New_cyber_project/week_3/outputs/training_data/train_ready_Network.parquet
/content/drive/MyDrive/Colab Notebooks/New_cyber_project/week_3/outputs/training_data/train_ready_Windows.parquet
/content/drive/MyDrive/Colab Notebooks/New_cyber_project/week_3/outputs/training_data/build_summary.csv


In [5]:
# /colab_notebooks/week3_step5_validate_training_data.ipynb
# Week 3 — Step 5: validate materialized Parquet datasets

from __future__ import annotations

import os
import json
from dataclasses import dataclass
from typing import Dict, List, Tuple

import pandas as pd
import pyarrow.parquet as pq

# ----------------------------- Paths ----------------------------- #
DRIVE_MOUNT_PT = "/content/drive"
PROJECT_ROOT   = f"{DRIVE_MOUNT_PT}/MyDrive/Colab Notebooks/New_cyber_project"

W3_OUT        = os.path.join(PROJECT_ROOT, "week_3", "outputs")
TRAIN_DIR     = os.path.join(W3_OUT, "training_data")
FEATURES_CSV  = os.path.join(W3_OUT, "features_common.csv")
CONFIG_JSON   = os.path.join(W3_OUT, "config.json")

SUMMARY_CSV   = os.path.join(TRAIN_DIR, "validation_summary.csv")
DETAILS_CSV   = os.path.join(TRAIN_DIR, "validation_details.csv")

# ---------------------------- Helpers ---------------------------- #
def _ensure_inputs() -> None:
    missing = []
    for p in [FEATURES_CSV, CONFIG_JSON]:
        if not os.path.isfile(p):
            missing.append(p)
    for fname in ["train_ready_ALL.parquet"]:
        if not os.path.isfile(os.path.join(TRAIN_DIR, fname)):
            missing.append(os.path.join(TRAIN_DIR, fname))
    if missing:
        raise FileNotFoundError("Missing required artifact(s): " + ", ".join(missing))

def _load_expected_columns() -> List[str]:
    feats = pd.read_csv(FEATURES_CSV)
    if "column" not in feats.columns:
        raise ValueError("features_common.csv must have a 'column' column.")
    exp = list(dict.fromkeys(feats["column"].astype(str).tolist()))
    exp += ["label", "type"]  # standardized targets if present
    return exp

def _list_parquet_targets() -> Dict[str, str]:
    files = {}
    for dom in ["Windows", "Linux", "Network", "IoT"]:
        p = os.path.join(TRAIN_DIR, f"train_ready_{dom}.parquet")
        if os.path.isfile(p):
            files[dom] = p
    files["ALL"] = os.path.join(TRAIN_DIR, "train_ready_ALL.parquet")
    return files

def _pq_schema_cols(path: str) -> List[str]:
    pf = pq.ParquetFile(path)
    return [pf.schema_arrow.names[i] for i in range(len(pf.schema_arrow.names))]

def _pq_rowcount(path: str) -> int:
    pf = pq.ParquetFile(path)
    return sum(pf.metadata.row_group(i).num_rows for i in range(pf.metadata.num_row_groups))

def _value_counts_parquet(path: str, col: str, max_unique: int = 15) -> Tuple[int, Dict[str, int]]:
    pf = pq.ParquetFile(path)
    counts: Dict[str, int] = {}
    total = 0
    for i in range(pf.metadata.num_row_groups):
        tbl = pf.read_row_group(i, columns=[col]) if col in pf.schema_arrow.names else None
        if tbl is None:
            continue
        s = tbl.to_pandas()[col]
        total += s.notna().sum()
        vc = s.dropna().astype(str).value_counts()
        for k, v in vc.items():
            counts[k] = counts.get(k, 0) + int(v)
    # keep top max_unique
    counts = dict(sorted(counts.items(), key=lambda kv: kv[1], reverse=True)[:max_unique])
    return total, counts

# ------------------------------ Main ----------------------------- #
def main() -> None:
    _ensure_inputs()
    expected_cols = _load_expected_columns()
    targets = {"label", "type"}

    files = _list_parquet_targets()
    rows_summary = []
    rows_details = []

    for dom, path in files.items():
        if not os.path.isfile(path):
            continue

        cols = _pq_schema_cols(path)
        nrows = _pq_rowcount(path)

        missing_expected = [c for c in expected_cols if c not in cols]
        unexpected = [c for c in cols if c not in expected_cols + ["domain"]]  # 'domain' may appear in ALL

        has_label = "label" in cols
        has_type  = "type" in cols

        label_nonnull = type_nonnull = 0
        label_vc = {}
        type_vc = {}

        if has_label:
            label_nonnull, label_vc = _value_counts_parquet(path, "label")
        if has_type:
            type_nonnull, type_vc = _value_counts_parquet(path, "type")

        rows_summary.append({
            "dataset": dom,
            "path": path,
            "rows": nrows,
            "ncols": len(cols),
            "has_label": has_label,
            "has_type": has_type,
            "label_nonnull": label_nonnull,
            "type_nonnull": type_nonnull,
            "missing_expected_cols": len(missing_expected),
            "unexpected_cols": len(unexpected),
        })

        rows_details.append({
            "dataset": dom,
            "path": path,
            "columns": ",".join(cols),
            "missing_expected_cols": ",".join(missing_expected),
            "unexpected_cols": ",".join(unexpected),
            "label_top_values": json.dumps(label_vc),
            "type_top_values": json.dumps(type_vc),
        })

    pd.DataFrame(rows_summary).sort_values("dataset").to_csv(SUMMARY_CSV, index=False)
    pd.DataFrame(rows_details).sort_values("dataset").to_csv(DETAILS_CSV, index=False)

    print(f"validation_summary: {SUMMARY_CSV}")
    print(f"validation_details: {DETAILS_CSV}")

if __name__ == "__main__":
    main()
# /colab_notebooks/week3_step5_validate_training_data.ipynb
# Week 3 — Step 5: validate materialized Parquet datasets

from __future__ import annotations

import os
import json
from dataclasses import dataclass
from typing import Dict, List, Tuple

import pandas as pd
import pyarrow.parquet as pq

# ----------------------------- Paths ----------------------------- #
DRIVE_MOUNT_PT = "/content/drive"
PROJECT_ROOT   = f"{DRIVE_MOUNT_PT}/MyDrive/Colab Notebooks/New_cyber_project"

W3_OUT        = os.path.join(PROJECT_ROOT, "week_3", "outputs")
TRAIN_DIR     = os.path.join(W3_OUT, "training_data")
FEATURES_CSV  = os.path.join(W3_OUT, "features_common.csv")
CONFIG_JSON   = os.path.join(W3_OUT, "config.json")

SUMMARY_CSV   = os.path.join(TRAIN_DIR, "validation_summary.csv")
DETAILS_CSV   = os.path.join(TRAIN_DIR, "validation_details.csv")

# ---------------------------- Helpers ---------------------------- #
def _ensure_inputs() -> None:
    missing = []
    for p in [FEATURES_CSV, CONFIG_JSON]:
        if not os.path.isfile(p):
            missing.append(p)
    for fname in ["train_ready_ALL.parquet"]:
        if not os.path.isfile(os.path.join(TRAIN_DIR, fname)):
            missing.append(os.path.join(TRAIN_DIR, fname))
    if missing:
        raise FileNotFoundError("Missing required artifact(s): " + ", ".join(missing))

def _load_expected_columns() -> List[str]:
    feats = pd.read_csv(FEATURES_CSV)
    if "column" not in feats.columns:
        raise ValueError("features_common.csv must have a 'column' column.")
    exp = list(dict.fromkeys(feats["column"].astype(str).tolist()))
    exp += ["label", "type"]  # standardized targets if present
    return exp

def _list_parquet_targets() -> Dict[str, str]:
    files = {}
    for dom in ["Windows", "Linux", "Network", "IoT"]:
        p = os.path.join(TRAIN_DIR, f"train_ready_{dom}.parquet")
        if os.path.isfile(p):
            files[dom] = p
    files["ALL"] = os.path.join(TRAIN_DIR, "train_ready_ALL.parquet")
    return files

def _pq_schema_cols(path: str) -> List[str]:
    pf = pq.ParquetFile(path)
    return [pf.schema_arrow.names[i] for i in range(len(pf.schema_arrow.names))]

def _pq_rowcount(path: str) -> int:
    pf = pq.ParquetFile(path)
    return sum(pf.metadata.row_group(i).num_rows for i in range(pf.metadata.num_row_groups))

def _value_counts_parquet(path: str, col: str, max_unique: int = 15) -> Tuple[int, Dict[str, int]]:
    pf = pq.ParquetFile(path)
    counts: Dict[str, int] = {}
    total = 0
    for i in range(pf.metadata.num_row_groups):
        tbl = pf.read_row_group(i, columns=[col]) if col in pf.schema_arrow.names else None
        if tbl is None:
            continue
        s = tbl.to_pandas()[col]
        total += s.notna().sum()
        vc = s.dropna().astype(str).value_counts()
        for k, v in vc.items():
            counts[k] = counts.get(k, 0) + int(v)
    # keep top max_unique
    counts = dict(sorted(counts.items(), key=lambda kv: kv[1], reverse=True)[:max_unique])
    return total, counts

# ------------------------------ Main ----------------------------- #
def main() -> None:
    _ensure_inputs()
    expected_cols = _load_expected_columns()
    targets = {"label", "type"}

    files = _list_parquet_targets()
    rows_summary = []
    rows_details = []

    for dom, path in files.items():
        if not os.path.isfile(path):
            continue

        cols = _pq_schema_cols(path)
        nrows = _pq_rowcount(path)

        missing_expected = [c for c in expected_cols if c not in cols]
        unexpected = [c for c in cols if c not in expected_cols + ["domain"]]  # 'domain' may appear in ALL

        has_label = "label" in cols
        has_type  = "type" in cols

        label_nonnull = type_nonnull = 0
        label_vc = {}
        type_vc = {}

        if has_label:
            label_nonnull, label_vc = _value_counts_parquet(path, "label")
        if has_type:
            type_nonnull, type_vc = _value_counts_parquet(path, "type")

        rows_summary.append({
            "dataset": dom,
            "path": path,
            "rows": nrows,
            "ncols": len(cols),
            "has_label": has_label,
            "has_type": has_type,
            "label_nonnull": label_nonnull,
            "type_nonnull": type_nonnull,
            "missing_expected_cols": len(missing_expected),
            "unexpected_cols": len(unexpected),
        })

        rows_details.append({
            "dataset": dom,
            "path": path,
            "columns": ",".join(cols),
            "missing_expected_cols": ",".join(missing_expected),
            "unexpected_cols": ",".join(unexpected),
            "label_top_values": json.dumps(label_vc),
            "type_top_values": json.dumps(type_vc),
        })

    pd.DataFrame(rows_summary).sort_values("dataset").to_csv(SUMMARY_CSV, index=False)
    pd.DataFrame(rows_details).sort_values("dataset").to_csv(DETAILS_CSV, index=False)

    print(f"validation_summary: {SUMMARY_CSV}")
    print(f"validation_details: {DETAILS_CSV}")

if __name__ == "__main__":
    main()
# /colab_notebooks/week3_step5_validate_training_data.ipynb
# Week 3 — Step 5: validate materialized Parquet datasets

from __future__ import annotations

import os
import json
from dataclasses import dataclass
from typing import Dict, List, Tuple

import pandas as pd
import pyarrow.parquet as pq

# ----------------------------- Paths ----------------------------- #
DRIVE_MOUNT_PT = "/content/drive"
PROJECT_ROOT   = f"{DRIVE_MOUNT_PT}/MyDrive/Colab Notebooks/New_cyber_project"

W3_OUT        = os.path.join(PROJECT_ROOT, "week_3", "outputs")
TRAIN_DIR     = os.path.join(W3_OUT, "training_data")
FEATURES_CSV  = os.path.join(W3_OUT, "features_common.csv")
CONFIG_JSON   = os.path.join(W3_OUT, "config.json")

SUMMARY_CSV   = os.path.join(TRAIN_DIR, "validation_summary.csv")
DETAILS_CSV   = os.path.join(TRAIN_DIR, "validation_details.csv")

# ---------------------------- Helpers ---------------------------- #
def _ensure_inputs() -> None:
    missing = []
    for p in [FEATURES_CSV, CONFIG_JSON]:
        if not os.path.isfile(p):
            missing.append(p)
    for fname in ["train_ready_ALL.parquet"]:
        if not os.path.isfile(os.path.join(TRAIN_DIR, fname)):
            missing.append(os.path.join(TRAIN_DIR, fname))
    if missing:
        raise FileNotFoundError("Missing required artifact(s): " + ", ".join(missing))

def _load_expected_columns() -> List[str]:
    feats = pd.read_csv(FEATURES_CSV)
    if "column" not in feats.columns:
        raise ValueError("features_common.csv must have a 'column' column.")
    exp = list(dict.fromkeys(feats["column"].astype(str).tolist()))
    exp += ["label", "type"]  # standardized targets if present
    return exp

def _list_parquet_targets() -> Dict[str, str]:
    files = {}
    for dom in ["Windows", "Linux", "Network", "IoT"]:
        p = os.path.join(TRAIN_DIR, f"train_ready_{dom}.parquet")
        if os.path.isfile(p):
            files[dom] = p
    files["ALL"] = os.path.join(TRAIN_DIR, "train_ready_ALL.parquet")
    return files

def _pq_schema_cols(path: str) -> List[str]:
    pf = pq.ParquetFile(path)
    return [pf.schema_arrow.names[i] for i in range(len(pf.schema_arrow.names))]

def _pq_rowcount(path: str) -> int:
    pf = pq.ParquetFile(path)
    return sum(pf.metadata.row_group(i).num_rows for i in range(pf.metadata.num_row_groups))

def _value_counts_parquet(path: str, col: str, max_unique: int = 15) -> Tuple[int, Dict[str, int]]:
    pf = pq.ParquetFile(path)
    counts: Dict[str, int] = {}
    total = 0
    for i in range(pf.metadata.num_row_groups):
        tbl = pf.read_row_group(i, columns=[col]) if col in pf.schema_arrow.names else None
        if tbl is None:
            continue
        s = tbl.to_pandas()[col]
        total += s.notna().sum()
        vc = s.dropna().astype(str).value_counts()
        for k, v in vc.items():
            counts[k] = counts.get(k, 0) + int(v)
    # keep top max_unique
    counts = dict(sorted(counts.items(), key=lambda kv: kv[1], reverse=True)[:max_unique])
    return total, counts

# ------------------------------ Main ----------------------------- #
def main() -> None:
    _ensure_inputs()
    expected_cols = _load_expected_columns()
    targets = {"label", "type"}

    files = _list_parquet_targets()
    rows_summary = []
    rows_details = []

    for dom, path in files.items():
        if not os.path.isfile(path):
            continue

        cols = _pq_schema_cols(path)
        nrows = _pq_rowcount(path)

        missing_expected = [c for c in expected_cols if c not in cols]
        unexpected = [c for c in cols if c not in expected_cols + ["domain"]]  # 'domain' may appear in ALL

        has_label = "label" in cols
        has_type  = "type" in cols

        label_nonnull = type_nonnull = 0
        label_vc = {}
        type_vc = {}

        if has_label:
            label_nonnull, label_vc = _value_counts_parquet(path, "label")
        if has_type:
            type_nonnull, type_vc = _value_counts_parquet(path, "type")

        rows_summary.append({
            "dataset": dom,
            "path": path,
            "rows": nrows,
            "ncols": len(cols),
            "has_label": has_label,
            "has_type": has_type,
            "label_nonnull": label_nonnull,
            "type_nonnull": type_nonnull,
            "missing_expected_cols": len(missing_expected),
            "unexpected_cols": len(unexpected),
        })

        rows_details.append({
            "dataset": dom,
            "path": path,
            "columns": ",".join(cols),
            "missing_expected_cols": ",".join(missing_expected),
            "unexpected_cols": ",".join(unexpected),
            "label_top_values": json.dumps(label_vc),
            "type_top_values": json.dumps(type_vc),
        })

    pd.DataFrame(rows_summary).sort_values("dataset").to_csv(SUMMARY_CSV, index=False)
    pd.DataFrame(rows_details).sort_values("dataset").to_csv(DETAILS_CSV, index=False)

    print(f"validation_summary: {SUMMARY_CSV}")
    print(f"validation_details: {DETAILS_CSV}")

if __name__ == "__main__":
    main()
# /colab_notebooks/week3_step5_validate_training_data.ipynb
# Week 3 — Step 5: validate materialized Parquet datasets

from __future__ import annotations

import os
import json
from dataclasses import dataclass
from typing import Dict, List, Tuple

import pandas as pd
import pyarrow.parquet as pq

# ----------------------------- Paths ----------------------------- #
DRIVE_MOUNT_PT = "/content/drive"
PROJECT_ROOT   = f"{DRIVE_MOUNT_PT}/MyDrive/Colab Notebooks/New_cyber_project"

W3_OUT        = os.path.join(PROJECT_ROOT, "week_3", "outputs")
TRAIN_DIR     = os.path.join(W3_OUT, "training_data")
FEATURES_CSV  = os.path.join(W3_OUT, "features_common.csv")
CONFIG_JSON   = os.path.join(W3_OUT, "config.json")

SUMMARY_CSV   = os.path.join(TRAIN_DIR, "validation_summary.csv")
DETAILS_CSV   = os.path.join(TRAIN_DIR, "validation_details.csv")

# ---------------------------- Helpers ---------------------------- #
def _ensure_inputs() -> None:
    missing = []
    for p in [FEATURES_CSV, CONFIG_JSON]:
        if not os.path.isfile(p):
            missing.append(p)
    for fname in ["train_ready_ALL.parquet"]:
        if not os.path.isfile(os.path.join(TRAIN_DIR, fname)):
            missing.append(os.path.join(TRAIN_DIR, fname))
    if missing:
        raise FileNotFoundError("Missing required artifact(s): " + ", ".join(missing))

def _load_expected_columns() -> List[str]:
    feats = pd.read_csv(FEATURES_CSV)
    if "column" not in feats.columns:
        raise ValueError("features_common.csv must have a 'column' column.")
    exp = list(dict.fromkeys(feats["column"].astype(str).tolist()))
    exp += ["label", "type"]  # standardized targets if present
    return exp

def _list_parquet_targets() -> Dict[str, str]:
    files = {}
    for dom in ["Windows", "Linux", "Network", "IoT"]:
        p = os.path.join(TRAIN_DIR, f"train_ready_{dom}.parquet")
        if os.path.isfile(p):
            files[dom] = p
    files["ALL"] = os.path.join(TRAIN_DIR, "train_ready_ALL.parquet")
    return files

def _pq_schema_cols(path: str) -> List[str]:
    pf = pq.ParquetFile(path)
    return [pf.schema_arrow.names[i] for i in range(len(pf.schema_arrow.names))]

def _pq_rowcount(path: str) -> int:
    pf = pq.ParquetFile(path)
    return sum(pf.metadata.row_group(i).num_rows for i in range(pf.metadata.num_row_groups))

def _value_counts_parquet(path: str, col: str, max_unique: int = 15) -> Tuple[int, Dict[str, int]]:
    pf = pq.ParquetFile(path)
    counts: Dict[str, int] = {}
    total = 0
    for i in range(pf.metadata.num_row_groups):
        tbl = pf.read_row_group(i, columns=[col]) if col in pf.schema_arrow.names else None
        if tbl is None:
            continue
        s = tbl.to_pandas()[col]
        total += s.notna().sum()
        vc = s.dropna().astype(str).value_counts()
        for k, v in vc.items():
            counts[k] = counts.get(k, 0) + int(v)
    # keep top max_unique
    counts = dict(sorted(counts.items(), key=lambda kv: kv[1], reverse=True)[:max_unique])
    return total, counts

# ------------------------------ Main ----------------------------- #
def main() -> None:
    _ensure_inputs()
    expected_cols = _load_expected_columns()
    targets = {"label", "type"}

    files = _list_parquet_targets()
    rows_summary = []
    rows_details = []

    for dom, path in files.items():
        if not os.path.isfile(path):
            continue

        cols = _pq_schema_cols(path)
        nrows = _pq_rowcount(path)

        missing_expected = [c for c in expected_cols if c not in cols]
        unexpected = [c for c in cols if c not in expected_cols + ["domain"]]  # 'domain' may appear in ALL

        has_label = "label" in cols
        has_type  = "type" in cols

        label_nonnull = type_nonnull = 0
        label_vc = {}
        type_vc = {}

        if has_label:
            label_nonnull, label_vc = _value_counts_parquet(path, "label")
        if has_type:
            type_nonnull, type_vc = _value_counts_parquet(path, "type")

        rows_summary.append({
            "dataset": dom,
            "path": path,
            "rows": nrows,
            "ncols": len(cols),
            "has_label": has_label,
            "has_type": has_type,
            "label_nonnull": label_nonnull,
            "type_nonnull": type_nonnull,
            "missing_expected_cols": len(missing_expected),
            "unexpected_cols": len(unexpected),
        })

        rows_details.append({
            "dataset": dom,
            "path": path,
            "columns": ",".join(cols),
            "missing_expected_cols": ",".join(missing_expected),
            "unexpected_cols": ",".join(unexpected),
            "label_top_values": json.dumps(label_vc),
            "type_top_values": json.dumps(type_vc),
        })

    pd.DataFrame(rows_summary).sort_values("dataset").to_csv(SUMMARY_CSV, index=False)
    pd.DataFrame(rows_details).sort_values("dataset").to_csv(DETAILS_CSV, index=False)

    print(f"validation_summary: {SUMMARY_CSV}")
    print(f"validation_details: {DETAILS_CSV}")

if __name__ == "__main__":
    main()


validation_summary: /content/drive/MyDrive/Colab Notebooks/New_cyber_project/week_3/outputs/training_data/validation_summary.csv
validation_details: /content/drive/MyDrive/Colab Notebooks/New_cyber_project/week_3/outputs/training_data/validation_details.csv
validation_summary: /content/drive/MyDrive/Colab Notebooks/New_cyber_project/week_3/outputs/training_data/validation_summary.csv
validation_details: /content/drive/MyDrive/Colab Notebooks/New_cyber_project/week_3/outputs/training_data/validation_details.csv
validation_summary: /content/drive/MyDrive/Colab Notebooks/New_cyber_project/week_3/outputs/training_data/validation_summary.csv
validation_details: /content/drive/MyDrive/Colab Notebooks/New_cyber_project/week_3/outputs/training_data/validation_details.csv
validation_summary: /content/drive/MyDrive/Colab Notebooks/New_cyber_project/week_3/outputs/training_data/validation_summary.csv
validation_details: /content/drive/MyDrive/Colab Notebooks/New_cyber_project/week_3/outputs/train