In [2]:
# /colab_notebooks/week1_step1_setup.ipynb

import os
from google.colab import drive

DRIVE_MOUNT_PT = "/content/drive"
if not os.path.ismount(DRIVE_MOUNT_PT):
    drive.mount(DRIVE_MOUNT_PT)

PROJECT_ROOT  = f"{DRIVE_MOUNT_PT}/MyDrive/Colab Notebooks/New_cyber_project"
DATASET_ROOT  = os.path.join(PROJECT_ROOT, "dataset")
WEEK_ROOT     = os.path.join(PROJECT_ROOT, "week_1")
WEEK_OUTPUT   = os.path.join(WEEK_ROOT, "outputs")
WEEK_NB       = os.path.join(WEEK_ROOT, "notebooks")

INPUT_FOLDERS = {
    "Windows": os.path.join(DATASET_ROOT, "Processed_Windows_dataset"),
    "Linux":   os.path.join(DATASET_ROOT, "Processed_Linux_dataset"),
    "Network": os.path.join(DATASET_ROOT, "Processed_Network_dataset"),
    "IoT":     os.path.join(DATASET_ROOT, "Processed_IoT_dataset"),
}

os.makedirs(WEEK_OUTPUT, exist_ok=True)
os.makedirs(WEEK_NB, exist_ok=True)

print("project:", PROJECT_ROOT)
print("dataset:", DATASET_ROOT)
print("week_1 outputs:", WEEK_OUTPUT)


Mounted at /content/drive
project: /content/drive/MyDrive/Colab Notebooks/New_cyber_project
dataset: /content/drive/MyDrive/Colab Notebooks/New_cyber_project/dataset
week_1 outputs: /content/drive/MyDrive/Colab Notebooks/New_cyber_project/week_1/outputs


In [3]:
# /colab_notebooks/week1_step2_verify_dirs.ipynb

import os

required = {
    "PROJECT_ROOT": PROJECT_ROOT,
    "DATASET_ROOT": DATASET_ROOT,
    "WEEK_ROOT": WEEK_ROOT,
    "WEEK_OUTPUT": WEEK_OUTPUT,
    **{f"INPUT:{k}": v for k, v in INPUT_FOLDERS.items()},
}

missing = [name for name, path in required.items() if not os.path.isdir(path)]
for name, path in required.items():
    flag = "OK" if os.path.isdir(path) else "MISSING"
    print(f"{flag:8s} {name:14s} -> {path}")

if missing:
    raise FileNotFoundError("Missing folders: " + ", ".join(missing))


OK       PROJECT_ROOT   -> /content/drive/MyDrive/Colab Notebooks/New_cyber_project
OK       DATASET_ROOT   -> /content/drive/MyDrive/Colab Notebooks/New_cyber_project/dataset
OK       WEEK_ROOT      -> /content/drive/MyDrive/Colab Notebooks/New_cyber_project/week_1
OK       WEEK_OUTPUT    -> /content/drive/MyDrive/Colab Notebooks/New_cyber_project/week_1/outputs
OK       INPUT:Windows  -> /content/drive/MyDrive/Colab Notebooks/New_cyber_project/dataset/Processed_Windows_dataset
OK       INPUT:Linux    -> /content/drive/MyDrive/Colab Notebooks/New_cyber_project/dataset/Processed_Linux_dataset
OK       INPUT:Network  -> /content/drive/MyDrive/Colab Notebooks/New_cyber_project/dataset/Processed_Network_dataset
OK       INPUT:IoT      -> /content/drive/MyDrive/Colab Notebooks/New_cyber_project/dataset/Processed_IoT_dataset


In [4]:
# /colab_notebooks/week1_step3a_week_structure.ipynb

import os

# Idempotent creation; separate step for clarity across weeks
os.makedirs(WEEK_OUTPUT, exist_ok=True)
os.makedirs(WEEK_NB, exist_ok=True)
print("ready:", WEEK_OUTPUT, "|", WEEK_NB)


ready: /content/drive/MyDrive/Colab Notebooks/New_cyber_project/week_1/outputs | /content/drive/MyDrive/Colab Notebooks/New_cyber_project/week_1/notebooks


In [5]:
# /colab_notebooks/week1_step3b_inventory_paths.ipynb

import os, glob
import pandas as pd
from datetime import datetime

ALLOWED_EXTS = {".csv", ".parquet"}

def _ext(p): return os.path.splitext(p)[1].lower()
def _mb(n):  return round(((n or 0)/1024/1024), 3)

rows = []
for domain, root in INPUT_FOLDERS.items():
    for p in glob.glob(os.path.join(root, "**", "*"), recursive=True):
        if not os.path.isfile(p):
            continue
        e = _ext(p)
        if e not in ALLOWED_EXTS:
            continue
        try:
            st = os.stat(p)
            size_b = st.st_size
            mtime  = datetime.fromtimestamp(st.st_mtime).isoformat(timespec="seconds")
        except FileNotFoundError:
            size_b, mtime = None, None
        rows.append({
            "domain": domain,
            "path": p,
            "extension": e,
            "size_bytes": size_b,
            "size_mb": _mb(size_b),
            "modified": mtime,
        })

inv = pd.DataFrame(rows).sort_values(["domain","extension","path"]).reset_index(drop=True)
out_csv = os.path.join(WEEK_OUTPUT, "data_inventory_paths.csv")
inv.to_csv(out_csv, index=False)

print("inventory:", out_csv, "| files:", len(inv))


inventory: /content/drive/MyDrive/Colab Notebooks/New_cyber_project/week_1/outputs/data_inventory_paths.csv | files: 28


In [6]:
# /colab_notebooks/week1_step4_schema_preview.ipynb

from __future__ import annotations

import os, json, re
from typing import List, Dict, Any, Optional
import pandas as pd

INVENTORY_CSV = os.path.join(WEEK_OUTPUT, "data_inventory_paths.csv")
SCHEMA_JSON   = os.path.join(WEEK_OUTPUT, "schema_preview.json")
ERRORS_CSV    = os.path.join(WEEK_OUTPUT, "schema_errors.csv")

if not os.path.isfile(INVENTORY_CSV):
    raise FileNotFoundError(f"Required file not found: {INVENTORY_CSV}")

inv = pd.read_csv(INVENTORY_CSV)
if inv.empty:
    raise ValueError("Inventory contains no entries.")

MAX_ROWS = 10_000
TARGET_RE = re.compile(r"^(label|labels|target|class|attack_cat|type)$", re.I)

def read_sample(path: str, ext: str, nrows: int) -> Optional[pd.DataFrame]:
    try:
        if ext == ".csv":
            return pd.read_csv(path, nrows=nrows, low_memory=True)
        if ext == ".parquet":
            return pd.read_parquet(path).head(nrows)
        return None
    except Exception:
        return None

def candidate_targets(cols: List[str]) -> List[str]:
    return [c for c in cols if TARGET_RE.match(c)]

schema_records: List[Dict[str, Any]] = []
error_records: List[Dict[str, Any]] = []

for _, r in inv.iterrows():
    path = str(r["path"]); ext = str(r["extension"]).lower(); dom = str(r["domain"])
    df = read_sample(path, ext, MAX_ROWS)
    if df is None or df.empty:
        error_records.append({"domain": dom, "path": path, "extension": ext, "error": "ReadError"})
        continue
    cols = list(df.columns)
    schema_records.append({
        "domain": dom,
        "path": path,
        "extension": ext,
        "sample_rows": int(min(len(df), MAX_ROWS)),
        "ncols": int(len(cols)),
        "columns": cols,
        "dtypes": {c: str(df[c].dtype) for c in cols},
        "candidate_targets": candidate_targets(cols),
    })

with open(SCHEMA_JSON, "w") as f:
    json.dump(schema_records, f, indent=2)
if error_records:
    pd.DataFrame(error_records).to_csv(ERRORS_CSV, index=False)

print("schema:", SCHEMA_JSON, "| ok:", len(schema_records), "| errors:", len(error_records))


  return pd.read_csv(path, nrows=nrows, low_memory=True)
  return pd.read_csv(path, nrows=nrows, low_memory=True)


schema: /content/drive/MyDrive/Colab Notebooks/New_cyber_project/week_1/outputs/schema_preview.json | ok: 28 | errors: 0


In [7]:
# /colab_notebooks/week1_step5_summary.ipynb

import os, json
import pandas as pd

INVENTORY_CSV = os.path.join(WEEK_OUTPUT, "data_inventory_paths.csv")
SCHEMA_JSON   = os.path.join(WEEK_OUTPUT, "schema_preview.json")
ERRORS_CSV    = os.path.join(WEEK_OUTPUT, "schema_errors.csv")
OUT_CSV       = os.path.join(WEEK_OUTPUT, "summary.csv")

if not os.path.isfile(INVENTORY_CSV):
    raise FileNotFoundError(f"Required file not found: {INVENTORY_CSV}")
if not os.path.isfile(SCHEMA_JSON):
    raise FileNotFoundError(f"Required file not found: {SCHEMA_JSON}")

inv = pd.read_csv(INVENTORY_CSV)
schema_ok = pd.DataFrame(json.load(open(SCHEMA_JSON))) if os.path.getsize(SCHEMA_JSON) > 2 else pd.DataFrame()
schema_err = pd.read_csv(ERRORS_CSV) if os.path.isfile(ERRORS_CSV) else pd.DataFrame(columns=["domain","path","extension","error"])

if schema_ok.empty:
    schema_ok = pd.DataFrame(columns=["domain","path","extension","candidate_targets","ncols","sample_rows"])

def has_target(v) -> bool:
    if isinstance(v, list):
        return len(v) > 0
    if isinstance(v, str):
        return v.strip() not in ("[]", "", "nan")
    return False

schema_ok["has_target"] = schema_ok["candidate_targets"].apply(has_target)

totals = inv.groupby("domain").size().rename("total_files").to_frame()
okc    = schema_ok.groupby("domain").size().rename("ok_files").to_frame()
errs   = schema_err.groupby("domain").size().rename("error_files").to_frame()
with_y = schema_ok.groupby("domain")["has_target"].sum().rename("files_with_target").to_frame()
avg_nc = schema_ok.groupby("domain")["ncols"].mean().round(2).rename("avg_ncols").to_frame()
med_sr = schema_ok.groupby("domain")["sample_rows"].median().rename("median_sample_rows").to_frame()

summary = (totals.join(okc, how="left")
                 .join(errs, how="left")
                 .join(with_y, how="left")
                 .join(avg_nc, how="left")
                 .join(med_sr, how="left")
                 .fillna(0)
                 .reset_index())

summary["ok_files"] = summary["ok_files"].astype(int)
summary["error_files"] = summary["error_files"].astype(int)
summary["files_with_target"] = summary["files_with_target"].astype(int)
summary["pct_with_target"] = (summary["files_with_target"] / summary["total_files"] * 100).round(2)

cols = ["domain","total_files","ok_files","error_files","files_with_target","pct_with_target","avg_ncols","median_sample_rows"]
summary = summary[cols]
summary.to_csv(OUT_CSV, index=False)

print("summary:", OUT_CSV)


summary: /content/drive/MyDrive/Colab Notebooks/New_cyber_project/week_1/outputs/summary.csv


In [None]:
# /colab_notebooks/week2_step3_column_profile.ipynb

from __future__ import annotations

import os
from collections import Counter, defaultdict
from dataclasses import dataclass
from typing import Dict, Iterable, List, Optional

import pandas as pd

DRIVE_MOUNT_PT = "/content/drive"
PROJECT_ROOT   = f"{DRIVE_MOUNT_PT}/MyDrive/Colab Notebooks/New_cyber_project"
W1_OUT         = os.path.join(PROJECT_ROOT, "week_1", "outputs")
W2_OUT         = os.path.join(PROJECT_ROOT, "week_2", "outputs")

INVENTORY_CSV       = os.path.join(W1_OUT, "data_inventory_paths.csv")
COLUMN_PROFILE_CSV  = os.path.join(W2_OUT, "column_profile.csv")
DTYPE_SUMMARY_CSV   = os.path.join(W2_OUT, "dtype_summary.csv")

MAX_SAMPLE_ROWS = 10_000  # as requested

@dataclass(frozen=True)
class FileEntry:
    domain: str
    path: str
    extension: str

@dataclass
class FileSchema:
    domain: str
    path: str
    columns: List[str]
    dtypes: Dict[str, str]

def ensure_inputs() -> None:
    if not os.path.isfile(INVENTORY_CSV):
        raise FileNotFoundError(f"Required file not found: {INVENTORY_CSV}")
    os.makedirs(W2_OUT, exist_ok=True)

def read_inventory() -> List[FileEntry]:
    df = pd.read_csv(INVENTORY_CSV)
    if df.empty:
        raise ValueError("Inventory contains no entries.")
    return [
        FileEntry(str(r["domain"]), str(r["path"]), str(r["extension"]).lower())
        for _, r in df.iterrows()
    ]

def read_sample(path: str, extension: str, nrows: int) -> Optional[pd.DataFrame]:
    try:
        if extension == ".csv":
            return pd.read_csv(path, nrows=nrows, low_memory=True)
        if extension == ".parquet":
            return pd.read_parquet(path).head(nrows)
        return None
    except Exception:
        return None

def infer_file_schema(e: FileEntry) -> Optional[FileSchema]:
    df = read_sample(e.path, e.extension, MAX_SAMPLE_ROWS)
    if df is None or df.empty:
        return None
    cols = list(df.columns)
    dtypes = {c: str(df[c].dtype) for c in cols}
    return FileSchema(e.domain, e.path, cols, dtypes)

def collect_schemas(entries: Iterable[FileEntry]) -> List[FileSchema]:
    out: List[FileSchema] = []
    for e in entries:
        fs = infer_file_schema(e)
        if fs is not None:
            out.append(fs)
    if not out:
        raise ValueError("No readable files were found during sampling.")
    return out

def build_column_profile(schemas: List[FileSchema]) -> pd.DataFrame:
    by_domain_files = defaultdict(set)
    by_domain_col_files = defaultdict(lambda: defaultdict(set))
    by_domain_col_dtype = defaultdict(lambda: defaultdict(Counter))

    for fs in schemas:
        by_domain_files[fs.domain].add(fs.path)
        for c in fs.columns:
            by_domain_col_files[fs.domain][c].add(fs.path)
            by_domain_col_dtype[fs.domain][c].update([fs.dtypes.get(c, "unknown")])

    rows: List[Dict[str, object]] = []
    for dom, files in by_domain_files.items():
        total = len(files)
        for col, file_set in by_domain_col_files[dom].items():
            modal_dtype = by_domain_col_dtype[dom][col].most_common(1)[0][0]
            rows.append({
                "domain": dom,
                "column": col,
                "files_with_column": len(file_set),
                "total_files_in_domain": total,
                "pct_coverage": round(len(file_set) / total * 100.0, 2) if total else 0.0,
                "modal_dtype": modal_dtype,
            })

    df = pd.DataFrame(rows)
    if df.empty:
        df = pd.DataFrame(columns=["domain","column","files_with_column","total_files_in_domain","pct_coverage","modal_dtype"])
    return df.sort_values(["domain","pct_coverage","column"], ascending=[True, False, True]).reset_index(drop=True)

def build_dtype_summary(schemas: List[FileSchema]) -> pd.DataFrame:
    by_domain = defaultdict(Counter)
    for fs in schemas:
        by_domain[fs.domain].update(fs.dtypes.values())
    rows = [{"domain": d, "dtype": t, "count": int(c)} for d, ctr in by_domain.items() for t, c in ctr.most_common()]
    df = pd.DataFrame(rows)
    if df.empty:
        df = pd.DataFrame(columns=["domain","dtype","count"])
    return df.sort_values(["domain","count"], ascending=[True, False]).reset_index(drop=True)

def main() -> None:
    ensure_inputs()
    entries = read_inventory()
    schemas = collect_schemas(entries)
    column_profile = build_column_profile(schemas)
    dtype_summary = build_dtype_summary(schemas)
    column_profile.to_csv(COLUMN_PROFILE_CSV, index=False)
    dtype_summary.to_csv(DTYPE_SUMMARY_CSV, index=False)
    print(f"column_profile: {COLUMN_PROFILE_CSV}")
    print(f"dtype_summary:  {DTYPE_SUMMARY_CSV}")

if __name__ == "__main__":
    main()
