# PreVisor EDA (additional)

This notebook loads processed datasets (if present) and builds quick visuals for defense.

In [None]:
import os
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme(style="whitegrid")

root = Path(".").resolve().parent
print("Project root:", root)

datasets_dir = root / "data" / "runtime" / "datasets"
paths = sorted(datasets_dir.glob("*_processed.csv"))
print("Datasets:", [p.name for p in paths])


In [None]:
# Load and concat (with optional sampling for speed)
frames = []
for p in paths:
    df = pd.read_csv(p, low_memory=False)
    frames.append(df)

if not frames:
    print("No processed datasets found. Run utils/process_data.py first.")
    combined = pd.DataFrame()
else:
    combined = pd.concat(frames, ignore_index=True)
    print("Rows:", len(combined))
    # Optional: sample for faster plots
    if len(combined) > 50000:
        combined = combined.sample(50000, random_state=42)
        print("Sampled rows:", len(combined))


In [None]:
# Identify label column
label_candidates = ["Attack Type", "Label", "classification"]
label_col = next((c for c in label_candidates if c in combined.columns), None)
label_col


In [None]:
# Distribution of labels
if label_col:
    vc = combined[label_col].value_counts().head(20)
    ax = vc.plot(kind="bar", figsize=(10, 4), title="Top labels")
    ax.set_xlabel("Label")
    ax.set_ylabel("Count")
    plt.tight_layout()
    plt.show()
else:
    print("No label column found in data.")


In [None]:
# Top destination ports (if available)
port_col = "dest_port" if "dest_port" in combined.columns else None
if port_col:
    vc = combined[port_col].value_counts().head(20)
    ax = vc.plot(kind="bar", figsize=(10, 4), title="Top destination ports")
    ax.set_xlabel("Port")
    ax.set_ylabel("Count")
    plt.tight_layout()
    plt.show()
else:
    print("dest_port column not found.")


In [None]:
# Correlation heatmap for numeric features (sample)
num = combined.select_dtypes(include=[np.number])
if not num.empty:
    sample = num.sample(min(3000, len(num)), random_state=42)
    corr = sample.corr(numeric_only=True)
    plt.figure(figsize=(10, 8))
    sns.heatmap(corr, cmap="coolwarm", center=0)
    plt.title("Feature correlation (sample)")
    plt.tight_layout()
    plt.show()
else:
    print("No numeric columns to plot.")
