In [69]:
from pathlib import Path
import cv2
import numpy as np
import pandas as pd
import pyshark
import librosa
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.pipeline import Pipeline
from joblib import dump, load
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

In [70]:
import zipfile
import os

zip_path = "augmented_data.zip"
out_dir  = "augmented_data"

os.makedirs(out_dir, exist_ok=True)

with zipfile.ZipFile(zip_path, "r") as zip_ref:
    zip_ref.extractall(out_dir)

print(f"Extracted all files from {zip_path} → {out_dir}/")

Extracted all files from augmented_data.zip → augmented_data/


In [71]:
!pip install pyshark

[0m

In [72]:
!pip install scapy

[0m

In [73]:
# 01-imports.py ───────────────────────────────────────────────────────────────
from pathlib import Path
from collections import Counter
import json, math, itertools, time

import pandas as pd
import numpy as np
from tqdm.auto import tqdm

from scapy.all import rdpcap   # needs sudo on some platforms
import cv2

import torch
yolo = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True)  # first run downloads weights
yolo.conf = 0.25  # confidence threshold


Using cache found in /root/.cache/torch/hub/ultralytics_yolov5_master
YOLOv5 🚀 2025-6-10 Python-3.10.13 torch-2.2.1 CUDA:0 (NVIDIA A100-SXM4-80GB, 81156MiB)

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients, 16.4 GFLOPs
Adding AutoShape... 


In [74]:
# 02-pcap-features.py ─────────────────────────────────────────────────────────
def extract_pcap_features(pcap_path: Path) -> dict:
    """
    Lightweight flow-level summary of a PCAP.
    Returns a feature dict (flat, numeric) that can go straight into a DataFrame.
    """
    packets = rdpcap(str(pcap_path))
    if not packets:
        return {"fname": pcap_path.name, "tot_pkts": 0, "tot_bytes": 0}

    t0, t1 = packets[0].time, packets[-1].time
    duration = max(t1 - t0, 1e-6)

    sizes = [len(pkt) for pkt in packets]
    protocols = Counter(pkt.name for pkt in packets)
    l4 = Counter(pkt.lastlayer().name for pkt in packets)

    ips = [(pkt[0][1].src, pkt[0][1].dst)   # naive; works for IPv4 in most traces
           for pkt in packets if pkt.haslayer('IP')]

    src_ips = Counter(src for src, _ in ips)
    dst_ips = Counter(dst for _, dst in ips)

    feats = {
        "fname"        : pcap_path.name,
        "tot_pkts"     : len(packets),
        "tot_bytes"    : sum(sizes),
        "mean_pkt_len" : float(np.mean(sizes)),
        "std_pkt_len"  : float(np.std(sizes)),
        "duration"     : duration,
        "pkts_per_sec" : len(packets)/duration,
        "bytes_per_sec": sum(sizes)/duration,
        "uniq_src_ip"  : len(src_ips),
        "uniq_dst_ip"  : len(dst_ips),
        # Top-3 protocol ratios
        **{f"proto_{p}": protocols[p]/len(packets) for p in ("TCP", "UDP", "ICMP")},
        **{f"l4_{l}": l4[l]/len(packets) for l in ("Raw", "HTTP")},  # extend as you wish
    }
    return feats


In [75]:
# 03-video-features.py ────────────────────────────────────────────────────────
def _calc_entropy(hist):
    p = hist / (hist.sum() + 1e-12)
    return -(p * np.log2(p + 1e-12)).sum()

def extract_video_features(video_path: Path,
                           frame_skip: int = 15,
                           max_frames: int = 600) -> dict:
    """
    * Motion magnitude (simple frame-diff)
    * Color histogram entropy
    * YOLO object counts (knife, gun, person, etc.)  — default model ≈ COCO
    Returns a dict of numeric features.
    """
    cap = cv2.VideoCapture(str(video_path))
    fps   = cap.get(cv2.CAP_PROP_FPS) or 30
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    w, h = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    # Stats we accumulate
    motion_vals, entropy_vals = [], []
    yolo_counts = Counter()

    last_gray = None
    processed = 0
    with torch.no_grad():
        for i in range(frame_count):
            ret, frame = cap.read()
            if not ret: break
            if i % frame_skip: continue
            processed += 1
            if processed > max_frames: break

            gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
            if last_gray is not None:
                diff = cv2.absdiff(gray, last_gray)
                motion_vals.append(diff.mean())
            last_gray = gray

            # histogram entropy
            hist = cv2.calcHist([frame],[0],None,[256],[0,256]).flatten()
            entropy_vals.append(_calc_entropy(hist))

            # object detection
            results = yolo(frame, size=640)
            for cls in results.pred[0][:,5].tolist():  # integer class ids
                yolo_counts[int(cls)] += 1

    cap.release()

    # Map YOLO class ids to human labels (subset)
    coco_names = yolo.names
    weapon_labels = {"knife", "sports ball", "baseball bat", "baseball glove"}
    person_labels = {"person"}

    counts = Counter({coco_names[k]: v for k, v in yolo_counts.items()
                      if coco_names[k] in weapon_labels | person_labels})

    feats = {
        "fname"           : video_path.name,
        "fps"             : fps,
        "frames_proc"     : processed,
        "mean_motion"     : float(np.mean(motion_vals)) if motion_vals else 0.0,
        "std_motion"      : float(np.std(motion_vals))  if motion_vals else 0.0,
        "mean_entropy"    : float(np.mean(entropy_vals)) if entropy_vals else 0.0,
        "std_entropy"     : float(np.std(entropy_vals))  if entropy_vals else 0.0,
        # object counts (use .get to default to 0)
        "persons"         : counts.get("person", 0),
        "knives"          : counts.get("knife", 0),
        "bats"            : counts.get("baseball bat", 0),
        "balls"           : counts.get("sports ball", 0),
    }
    return feats


In [18]:
# 04-batch-runner.py  (revised) ──────────────────────────────────────────────
VIDEO_EXTS = {".mp4", ".avi", ".mov", ".mkv"}
PCAP_EXTS  = {".pcap", ".pcapng"}

def list_files(root: Path, exts) -> list[Path]:
    """Recursively list files whose suffix (lower-case) is in `exts`."""
    return sorted(p for p in root.rglob("*") if p.suffix.lower() in exts)

def build_feature_table(pcap_dir: Path, video_dir: Path) -> pd.DataFrame:
    # ---------- PCAP ----------
    pcap_paths = list_files(pcap_dir, PCAP_EXTS)
    if not pcap_paths:
        print(f"[WARN] No PCAPs found under {pcap_dir.resolve()}")
    pcap_feats = [extract_pcap_features(p) for p in tqdm(pcap_paths, desc="PCAP")]
    df_pcap = (
        pd.DataFrame(pcap_feats).set_index("fname").add_prefix("pcap_")
        if pcap_feats else pd.DataFrame()
    )

    # ---------- VIDEO ----------
    vid_paths = list_files(video_dir, VIDEO_EXTS)
    if not vid_paths:
        print(f"[WARN] No videos found under {video_dir.resolve()}")
    video_feats = [extract_video_features(v) for v in tqdm(vid_paths, desc="VIDEO")]
    df_video = (
        pd.DataFrame(video_feats).set_index("fname").add_prefix("vid_")
        if video_feats else pd.DataFrame()
    )

    # ---------- MERGE ----------
    dfs = [df for df in (df_pcap, df_video) if not df.empty]
    if not dfs:
        raise RuntimeError("No files processed – double-check your folder paths & extensions.")
    return (
        pd.concat(dfs, axis=0, sort=False)
          .reset_index()
          .rename(columns={"index": "file"})
    )


In [77]:
# 05-run-feature-extraction.py ───────────────────────────────────────────────
# <<< EDIT THESE >>>  (absolute or relative paths)
# --------------------------------------------------------------------------
# 00-constants.py (add after your imports)
JUNK_PREFIXES = {"._"}                      # macOS resource forks
VIDEO_EXTS    = {".mp4", ".avi", ".mov", ".mkv"}
PCAP_EXTS     = {".pcap", ".pcapng", ".pcap.gz", ".pcapng.gz"}
# --------------------------------------------------------------------------

# --------------------------------------------------------------------------
# util: list_files()  – add a junk-file filter
def list_files(root: Path, exts) -> list[Path]:
    return sorted(
        p for p in root.rglob("*")
        if p.suffix.lower() in exts and not p.name.startswith(tuple(JUNK_PREFIXES))
    )
# --------------------------------------------------------------------------

# --------------------------------------------------------------------------
# util: _pyshark_summary() – auto-adapt to PyShark version
import inspect
def _pyshark_summary(path: Path, max_pkts: int = 100_000) -> dict:
    kwargs = dict(keep_packets=False, override_prefs={"transport_layer": "true"})
    if "decode_tunnels" in inspect.signature(pyshark.FileCapture).parameters:
        kwargs["decode_tunnels"] = True

    cap = pyshark.FileCapture(str(path), **kwargs)

    bytes_total = pkts = 0
    protocols   = Counter(); src_ips = Counter(); dst_ips = Counter()
    try:
        for pkt in cap:
            pkts += 1
            bytes_total += int(getattr(pkt.frame_info, "len", 0))
            protos = getattr(pkt.frame_info, "protocols", "")
            protocols[protos.split(":")[-1]] += 1
            if hasattr(pkt, "ip"):
                src_ips[pkt.ip.src] += 1
                dst_ips[pkt.ip.dst] += 1
            if pkts >= max_pkts:
                break
    finally:
        cap.close()        # ensures _running_processes exists before __del__
    return dict(tot_pkts=pkts,
                tot_bytes=bytes_total,
                uniq_src_ip=len(src_ips),
                uniq_dst_ip=len(dst_ips),
                **{f"proto_{p}": protocols[p]/pkts for p in ("tcp", "udp", "icmp") if pkts})
# --------------------------------------------------------------------------

def extract_pcap_features(pcap_path: Path) -> dict:
    """Robust extractor with gzip+pcapng support and PyShark fallback."""
    header = _sniff_magic(pcap_path)
    try:
        raw = _open_maybe_gzip(pcap_path)
        # ---------- Try Scapy ----------
        if raw[:4] in MAGIC_PCAP | MAGIC_PCAPNG:
            packets = rdpcap(io.BytesIO(raw))   # we can feed bytes-like object
        else:
            raise Scapy_Exception("Unrecognised magic")
        if not packets:                       # empty file?
            raise Scapy_Exception("0 packets")
        t0, t1 = packets[0].time, packets[-1].time
        duration = max(t1 - t0, 1e-6)
        sizes = [len(pkt) for pkt in packets]
        protocols = Counter(pkt.name for pkt in packets)
        l4 = Counter(pkt.lastlayer().name for pkt in packets)
        ips  = [(pkt[0][1].src, pkt[0][1].dst) for pkt in packets if pkt.haslayer('IP')]
        src_ips, dst_ips = Counter(src for src, _ in ips), Counter(dst for _, dst in ips)
        feats_core = dict(
            tot_pkts=len(packets),
            tot_bytes=sum(sizes),
            mean_pkt_len=float(np.mean(sizes)),
            std_pkt_len=float(np.std(sizes)),
            duration=duration,
            pkts_per_sec=len(packets)/duration,
            bytes_per_sec=sum(sizes)/duration,
            uniq_src_ip=len(src_ips),
            uniq_dst_ip=len(dst_ips),
        )
        feats_proto = {f"proto_{p}": protocols[p]/len(packets) for p in ("TCP","UDP","ICMP")}
        feats_l4    = {f"l4_{l}": l4[l]/len(packets) for l in ("Raw","HTTP")}
        feats = {**feats_core, **feats_proto, **feats_l4}

    except Exception as scapy_err:
        # ---------- PyShark fallback ----------
        try:
            feats = _pyshark_summary(pcap_path)
            feats.setdefault("mean_pkt_len", 0)
            feats.setdefault("std_pkt_len", 0)
            feats.setdefault("duration",     0)
            feats.setdefault("pkts_per_sec", 0)
            feats.setdefault("bytes_per_sec",0)
        except Exception as py_err:
            print(f"[ERROR] {pcap_path.name}: {scapy_err} ; fallback failed ({py_err})")
            return {"fname": pcap_path.name, "tot_pkts": 0, "tot_bytes": 0, "error": 1}

    feats["fname"] = pcap_path.name
    feats.setdefault("error", 0)
    return feats

# quick sanity


PCAP_FOLDER  = Path("augmented_data/augmented_data/pcap")        # e.g. Path("/home/me/captures")
VIDEO_FOLDER = Path("augmented_data/augmented_data/video")        # e.g. Path("/home/me/cctv")
print("PCAP good files :", len(list_files(PCAP_FOLDER, PCAP_EXTS)))
print("Video candidates:", len(list_files(VIDEO_FOLDER, VIDEO_EXTS)))

df = build_feature_table(PCAP_FOLDER, VIDEO_FOLDER)
# print("Final df shape:", df.shape)
# df[df["pcap_error"] == 1].head()        # ⇠ optional: see which pcaps failed


PCAP good files : 120
Video candidates: 81


PCAP:   0%|          | 0/120 [00:00<?, ?it/s]

VIDEO:   0%|          | 0/81 [00:00<?, ?it/s]

In [78]:
print(">>> ACTUAL SHAPE:", df.shape)      # should print (80, 26)

df.head()                                 # 5 example rows

>>> ACTUAL SHAPE: (201, 26)


Unnamed: 0,fname,pcap_tot_pkts,pcap_tot_bytes,pcap_mean_pkt_len,pcap_std_pkt_len,...,vid_std_entropy,vid_persons,vid_knives,vid_bats,vid_balls
0,291c-delays.pcap,587.0,472874.0,805.577513,457.67332,...,,,,,
1,291c-drops.pcap,568.0,455519.0,801.97007,458.50093,...,,,,,
2,291c-speed1.1.pcap,587.0,472874.0,805.577513,457.67332,...,,,,,
3,A-speed1.1.pcap,281.0,238761.0,849.683274,441.244587,...,,,,,
4,A-speed2.pcap,281.0,238761.0,849.683274,441.244587,...,,,,,


In [86]:
def extract_video_features(video_path: Path,
                           target_frames: int = 60,
                           max_frames: int = 2000,
                           yolo_model=yolo,               # can inject custom model
                           yolo_conf: float | None = None):
    """
    Sample enough frames to hit `target_frames`, but stop at `max_frames`.
    Returns a flat feature dict (all numeric except 'fname').
    """
    if yolo_conf is not None:
        yolo_model.conf = yolo_conf

    cap = cv2.VideoCapture(str(video_path))
    if not cap.isOpened():
        return {"fname": video_path.name, "vid_error": 1}

    fps   = cap.get(cv2.CAP_PROP_FPS) or 30
    total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) or 1
    skip  = max(total // target_frames, 1)               # adaptive sampling stride

    motion_vals, entropy_vals = [], []
    counts = Counter(); last_gray = None
    processed = 0

    with torch.no_grad():
        for idx in range(total):
            ret, frame = cap.read()
            if not ret:
                break
            if idx % skip:
                continue
            processed += 1
            if processed > max_frames:
                break

            gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
            if last_gray is not None:
                motion_vals.append(cv2.absdiff(gray, last_gray).mean())
            last_gray = gray

            hist = cv2.calcHist([frame],[0],None,[256],[0,256]).flatten()
            entropy_vals.append(_calc_entropy(hist))

            res = yolo_model(frame, size=640)
            for cls in res.pred[0][:, 5].tolist():
                counts[int(cls)] += 1

    cap.release()

    coco = yolo_model.names
    feats = {
        "fname"           : video_path.name,
        "vid_fps"         : fps,
        "vid_frames_total": total,
        "vid_frames_proc" : processed,
        "vid_mean_motion" : float(np.mean(motion_vals)) if motion_vals else 0,
        "vid_std_motion"  : float(np.std(motion_vals))  if motion_vals else 0,
        "vid_mean_entropy": float(np.mean(entropy_vals)) if entropy_vals else 0,
        "vid_std_entropy" : float(np.std(entropy_vals))  if entropy_vals else 0,
        "vid_persons"     : counts.get(next((k for k,v in coco.items() if v=="person"), -1), 0),
        "vid_knives"      : counts.get(next((k for k,v in coco.items() if v=="knife"),  -1), 0),
        "vid_bats"        : counts.get(next((k for k,v in coco.items() if v=="baseball bat"), -1), 0),
        "vid_balls"       : counts.get(next((k for k,v in coco.items() if v=="sports ball"), -1), 0),
        "vid_error"       : 0,
    }
    return feats


In [97]:
print(df.columns)

Index(['fname', 'pcap_tot_pkts', 'pcap_tot_bytes', 'pcap_mean_pkt_len',
       'pcap_std_pkt_len', 'pcap_duration', 'pcap_pkts_per_sec',
       'pcap_bytes_per_sec', 'pcap_uniq_src_ip', 'pcap_uniq_dst_ip',
       'pcap_proto_TCP', 'pcap_proto_UDP', 'pcap_proto_ICMP', 'pcap_l4_Raw',
       'pcap_l4_HTTP', 'pcap_error', 'vid_fps', 'vid_frames_proc',
       'vid_mean_motion', 'vid_std_motion', 'vid_mean_entropy',
       'vid_std_entropy', 'vid_persons', 'vid_knives', 'vid_bats', 'vid_balls',
       'label'],
      dtype='object')


In [100]:
video_df = df[df['vid_frames_proc'].notnull()]
print(video_df[['vid_frames_proc', 'vid_persons', 'vid_mean_motion']].head())

     vid_frames_proc  vid_persons  vid_mean_motion
120              6.0          5.0        12.119514
121              6.0          5.0        11.025236
122              3.0          3.0         8.958849
123              2.0          2.0        13.239454
124              4.0          4.0         7.380272


In [106]:
import pandas as pd
import numpy as np

# --- 1. Define label column and validate ----------------------------
LABEL_COL = "label"
assert LABEL_COL in df.columns, f"❌ '{LABEL_COL}' column missing"

# --- 2. Identify non-feature/meta columns ---------------------------
# Only drop columns that exist
maybe_non_features = ["fname", "file_type", LABEL_COL]
NON_FEATURES = [col for col in maybe_non_features if col in df.columns]

# --- 3. Create feature matrix and label vector ----------------------
X = df.drop(columns=NON_FEATURES).astype(np.float32)
y = df[LABEL_COL].astype(int)

# Optional: fill missing values (e.g., video features missing for .pcap files)
X = X.fillna(0.0)

# --- 4. Inspect shape -----------------------------------------------
print("✅ Feature matrix shape:", X.shape)
print("✅ Label vector shape   :", y.shape)

✅ Feature matrix shape: (201, 25)
✅ Label vector shape   : (201,)


In [107]:

pcap_only = df[df['vid_frames_proc'].isnull()]
video_only = df[df['vid_frames_proc'].notnull()]

In [115]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_train, y_train)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_res, y_res)

y_pred = model.predict(X_test)

print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\n📊 Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\n📝 Classification Report:")
print(classification_report(y_test, y_pred))


✅ Accuracy: 0.9803921568627451

📊 Confusion Matrix:
[[48  0]
 [ 1  2]]

📝 Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99        48
           1       1.00      0.67      0.80         3

    accuracy                           0.98        51
   macro avg       0.99      0.83      0.89        51
weighted avg       0.98      0.98      0.98        51



In [112]:
import matplotlib.pyplot as plt

importances = model.feature_importances_
feature_names = X.columns
indices = np.argsort(importances)[::-1]

plt.figure(figsize=(10, 6))
plt.title("Top 10 Feature Importances")
plt.bar(range(10), importances[indices[:10]])
plt.xticks(range(10), feature_names[indices[:10]], rotation=45, ha="right")
plt.tight_layout()
plt.show()

In [116]:
# 08-feature-importance.py ───────────────────────────────────────────────────
importances = pd.Series(clf.feature_importances_, index=X.columns)\
               .sort_values(ascending=False)

print("Top-15 influential features:")
display(importances.head(15))

# Optional: visual bar plot
import matplotlib.pyplot as plt
plt.figure()
importances.head(25).plot(kind='barh')
plt.gca().invert_yaxis()
plt.title("Feature importance – RandomForest")
plt.tight_layout()
plt.show()


Top-15 influential features:


vid_mean_entropy      0.114471
pcap_mean_pkt_len     0.100396
pcap_tot_pkts         0.089721
pcap_tot_bytes        0.074945
vid_frames_proc       0.071043
pcap_duration         0.065078
vid_std_entropy       0.059878
vid_mean_motion       0.055707
vid_std_motion        0.054616
vid_persons           0.054607
pcap_bytes_per_sec    0.054310
pcap_l4_Raw           0.053589
pcap_std_pkt_len      0.050567
pcap_pkts_per_sec     0.049447
pcap_uniq_dst_ip      0.022556
dtype: float64

In [117]:
print("Label distribution:")
print(df["label"].value_counts())


Label distribution:
0    191
1     10
Name: label, dtype: int64


In [63]:
!pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Collecting imbalanced-learn (from imblearn)
  Downloading imbalanced_learn-0.13.0-py3-none-any.whl.metadata (8.8 kB)
Collecting sklearn-compat<1,>=0.1 (from imbalanced-learn->imblearn)
  Downloading sklearn_compat-0.1.3-py3-none-any.whl.metadata (18 kB)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Downloading imbalanced_learn-0.13.0-py3-none-any.whl (238 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m238.4/238.4 kB[0m [31m745.3 kB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hDownloading sklearn_compat-0.1.3-py3-none-any.whl (18 kB)
Installing collected packages: sklearn-compat, imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.13.0 imblearn-0.0 sklearn-compat-0.1.3
[0m

In [66]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-3.0.2-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting nvidia-nccl-cu12 (from xgboost)
  Downloading nvidia_nccl_cu12-2.27.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (2.0 kB)
Downloading xgboost-3.0.2-py3-none-manylinux_2_28_x86_64.whl (253.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.9/253.9 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:02[0mm
[?25hDownloading nvidia_nccl_cu12-2.27.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (322.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m322.4/322.4 MB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m00:01[0mm00:01[0m
[?25hInstalling collected packages: nvidia-nccl-cu12, xgboost
Successfully installed nvidia-nccl-cu12-2.27.3 xgboost-3.0.2
[0m

In [95]:
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

ratio = (y == 0).sum() / (y == 1).sum()          # 76 / 4 = 19

xgb = XGBClassifier(
    n_estimators=800,
    max_depth=3,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    scale_pos_weight = ratio,                     # **key line**
    eval_metric = "aucpr",
    random_state = 42,
    n_jobs = -1
)

skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)
auc = cross_val_score(xgb, X, y, cv=skf, scoring="average_precision")
print("PR-AUC per fold:", auc.round(3), " mean =", auc.mean().round(3))


PR-AUC per fold: [      0.091       0.125        0.05       0.083]  mean = 0.087


In [96]:
from sklearn.ensemble import IsolationForest
iso = IsolationForest(contamination=0.05, random_state=42)
iso.fit(X)

df["anomaly"] = -iso.score_samples(X)          # higher = more suspicious
top = df.sort_values("anomaly", ascending=False).head(10)
display(top[["fname", "anomaly"]])

ValueError: Length of values (80) does not match length of index (201)