
# IsolationForest Anomaly Detection — Notebook

This notebook shows how to **train** and **run inference** using your project modules:
- `src/pipelines.py`
- `src/utils.py`
- `src/visualize.py` (optional plots)

> Run this notebook from the **project root** so relative paths resolve (e.g., `configs/config.yaml`, `data/train.csv`).


In [None]:

# (Optional) Install dependencies if needed in your environment
# %pip install -r requirements.txt


In [None]:

import os, sys
from pathlib import Path
import pandas as pd

# Ensure project root is on sys.path (assumes the notebook is at project root)
project_root = Path.cwd()
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

from src.pipelines import build_pipeline
from src.utils import load_config, ensure_features, default_meta, save_meta, annotate_with_iforest
from joblib import dump, load


In [None]:

# Load configuration
cfg_path = Path("configs/config.yaml")
assert cfg_path.exists(), "configs/config.yaml not found. Please create it or adjust the path."

cfg = load_config(cfg_path)
cfg


In [None]:

# === Train ===
features = cfg["features"]
contamination = float(cfg.get("contamination", 0.1))
random_state = int(cfg.get("random_state", 42))
imputer_strategy = cfg.get("imputer_strategy", "median")
scaler = cfg.get("scaler", "standard")

train_csv = Path("data/train.csv")
assert train_csv.exists(), "data/train.csv not found. Place your training CSV there or change the path."

df_train = pd.read_csv(train_csv)
ensure_features(df_train, features)

pipe = build_pipeline(
    features=features,
    contamination=contamination,
    random_state=random_state,
    imputer_strategy=imputer_strategy,
    scaler=scaler,
)

pipe.fit(df_train)

model_path = Path(cfg.get("model_path", "models/iforest_pipeline.joblib"))
meta_path  = Path(cfg.get("meta_path", "models/iforest_meta.json"))
model_path.parent.mkdir(parents=True, exist_ok=True)

dump(pipe, model_path)
save_meta(default_meta(cfg, model_path), meta_path)

print("✅ Trained and saved:", model_path)
print("📝 Meta saved:", meta_path)


In [None]:

# === Inference ===
pipe_loaded = load(model_path)

test_csv = Path("data/new_data.csv")
assert test_csv.exists(), "data/new_data.csv not found. Place your scoring CSV there or change the path."

df_new = pd.read_csv(test_csv)
ensure_features(df_new, features)

df_annot = annotate_with_iforest(df_new, pipe_loaded, features)
df_annot.head()


In [None]:

# === Visualizations (Plotly) ===
import plotly.express as px
from src.visualize import (
    plot_scatter_melted,
    plot_score_hist,
    plot_pair_scatter,
    plot_feature_box,
)

# Combined melted scatter (index vs values), colored by variable, anomalies as symbols
fig1 = plot_scatter_melted(df_annot, features, show=True)

# Score distribution
fig2 = plot_score_hist(df_annot, show=True)

# Pairwise feature scatter (e.g., RHOB vs NPHI)
if len(features) >= 2:
    fig3 = plot_pair_scatter(df_annot, x=features[1], y=features[0], show=True)

# Box plots (melted)
fig4 = plot_feature_box(df_annot, features, points="all", show=True)


In [None]:

# Save scored/annotated data from notebook
out_csv = Path("scored_from_notebook.csv")
df_annot.to_csv(out_csv, index=False)
print("💾 Wrote:", out_csv.resolve())



## Run CLI scripts from the notebook (alternative)
If you prefer, you can call the existing scripts directly:


In [None]:

# !python scripts/train.py --data data/train.csv --config configs/config.yaml
# !python scripts/inference.py --data data/new_data.csv --config configs/config.yaml --out scored.csv --plot
