# Step 5 — First model (interpretable baseline)

This notebook trains a simple model on aggregated anomaly features.

Inputs:
- `data/features/anomaly_features.parquet`
- `data/labels/segment_labels.csv` with columns: `segment_id`, `label` (1=break, 0=no break)

Output:
- `models/leak_risk_model.pkl`

Metric:
- Recall@K (default K=5%)


In [None]:
from pathlib import Path

try:
    import pandas as pd
    import numpy as np
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import roc_auc_score
    import joblib
except ImportError as exc:
    raise SystemExit('Missing deps. Run: pip install pandas pyarrow scikit-learn joblib') from exc


In [None]:
# Load features
features_path = Path('data/features/anomaly_features.parquet')
if not features_path.exists():
    raise SystemExit('Missing features. Run Step 4 to generate anomaly_features.parquet.')

features = pd.read_parquet(features_path)

# Load labels
labels_path = Path('data/labels/segment_labels.csv')
if not labels_path.exists():
    raise SystemExit('Missing labels. Create data/labels/segment_labels.csv with segment_id,label.')

labels = pd.read_csv(labels_path)

df = features.merge(labels, on='segment_id', how='inner')
if df.empty:
    raise SystemExit('No labeled rows after merge. Check segment_id alignment.')

df['label'] = df['label'].astype(int)
df.head()


In [None]:
# Feature columns (exclude ids and dates)
drop_cols = {'break_id', 'segment_id', 'break_date', 'label'}
feature_cols = [c for c in df.columns if c not in drop_cols]

X = df[feature_cols]
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [None]:
# Train baseline model
model = RandomForestClassifier(
    n_estimators=300,
    max_depth=8,
    min_samples_leaf=5,
    random_state=42,
    class_weight='balanced'
)

model.fit(X_train, y_train)

proba = model.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, proba)
print('ROC AUC:', round(auc, 4))


In [None]:
# Recall@K
def recall_at_k(y_true, y_score, k_frac=0.05):
    n = len(y_true)
    k = max(1, int(n * k_frac))
    order = np.argsort(y_score)[::-1]
    topk = np.array(y_true)[order[:k]]
    return float(topk.sum() / max(1, np.sum(y_true)))

r5 = recall_at_k(y_test.values, proba, k_frac=0.05)
print('Recall@5%:', round(r5, 4))


In [None]:
# Save model
out_path = Path('models/leak_risk_model.pkl')
out_path.parent.mkdir(parents=True, exist_ok=True)

joblib.dump({
    'model': model,
    'feature_cols': feature_cols
}, out_path)

print(f'Wrote {out_path}')
