# Ensemble Anomaly Detection

**Мета:** Об'єднати результати всіх методів в єдиний risk score.

**Методи:**
1. Rule-based (44 rules) — процедурні порушення
2. Statistical (Benford, Z-score) — числові аномалії
3. Isolation Forest — глобальні outliers
4. LOF — локальні outliers

**Ensemble підхід:**
- Weighted voting: кожен метод голосує з вагою
- Consensus: скільки методів flagged тендер
- Final risk score: нормалізована комбінація

**Pipeline:**
1. Rule-based ✓
2. Statistical ✓
3. Isolation Forest ✓
4. LOF ✓
5. **Ensemble** ← current

In [None]:
import sys
sys.path.insert(0, '..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

from src.data_loader import load_tenders, load_bids, load_buyers, load_suppliers
from src.detectors.rule_based import RuleBasedDetector
from src.detectors.statistical import StatisticalDetector
from src.detectors.ml_based import IsolationForestDetector, LOFDetector

# ============================================================
# CONFIGURATION
# ============================================================
YEARS = [2022, 2023, 2024, 2025]
CONTAMINATION = 0.05
LOF_SAMPLE_SIZE = 500_000  # LOF on sample, then extrapolate
# ============================================================

# Create output directories
Path('../results/figures/ensemble').mkdir(parents=True, exist_ok=True)
Path('../results').mkdir(parents=True, exist_ok=True)

# Style
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 11

print(f"Configuration: YEARS={YEARS}, CONTAMINATION={CONTAMINATION}")

## 1. Завантаження даних

In [None]:
print("Loading data...")
tenders = load_tenders(years=YEARS)
bids = load_bids(years=YEARS)
buyers = load_buyers()
suppliers = load_suppliers()

print(f"\nDataset:")
print(f"  Tenders: {len(tenders):,}")
print(f"  Bids: {len(bids):,}")
print(f"  Buyers: {len(buyers):,}")
print(f"  Suppliers: {len(suppliers):,}")

## 2. Run All Detectors

In [None]:
# 1. Rule-based
print("="*60)
print("1. RULE-BASED DETECTOR")
print("="*60)
rule_detector = RuleBasedDetector()
rule_results = rule_detector.detect(tenders, buyers_df=buyers, bids_df=bids)
print(f"\nRule-based anomalies (score >= 6): {(rule_results['rule_risk_score'] >= 6).sum():,}")

In [None]:
# 2. Statistical
print("="*60)
print("2. STATISTICAL DETECTOR")
print("="*60)
stat_detector = StatisticalDetector()
stat_results = stat_detector.detect(tenders, bids_df=bids)
print(f"\nStatistical anomalies (score >= 3): {(stat_results['stat_score'] >= 3).sum():,}")

In [None]:
# 3. Isolation Forest
print("="*60)
print("3. ISOLATION FOREST")
print("="*60)

# Extended features
features_extended = {
    "tender": [
        "tender_value", "price_change_pct", "number_of_tenderers",
        "is_single_bidder", "is_competitive",
        "is_weekend", "is_q4", "is_december",
    ],
    "buyer": [
        "single_bidder_rate", "competitive_rate",
        "avg_discount_pct", "supplier_diversity_index",
    ],
    "supplier": ["total_awards", "total_value"],
}

if_detector = IsolationForestDetector(
    contamination=CONTAMINATION,
    n_estimators=100,
    random_state=42,
    features=features_extended,
)
if_results = if_detector.fit_detect(tenders, buyers_df=buyers, suppliers_df=suppliers)
print(f"\nIF anomalies: {if_results['if_anomaly'].sum():,}")

In [None]:
# 4. LOF (on sample)
print("="*60)
print("4. LOCAL OUTLIER FACTOR (sample)")
print("="*60)

# Sample for LOF
tenders_sample = tenders.sample(LOF_SAMPLE_SIZE, random_state=42)
print(f"LOF sample size: {len(tenders_sample):,}")

lof_detector = LOFDetector(
    n_neighbors=20,
    contamination=CONTAMINATION,
    features=features_extended,
)
lof_results = lof_detector.fit_detect(tenders_sample, buyers_df=buyers, suppliers_df=suppliers)
print(f"\nLOF anomalies: {lof_results['lof_anomaly'].sum():,}")

## 3. Merge All Results

In [None]:
# Merge all results
print("Merging results...")

# Start with tender IDs
ensemble = tenders[['tender_id', 'tender_value', 'procurement_method', 'year', 
                    'buyer_id', 'supplier_id', 'is_single_bidder', 'is_competitive']].copy()

# Add Rule-based scores
ensemble = ensemble.merge(
    rule_results[['tender_id', 'rule_risk_score', 'rule_risk_level']],
    on='tender_id', how='left'
)

# Add Statistical scores
ensemble = ensemble.merge(
    stat_results[['tender_id', 'stat_score']],
    on='tender_id', how='left'
)

# Add IF scores
ensemble = ensemble.merge(
    if_results[['tender_id', 'if_score', 'if_anomaly']],
    on='tender_id', how='left'
)

# Add LOF scores (only for sample)
ensemble = ensemble.merge(
    lof_results[['tender_id', 'lof_score', 'lof_anomaly']],
    on='tender_id', how='left'
)

print(f"Ensemble dataset: {len(ensemble):,} tenders")
print(f"With LOF scores: {ensemble['lof_score'].notna().sum():,}")

## 4. Normalize Scores

In [None]:
# Normalize all scores to 0-1 range
print("Normalizing scores to 0-1 range...")

# Rule score: max is ~15-20, normalize
rule_max = ensemble['rule_risk_score'].max()
ensemble['rule_score_norm'] = ensemble['rule_risk_score'] / rule_max

# Stat score: max is ~10-15
stat_max = ensemble['stat_score'].max()
ensemble['stat_score_norm'] = ensemble['stat_score'] / stat_max

# IF score: already 0-1
ensemble['if_score_norm'] = ensemble['if_score']

# LOF score: already 0-1
ensemble['lof_score_norm'] = ensemble['lof_score']

print(f"\nScore ranges (normalized):")
print(f"  Rule: 0 - {ensemble['rule_score_norm'].max():.2f}")
print(f"  Stat: 0 - {ensemble['stat_score_norm'].max():.2f}")
print(f"  IF:   0 - {ensemble['if_score_norm'].max():.2f}")
print(f"  LOF:  0 - {ensemble['lof_score_norm'].max():.2f}")

## 5. Compute Ensemble Score

In [None]:
# Ensemble scoring methods

# Method 1: Weighted average (without LOF for full dataset)
WEIGHTS = {
    'rule': 1.0,   # Process violations
    'stat': 0.8,   # Statistical anomalies
    'if': 1.0,     # Global outliers
}

ensemble['ensemble_score'] = (
    ensemble['rule_score_norm'] * WEIGHTS['rule'] +
    ensemble['stat_score_norm'] * WEIGHTS['stat'] +
    ensemble['if_score_norm'] * WEIGHTS['if']
) / sum(WEIGHTS.values())

# Method 2: Consensus count (how many methods flagged)
ensemble['rule_flag'] = (ensemble['rule_risk_score'] >= 6).astype(int)
ensemble['stat_flag'] = (ensemble['stat_score'] >= 3).astype(int)
ensemble['if_flag'] = ensemble['if_anomaly']

ensemble['consensus_count'] = (
    ensemble['rule_flag'] + 
    ensemble['stat_flag'] + 
    ensemble['if_flag']
)

# With LOF (for sample only)
ensemble['consensus_with_lof'] = ensemble['consensus_count'] + ensemble['lof_anomaly'].fillna(0)

print("Ensemble scores computed.")
print(f"\nWeights: {WEIGHTS}")

In [None]:
# Assign risk levels based on ensemble score
def assign_risk_level(row):
    if row['consensus_count'] >= 3:
        return 'critical'
    elif row['consensus_count'] == 2 or row['ensemble_score'] >= 0.7:
        return 'high'
    elif row['consensus_count'] == 1 or row['ensemble_score'] >= 0.4:
        return 'medium'
    else:
        return 'low'

ensemble['ensemble_risk'] = ensemble.apply(assign_risk_level, axis=1)

# Distribution
risk_dist = ensemble['ensemble_risk'].value_counts()
print("\nENSEMBLE RISK DISTRIBUTION:")
for level in ['critical', 'high', 'medium', 'low']:
    count = risk_dist.get(level, 0)
    pct = count / len(ensemble) * 100
    print(f"  {level:10} {count:>10,} ({pct:>5.2f}%)")

## 6. Consensus Analysis

In [None]:
# Consensus breakdown
print("CONSENSUS BREAKDOWN (Rule + Stat + IF):")
consensus_dist = ensemble['consensus_count'].value_counts().sort_index()
for count, num in consensus_dist.items():
    pct = num / len(ensemble) * 100
    methods = f"{count}/3 methods"
    print(f"  {methods}: {num:>10,} ({pct:>5.2f}%)")

# Critical: all 3 methods agree
critical_consensus = ensemble[ensemble['consensus_count'] == 3]
print(f"\nCRITICAL (all 3 methods): {len(critical_consensus):,} tenders")
print(f"  Total value: {critical_consensus['tender_value'].sum() / 1e9:.2f} B UAH")
print(f"  Mean value: {critical_consensus['tender_value'].mean() / 1e6:.2f} M UAH")

In [None]:
# Visualize consensus
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Consensus distribution
colors = ['#2ca02c', '#ffbb78', '#ff7f0e', '#d62728']
axes[0].bar(consensus_dist.index, consensus_dist.values, color=colors)
axes[0].set_xlabel('Number of Methods Flagged')
axes[0].set_ylabel('Number of Tenders')
axes[0].set_title('Consensus Distribution')
axes[0].set_xticks([0, 1, 2, 3])
axes[0].set_xticklabels(['0 (Normal)', '1 Method', '2 Methods', '3 Methods (Critical)'])

# Risk level pie
risk_colors = {'critical': '#d62728', 'high': '#ff7f0e', 'medium': '#ffbb78', 'low': '#2ca02c'}
risk_order = ['critical', 'high', 'medium', 'low']
risk_values = [risk_dist.get(r, 0) for r in risk_order]
axes[1].pie(risk_values, labels=risk_order, colors=[risk_colors[r] for r in risk_order],
            autopct='%1.1f%%', startangle=90)
axes[1].set_title('Ensemble Risk Distribution')

plt.tight_layout()
plt.savefig('../results/figures/ensemble/consensus_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

## 7. Method Contribution Analysis

In [None]:
# Which methods contribute to critical cases?
print("METHOD FLAGS IN CRITICAL TENDERS (consensus=3):")
print(f"  Rule flagged: {critical_consensus['rule_flag'].sum():,} (100%)")
print(f"  Stat flagged: {critical_consensus['stat_flag'].sum():,} (100%)")
print(f"  IF flagged: {critical_consensus['if_flag'].sum():,} (100%)")

# Method overlap matrix
print("\nMETHOD OVERLAP:")
rule_only = ((ensemble['rule_flag'] == 1) & (ensemble['stat_flag'] == 0) & (ensemble['if_flag'] == 0)).sum()
stat_only = ((ensemble['rule_flag'] == 0) & (ensemble['stat_flag'] == 1) & (ensemble['if_flag'] == 0)).sum()
if_only = ((ensemble['rule_flag'] == 0) & (ensemble['stat_flag'] == 0) & (ensemble['if_flag'] == 1)).sum()

print(f"  Only Rule: {rule_only:,}")
print(f"  Only Stat: {stat_only:,}")
print(f"  Only IF: {if_only:,}")

rule_stat = ((ensemble['rule_flag'] == 1) & (ensemble['stat_flag'] == 1) & (ensemble['if_flag'] == 0)).sum()
rule_if = ((ensemble['rule_flag'] == 1) & (ensemble['stat_flag'] == 0) & (ensemble['if_flag'] == 1)).sum()
stat_if = ((ensemble['rule_flag'] == 0) & (ensemble['stat_flag'] == 1) & (ensemble['if_flag'] == 1)).sum()

print(f"  Rule + Stat: {rule_stat:,}")
print(f"  Rule + IF: {rule_if:,}")
print(f"  Stat + IF: {stat_if:,}")
print(f"  All three: {len(critical_consensus):,}")

In [None]:
# Venn-style bar chart
overlap_data = {
    'Only Rule': rule_only,
    'Only Stat': stat_only,
    'Only IF': if_only,
    'Rule+Stat': rule_stat,
    'Rule+IF': rule_if,
    'Stat+IF': stat_if,
    'All Three': len(critical_consensus),
}

fig, ax = plt.subplots(figsize=(12, 5))
colors = ['#1f77b4', '#2ca02c', '#ff7f0e', '#9467bd', '#8c564b', '#e377c2', '#d62728']
bars = ax.bar(overlap_data.keys(), overlap_data.values(), color=colors)
ax.set_ylabel('Number of Tenders')
ax.set_title('Method Overlap Analysis')
plt.xticks(rotation=45, ha='right')

# Add value labels
for bar in bars:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{int(height):,}', ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.savefig('../results/figures/ensemble/method_overlap.png', dpi=150, bbox_inches='tight')
plt.show()

## 8. Critical Tenders Analysis

In [None]:
# Characteristics of critical tenders
print("="*60)
print("CRITICAL TENDERS CHARACTERISTICS")
print("="*60)

normal = ensemble[ensemble['consensus_count'] == 0]

print(f"\n{'Metric':<30} {'Critical':>15} {'Normal':>15}")
print("-"*60)

# Value
print(f"{'Mean tender value (M UAH)':<30} {critical_consensus['tender_value'].mean()/1e6:>15,.2f} {normal['tender_value'].mean()/1e6:>15,.2f}")
print(f"{'Median tender value (K UAH)':<30} {critical_consensus['tender_value'].median()/1e3:>15,.1f} {normal['tender_value'].median()/1e3:>15,.1f}")

# Competition
print(f"{'Single bidder rate (%)':<30} {critical_consensus['is_single_bidder'].mean()*100:>15.1f} {normal['is_single_bidder'].mean()*100:>15.1f}")
print(f"{'Competitive rate (%)':<30} {critical_consensus['is_competitive'].mean()*100:>15.1f} {normal['is_competitive'].mean()*100:>15.1f}")

# Scores
print(f"{'Mean rule score':<30} {critical_consensus['rule_risk_score'].mean():>15.1f} {normal['rule_risk_score'].mean():>15.1f}")
print(f"{'Mean stat score':<30} {critical_consensus['stat_score'].mean():>15.1f} {normal['stat_score'].mean():>15.1f}")
print(f"{'Mean IF score':<30} {critical_consensus['if_score'].mean():>15.3f} {normal['if_score'].mean():>15.3f}")

In [None]:
# Procurement method distribution
print("\nPROCUREMENT METHOD:")
critical_method = critical_consensus['procurement_method'].value_counts(normalize=True) * 100
normal_method = normal['procurement_method'].value_counts(normalize=True) * 100

for method in critical_method.index:
    c_pct = critical_method.get(method, 0)
    n_pct = normal_method.get(method, 0)
    ratio = c_pct / n_pct if n_pct > 0 else 0
    print(f"  {method}: {c_pct:.1f}% (vs {n_pct:.1f}% normal) - {ratio:.1f}x")

In [None]:
# Year distribution
print("\nYEAR DISTRIBUTION:")
critical_year = critical_consensus['year'].value_counts().sort_index()
for year, count in critical_year.items():
    total_year = len(ensemble[ensemble['year'] == year])
    pct = count / total_year * 100
    print(f"  {year}: {count:,} ({pct:.2f}% of year)")

## 9. Top Risky Entities

In [None]:
# Top buyers by critical tender count
print("TOP 10 BUYERS BY CRITICAL TENDERS:")
top_buyers_critical = critical_consensus.groupby('buyer_id').agg({
    'tender_id': 'count',
    'tender_value': 'sum',
    'ensemble_score': 'mean'
}).sort_values('tender_id', ascending=False).head(10)

top_buyers_critical = top_buyers_critical.reset_index().merge(
    buyers[['buyer_id', 'buyer_name', 'buyer_region']], on='buyer_id', how='left'
)

for _, row in top_buyers_critical.iterrows():
    name = str(row['buyer_name'])[:50] if pd.notna(row['buyer_name']) else 'N/A'
    print(f"  {row['tender_id']:>5,} tenders | {row['tender_value']/1e6:>10,.1f}M UAH | {name}")

In [None]:
# Top suppliers by critical tender count
print("\nTOP 10 SUPPLIERS BY CRITICAL TENDERS:")
top_suppliers_critical = critical_consensus.groupby('supplier_id').agg({
    'tender_id': 'count',
    'tender_value': 'sum',
    'ensemble_score': 'mean'
}).sort_values('tender_id', ascending=False).head(10)

top_suppliers_critical = top_suppliers_critical.reset_index().merge(
    suppliers[['supplier_id', 'supplier_name']], on='supplier_id', how='left'
)

for _, row in top_suppliers_critical.iterrows():
    name = str(row['supplier_name'])[:50] if pd.notna(row['supplier_name']) else 'N/A'
    print(f"  {row['tender_id']:>5,} tenders | {row['tender_value']/1e6:>10,.1f}M UAH | {name}")

## 10. Save Results

In [None]:
# Save ensemble results
output_cols = [
    'tender_id', 'tender_value', 'procurement_method', 'year',
    'buyer_id', 'supplier_id',
    'rule_risk_score', 'stat_score', 'if_score', 'lof_score',
    'ensemble_score', 'consensus_count', 'ensemble_risk'
]

# Save full results
ensemble[output_cols].to_csv('../results/ensemble_results.csv', index=False)
print(f"Saved full results: results/ensemble_results.csv ({len(ensemble):,} rows)")

# Save critical only
critical_consensus[output_cols].to_csv('../results/critical_tenders.csv', index=False)
print(f"Saved critical tenders: results/critical_tenders.csv ({len(critical_consensus):,} rows)")

## 11. Summary

In [None]:
print("="*60)
print("ENSEMBLE SUMMARY")
print("="*60)

print(f"\nDataset: {len(ensemble):,} tenders ({YEARS[0]}-{YEARS[-1]})")
print(f"\nMethods combined:")
print(f"  1. Rule-based: {ensemble['rule_flag'].sum():,} high-risk")
print(f"  2. Statistical: {ensemble['stat_flag'].sum():,} high-risk")
print(f"  3. Isolation Forest: {ensemble['if_flag'].sum():,} anomalies")
print(f"  4. LOF (sample): {lof_results['lof_anomaly'].sum():,} anomalies")

print(f"\nENSEMBLE RISK LEVELS:")
for level in ['critical', 'high', 'medium', 'low']:
    count = risk_dist.get(level, 0)
    pct = count / len(ensemble) * 100
    value = ensemble[ensemble['ensemble_risk'] == level]['tender_value'].sum() / 1e9
    print(f"  {level:10} {count:>10,} ({pct:>5.2f}%) - {value:>8.2f}B UAH")

print(f"\nCRITICAL TENDERS (all 3 methods agree):")
print(f"  Count: {len(critical_consensus):,}")
print(f"  Total value: {critical_consensus['tender_value'].sum()/1e9:.2f}B UAH")
print(f"  Mean value: {critical_consensus['tender_value'].mean()/1e6:.2f}M UAH")
print(f"  Single bidder rate: {critical_consensus['is_single_bidder'].mean()*100:.1f}%")

print("\n" + "="*60)

## Висновки

### Ensemble підхід:
- **Consensus voting**: кількість методів, що flagged тендер
- **Weighted score**: зважена комбінація нормалізованих scores
- **Risk levels**: critical (3/3), high (2/3), medium (1/3), low (0/3)

### Ключові результати:
- **Critical** (всі 3 методи згодні) — найвища впевненість для аудиту
- Методи доповнюють один одного (низька кореляція)
- Різні типи аномалій покриваються різними методами

### Збережені файли:
- `results/ensemble_results.csv` — всі тендери з scores
- `results/critical_tenders.csv` — лише critical (consensus=3)

### Наступні кроки:
- Детальний аналіз top buyers/suppliers
- DBSCAN для виявлення картелів
- Звіт для thesis