# Rule-Based Anomaly Detection (Baseline)

**Мета:** Застосувати експертні правила для виявлення аномалій у закупівлях.

Цей notebook є **baseline** для порівняння з ML методами.

## Рівень 1: Red Flags

| Правило | Серйозність | Опис |
|---------|-------------|------|
| Переплата | КРИТИЧНА | award > tender value |
| Single bidder + низька знижка | КРИТИЧНА | Один учасник і знижка < 2% |
| Open з одним учасником | ВИСОКА | Конкурентний тендер без конкуренції |
| Публікація у вихідний | СЕРЕДНЯ | Знижена видимість |
| Q4 rush | СЕРЕДНЯ | Тиск бюджетного року |

In [None]:
import sys
sys.path.insert(0, '..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from src.data_loader import (
    load_tenders, load_open_tenders, load_buyers,
    memory_usage, data_overview
)
from src.detectors.rule_based import (
    RuleBasedDetector, detect_contract_splitting, detect_buyer_supplier_pairs
)
from src.config import Thresholds

plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
pd.set_option('display.max_columns', 50)

## 1. Data Overview

In [None]:
# Quick overview without loading
overview = data_overview()
for name, size in overview.items():
    print(f"{name}: {size}")

## 2. Load Data

Для швидкого тестування почнемо з одного року.

In [None]:
# Load 2023 data (3.6M records)
tenders = load_tenders(years=2023)
print(f"\nMemory usage: {memory_usage(tenders)}")

In [None]:
# Distribution by procurement method
method_dist = tenders['procurement_method'].value_counts()
print("Procurement methods:")
for method, count in method_dist.items():
    print(f"  {method}: {count:,} ({count/len(tenders)*100:.1f}%)")

## 3. Apply Rule-Based Detection

In [None]:
# Initialize detector
detector = RuleBasedDetector()

# Run detection
results = detector.detect(tenders)
print(f"\nProcessed {len(results):,} tenders")

In [None]:
# Flags summary
summary = detector.summary()
print("\n=== RED FLAGS SUMMARY ===")
print(summary.to_string(index=False))

In [None]:
# Risk level distribution
risk_dist = detector.risk_distribution()
print("\n=== RISK LEVELS ===")
print(risk_dist.to_string(index=False))

In [None]:
# Visualize flags
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Flags by count
summary_sorted = summary.sort_values('count', ascending=True)
colors = {'critical': 'red', 'high': 'orange', 'medium': 'yellow', 'low': 'green'}
bar_colors = [colors.get(s, 'gray') for s in summary_sorted['severity']]

axes[0].barh(summary_sorted['flag'], summary_sorted['count'], color=bar_colors)
axes[0].set_xlabel('Count')
axes[0].set_title('Red Flags by Count')

# Risk level pie
risk_colors = ['green', 'yellow', 'orange', 'red']
axes[1].pie(risk_dist['count'], labels=risk_dist['risk_level'], 
            colors=risk_colors, autopct='%1.1f%%')
axes[1].set_title('Risk Level Distribution')

plt.tight_layout()
plt.show()

## 4. High-Risk Tenders Analysis

In [None]:
# Get high-risk tenders (score >= 4)
high_risk = detector.get_high_risk(min_score=4)
print(f"High-risk tenders: {len(high_risk):,} ({len(high_risk)/len(results)*100:.2f}%)")

In [None]:
# Examine high-risk examples
if len(high_risk) > 0:
    flag_cols = [col for col in high_risk.columns if col.startswith('flag_')]
    display_cols = ['tender_id', 'procurement_method', 'tender_value', 
                    'price_change_pct', 'rule_risk_score'] + flag_cols[:5]
    print("\nExample high-risk tenders:")
    print(high_risk[display_cols].head(10).to_string())

In [None]:
# High-risk by procurement method
if len(high_risk) > 0:
    high_risk_by_method = high_risk.groupby('procurement_method').size()
    print("\nHigh-risk by procurement method:")
    print(high_risk_by_method)

## 5. Contract Splitting Detection

In [None]:
# Detect potential contract splitting
# Threshold 50K UAH is common for simplified procurement
splitting = detect_contract_splitting(tenders, threshold=50000, time_window_days=7)

print(f"Potential contract splitting cases: {len(splitting):,}")
if len(splitting) > 0:
    print("\nTop cases:")
    print(splitting.head(10).to_string())

## 6. Buyer-Supplier Pairs (Favoritism)

In [None]:
# Detect suspicious buyer-supplier relationships
suspicious_pairs = detect_buyer_supplier_pairs(
    tenders, 
    min_contracts=10,  # At least 10 contracts
    min_share=0.8      # Supplier wins 80%+ of buyer's contracts
)

print(f"Suspicious buyer-supplier pairs: {len(suspicious_pairs):,}")
if len(suspicious_pairs) > 0:
    print("\nTop pairs by contract count:")
    print(suspicious_pairs[['buyer_id', 'supplier_id', 'pair_contracts', 
                           'contract_share', 'pair_value']].head(10).to_string())

## 7. Open Tenders Analysis (Bid Rigging Focus)

In [None]:
# Focus on Open tenders - most interesting for fraud detection
open_tenders = results[results['procurement_method'] == 'open']
print(f"Open tenders: {len(open_tenders):,}")

# Single bidder rate in Open (should be low)
single_bidder_open = open_tenders['is_single_bidder'].mean() * 100
print(f"Single bidder rate in Open: {single_bidder_open:.1f}%")

# Risk distribution for Open
open_risk = open_tenders['rule_risk_level'].value_counts()
print("\nRisk levels in Open tenders:")
print(open_risk)

## 8. Save Results

In [None]:
# Save high-risk tenders for further analysis
output_path = '../results/anomalies/rule_based_2023.csv'
high_risk.to_csv(output_path, index=False)
print(f"Saved {len(high_risk):,} high-risk tenders to {output_path}")

# Save summary
summary.to_csv('../results/anomalies/rule_based_summary_2023.csv', index=False)
print("Saved summary")

## Summary

### Baseline Results (2023)

| Metric | Value |
|--------|-------|
| Total tenders | - |
| High-risk (score >= 4) | - |
| Critical flags | - |

### Next Steps

1. **Statistical Screens** - CV, RDNOR для Open тендерів з bids
2. **Isolation Forest** - ML baseline
3. **LOF** - локальні аномалії по CPV/регіону
4. **Ensemble** - об'єднання всіх методів