# 09. Aggregated HDBSCAN Clustering

HDBSCAN clustering at three aggregation levels:
1. **Buyers** - cluster buyers by procurement behavior patterns
2. **Suppliers** - cluster suppliers by winning patterns (detect cartels)
3. **Buyer-Supplier Pairs** - cluster relationships (detect collusion)

In [None]:
import sys
sys.path.insert(0, '..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

from src.data_loader import load_tenders, load_buyers, load_suppliers
from src.detectors import AggregatedHDBSCAN

pd.set_option('display.max_columns', 50)
plt.style.use('seaborn-v0_8-whitegrid')

print(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

## 1. Load Data

In [None]:
# Load 2023 data
tenders = load_tenders(years=[2023], sample_frac=0.1)
buyers = load_buyers()
suppliers = load_suppliers()

print(f"Tenders: {len(tenders):,}")
print(f"Buyers: {len(buyers):,}")
print(f"Suppliers: {len(suppliers):,}")

## 2. Initialize Detector

In [None]:
detector = AggregatedHDBSCAN(
    min_cluster_size=10,
    min_samples=5,
    metric="euclidean",
)

## 3. Cluster Buyers

Find groups of buyers with similar procurement patterns. Outliers = suspicious buyers.

In [None]:
buyer_results = detector.cluster_buyers(tenders, buyers)

In [None]:
print("\nBuyer clustering results:")
print(f"  Total buyers: {len(buyer_results):,}")
print(f"  Clusters: {buyer_results['cluster'].nunique() - 1}")
print(f"  Noise (outliers): {buyer_results['is_noise'].sum():,}")
print(f"  Anomalies: {buyer_results['is_anomaly'].sum():,}")

In [None]:
# Suspicious buyers (high outlier score)
suspicious_buyers = detector.get_suspicious_buyers(min_score=0.5)
print(f"\nSuspicious buyers (score >= 0.5): {len(suspicious_buyers):,}")

if len(suspicious_buyers) > 0:
    print("\nTop 10 by outlier score:")
    display(suspicious_buyers.nlargest(10, 'outlier_score'))

In [None]:
# Compare suspicious vs normal buyers
if len(suspicious_buyers) > 0 and 'single_bidder_rate' in buyer_results.columns:
    normal_buyers = buyer_results[buyer_results['outlier_score'] < 0.5]
    
    print("Comparison: Suspicious vs Normal Buyers")
    print("="*50)
    
    for col in ['single_bidder_rate', 'competitive_rate', 'total_value']:
        if col in buyer_results.columns:
            sus_mean = suspicious_buyers[col].mean()
            norm_mean = normal_buyers[col].mean()
            print(f"{col}:")
            print(f"  Suspicious: {sus_mean:.3f}")
            print(f"  Normal:     {norm_mean:.3f}")

## 4. Cluster Suppliers

Find groups of suppliers with similar winning patterns. Outliers = potential cartel members or anomalous suppliers.

In [None]:
supplier_results = detector.cluster_suppliers(tenders, suppliers)

In [None]:
print("\nSupplier clustering results:")
print(f"  Total suppliers: {len(supplier_results):,}")
print(f"  Clusters: {supplier_results['cluster'].nunique() - 1}")
print(f"  Noise (outliers): {supplier_results['is_noise'].sum():,}")
print(f"  Anomalies: {supplier_results['is_anomaly'].sum():,}")

In [None]:
# Suspicious suppliers
suspicious_suppliers = detector.get_suspicious_suppliers(min_score=0.5)
print(f"\nSuspicious suppliers (score >= 0.5): {len(suspicious_suppliers):,}")

if len(suspicious_suppliers) > 0:
    print("\nTop 10 by outlier score:")
    display(suspicious_suppliers.nlargest(10, 'outlier_score'))

In [None]:
# Supplier clusters analysis
print("\nSupplier Cluster Analysis:")
print("="*50)

cluster_stats = supplier_results.groupby('cluster').agg({
    'supplier_id': 'count',
    'total_awards': 'mean',
    'total_value': 'mean',
    'single_bidder_rate': 'mean',
}).round(2)
cluster_stats.columns = ['count', 'avg_awards', 'avg_value', 'single_bidder_rate']
cluster_stats = cluster_stats.sort_values('count', ascending=False)

display(cluster_stats.head(10))

## 5. Cluster Buyer-Supplier Pairs

Find groups of relationships with similar patterns. Outliers = suspicious collusive relationships.

In [None]:
pair_results = detector.cluster_pairs(tenders, min_contracts=3)

In [None]:
if len(pair_results) > 0:
    print("\nBuyer-Supplier pair clustering results:")
    print(f"  Total pairs: {len(pair_results):,}")
    print(f"  Clusters: {pair_results['cluster'].nunique() - 1}")
    print(f"  Noise (outliers): {pair_results['is_noise'].sum():,}")
    print(f"  Anomalies: {pair_results['is_anomaly'].sum():,}")

In [None]:
# Suspicious pairs (potential collusion)
if len(pair_results) > 0:
    suspicious_pairs = detector.get_suspicious_pairs(min_score=0.5)
    print(f"\nSuspicious pairs (score >= 0.5): {len(suspicious_pairs):,}")

    if len(suspicious_pairs) > 0:
        print("\nTop 10 suspicious pairs by outlier score:")
        display(suspicious_pairs.nlargest(10, 'outlier_score')[[
            'buyer_id', 'supplier_id', 'outlier_score', 
            'contracts_count', 'total_value', 'single_bidder_rate',
            'exclusivity_buyer', 'exclusivity_supplier'
        ]])

In [None]:
# Highly exclusive relationships (potential red flags)
if len(pair_results) > 0:
    exclusive_pairs = pair_results[
        (pair_results['exclusivity_buyer'] > 0.5) | 
        (pair_results['exclusivity_supplier'] > 0.5)
    ]
    print(f"\nHighly exclusive relationships (>50% exclusivity): {len(exclusive_pairs):,}")
    
    if len(exclusive_pairs) > 0:
        print("\nTop 10 by buyer exclusivity:")
        display(exclusive_pairs.nlargest(10, 'exclusivity_buyer')[[
            'buyer_id', 'supplier_id', 'contracts_count', 'total_value',
            'exclusivity_buyer', 'exclusivity_supplier', 'single_bidder_rate'
        ]])

## 6. Summary

In [None]:
summaries = detector.summary()

for level, summary_df in summaries.items():
    print(f"\n{level.upper()} Summary:")
    print("="*40)
    display(summary_df)

## 7. Visualizations

In [None]:
# Outlier score distributions
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

for ax, (name, results) in zip(axes, [
    ('Buyers', detector.buyer_results_),
    ('Suppliers', detector.supplier_results_),
    ('Pairs', detector.pair_results_),
]):
    if results is not None and len(results) > 0:
        results['outlier_score'].hist(bins=30, ax=ax, color='steelblue', edgecolor='white')
        ax.axvline(x=0.5, color='red', linestyle='--', label='Anomaly threshold')
        ax.set_title(f'{name} Outlier Scores')
        ax.set_xlabel('Score')
        ax.set_ylabel('Count')
        ax.legend()
    else:
        ax.text(0.5, 0.5, 'No data', ha='center', va='center')
        ax.set_title(f'{name} Outlier Scores')

plt.tight_layout()
plt.savefig('../results/aggregated_hdbscan_scores.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Cluster sizes
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

for ax, (name, results) in zip(axes, [
    ('Buyers', detector.buyer_results_),
    ('Suppliers', detector.supplier_results_),
    ('Pairs', detector.pair_results_),
]):
    if results is not None and len(results) > 0:
        cluster_sizes = results['cluster'].value_counts().sort_index()
        cluster_sizes.plot(kind='bar', ax=ax, color='coral')
        ax.set_title(f'{name} Cluster Sizes')
        ax.set_xlabel('Cluster (-1 = noise)')
        ax.set_ylabel('Count')
    else:
        ax.text(0.5, 0.5, 'No data', ha='center', va='center')
        ax.set_title(f'{name} Cluster Sizes')

plt.tight_layout()
plt.savefig('../results/aggregated_hdbscan_clusters.png', dpi=150, bbox_inches='tight')
plt.show()

## 8. Save Results

In [None]:
# Save anomalies from each level
if detector.buyer_results_ is not None:
    suspicious_buyers = detector.get_suspicious_buyers(0.5)
    suspicious_buyers.to_csv('../results/hdbscan_suspicious_buyers.csv', index=False)
    print(f"Saved {len(suspicious_buyers)} suspicious buyers")

if detector.supplier_results_ is not None:
    suspicious_suppliers = detector.get_suspicious_suppliers(0.5)
    suspicious_suppliers.to_csv('../results/hdbscan_suspicious_suppliers.csv', index=False)
    print(f"Saved {len(suspicious_suppliers)} suspicious suppliers")

if detector.pair_results_ is not None and len(detector.pair_results_) > 0:
    suspicious_pairs = detector.get_suspicious_pairs(0.5)
    suspicious_pairs.to_csv('../results/hdbscan_suspicious_pairs.csv', index=False)
    print(f"Saved {len(suspicious_pairs)} suspicious pairs")

print(f"\nCompleted: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")