In [2]:
#!/usr/bin/env python
# coding: utf-8

import json
import csv
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import homogeneity_score, completeness_score, v_measure_score
import os
from collections import defaultdict
import pandas as pd


def load_ground_truth(json_file):
    """Loads ground truth labels from a JSON file."""
    with open(json_file, 'r') as f:
        gt_data = json.load(f)
    ap_to_floor = {int(k): v for k, v in gt_data.items()}
    return ap_to_floor


def load_clustering_result(csv_file):
    """Loads clustering results from a CSV file."""
    clusters = []
    with open(csv_file, 'r') as f:
        csv_reader = csv.reader(f)
        for row in csv_reader:
            cluster = [int(item.strip()) for item in row if item.strip() and item.strip().isdigit()]
            if cluster:
                clusters.append(cluster)
    ap_to_cluster = {}
    for cluster_id, aps in enumerate(clusters):
        for ap in aps:
            ap_to_cluster[ap] = cluster_id
    return ap_to_cluster, clusters


def map_clusters_to_floors(ap_to_floor, ap_to_cluster):
    """Maps clusters to floors (majority floor in the cluster becomes its label)."""
    cluster_floor_counts = defaultdict(lambda: defaultdict(int))
    common_aps = set(ap_to_floor.keys()) & set(ap_to_cluster.keys())
    for ap in common_aps:
        floor = ap_to_floor[ap]
        cluster = ap_to_cluster[ap]
        cluster_floor_counts[cluster][floor] += 1
    cluster_to_floor = {}
    for cluster, floor_counts in cluster_floor_counts.items():
        cluster_to_floor[cluster] = max(floor_counts.items(), key=lambda x: x[1])[0]
    return cluster_to_floor


def create_true_pred_arrays(ap_to_floor, ap_to_cluster, cluster_to_floor):
    """Creates arrays of true and predicted labels for evaluation."""
    common_aps = sorted(set(ap_to_floor.keys()) & set(ap_to_cluster.keys()))
    y_true = np.array([ap_to_floor[ap] for ap in common_aps])
    y_pred_raw = np.array([ap_to_cluster[ap] for ap in common_aps])
    y_pred_mapped = np.array([cluster_to_floor[ap_to_cluster[ap]] for ap in common_aps])
    return common_aps, y_true, y_pred_raw, y_pred_mapped


def calculate_cluster_quality_metrics(clusters, ap_to_floor):
    """Kümeleme kalitesini değerlendirir."""
    total_aps = sum(len(cluster) for cluster in clusters)
    num_clusters = len(clusters)
    num_unique_floors = len(set(ap_to_floor.values()))
    
    cluster_sizes = [len(cluster) for cluster in clusters]
    avg_cluster_size = np.mean(cluster_sizes)
    std_cluster_size = np.std(cluster_sizes)
    
    singleton_clusters = sum(1 for size in cluster_sizes if size == 1)
    singleton_ratio = singleton_clusters / num_clusters if num_clusters > 0 else 0
    
    cluster_purities = []
    for cluster in clusters:
        if not cluster:
            continue
        floor_counts = defaultdict(int)
        valid_count = 0
        for ap in cluster:
            if ap in ap_to_floor:
                floor_counts[ap_to_floor[ap]] += 1
                valid_count += 1
        if valid_count > 0:
            max_count = max(floor_counts.values())
            purity = max_count / valid_count
            cluster_purities.append(purity)
    
    avg_purity = np.mean(cluster_purities) if cluster_purities else 0
    cluster_floor_ratio = num_clusters / num_unique_floors if num_unique_floors > 0 else float('inf')
    
    return {
        'num_clusters': num_clusters,
        'num_unique_floors': num_unique_floors,
        'total_aps': total_aps,
        'avg_cluster_size': avg_cluster_size,
        'std_cluster_size': std_cluster_size,
        'singleton_clusters': singleton_clusters,
        'singleton_ratio': singleton_ratio,
        'avg_cluster_purity': avg_purity,
        'cluster_floor_ratio': cluster_floor_ratio,
        'min_cluster_size': min(cluster_sizes) if cluster_sizes else 0,
        'max_cluster_size': max(cluster_sizes) if cluster_sizes else 0
    }


def evaluate_clustering(y_true, y_pred, prefix=""):
    """Evaluates clustering performance and returns metrics."""
    results = {}
    results[f"{prefix}ari"] = adjusted_rand_score(y_true, y_pred)
    results[f"{prefix}nmi"] = normalized_mutual_info_score(y_true, y_pred)
    results[f"{prefix}homogeneity"] = homogeneity_score(y_true, y_pred)
    results[f"{prefix}completeness"] = completeness_score(y_true, y_pred)
    results[f"{prefix}v_measure"] = v_measure_score(y_true, y_pred)
    if prefix == "mapped_":
        results[f"{prefix}accuracy"] = accuracy_score(y_true, y_pred)
        precision, recall, f1, _ = precision_recall_fscore_support(
            y_true, y_pred, average='weighted', zero_division=0
        )
        results[f"{prefix}precision"] = precision
        results[f"{prefix}recall"] = recall
        results[f"{prefix}f1"] = f1
    return results


def calculate_adjusted_score(mapped_f1, singleton_ratio, cluster_floor_ratio):
    """Over-clustering cezası eklenmiş adjusted score"""
    if singleton_ratio > 0.5:
        singleton_penalty = 0.5 * (1 - singleton_ratio)
    else:
        singleton_penalty = 1.0
    
    if cluster_floor_ratio > 5.0:
        overclustering_penalty = max(0.1, 1.0 / (cluster_floor_ratio / 5.0))
    elif cluster_floor_ratio > 2.0:
        overclustering_penalty = max(0.5, 1.0 / (cluster_floor_ratio / 2.0))
    else:
        overclustering_penalty = 1.0
    
    adjusted_score = mapped_f1 * singleton_penalty * overclustering_penalty
    return adjusted_score


def generate_confusion_matrix(y_true, y_pred, output_file, title):
    """Generates and saves a confusion matrix plot."""
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(12, 10))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title(title)
    plt.savefig(output_file)
    plt.close()


def analyze_clusters(clusters, ap_to_floor, output_dir):
    """Analyzes each cluster and visualizes floor distribution."""
    results = []
    for cluster_id, aps in enumerate(clusters):
        floor_counts = defaultdict(int)
        valid_aps = 0
        for ap in aps:
            if ap in ap_to_floor:
                floor_counts[ap_to_floor[ap]] += 1
                valid_aps += 1
        if not valid_aps:
            continue
        percentages = {floor: (count / valid_aps) * 100 for floor, count in floor_counts.items()}
        dominant_floor = max(percentages.items(), key=lambda x: x[1]) if percentages else (None, 0)
        results.append({
            'cluster_id': cluster_id,
            'total_aps': len(aps),
            'valid_aps': valid_aps,
            'floor_counts': dict(floor_counts),
            'floor_percentages': percentages,
            'dominant_floor': dominant_floor[0],
            'dominant_percentage': dominant_floor[1]
        })
    
    with open(os.path.join(output_dir, 'cluster_analysis.json'), 'w', encoding='utf-8') as f:
        json.dump(results, f, ensure_ascii=False, indent=4)
    return results


def find_the_way(path, file_format, con=""):
    files_add = []
    for r, d, f in os.walk(path):
        for file in f:
            if file_format in file:
                if con in file:
                    files_add.append(os.path.join(r, file))
    return files_add


# ====== ANA DEĞERLENDİRME DÖNGÜSÜ ======
paths = ["./ValidationGeo", "./ValidationWDE", "./TrainingGeo", "./TrainingWDE"]

# Her path için toplu sonuçları tutacak liste
all_metrics_list = []

for p in paths:
    path = f"{p}/community_results"
    files_add = find_the_way(path, '_communities.csv')
    print(f"\n{'='*80}")
    print(f"İşleniyor: {p}")
    print(f"{'='*80}")
    print(f"Bulunan dosyalar: {len(files_add)}")

    # BASİTLEŞTİRİLMİŞ KLASÖR YAPISI
    # Artık sadece: results/{dataset_adi}/ altına her şey gidecek
    dataset_name = p.replace('./', '').replace('/', '_')
    main_output_dir = f'./results/{dataset_name}'
    os.makedirs(main_output_dir, exist_ok=True)

    for file in files_add:
        # Algoritma adını dosya isminden çıkar
        algo_name = os.path.basename(file).replace('_communities.csv', '')
        
        # 1. Setup file paths
        gt_file = f'{p}/data_GT.json'
        result_file = file
        
        # 2. Load data
        ap_to_floor = load_ground_truth(gt_file)
        ap_to_cluster, clusters = load_clustering_result(result_file)
        cluster_to_floor = map_clusters_to_floors(ap_to_floor, ap_to_cluster)
        
        common_aps, y_true, y_pred_raw, y_pred_mapped = create_true_pred_arrays(
            ap_to_floor, ap_to_cluster, cluster_to_floor
        )
        
        # 3. Evaluate clustering
        raw_results = evaluate_clustering(y_true, y_pred_raw, prefix="raw_")
        mapped_results = evaluate_clustering(y_true, y_pred_mapped, prefix="mapped_")
        
        # 4. Küme kalite metrikleri
        quality_metrics = calculate_cluster_quality_metrics(clusters, ap_to_floor)
        
        # 5. Adjusted score hesapla
        adjusted_f1 = calculate_adjusted_score(
            mapped_results['mapped_f1'],
            quality_metrics['singleton_ratio'],
            quality_metrics['cluster_floor_ratio']
        )
        
        # Tüm sonuçları birleştir
        all_results = {
            'Dataset': dataset_name,
            'Algorithm': algo_name,
            **raw_results,
            **mapped_results,
            **quality_metrics,
            'adjusted_f1_score': adjusted_f1
        }
        
        # Toplu liste için ekle
        all_metrics_list.append(all_results)
        
        # 6. Print Results
        print(f"\n--- {algo_name} ---")
        print(f"F1-Score: {mapped_results['mapped_f1']:.4f} | Adjusted F1: {adjusted_f1:.4f}")
        print(f"Küme Sayısı: {quality_metrics['num_clusters']} | Kat Sayısı: {quality_metrics['num_unique_floors']}")
        
        # Uyarılar
        if quality_metrics['singleton_ratio'] > 0.3:
            print(f"⚠️ Singleton oranı yüksek: {quality_metrics['singleton_ratio']:.1%}")
        if quality_metrics['cluster_floor_ratio'] > 3:
            print(f"⚠️ Over-clustering: {quality_metrics['cluster_floor_ratio']:.1f}x fazla küme")
        
        # 7. BASİTLEŞTİRİLMİŞ DOSYA KAYDETME
        # Tüm dosyalar doğrudan main_output_dir altına, algoritma adıyla
        
        # JSON sonuçları
        json_file = os.path.join(main_output_dir, f'{algo_name}_metrics.json')
        with open(json_file, 'w', encoding='utf-8') as f:
            json.dump(all_results, f, ensure_ascii=False, indent=4)
        
        # Confusion matrix
        cm_file = os.path.join(main_output_dir, f'{algo_name}_confusion_matrix.pdf')
        generate_confusion_matrix(y_true, y_pred_mapped, cm_file, 
                                 f'Confusion Matrix - {algo_name}')
        
        # Cluster analysis
        analysis_file = os.path.join(main_output_dir, f'{algo_name}_cluster_analysis.json')
        cluster_analysis = analyze_clusters(clusters, ap_to_floor, main_output_dir)
        # Dosyayı taşı/yeniden adlandır
        os.rename(os.path.join(main_output_dir, 'cluster_analysis.json'), analysis_file)
    
    print(f"\n✅ {dataset_name} sonuçları kaydedildi: {main_output_dir}")

# ====== TOPLU EXCEL RAPORU ======
print(f"\n{'='*80}")
print("Toplu Excel raporu oluşturuluyor...")
print(f"{'='*80}")

if all_metrics_list:
    df_all = pd.DataFrame(all_metrics_list)
    
    # Sütun sırasını düzenle
    important_cols = ['Dataset', 'Algorithm', 'mapped_f1', 'adjusted_f1_score', 
                     'mapped_accuracy', 'num_clusters', 'num_unique_floors', 
                     'singleton_ratio', 'cluster_floor_ratio']
    other_cols = [c for c in df_all.columns if c not in important_cols]
    df_all = df_all[important_cols + other_cols]
    
    # Excel'e kaydet
    excel_file = './results/TUM_SONUCLAR.xlsx'
    df_all.to_excel(excel_file, index=False)
    print(f"\n✅ Toplu sonuçlar kaydedildi: {excel_file}")
    
    # Özet tabloyu ekrana yazdır
    print("\n" + "="*80)
    print("ÖZET TABLO (En İyi 10 Algoritma - Adjusted F1'e göre)")
    print("="*80)
    summary = df_all[['Dataset', 'Algorithm', 'mapped_f1', 'adjusted_f1_score', 
                      'num_clusters', 'singleton_ratio']].copy()
    summary = summary.sort_values('adjusted_f1_score', ascending=False).head(10)
    print(summary.to_string(index=False))
else:
    print("❌ Hiçbir sonuç bulunamadı!")

print("\n" + "="*80)
print("İŞLEM TAMAMLANDI!")
print("="*80)
print("\nKLASÖR YAPISI:")
print("./results/")
print("  ├── ValidationGeo/")
print("  │   ├── algorithm1_metrics.json")
print("  │   ├── algorithm1_confusion_matrix.pdf")
print("  │   ├── algorithm1_cluster_analysis.json")
print("  │   └── ...")
print("  ├── ValidationWDE/")
print("  ├── TrainingGeo/")
print("  ├── TrainingWDE/")
print("  └── TUM_SONUCLAR.xlsx  ← Tüm sonuçlar burada!")
print("="*80)



İşleniyor: ./ValidationGeo
Bulunan dosyalar: 7

--- edge_betweenness ---
F1-Score: 0.4670 | Adjusted F1: 0.0047
Küme Sayısı: 36 | Kat Sayısı: 13
⚠️ Singleton oranı yüksek: 97.2%

--- fast_greedy ---
F1-Score: 0.0997 | Adjusted F1: 0.0997
Küme Sayısı: 2 | Kat Sayısı: 13

--- infomap ---
F1-Score: 0.0551 | Adjusted F1: 0.0551
Küme Sayısı: 1 | Kat Sayısı: 13

--- label_propagation ---
F1-Score: 0.0551 | Adjusted F1: 0.0551
Küme Sayısı: 1 | Kat Sayısı: 13

--- leiden ---
F1-Score: 0.1113 | Adjusted F1: 0.1113
Küme Sayısı: 3 | Kat Sayısı: 13

--- louvain ---
F1-Score: 0.0953 | Adjusted F1: 0.0953
Küme Sayısı: 2 | Kat Sayısı: 13

--- Node2Vec ---
F1-Score: 0.6162 | Adjusted F1: 0.6162
Küme Sayısı: 20 | Kat Sayısı: 13

✅ ValidationGeo sonuçları kaydedildi: ./results/ValidationGeo

İşleniyor: ./ValidationWDE
Bulunan dosyalar: 7

--- edge_betweenness ---
F1-Score: 0.4670 | Adjusted F1: 0.0047
Küme Sayısı: 36 | Kat Sayısı: 13
⚠️ Singleton oranı yüksek: 97.2%

--- fast_greedy ---
F1-Score: 0.099