In [4]:
#!/usr/bin/env python
# coding: utf-8

import json
import csv
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score
from sklearn.metrics import confusion_matrix, silhouette_score
from sklearn.metrics import homogeneity_score, completeness_score, v_measure_score
import argparse
import os
from collections import defaultdict


def load_ground_truth(json_file):
    """Loads ground truth labels from a JSON file."""
    with open(json_file, 'r') as f:
        gt_data = json.load(f)
    ap_to_floor = {int(k): v for k, v in gt_data.items()}
    return ap_to_floor


def load_clustering_result(csv_file):
    """Loads clustering results from a CSV file."""
    clusters = []
    with open(csv_file, 'r') as f:
        csv_reader = csv.reader(f)
        for row in csv_reader:
            cluster = [int(item.strip()) for item in row if item.strip() and item.strip().isdigit()]
            if cluster:
                clusters.append(cluster)
    ap_to_cluster = {}
    for cluster_id, aps in enumerate(clusters):
        for ap in aps:
            ap_to_cluster[ap] = cluster_id
    return ap_to_cluster, clusters


def map_clusters_to_floors(ap_to_floor, ap_to_cluster):
    """Maps clusters to floors (majority floor in the cluster becomes its label)."""
    cluster_floor_counts = defaultdict(lambda: defaultdict(int))
    common_aps = set(ap_to_floor.keys()) & set(ap_to_cluster.keys())
    for ap in common_aps:
        floor = ap_to_floor[ap]
        cluster = ap_to_cluster[ap]
        cluster_floor_counts[cluster][floor] += 1
    cluster_to_floor = {}
    for cluster, floor_counts in cluster_floor_counts.items():
        cluster_to_floor[cluster] = max(floor_counts.items(), key=lambda x: x[1])[0]
    return cluster_to_floor


def create_true_pred_arrays(ap_to_floor, ap_to_cluster, cluster_to_floor):
    """Creates arrays of true and predicted labels for evaluation."""
    common_aps = sorted(set(ap_to_floor.keys()) & set(ap_to_cluster.keys()))
    y_true = np.array([ap_to_floor[ap] for ap in common_aps])
    y_pred_raw = np.array([ap_to_cluster[ap] for ap in common_aps])
    y_pred_mapped = np.array([cluster_to_floor[ap_to_cluster[ap]] for ap in common_aps])
    return common_aps, y_true, y_pred_raw, y_pred_mapped


def calculate_cluster_quality_metrics(clusters, ap_to_floor):
    """
    YENI FONKSİYON: Kümeleme kalitesini değerlendirir.
    
    Returns:
        dict: Kümeleme kalite metrikleri
    """
    total_aps = sum(len(cluster) for cluster in clusters)
    num_clusters = len(clusters)
    num_unique_floors = len(set(ap_to_floor.values()))
    
    # Cluster boyutları
    cluster_sizes = [len(cluster) for cluster in clusters]
    avg_cluster_size = np.mean(cluster_sizes)
    std_cluster_size = np.std(cluster_sizes)
    
    # Singleton cluster sayısı (tek elemanlı küme)
    singleton_clusters = sum(1 for size in cluster_sizes if size == 1)
    singleton_ratio = singleton_clusters / num_clusters if num_clusters > 0 else 0
    
    # Küme içi homojenlik (her kümedeki dominant floor oranı)
    cluster_purities = []
    for cluster in clusters:
        if not cluster:
            continue
        floor_counts = defaultdict(int)
        valid_count = 0
        for ap in cluster:
            if ap in ap_to_floor:
                floor_counts[ap_to_floor[ap]] += 1
                valid_count += 1
        if valid_count > 0:
            max_count = max(floor_counts.values())
            purity = max_count / valid_count
            cluster_purities.append(purity)
    
    avg_purity = np.mean(cluster_purities) if cluster_purities else 0
    
    # Küme/Kat oranı - idealden ne kadar sapma var?
    cluster_floor_ratio = num_clusters / num_unique_floors if num_unique_floors > 0 else float('inf')
    
    return {
        'num_clusters': num_clusters,
        'num_unique_floors': num_unique_floors,
        'total_aps': total_aps,
        'avg_cluster_size': avg_cluster_size,
        'std_cluster_size': std_cluster_size,
        'singleton_clusters': singleton_clusters,
        'singleton_ratio': singleton_ratio,
        'avg_cluster_purity': avg_purity,
        'cluster_floor_ratio': cluster_floor_ratio,
        'min_cluster_size': min(cluster_sizes) if cluster_sizes else 0,
        'max_cluster_size': max(cluster_sizes) if cluster_sizes else 0
    }


def evaluate_clustering(y_true, y_pred, prefix=""):
    """Evaluates clustering performance and returns metrics."""
    results = {}
    results[f"{prefix}ari"] = adjusted_rand_score(y_true, y_pred)
    results[f"{prefix}nmi"] = normalized_mutual_info_score(y_true, y_pred)
    results[f"{prefix}homogeneity"] = homogeneity_score(y_true, y_pred)
    results[f"{prefix}completeness"] = completeness_score(y_true, y_pred)
    results[f"{prefix}v_measure"] = v_measure_score(y_true, y_pred)
    if prefix == "mapped_":
        results[f"{prefix}accuracy"] = accuracy_score(y_true, y_pred)
        precision, recall, f1, _ = precision_recall_fscore_support(
            y_true, y_pred, average='weighted', zero_division=0
        )
        results[f"{prefix}precision"] = precision
        results[f"{prefix}recall"] = recall
        results[f"{prefix}f1"] = f1
    return results


def calculate_adjusted_score(mapped_f1, singleton_ratio, cluster_floor_ratio):
    """
    YENI FONKSİYON: Over-clustering cezası eklenmiş adjusted score
    
    Args:
        mapped_f1: Orijinal F1 skoru
        singleton_ratio: Tek elemanlı küme oranı
        cluster_floor_ratio: Küme sayısı / Kat sayısı oranı
    
    Returns:
        float: Cezalandırılmış F1 skoru
    """
    # Singleton cezası: %50'den fazlası singleton ise ağır ceza
    if singleton_ratio > 0.5:
        singleton_penalty = 0.5 * (1 - singleton_ratio)
    else:
        singleton_penalty = 1.0
    
    # Over-clustering cezası: İdeal oran 1.0-2.0 arası
    if cluster_floor_ratio > 5.0:
        overclustering_penalty = max(0.1, 1.0 / (cluster_floor_ratio / 5.0))
    elif cluster_floor_ratio > 2.0:
        overclustering_penalty = max(0.5, 1.0 / (cluster_floor_ratio / 2.0))
    else:
        overclustering_penalty = 1.0
    
    # Final skor
    adjusted_score = mapped_f1 * singleton_penalty * overclustering_penalty
    
    return adjusted_score


def generate_confusion_matrix(y_true, y_pred, output_file, title):
    """Generates and saves a confusion matrix plot."""
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(12, 10))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title(title)
    plt.savefig(output_file)
    plt.close()


def analyze_clusters(clusters, ap_to_floor, output_dir):
    """Analyzes each cluster and visualizes floor distribution."""
    results = []
    for cluster_id, aps in enumerate(clusters):
        floor_counts = defaultdict(int)
        valid_aps = 0
        for ap in aps:
            if ap in ap_to_floor:
                floor_counts[ap_to_floor[ap]] += 1
                valid_aps += 1
        if not valid_aps:
            continue
        percentages = {floor: (count / valid_aps) * 100 for floor, count in floor_counts.items()}
        dominant_floor = max(percentages.items(), key=lambda x: x[1]) if percentages else (None, 0)
        results.append({
            'cluster_id': cluster_id,
            'total_aps': len(aps),
            'valid_aps': valid_aps,
            'floor_counts': dict(floor_counts),
            'floor_percentages': percentages,
            'dominant_floor': dominant_floor[0],
            'dominant_percentage': dominant_floor[1]
        })
    
    with open(os.path.join(output_dir, 'cluster_analysis.json'), 'w', encoding='utf-8') as f:
        json.dump(results, f, ensure_ascii=False, indent=4)
    return results


def find_the_way(path, file_format, con=""):
    files_add = []
    for r, d, f in os.walk(path):
        for file in f:
            if file_format in file:
                if con in file:
                    files_add.append(os.path.join(r, file))
    return files_add


# ====== ANA DEĞERLENDİRME DÖNGÜSÜ ======
paths = ['./']

for p in paths:
    path = f"{p}/community_results"
    files_add = find_the_way(path, '_communities.csv')
    print(files_add)

    for file in files_add:
        # 1. Setup file paths
        gt_file = f'{p}/data_GT.json'
        result_file = file
        
        folder_name = file[2:-4]
        output_dir = f'{p}/results/{folder_name}/'
        os.makedirs(output_dir, exist_ok=True)
        
        # 2. Load data
        ap_to_floor = load_ground_truth(gt_file)
        ap_to_cluster, clusters = load_clustering_result(result_file)
        cluster_to_floor = map_clusters_to_floors(ap_to_floor, ap_to_cluster)
        
        common_aps, y_true, y_pred_raw, y_pred_mapped = create_true_pred_arrays(
            ap_to_floor, ap_to_cluster, cluster_to_floor
        )
        
        # 3. Evaluate clustering
        raw_results = evaluate_clustering(y_true, y_pred_raw, prefix="raw_")
        mapped_results = evaluate_clustering(y_true, y_pred_mapped, prefix="mapped_")
        
        # 4. *** YENI: Küme kalite metrikleri ***
        quality_metrics = calculate_cluster_quality_metrics(clusters, ap_to_floor)
        
        # 5. *** YENI: Adjusted score hesapla ***
        adjusted_f1 = calculate_adjusted_score(
            mapped_results['mapped_f1'],
            quality_metrics['singleton_ratio'],
            quality_metrics['cluster_floor_ratio']
        )
        
        # Tüm sonuçları birleştir
        all_results = {
            **raw_results,
            **mapped_results,
            **quality_metrics,
            'adjusted_f1_score': adjusted_f1
        }
        
        # 6. Print Results
        print(f"\n{'='*60}")
        print(f"ALGORITMA: {folder_name}")
        print(f"{'='*60}")
        
        print("\n--- Kümeleme İstatistikleri ---")
        print(f"Toplam AP Sayısı: {quality_metrics['total_aps']}")
        print(f"Küme Sayısı: {quality_metrics['num_clusters']}")
        print(f"Gerçek Kat Sayısı: {quality_metrics['num_unique_floors']}")
        print(f"Küme/Kat Oranı: {quality_metrics['cluster_floor_ratio']:.2f}")
        print(f"Singleton (Tek Elemanlı) Küme Sayısı: {quality_metrics['singleton_clusters']}")
        print(f"Singleton Oranı: {quality_metrics['singleton_ratio']:.2%}")
        print(f"Ortalama Küme Boyutu: {quality_metrics['avg_cluster_size']:.2f}")
        print(f"Ortalama Küme Saflığı: {quality_metrics['avg_cluster_purity']:.2%}")
        
        print("\n--- Raw Cluster Evaluation ---")
        print(f"Adjusted Rand Index: {raw_results['raw_ari']:.4f}")
        print(f"Normalized Mutual Information: {raw_results['raw_nmi']:.4f}")
        
        print("\n--- Mapped Evaluation ---")
        print(f"Accuracy: {mapped_results['mapped_accuracy']:.4f}")
        print(f"F1-Score (Orijinal): {mapped_results['mapped_f1']:.4f}")
        print(f"F1-Score (Adjusted): {adjusted_f1:.4f} ⚠️")
        
        # *** YENI: UYARI SİSTEMİ ***
        warnings = []
        if quality_metrics['singleton_ratio'] > 0.3:
            warnings.append(f"⚠️  UYARI: Singleton küme oranı çok yüksek ({quality_metrics['singleton_ratio']:.1%})!")
        if quality_metrics['cluster_floor_ratio'] > 3:
            warnings.append(f"⚠️  UYARI: Küme sayısı kat sayısının {quality_metrics['cluster_floor_ratio']:.1f}x fazla!")
        if quality_metrics['avg_cluster_size'] < 2:
            warnings.append(f"⚠️  UYARI: Ortalama küme boyutu çok küçük ({quality_metrics['avg_cluster_size']:.1f})!")
        
        if warnings:
            print("\n" + "="*60)
            for warning in warnings:
                print(warning)
            print("="*60)
        
        # 7. Save results
        with open(os.path.join(output_dir, 'performance_metrics.json'), 'w', encoding='utf-8') as f:
            json.dump(all_results, f, ensure_ascii=False, indent=4)
        
        # 8. Generate plots
        generate_confusion_matrix(
            y_true, y_pred_mapped,
            os.path.join(output_dir, 'confusion_matrix_mapped.pdf'),
            'Confusion Matrix for Mapped Clusters'
        )
        
        cluster_analysis = analyze_clusters(clusters, ap_to_floor, output_dir)
        
        print(f"\nTüm sonuçlar kaydedildi: {output_dir}")


['.//community_results\\fast_greedy_communities.csv', './/community_results\\gat_gnn_communities.csv', './/community_results\\gcn_gnn_communities.csv', './/community_results\\infomap_communities.csv', './/community_results\\label_propagation_communities.csv', './/community_results\\leiden_communities.csv', './/community_results\\louvain_communities.csv', './/community_results\\Node2Vec_communities.csv']

ALGORITMA: /community_results\fast_greedy_communities

--- Kümeleme İstatistikleri ---
Toplam AP Sayısı: 8371
Küme Sayısı: 1071
Gerçek Kat Sayısı: 9
Küme/Kat Oranı: 119.00
Singleton (Tek Elemanlı) Küme Sayısı: 1019
Singleton Oranı: 95.14%
Ortalama Küme Boyutu: 7.82
Ortalama Küme Saflığı: 94.19%

--- Raw Cluster Evaluation ---
Adjusted Rand Index: -0.0020
Normalized Mutual Information: 0.1382

--- Mapped Evaluation ---
Accuracy: 0.4448
F1-Score (Orijinal): 0.3555
F1-Score (Adjusted): 0.0009 ⚠️

⚠️  UYARI: Singleton küme oranı çok yüksek (95.1%)!
⚠️  UYARI: Küme sayısı kat sayısının 119.

In [5]:
import os
import json
import pandas as pd

def compile_metrics_to_table(root_dir):
    """
    Belirtilen dizin ve alt dizinlerindeki tüm performance_metrics.json
    dosyalarını okur ve bir Pandas DataFrame (tablo) haline getirir.
    """
    all_data = []

    # os.walk ile tüm alt klasörleri geziyoruz
    for root, dirs, files in os.walk(root_dir):
        if 'performance_metrics.json' in files:
            file_path = os.path.join(root, 'performance_metrics.json')
            
            # Klasör adını al (Örn: Node2Vec_communities)
            folder_name = os.path.basename(root)
            
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    metrics = json.load(f)
                    
                    # Klasör ismini veriye 'Algorithm' sütunu olarak ekle
                    metrics['Algorithm'] = folder_name
                    
                    all_data.append(metrics)
            except Exception as e:
                print(f"Hata: {file_path} dosyası okunamadı. Sebebi: {e}")

    # Listeyi DataFrame'e (Tabloya) çevir
    if all_data:
        df = pd.DataFrame(all_data)
        
        # 'Algorithm' sütununu en başa al
        cols = ['Algorithm'] + [c for c in df.columns if c != 'Algorithm']
        df = df[cols]
        
        # Algoritma ismine göre sırala (isteğe bağlı)
        df = df.sort_values(by='Algorithm').reset_index(drop=True)
        
        return df
    else:
        print("Hiçbir performance_metrics.json dosyası bulunamadı.")
        return None

# --- KULLANIM ---
for p in paths:
    # Buraya taramak istediğin ANA klasörün yolunu yazmalısın.
    # Windows yolu olduğu için string'in başına 'r' koyuyoruz.
    search_path = f"{p}/results/{p[2:]}community_results" 
    #C:\Users\kahra\Desktop\Cluster\UJIndoorLoc\output\10percent\results\output\10percent\community_results\fast_greedy_communities
    # Fonksiyonu çalıştır
    df_results = compile_metrics_to_table(search_path)
    
    if df_results is not None:
        # Tabloyu ekrana yazdır
        print("Tüm Sonuçlar Tablosu:")
        print(df_results)
    
        # İstersen sonuçları Excel veya CSV olarak kaydedebilirsin:
        # df_results.to_csv("tum_sonuclar.csv", index=False)
        df_results.to_excel(f"sonuc.xlsx", index=False)

Tüm Sonuçlar Tablosu:
                       Algorithm   raw_ari   raw_nmi  raw_homogeneity  \
0           Node2Vec_communities  0.443271  0.572218         0.600857   
1        fast_greedy_communities -0.001974  0.138191         0.156520   
2            gat_gnn_communities  0.115348  0.303545         0.302598   
3            gcn_gnn_communities  0.028238  0.127976         0.150296   
4            infomap_communities -0.000900  0.170046         0.261381   
5  label_propagation_communities -0.004267  0.158697         0.223265   
6             leiden_communities -0.000631  0.131013         0.170299   
7            louvain_communities -0.001812  0.130332         0.169195   

   raw_completeness  raw_v_measure  mapped_ari  mapped_nmi  \
0          0.546185       0.572218    0.581278    0.600319   
1          0.123704       0.138191    0.054220    0.165127   
2          0.304499       0.303545    0.153272    0.281730   
3          0.111428       0.127976    0.087177    0.103190   
4         