In [1]:
paths=folders= ["./ValidationGeo", "./ValidationWDE", "./TrainingGeo", "./TrainingWDE"]

In [2]:
## 2.2  Setting Random Seeds
seed_value=0
import os
os.environ['PYTHONHASHSEED']=str(seed_value)
import random
random.seed(seed_value)
import numpy as np
np.random.seed(seed_value)
from sklearn.metrics import calinski_harabasz_score

# 2.5  Best result
from sklearn import cluster
import time


def cluster_embeddings_optimized(embeddings, min_clusters=3, max_clusters=20):
    # Tensor kontrolü
    if torch.is_tensor(embeddings):
        embeddings_np = embeddings.detach().cpu().numpy()
    else:
        embeddings_np = embeddings

    best_score = -1
    best_k = min_clusters # Varsayılan başlangıç
    
    print(f"En iyi küme sayısı aranıyor ({min_clusters}-{max_clusters} arası)...")
    
    # Aramaya min_clusters'dan başla
    search_range = range(min_clusters, min(max_clusters + 1, len(embeddings_np)))
    
    for k in search_range:
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
        labels = kmeans.fit_predict(embeddings_np)
        
        # Calinski-Harabasz kullanıyoruz (Daha yüksek k sayılarını sever)
        score = calinski_harabasz_score(embeddings_np, labels)
        
        # İstersen skorları yazdırıp görebilirsin
        # print(f"k={k}, Score={score:.4f}")
        
        if score > best_score:
            best_score = score
            best_k = k
            
    print(f"Seçilen küme sayısı: {best_k} (Score: {best_score:.4f})")
    
    # Final kümeleme
    kmeans = KMeans(n_clusters=best_k, random_state=42, n_init=10)
    labels = kmeans.fit_predict(embeddings_np)
    
    communities = {}
    for label in np.unique(labels):
        communities[int(label)] = np.where(labels == label)[0].tolist()
    
    return communities

import torch
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score


import networkx as nx
import leidenalg
import igraph as ig
import time
import concurrent.futures
import sys
import csv
import os

def run_algorithm(G_ig, algorithm_name):
    """
    Runs the specified community detection algorithm and returns the results
    """
    start_time = time.time()
    
    try:
        if algorithm_name == "leiden":
            partition = leidenalg.find_partition(G_ig, leidenalg.ModularityVertexPartition)
        elif algorithm_name == "louvain":
            partition = G_ig.community_multilevel(weights='weight' if 'weight' in G_ig.edge_attributes() else None)
        elif algorithm_name == "label_propagation":
            partition = G_ig.community_label_propagation()
        elif algorithm_name == "fast_greedy":
            partition = G_ig.community_fastgreedy().as_clustering()
        elif algorithm_name == "infomap":
            partition = G_ig.community_infomap()
        elif algorithm_name == "edge_betweenness":
            # Edge betweenness is very slow for large graphs
            # Add a safety check here
            if G_ig.vcount() > 1000:
                print(f"WARNING: Edge Betweenness algorithm is very slow for graphs with {G_ig.vcount()} nodes.")
                print("Press 'e' to skip, or any other key to continue:")
                choice = input().lower()
                if choice == 'e':
                    return None, None
            partition = G_ig.community_edge_betweenness().as_clustering()
        else:
            print(f"Unknown algorithm: {algorithm_name}")
            return None, None
            
        elapsed_time = time.time() - start_time
        
        # Convert results to dictionary
        community_dict = {}
        for i, community in enumerate(partition):
            community_dict[i] = list(community)
            
        return community_dict, elapsed_time
        
    except Exception as e:
        print(f"An error occurred while running {algorithm_name}: {str(e)}")
        return None, None

def save_communities_to_csv(communities, algorithm_name):
    """
    Saves community results to a CSV file
    """
    if not communities:
        return False
    
    # Check/create output directory
    output_dir = f"./{p}/community_results/"
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Create filename
    filename = os.path.join(output_dir, f"{algorithm_name.lower()}_communities.csv")
    
    try:
        with open(filename, 'w', newline='') as csvfile:
            writer = csv.writer(csvfile)
            
            # Write each community as a row
            for comm_id, members in communities.items():
                row = members
                writer.writerow(row)
        
        return True
    except Exception as e:
        print(f"Error while saving CSV file: {str(e)}")
        return False

def print_communities(communities, algorithm_name, elapsed_time):
    """
    Prints community results and saves them to a CSV file
    """
    if not communities:
        print(f"\n{algorithm_name} results could not be obtained.")
        return
        
    print(f"\n{algorithm_name} Results (Time: {elapsed_time:.2f} seconds):")
    print(f"Total of {len(communities)} communities detected.")
    
    # Show only the first 5 communities (in case there are too many)
    for i, (comm_id, members) in enumerate(communities.items()):
        if i < 5:
            print(f"Community {comm_id}: {len(members)} members")
        else:
            print("...")
            break
    
    # Find the largest community
    max_community = max(communities.items(), key=lambda x: len(x[1]))
    print(f"Largest community: Community {max_community[0]} ({len(max_community[1])} members)")
    
    # Save to CSV
    if save_communities_to_csv(communities, algorithm_name):
        print(f"Community results saved to {algorithm_name.lower()}_communities.csv")
    else:
        print(f"WARNING: {algorithm_name} results could not be saved to CSV.")

def main():
    # Check/create output directory
    output_dir = f"./{p}/community_results/"
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        print(f"'{output_dir}' directory created.")
    
    # Load the graph
    print("Loading graph...")
    start_time = time.time()
    
    try:
        G_nx = nx.read_adjlist(f"./{p}/my.adjlist")
        
        # Print graph stats
        print(f"Graph properties:")
        print(f"Number of nodes: {G_nx.number_of_nodes()}")
        print(f"Number of edges: {G_nx.number_of_edges()}")
        
        # Convert to undirected if needed
        if nx.is_directed(G_nx):
            G_nx = G_nx.to_undirected()
        
        # Convert NetworkX graph to iGraph
        G_ig = ig.Graph.from_networkx(G_nx)
        
        elapsed_time = time.time() - start_time
        print(f"Graph loaded (Time: {elapsed_time:.2f} seconds)")
        
        # Choose algorithms to run
        algorithms = ["leiden", "louvain", "label_propagation", "fast_greedy", "infomap"]
        
        # Skip edge_betweenness for large graphs
        if G_ig.vcount() <= 1000:
            algorithms.append("edge_betweenness")
        else:
            print("\nNOTE: Edge Betweenness algorithm will not be run due to large graph size.")
        
        print("\nRunning community detection algorithms...")
        
        for algorithm in algorithms:
            print(f"\nRunning {algorithm.capitalize()} algorithm...")
            communities, elapsed_time = run_algorithm(G_ig, algorithm)
            if communities:
                print_communities(communities, algorithm.capitalize(), elapsed_time)
                
        # Alternatively, for parallel execution:
        """
        with concurrent.futures.ProcessPoolExecutor() as executor:
            futures = {executor.submit(run_algorithm, G_ig, alg): alg for alg in algorithms}
            
            for future in concurrent.futures.as_completed(futures):
                algorithm = futures[future]
                try:
                    communities, elapsed_time = future.result()
                    if communities:
                        print_communities(communities, algorithm.capitalize(), elapsed_time)
                except Exception as e:
                    print(f"Error while running {algorithm}: {str(e)}")
        """
        
        print("\nAll tasks completed.")
        
    except Exception as e:
        print(f"An error occurred: {str(e)}")








In [3]:
for p in paths:
    ## 2.3 Loading the data  
    import os
    import json
    import csv
    import networkx as nx
    from tqdm import tqdm
    
    path_to_data = p
    
    with open(os.path.join(path_to_data,f"data_distances.csv")) as f:
        wifi = []
        reader = csv.DictReader(f)
        for line in tqdm(reader):
            wifi.append([line['id1'],line['id2'],float(line['estimated_distance'])])
            
    with open(os.path.join(path_to_data,f"data_elevations.csv")) as f:
        elevs = []
        reader = csv.DictReader(f)
        for line in tqdm(reader):
            elevs.append([line['id1'],line['id2']])        
    
    with open(os.path.join(path_to_data,f"data_steps.csv")) as f:
        steps = []
        reader = csv.DictReader(f)
        for line in tqdm(reader):
            steps.append([line['id1'],line['id2'],float(line['displacement'])]) 
            
    fp_lookup_path = os.path.join(path_to_data,f"data_lookup.json")
    
    with open(fp_lookup_path) as f:
        fp_lookup = json.load(f)


    ## 2.3 Generating the Trajectory  graph. 
    B = nx.Graph()
    
    # Get all the trajectory ids from the lookup
    valid_nodes = set(fp_lookup.values())
    
    for node in valid_nodes:
        B.add_node(node)
    
    # Either add an edge or append the distance to the edge data
    for id1,id2,dist in tqdm(wifi):
        if not B.has_edge(fp_lookup[str(id1)], fp_lookup[str(id2)]):
            
            B.add_edge(fp_lookup[str(id1)], 
                       fp_lookup[str(id2)], 
                       ty = "w", weight=[dist])
        else:
            B[fp_lookup[str(id1)]][fp_lookup[str(id2)]]['weight'].append(dist)
            
    # Compute the mean edge weight
    for edge in B.edges(data=True):
        B[edge[0]][edge[1]]['weight'] = sum(B[edge[0]][edge[1]]['weight'])/len(B[edge[0]][edge[1]]['weight'])
            
    # If you have made a wifi connection between trajectories with an elev, delete the edge
    for id1,id2 in tqdm(elevs):
        if B.has_edge(fp_lookup[str(id1)], fp_lookup[str(id2)]):
            B.remove_edge(fp_lookup[str(id1)], 
                          fp_lookup[str(id2)])
    
    nx.write_adjlist(B, f"{p}/my.adjlist")


    # 2.4 Converting nodes to vectors
    # A folder named tmp is created. This folder is essential for the node2vec model to use less RAM.
    try:
        if not os.path.exists("tmp"):
            os.makedirs("tmp")
    except OSError:
        print ("The folder could not be created!\n Please manually create the \"tmp\" folder in the directory")
    
    
    node=f"""
 
# importing related modules

from node2vec import Node2Vec
import networkx as nx

#importing  adjacency list file as B
B = nx.read_adjlist("{p}/my.adjlist")

seed_value=0


# Specifying the input and hyperparameters of the node2vec model
node2vec = Node2Vec(B, dimensions=32, walk_length=15, num_walks=100, workers=1,seed=seed_value,temp_folder = './tmp')  

#Assigning/specifying random seeds

import os
os.environ['PYTHONHASHSEED']=str(seed_value)

import random
random.seed(seed_value)

import numpy as np
np.random.seed(seed_value)


# creation of the model

model = node2vec.fit(window=10, min_count=1, batch_words=4,seed=seed_value)   


# saving the output vector

model.wv.save_word2vec_format("{p}/vectors.emb")

# save the model
model.save("{p}/vectorMODEL")

    """
    f = open("node.py", "w")
    f.write(node)
    f.close()
    !python node.py
    embeddings = np.loadtxt(f"{p}/vectors.emb", skiprows=1)
    a=cluster_embeddings_optimized(embeddings)




    
    # 2.4 Reshaping data
    vec = np.loadtxt(f"{p}/vectors.emb", skiprows=1)
    print("shape of vector file: ",vec.shape)
    print(vec)
    vec=vec[vec[:,0].argsort()]; 
    vec=vec[0:vec.shape[0],1:vec.shape[1]]
    
    

    
    ML_results = []
    k_clusters =len(a)
    algorithms = {}
    algorithms['KMeans'] =cluster.KMeans(n_clusters=k_clusters,random_state=10)
    second=time.time()
    for model in algorithms.values():
        model.fit(vec)
        ML_results=list(model.labels_)
        print(model,time.time()-second)
    


    ## 2.6 RESULTS 
    result={}
    for ii,i in enumerate(set(fp_lookup.values())):
        result[i]=ML_results[ii]
            
        
    ters={}
    for i in result:
        if result[i] not in ters:
            ters[result[i]]=[]
            ters[result[i]].append(i)
        else:
            ters[result[i]].append(i)
            
            
            
    final_results=[]
    for i in ters:
        final_results.append(ters[i])

        
    output_dir = f"./{p}/community_results/"
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        
    name=f"./{p}/community_results/Node2Vec_communities.csv"    
    with open(name, "w", newline='') as f:
        csv_writer = csv.writer(f)
        csv_writer.writerows(final_results)
    print(name, "file is ready!")

    main()
    

174416it [00:01, 159205.12it/s]
459it [00:00, 159224.67it/s]
988it [00:00, 145178.40it/s]
100%|██████████████████████████████████████████████████████████████████████| 174416/174416 [00:00<00:00, 374803.06it/s]
100%|████████████████████████████████████████████████████████████████████████████| 459/459 [00:00<00:00, 230671.64it/s]

Computing transition probabilities:   0%|          | 0/123 [00:00<?, ?it/s]
Computing transition probabilities:   7%|6         | 8/123 [00:00<00:01, 75.01it/s]
Computing transition probabilities:  14%|#3        | 17/123 [00:00<00:01, 80.25it/s]
Computing transition probabilities:  21%|##1       | 26/123 [00:00<00:01, 76.22it/s]
Computing transition probabilities:  28%|##7       | 34/123 [00:00<00:01, 73.98it/s]
Computing transition probabilities:  34%|###4      | 42/123 [00:00<00:01, 68.17it/s]
Computing transition probabilities:  41%|####      | 50/123 [00:00<00:01, 69.46it/s]
Computing transition probabilities:  48%|####7     | 59/123 [00:00<00:00, 74.78it/s]

En iyi küme sayısı aranıyor (3-20 arası)...
Seçilen küme sayısı: 20 (Score: 1786.7479)
shape of vector file:  (123, 33)
[[ 1.0400000e+02  2.2637676e-01  6.3999510e-03 ... -1.3807258e-01
   2.2428058e-01 -3.5540980e-01]
 [ 1.0300000e+02  3.4622487e-01  8.9824370e-02 ...  2.7656011e-02
  -7.7293360e-02 -9.2037074e-02]
 [ 7.6000000e+01  4.0894390e-01 -5.9875645e-02 ...  1.2728234e-01
  -1.1477421e-01 -1.3443314e-01]
 ...
 [ 2.6000000e+01  4.5844954e-01  1.3734044e-01 ... -2.9272008e-01
   4.0598315e-01 -1.5071204e-01]
 [ 2.7000000e+01  4.8750423e-02  1.8444265e-01 ... -5.1432200e-01
   6.9531040e-01 -1.1705779e-01]
 [ 4.3000000e+01  2.1000692e-01 -1.8139179e-01 ... -7.9801030e-01
   5.3324187e-01  2.4091307e-02]]
KMeans(n_clusters=20, random_state=10) 0.010541915893554688
././ValidationGeo/community_results/Node2Vec_communities.csv file is ready!
Loading graph...
Graph properties:
Number of nodes: 123
Number of edges: 3694
Graph loaded (Time: 0.01 seconds)

Running community detection alg

174416it [00:00, 195591.93it/s]
459it [00:00, 457506.07it/s]
988it [00:00, 247076.82it/s]
100%|██████████████████████████████████████████████████████████████████████| 174416/174416 [00:00<00:00, 394708.82it/s]
100%|████████████████████████████████████████████████████████████████████████████| 459/459 [00:00<00:00, 230726.93it/s]

Computing transition probabilities:   0%|          | 0/123 [00:00<?, ?it/s]
Computing transition probabilities:   7%|6         | 8/123 [00:00<00:01, 71.58it/s]
Computing transition probabilities:  15%|#4        | 18/123 [00:00<00:01, 83.33it/s]
Computing transition probabilities:  22%|##1       | 27/123 [00:00<00:01, 77.51it/s]
Computing transition probabilities:  29%|##9       | 36/123 [00:00<00:01, 78.21it/s]
Computing transition probabilities:  36%|###5      | 44/123 [00:00<00:01, 74.00it/s]
Computing transition probabilities:  43%|####3     | 53/123 [00:00<00:00, 77.76it/s]
Computing transition probabilities:  50%|#####     | 62/123 [00:00<00:00, 79.86it/s]

En iyi küme sayısı aranıyor (3-20 arası)...
Seçilen küme sayısı: 20 (Score: 1786.7479)
shape of vector file:  (123, 33)
[[ 1.0400000e+02  2.2637676e-01  6.3999510e-03 ... -1.3807258e-01
   2.2428058e-01 -3.5540980e-01]
 [ 1.0300000e+02  3.4622487e-01  8.9824370e-02 ...  2.7656011e-02
  -7.7293360e-02 -9.2037074e-02]
 [ 7.6000000e+01  4.0894390e-01 -5.9875645e-02 ...  1.2728234e-01
  -1.1477421e-01 -1.3443314e-01]
 ...
 [ 2.6000000e+01  4.5844954e-01  1.3734044e-01 ... -2.9272008e-01
   4.0598315e-01 -1.5071204e-01]
 [ 2.7000000e+01  4.8750423e-02  1.8444265e-01 ... -5.1432200e-01
   6.9531040e-01 -1.1705779e-01]
 [ 4.3000000e+01  2.1000692e-01 -1.8139179e-01 ... -7.9801030e-01
   5.3324187e-01  2.4091307e-02]]
KMeans(n_clusters=20, random_state=10) 0.012368917465209961
././ValidationWDE/community_results/Node2Vec_communities.csv file is ready!
Loading graph...
Graph properties:
Number of nodes: 123
Number of edges: 3694
Graph loaded (Time: 0.01 seconds)

Running community detection alg

67588605it [06:25, 175289.53it/s]
389it [00:00, ?it/s]
19878it [00:00, 181946.76it/s]
100%|██████████████████████████████████████████████████████████████████| 67588605/67588605 [03:06<00:00, 362289.84it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████| 389/389 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/59 [00:00<?, ?it/s]
Computing transition probabilities:  32%|###2      | 19/59 [00:00<00:00, 183.67it/s]
Computing transition probabilities:  64%|######4   | 38/59 [00:00<00:00, 182.69it/s]
Computing transition probabilities:  97%|#########6| 57/59 [00:00<00:00, 180.01it/s]
Computing transition probabilities: 100%|##########| 59/59 [00:00<00:00, 181.52it/s]

Generating walks (CPU: 1):   0%|          | 0/100 [00:00<?, ?it/s]
Generating walks (CPU: 1):  14%|#4        | 14/100 [00:00<00:00, 130.72it/s]
Generating walks (CPU: 1):  28%|##8       | 28/100 [00:00<00:00, 121.79it/s]
Generating walks (CPU: 1):  41%|####

En iyi küme sayısı aranıyor (3-20 arası)...
Seçilen küme sayısı: 20 (Score: 693.4006)
shape of vector file:  (59, 33)
[[ 5.80000000e+01 -2.61382980e-02  2.26462870e-01 ...  2.90218800e-01
   1.70179620e-02  1.13002160e-01]
 [ 5.70000000e+01 -6.31274660e-02  2.42057980e-01 ...  3.66515430e-01
  -2.03696820e-02  1.07730190e-01]
 [ 5.00000000e+01  1.07711870e-01  2.79857430e-01 ...  2.16392860e-01
   1.44559685e-02  4.11496530e-02]
 ...
 [ 3.30000000e+01  1.35819450e-01  4.03025500e-02 ...  4.37330750e-01
  -9.59635100e-02  4.06501470e-01]
 [ 3.60000000e+01  1.05246970e-01  9.58451260e-02 ...  4.83163680e-01
  -1.08766690e-01  3.48558340e-01]
 [ 4.00000000e+00 -2.12246400e-01  1.49387630e-01 ...  4.01545700e-01
  -5.18066060e-02 -9.40422200e-03]]
KMeans(n_clusters=20, random_state=10) 0.013186454772949219
././TrainingGeo/community_results/Node2Vec_communities.csv file is ready!
Loading graph...
Graph properties:
Number of nodes: 59
Number of edges: 1132
Graph loaded (Time: 0.01 seconds)



67588605it [06:16, 179417.72it/s]
389it [00:00, 390237.80it/s]
19878it [00:00, 204545.48it/s]
100%|██████████████████████████████████████████████████████████████████| 67588605/67588605 [02:59<00:00, 375877.38it/s]
100%|████████████████████████████████████████████████████████████████████████████| 389/389 [00:00<00:00, 431407.79it/s]

Computing transition probabilities:   0%|          | 0/59 [00:00<?, ?it/s]
Computing transition probabilities:  29%|##8       | 17/59 [00:00<00:00, 167.12it/s]
Computing transition probabilities:  66%|######6   | 39/59 [00:00<00:00, 194.44it/s]
Computing transition probabilities: 100%|##########| 59/59 [00:00<00:00, 198.68it/s]

Generating walks (CPU: 1):   0%|          | 0/100 [00:00<?, ?it/s]
Generating walks (CPU: 1):  13%|#3        | 13/100 [00:00<00:00, 126.07it/s]
Generating walks (CPU: 1):  26%|##6       | 26/100 [00:00<00:00, 122.36it/s]
Generating walks (CPU: 1):  39%|###9      | 39/100 [00:00<00:00, 117.98it/s]
Generating walks (CPU: 1):  52%|####

En iyi küme sayısı aranıyor (3-20 arası)...
Seçilen küme sayısı: 20 (Score: 693.4006)
shape of vector file:  (59, 33)
[[ 5.80000000e+01 -2.61382980e-02  2.26462870e-01 ...  2.90218800e-01
   1.70179620e-02  1.13002160e-01]
 [ 5.70000000e+01 -6.31274660e-02  2.42057980e-01 ...  3.66515430e-01
  -2.03696820e-02  1.07730190e-01]
 [ 5.00000000e+01  1.07711870e-01  2.79857430e-01 ...  2.16392860e-01
   1.44559685e-02  4.11496530e-02]
 ...
 [ 3.30000000e+01  1.35819450e-01  4.03025500e-02 ...  4.37330750e-01
  -9.59635100e-02  4.06501470e-01]
 [ 3.60000000e+01  1.05246970e-01  9.58451260e-02 ...  4.83163680e-01
  -1.08766690e-01  3.48558340e-01]
 [ 4.00000000e+00 -2.12246400e-01  1.49387630e-01 ...  4.01545700e-01
  -5.18066060e-02 -9.40422200e-03]]
KMeans(n_clusters=20, random_state=10) 0.012492895126342773
././TrainingWDE/community_results/Node2Vec_communities.csv file is ready!
Loading graph...
Graph properties:
Number of nodes: 59
Number of edges: 1132
Graph loaded (Time: 0.01 seconds)

