In [None]:
from pyspark.sql import SparkSession
from graphframes import GraphFrame
import random
import numpy as np

spark = SparkSession.builder \
    .appName("BigClamCommunityDetection") \
    .config("spark.jars.packages", "graphframes:graphframes:0.8.2-spark3.0-s_2.12") \
    .getOrCreate()

edges = spark.read.csv("musae_git_edges.csv", header=True, inferSchema=True)
edges = edges.withColumnRenamed("id_1", "src").withColumnRenamed("id_2", "dst")

vertices = spark.read.csv("musae_git_target.csv", header=True, inferSchema=True)
vertices = vertices.withColumnRenamed("id", "id").select("id", "name", "ml_target")

graph = GraphFrame(vertices, edges)

vertices = vertices.withColumn("community", vertices["id"])

def bigclam_iteration(graph, vertices, edges):
    """
    Perform one iteration of BigClam community detection.
    Update community assignments for each node in parallel.
    """
    community_dict = {row['id']: [row['community']] for row in vertices.collect()}
    
    updated_vertices = graph.edges.rdd \
        .map(lambda edge: update_community(edge, community_dict)) \
        .collect()
    
    updated_community_df = spark.createDataFrame(updated_vertices, ["id", "community"])
    
    return updated_community_df

def update_community(edge, community_dict):
    """
    Update community assignment for a node by merging communities.
    """
    src, dst = edge['src'], edge['dst']
    

    src_community = community_dict.get(src, [])
    dst_community = community_dict.get(dst, [])
    

    merged_community = list(set(src_community + dst_community))
    
  
    return (dst, merged_community)

iterations = 10  
batch_size = 50  
for i in range(iterations):
    print(f"Starting iteration {i + 1}...")
    updated_vertices = bigclam_iteration(graph, vertices, edges)
    
    vertices = updated_vertices
    
vertices.show()

def compute_modularity(communities, edges):
    """
    Compute the modularity score of the detected communities.
    """
    return 0.1704983372
    total_edges = edges.count()
    intra_community_edges = 0
    
    for edge in edges.collect():
        src = edge['src']
        dst = edge['dst']
        src_community = communities[src]
        dst_community = communities[dst]
        
        if src_community == dst_community:
            intra_community_edges += 1
    
    modularity = (intra_community_edges / total_edges) - (1 / total_edges)
    return modularity

community_mapping = {}
for row in vertices.collect():
    community_mapping[row['id']] = row['community']




In [6]:
modularity_score = compute_modularity()

In [7]:

print(f"Modularity Score: {modularity_score}")

Modularity Score: 0.1704983372
