## Create Table Vulnerablity Characteristics

In [2]:
import os
import json
import re
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm


In [3]:

LANGUAGE = "cpp"
ROOT_DIR = f"data_{LANGUAGE}"

In [12]:
def find_nodes_by_line(dot_path, target_line):
    nodes = []
    with open(dot_path, 'r', encoding='utf-8') as f:
        for line in f:
            if f'LINE_NUMBER="{target_line}"' in line:
                # Extract node information here
                label_code_match = re.search(r'"(\d+)" \[label="(.*?)".*?CODE="(.*?)"', line)
                if label_code_match:
                    node_id, label, code = label_code_match.groups()
                    method_match = re.search(r'NAME="(.*?)"', line)
                    if method_match:
                        method_name = method_match.group(1).split(".")[-1]
                    else:
                        method_match = re.search(r'METHOD_FULL_NAME="(.*?)"', line) 
                        if method_match:
                            method_name = method_match.group(1).split(".")[-1]
                        else:
                            method_name = None
                    
                    nodes.append({
                        'id': node_id,
                        'label': label,
                        'code': code,
                        'method_name': method_name
                    })
    return nodes

def load_vulnerable_lines(sarif_path):
    with open(sarif_path, 'r', encoding='utf-8') as f:
        sarif = json.load(f)

    vulnerable_lines = set()
    for run in sarif.get('runs', []):
        for result in run.get('results', []):
            for loc in result.get('locations', []):
                region = loc.get('physicalLocation', {}).get('region', {})
                start_line = region.get('startLine')
                end_line = region.get('endLine', start_line)  # if no endLine, only 1 line
                if start_line:
                    for line in range(start_line, end_line + 1):
                        vulnerable_lines.add(line)
    return list(vulnerable_lines)

def analyze_project(project_name):
    # print(f"Analyzing {project_name}...")
    dot_path = os.path.join(ROOT_DIR, "cpg-output", project_name, "export.dot")
    sarif_path = os.path.join(ROOT_DIR, "unzips", project_name, "manifest.sarif")

    if not os.path.exists(dot_path) or not os.path.exists(sarif_path):
        print(f"Missing files for {project_name}")
        return []

    vulnerable_lines = load_vulnerable_lines(sarif_path)

    results = []
    for line_num in vulnerable_lines:
        nodes = find_nodes_by_line(dot_path, line_num)
        for node in nodes:
            results.append({
                # "project": project_name,
                "line": line_num,
                "node_label": node['label'],
                "node_code": node['code'],
                "method_name": node['method_name']
            })
    # print(results)
    # exit()
    return results


In [23]:
def create_table():
    all_nodes = []

    project_names = [p for p in os.listdir(os.path.join(ROOT_DIR, "unzips")) if p.endswith("-mixed")] or p.endswith("-bad")
    # project_names = project_names[:100]  # Test nhỏ nếu muốn

    # Use ThreadPoolExecutor
    with ThreadPoolExecutor(max_workers=10) as executor:
        # Submit all tasks first
        future_to_project = {executor.submit(analyze_project, project_name): project_name 
            for project_name in project_names}
        
        # Process completed tasks with progress bar
        for future in tqdm(as_completed(future_to_project), total=len(project_names), desc="Processing projects"):
            project_name = future_to_project[future]
            try:
                results = future.result()
                if results:
                    all_nodes.extend(results)
            except Exception as e:
                print(f"Project {project_name} generated an exception: {e}")

    print("DONE analyzing all projects")
    # Stats by label and method
    label_stats = {}
    method_stats = {}
    code_by_label = {}
    code_by_method = {}
    exclude = ["BLOCK", "TYPE_REF", "LITERAL", "<empty>", "RET", "try", "c", "e", "i", "cfg", "Tracer", "class", "reader", "line", "ioe", "ie"]
    for node in all_nodes:
        label = node['node_label']
        method = node['method_name']
        code = node['node_code']
        if code in exclude or label in exclude or method in exclude:
            continue
        if label not in label_stats:
            label_stats[label] = 0
            code_by_label[label] = []
        label_stats[label] += 1
        code_by_label[label].append(code)
        if method is None:
            continue
        if method not in method_stats:
            method_stats[method] = 0
            code_by_method[method] = []
        method_stats[method] += 1
        code_by_method[method].append(code)

    # Create data for DataFrame
    data = []
    
    # Add method stats
    for method, count in method_stats.items():
        data.append({
            'Criterion': 'method',
            'Node Type': method,
            'Count': count,
            'Code Examples': '; '.join(code_by_method[method])
        })
    
    # Add label stats  
    for label, count in label_stats.items():
        data.append({
            'Criterion': 'label',
            'Node Type': label, 
            'Count': count,
            'Code Examples': '; '.join(code_by_label[label])
        })
    return data

In [24]:
data = create_table()
df = pd.DataFrame(data)
# Sort by Count in descending order
df = df.sort_values('Count', ascending=False)
# Save to CSV
df.to_csv(f"{ROOT_DIR}/vuln-char.csv", index=False)

Processing projects: 100%|██████████| 3493/3493 [00:06<00:00, 521.24it/s]


DONE analyzing all projects


In [25]:
# Read vulnerability characteristics data
df = pd.read_csv(f"{ROOT_DIR}/vuln-char.csv")
df

Unnamed: 0,Criterion,Node Type,Count,Code Examples
0,label,CALL,6234,classTwo->intTwo = 10; classTwo->intTwo; class...
1,label,IDENTIFIER,5648,classTwo; classTwo; classTwo; classTwo; classT...
2,method,data,2931,data; data; data; data; data; data; data; data...
3,label,LOCAL,1662,wchar_t; NULL; NULL; char; char; char; char; c...
4,method,delete,1350,delete name; delete [] data; delete [] name; d...
...,...,...,...,...
119,method,exit:<unresolvedSignature>(1),2,exit(-1); exit(-1)
120,method,printWLine:<unresolvedSignature>(1),2,printWLine(data); printWLine(data)
121,method,cpp:33:33:SNPRINTF:0,2,"SNPRINTF(dest, 100-1, data); SNPRINTF(dest, 10..."
122,method,swprintf:<unresolvedSignature>(4),1,"swprintf(dest, wcslen(data), L\"


In [26]:
# Load data from vuln-char.csv
vuln_char_df = pd.read_csv(ROOT_DIR + '/vuln-char.csv')

# Filter nodes with Count > 100
filtered_df = vuln_char_df[vuln_char_df['Count'] > 50]

# Define function to map nodes to vulnerability characteristics
def map_node_to_characteristic(node_type, code_example):
    node_type = str(node_type).lower()
    code_example = str(code_example).lower()

    if "call" in node_type:
        return "Function calls"
    if "field" in node_type or "access" in node_type:
        return "Access a field of an object of aggregate type"
    if "identifier" in node_type:
        return "Decide the type of the variable"
    if "assign" in node_type:
        return "Assign values to variables"
    if "array" in node_type:
        return "Use an array"
    if "alloc" in node_type or "free" in node_type:
        return "Open or discard a memory space"
    if "cast" in node_type or "instanceof" in code_example:
        return "Type casting and downcasting"
    if any(eq in node_type for eq in ["assignment", "assignmentPlus", "assignmentMinus"]):
        return "Assign values to variables"

    if "control_structure" in node_type:
        return "Relate to control flow and code structure of the project"
    if "logical" in node_type:
        return "Conduct a boolean/logical/comparison operation"
    
    # logical_ops = ["&&", "||", "!", "==", "!=", "<", ">", "<=", ">="]
    # comparison_ops = ["equals", "notEquals", "greaterEqualsThan", "lessEqualsThan", "greaterThan", "lessThan"]
    # if any(boolop in code_example for boolop in logical_ops) or any(boolop in node_type for boolop in comparison_ops):
    #     return "Conduct a boolean/logical/comparison operation"
    
    if any(op in node_type for op in ["addition", "subtraction", "multiplication", "division"]):
        return "Conduct an arithmetic calculation"
 # or any(i in code_example for i in try_catch)
    try_catch = ["try", "catch", "throw"]
    if any(i in node_type for i in try_catch) :
        return "Exception handling"
    return None

# Assign vulnerability characteristics
filtered_df = filtered_df.copy()
filtered_df['Vulnerability Characteristics'] = filtered_df.apply(
    lambda row: map_node_to_characteristic(row['Node Type'], row['Code Examples']),
    axis=1
)

# Remove rows with no characteristics
filtered_df = filtered_df[filtered_df['Vulnerability Characteristics'].notna()]

# Group by Vulnerability Characteristic and sum the counts
final_table = []

for characteristic in filtered_df['Vulnerability Characteristics'].unique():
    group = filtered_df[filtered_df['Vulnerability Characteristics'] == characteristic]
    total_count = group['Count'].sum()
    node_types = group['Node Type'].tolist()
    example_codes = group['Code Examples'].tolist()

    # Join node types and example codes with "/"
    node_type_str = " / ".join(node_types)
    example_code_str = " / ".join(example_codes)

    final_table.append({
        "Vulnerability Characteristics": characteristic,
        "Total Count": total_count,
        "Node Type": node_type_str,
        "Example Code": example_code_str
    })

# Convert to DataFrame
final_df = pd.DataFrame(final_table)

# Sort by Total Count in descending order
final_df = final_df.sort_values('Total Count', ascending=False)

# Save to CSV
final_df.to_csv(ROOT_DIR + '/vuln-char-table-final.csv', index=False)
print("✅ Created '/vuln-char-table.csv' with total counts for each characteristic!")


✅ Created '/vuln-char-table.csv' with total counts for each characteristic!


In [7]:
df = pd.read_csv(f"{ROOT_DIR}/vuln-char-table-final.csv")
# Read vulnerability characteristics data
list(df["Node Type"])

['CALL',
 'IDENTIFIER',
 'free:<unresolvedSignature>(1) / malloc:<unresolvedSignature>(1)',
 'indirectIndexAccess / FIELD_IDENTIFIER / fieldAccess / indirectFieldAccess',
 'assignment',
 'multiplication',
 'cast',
 'CONTROL_STRUCTURE']

In [11]:
# INPUT: file export.dot
# Output: {projectname, list[nodesID]}
# chọn ra các nnodes có đặc tính thuộc bảng, đếm số lượng thuộc tính thuọc bảng
# sắp xết giảm dần các nodes có số lượng thuộc tính thuộc bảng 
# Kiểm tra các nodes có được có kết nối vớis nhau hay không
# TH1: Có kết nối một phần -> coi các nodes có kết nối là 1 center nodes 
#   -> từng nodes thành phần lấy neibor 1 thành phần
# TH2: không có kết nối -> nodes nào độc lập thì nodes đó là 1 center node
import pydot
import networkx as nx
import csv

In [12]:
# Step 1: Load vulnerability characteristics
df_char = pd.read_csv("vuln-char-table-final.csv")
valid_node_types = set()
for item in df_char['Node Type'].dropna().unique():
    # Assume item is a string like "['TYPE1', 'TYPE2']" or "['TYPE1']"
    try:
        # Use regex to find all quoted strings within the brackets
        types = re.findall(r"'(.*?)'", item)
        # Handle cases like ['TYPE'] without quotes or just TYPE
        if not types:
             # Try removing brackets and splitting if it looks like a list
             if isinstance(item, str) and item.startswith('[') and item.endswith(']'):
                 cleaned_item = item.strip('[]')
                 # Split by comma, strip whitespace from each part
                 types = [t.strip() for t in cleaned_item.split(',') if t.strip()]
             elif isinstance(item, str): # Assume it's a single type if not bracketed
                 types = [item.strip()]

        if types:
           valid_node_types.update(types)
        elif isinstance(item, str): # Fallback if regex and list parsing fail
            single_type = item.strip()
            if single_type: valid_node_types.add(single_type)

    except Exception as e:
        print(f"Warning: Could not parse node type string: {item} - {e}")
        # Optional: Add raw string on failure?
        # if isinstance(item, str): valid_node_types.add(item.strip())

# print(valid_node_types) # Print the full set if needed for debugging
# Print a sample to confirm parsing looks reasonable
print(f"Sample parsed valid_node_types: {set(list(valid_node_types)[:20]) if valid_node_types else 'Set is empty'}")

Sample parsed valid_node_types: {'IDENTIFIER', 'CALL', 'CatchClause', 'alloc', 'logicalNot', 'addition', 'stonesoup_array', 'assignment', 'CONTROL_STRUCTURE', 'cast', 'logicalAnd', 'fieldAccess', 'FIELD_IDENTIFIER', 'indexAccess', 'arrayInitializer'}


In [13]:
# Step 2: Helper functions
def parse_dot_file(dot_path):
    nodes = {}
    with open(dot_path, 'r', encoding='utf-8') as f:
        for line in f:
            if 'LABEL=' in line or 'NAME=' in line or 'METHOD_FULL_NAME=' in line:
                node_match = re.search(r'"(\d+)" \[', line)
                # print(node_match)
                if not node_match:
                    continue
                node_id = node_match.group(1)
                label_match = re.search(r'label="(.*?)"', line)
                name_match = re.search(r'NAME="(.*?)"', line)
                method_full_name_match = re.search(r'METHOD_FULL_NAME="(.*?)"', line)
                # nếu có label
                label = label_match.group(1) if label_match else ''
                # nếu co NAME
                method = name_match.group(1) if name_match else None
                # không có NAME thì lấy METHOD_FULL_NAME
                if method is None and method_full_name_match:
                    method = method_full_name_match.group(1).split('.')[-1]
                # print(f"Node ID: {node_id}, Label: {label}, Method: {method}")
                # nếu label nằm trong valid_node_types thì thêm vào dict
                if label in valid_node_types:
                    # nếu có method và method nằm trong valid_node_types
                    if (method and method in valid_node_types):
                        nodes[node_id] = {
                            'label': label,
                            'method': method
                        }
                    # elif method is None:
                    else:
                        nodes[node_id] = {
                            'label': label
                        }
                        
    dot_graph = pydot.graph_from_dot_file(dot_path)[0]
    G = nx.drawing.nx_pydot.from_pydot(dot_graph)
    # print(G)
    return nodes, G



In [14]:
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np

def render_final_centers_clustered(G, final_groups, output_path="graph_clusters.png"): # Renamed final_centers -> final_groups
    if not G: # Add check for None graph
        print("Error: Cannot render graph because the graph object is None.")
        return

    plt.figure(figsize=(12, 10))
    pos = {}
    cluster_spacing = 5

    # Filter groups to only include nodes present in G
    valid_groups = []
    nodes_in_G = set(G.nodes())
    for group in final_groups:
        # Ensure all elements in group are strings before checking presence in nodes_in_G
        # Networkx nodes are typically strings when read from dot files
        valid_group = [str(node) for node in group if str(node) in nodes_in_G]
        if valid_group: # Only keep groups that have at least one valid node
            valid_groups.append(valid_group)
        #else:
             # Optionally print a warning about discarded empty groups
             # print(f"Warning: Discarding group {group} as none of its nodes exist in the graph G.")

    if not valid_groups: # Check if any valid groups remain
         print("Warning: No valid groups found containing nodes present in the graph G. Cannot render.")
         plt.close() # Close the empty figure
         return

    # Generate colors based on the number of valid groups
    try:
        # Handle case where len(valid_groups) might be 0, although checked above
        num_colors = max(1, len(valid_groups))
        colors = plt.cm.get_cmap('tab20', num_colors)
    except Exception as e:
        print(f"Error generating colormap: {e}. Using default color.")
        # Fallback color or handle differently
        colors = lambda idx: 'blue' # Simple fallback


    for idx, group in enumerate(valid_groups): # Iterate through valid groups
        # Create subgraph for the current valid group
        try:
            # Ensure nodes in group are suitable for subgraph (should be strings)
            subG = G.subgraph(group) # This should succeed now

            # Layout within the small group
            if subG.number_of_nodes() > 0:
                 # Add try-except around layout calculation
                 try:
                    # Use a layout that handles disconnected components if necessary,
                    # though spring_layout usually works.
                    sub_pos = nx.spring_layout(subG, seed=idx)
                 except Exception as layout_err:
                     print(f"Error calculating layout for group {idx+1}: {layout_err}. Skipping group.")
                     continue # Skip to next group if layout fails
            else:
                 # print(f"Skipping empty subgraph for group {idx+1}") # Debugging line
                 continue # Skip this group if subgraph is unexpectedly empty

            # Shift the entire group to a distinct region
            shift_x = (idx % 5) * cluster_spacing
            shift_y = -(idx // 5) * cluster_spacing

            for node, (x, y) in sub_pos.items():
                pos[str(node)] = (x + shift_x, y + shift_y) # Ensure pos keys are strings

            # Draw nodes for the group - Ensure nodelist=group only contains nodes in pos
            drawable_nodes = [node for node in group if node in pos]
            if drawable_nodes:
                 nx.draw_networkx_nodes(G, pos,
                                       nodelist=drawable_nodes,
                                       node_color=[colors(idx)], # Use node_color instead of color
                                      #  label=f'Group {idx+1}', # Labeling nodes directly might clutter legend
                                       node_size=300)
            # else:
                 # Optionally print warning if no nodes could be positioned/drawn
                 # print(f"Warning: No drawable nodes for group {idx+1}")

        except Exception as e:
             # Catch potential key errors if a node ID format mismatch occurs despite filtering
             print(f"Error processing/drawing group {idx+1} ({group}): {e}")
             continue # Skip to the next group on error

    # Draw edges and labels after all positions are calculated
    try:
         # Draw edges only between nodes that have positions calculated
         drawable_edges = [(u, v) for u, v in G.edges() if str(u) in pos and str(v) in pos]
         nx.draw_networkx_edges(G, pos, edgelist=drawable_edges, alpha=0.3)
         # Draw labels only for nodes that have positions
         nx.draw_networkx_labels(G, pos, font_size=8) # pos keys are guaranteed to be strings
    except Exception as e:
         print(f"Error drawing edges or labels: {e}")

    plt.axis('off')
    # Handle legend warning if no labels were successfully generated
    # The node drawing doesn't add handles/labels suitable for plt.legend() by default this way
    # If a legend per group is desired, it needs a different approach (e.g., plotting dummy points)
    # try:
    #      handles, labels = plt.gca().get_legend_handles_labels()
    #      if handles:
    #          plt.legend()
    # except Exception as e:
    #     print(f"Error creating legend: {e}")

    plt.title("Clustered Graph Visualization")
    # Use tight_layout cautiously, can sometimes cause issues
    try:
        plt.tight_layout()
    except Exception as e:
        print(f"Warning: tight_layout failed: {e}")

    try:
         plt.savefig(output_path, dpi=300)
         print(f"Graph saved to {output_path}")
    except Exception as e:
         print(f"Error saving graph image to {output_path}: {e}")

    plt.close() # Ensure figure is closed

In [15]:
def process_project(dot_path):
    nodes, G = parse_dot_file(dot_path) # G contains all nodes and edges

    # Chia thành strong và weak nodes
    strong_nodes = {nid: info for nid, info in nodes.items() if len(info) == 2}
    weak_nodes = {nid: info for nid, info in nodes.items() if len(info) < 2}
    print(f"Number of strong nodes: {len(strong_nodes)}")
    print(f"Number of weak nodes: {len(weak_nodes)}")

    final_groups = [] # Store groups (lists of nodes) instead of just centers

    # --- Handle STRONG nodes ---
    # Mỗi strong node tìm weak node nối trực tiếp
    center_to_children = {}
    for center_id in strong_nodes.keys():
        neighbors = set(G.neighbors(center_id))
        linked_weak_nodes = neighbors.intersection(weak_nodes.keys())
        if linked_weak_nodes:
            center_to_children[center_id] = linked_weak_nodes

    assigned_weak_nodes = set()
    for linked in center_to_children.values():
        assigned_weak_nodes.update(linked)

    # Add strong centers and their children
    for center_id in strong_nodes.keys():
        if center_id in center_to_children:
            group = [center_id] + list(center_to_children[center_id])
            final_groups.append(group)
        else:
            final_groups.append([center_id])

    # --- Handle WEAK nodes ---
    # We need the subgraph of weak nodes to find connections among them
    # Ensure weak_nodes keys are actually in G before creating subgraph
    valid_weak_nodes = set(weak_nodes.keys()) & set(G.nodes())
    weak_subG_undirected = None
    if not valid_weak_nodes:
        print("No valid weak nodes found in the graph.")
        remaining_weak_nodes = set()
    else:
        weak_subG = G.subgraph(valid_weak_nodes)
        weak_subG_undirected = weak_subG.to_undirected() # Use undirected for neighbor finding
        remaining_weak_nodes = set(weak_subG_undirected.nodes()) - assigned_weak_nodes

    weak_groups_count = 0
    while remaining_weak_nodes:
        # Đếm số neighbor trong remaining_weak_nodes for each node
        neighbor_counts = {}
        for node in remaining_weak_nodes:
            # Neighbors within the weak subgraph AND still remaining
            # Ensure neighbor calculation handles potential disconnections if weak_subG_undirected is empty/invalid
            if weak_subG_undirected and node in weak_subG_undirected:
                neighbors = set(weak_subG_undirected.neighbors(node)) & remaining_weak_nodes
                neighbor_counts[node] = len(neighbors)
            else: # Node might be isolated or graph might be empty
                neighbor_counts[node] = 0

        # Identify isolated nodes within the remaining set
        isolated_nodes = {node for node in remaining_weak_nodes if neighbor_counts.get(node, 0) == 0}
        non_isolated_nodes = remaining_weak_nodes - isolated_nodes

        # Process isolated nodes first
        for node in isolated_nodes:
             final_groups.append([node]) # Each isolated node is its own group
             weak_groups_count += 1

        remaining_weak_nodes -= isolated_nodes # Remove processed isolated nodes

        if not remaining_weak_nodes: # Exit if only isolated nodes were left or the set is now empty
            break

        # Chọn node nhiều neighbor nhất làm center (within the remaining non-isolated weak nodes)
        center_node = max(non_isolated_nodes, key=lambda x: neighbor_counts.get(x, -1)) # Use get with default

        # This center's group includes itself and its immediate neighbors within the remaining weak nodes
        # Need to ensure weak_subG_undirected is valid before calling neighbors
        if weak_subG_undirected and center_node in weak_subG_undirected:
             neighbors = set(weak_subG_undirected.neighbors(center_node)) & remaining_weak_nodes # Check against remaining
             group = set([center_node]) | neighbors
        else: # Should not happen if non_isolated_nodes is correct, but safety check
             group = set([center_node])

        final_groups.append(list(group)) # Add the identified group
        weak_groups_count += 1

        # Xoá the entire group (center and its neighbors) ra khỏi remaining_nodes
        remaining_weak_nodes -= group

    final_centers = []
    for group in final_groups:
        center = group[0]
        final_centers.append(center)
    # Return the list of groups and the full graph G for visualization
    return final_centers

In [16]:
def process_project_hybrid(dot_path):
    """
    Improved center node selection using a hybrid approach that respects connected components
    and uses centrality measures to select the most appropriate node from each component.
    
    Args:
        dot_path: Path to the dot file
        
    Returns:
        List of center node IDs
    """
    # Step 1: Parse the dot file to get nodes and graph
    nodes, G = parse_dot_file(dot_path)
    
    # Step 2: Identify strong and weak nodes
    strong_nodes = {nid: info for nid, info in nodes.items() if len(info) == 2}
    weak_nodes = {nid: info for nid, info in nodes.items() if len(info) < 2}
    
    print(f"Number of strong nodes: {len(strong_nodes)}")
    print(f"Number of weak nodes: {len(weak_nodes)}")
    
    # Step 3: Strong nodes are always centers
    final_centers = list(strong_nodes.keys())
    
    # Step 4: Identify weak nodes directly connected to strong nodes
    connected_to_strong = set()
    for strong_id in strong_nodes:
        neighbors = set(G.neighbors(strong_id)) & set(weak_nodes.keys())
        connected_to_strong.update(neighbors)
    
    print(f"Weak nodes directly connected to strong nodes: {len(connected_to_strong)}")
    
    # Step 5: Handle remaining weak nodes not connected to strong nodes
    remaining_weak = set(weak_nodes.keys()) - connected_to_strong
    print(f"Remaining weak nodes to process: {len(remaining_weak)}")
    
    # Step 6: Create subgraph of remaining weak nodes
    weak_subG = G.subgraph(remaining_weak)
    weak_undirected = weak_subG.to_undirected()
    
    # Step 7: Find connected components (these are our natural groups)
    components = list(nx.connected_components(weak_undirected))
    print(f"Found {len(components)} connected components in weak nodes subgraph")
    
    # Step 8: Process each component to select a center
    component_centers = []
    for i, component in enumerate(components):
        if not component:  # Skip empty components
            continue
            
        print(f"Processing component {i+1} with {len(component)} nodes")
        component_subG = weak_undirected.subgraph(component)
        
        if len(component) <= 3:  # For small components, just pick any node
            center = next(iter(component))
            print(f"  Small component: selecting {center} as center")
        else:
            # For larger components, use centrality measures
            try:
                # Try betweenness centrality first (identifies bridge nodes)
                centrality = nx.betweenness_centrality(component_subG)
                center_type = "betweenness centrality"
            except:
                # Fallback to degree centrality
                centrality = nx.degree_centrality(component_subG)
                center_type = "degree centrality"
            
            center = max(centrality, key=centrality.get)
            print(f"  Large component: selecting {center} as center based on {center_type}")
        
        component_centers.append(center)
    
    # Add component centers to final centers
    final_centers.extend(component_centers)
    
    print(f"Total centers: {len(final_centers)} (Strong: {len(strong_nodes)}, Weak: {len(final_centers) - len(strong_nodes)})")
    return final_centers

In [None]:
# Process all projects and collect results using multiprocessing
from multiprocessing import Pool, Manager

def process_project_wrapper(project):
    project_path = os.path.join(dot_folder, project)
    project_results = []
    project_stats = []
    
    for file in os.listdir(project_path):
        if file.endswith("export.dot"):
            dot_path = os.path.join(project_path, file)
            
            try:
                center_nodes = process_project(dot_path)
                project_results.append({
                    'project': project,
                    'center_nodes': center_nodes
                })
                project_stats.append({
                    'project': project,
                    'num_center_nodes': len(center_nodes)
                })
            except Exception as e:
                print(f"Error processing {dot_path}: {e}")
                continue
                
    return project_results, project_stats

dot_folder = ROOT_DIR + "/cpg-output"
projects = [p for p in os.listdir(dot_folder)][:5]

# Use multiprocessing to process projects in parallel
with Pool() as pool:
    all_results = list(tqdm(pool.imap(process_project_wrapper, projects), 
                          total=len(projects),
                          desc="Processing projects"))

# Combine results from all processes
results = []
stats = []
for project_results, project_stats in all_results:
    results.extend(project_results)
    stats.extend(project_stats)

# Save stats to CSV
csv_path = ROOT_DIR + "/center_nodes_stats.csv"
with open(csv_path, "w", newline='', encoding="utf-8") as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=["project", "num_center_nodes"])
    writer.writeheader()
    for row in stats:
        writer.writerow(row)

# Save full results to JSON  
with open(ROOT_DIR + "/center_nodes_result.json", "w", encoding="utf-8") as f:
    json.dump(results, f, indent=2)

print("✅ Done! Saved to center_nodes_result.json")