## Create Table Vulnerablity Characteristics

In [76]:
import os
import json
import re
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

In [77]:

LANGUAGE = "java"
ROOT_DIR = f"data_{LANGUAGE}"

In [None]:

def find_nodes_by_line(dot_path, target_line):
    nodes = []
    with open(dot_path, 'r', encoding='utf-8') as f:
        for line in f:
            if f'LINE_NUMBER="{target_line}"' in line:
                # Extract node information here
                label_code_match = re.search(r'"(\d+)" \[label="(.*?)".*?CODE="(.*?)"', line)
                if label_code_match:
                    node_id, label, code = label_code_match.groups()
                    method_match = re.search(r'NAME="(.*?)"', line)
                    if method_match:
                        method_name = method_match.group(1).split(".")[-1]
                    else:
                        method_match = re.search(r'METHOD_FULL_NAME="(.*?)"', line) 
                        if method_match:
                            method_name = method_match.group(1).split(".")[-1]
                        else:
                            method_name = None
                    
                    nodes.append({
                        'id': node_id,
                        'label': label,
                        'code': code,
                        'method_name': method_name
                    })
    return nodes

def load_vulnerable_lines(sarif_path):
    with open(sarif_path, 'r', encoding='utf-8') as f:
        sarif = json.load(f)

    vulnerable_lines = set()
    for run in sarif.get('runs', []):
        for result in run.get('results', []):
            for loc in result.get('locations', []):
                region = loc.get('physicalLocation', {}).get('region', {})
                start_line = region.get('startLine')
                end_line = region.get('endLine', start_line)  # if no endLine, only 1 line
                if start_line:
                    for line in range(start_line, end_line + 1):
                        vulnerable_lines.add(line)
    # print(vulnerable_lines)
    # exit()
    return list(vulnerable_lines)

def analyze_project(project_name):
    # print(f"Analyzing {project_name}...")
    dot_path = os.path.join(ROOT_DIR, "cpg-output", project_name, "export.dot")
    sarif_path = os.path.join(ROOT_DIR, "unzips", project_name, "manifest.sarif")

    if not os.path.exists(dot_path) or not os.path.exists(sarif_path):
        print(f"Missing files for {project_name}")
        return []

    vulnerable_lines = load_vulnerable_lines(sarif_path)

    results = []
    for line_num in vulnerable_lines:
        nodes = find_nodes_by_line(dot_path, line_num)
        for node in nodes:
            results.append({
                # "project": project_name,
                "line": line_num,
                "node_label": node['label'],
                "node_code": node['code'],
                "method_name": node['method_name']
            })
    # print(results)
    # exit()
    return results


In [3]:

def create_table():
        all_nodes = []

    project_names = [p for p in os.listdir(os.path.join(ROOT_DIR, "unzips")) if p.endswith("-bad")]
    # project_names = project_names[:100]  # Test nhỏ nếu muốn

    # Use ThreadPoolExecutor
    with ThreadPoolExecutor(max_workers=10) as executor:
        # Submit all tasks first
        future_to_project = {executor.submit(analyze_project, project_name): project_name 
            for project_name in project_names}
        
        # Process completed tasks with progress bar
        for future in tqdm(as_completed(future_to_project), total=len(project_names), desc="Processing projects"):
            project_name = future_to_project[future]
            try:
                results = future.result()
                if results:
                    all_nodes.extend(results)
            except Exception as e:
                print(f"Project {project_name} generated an exception: {e}")

    print("DONE analyzing all projects")
    # Stats by label and method
    label_stats = {}
    method_stats = {}
    code_by_label = {}
    code_by_method = {}
    exclude = ["BLOCK", "TYPE_REF", "LITERAL", "<empty>", "RET", "try", "c", "e", "i", "cfg", "Tracer", "class", "reader", "line", "ioe", "ie"]
    for node in all_nodes:
        label = node['node_label']
        method = node['method_name']
        code = node['node_code']
        if code in exclude or label in exclude or method in exclude:
            continue
        if label not in label_stats:
            label_stats[label] = 0
            code_by_label[label] = []
        label_stats[label] += 1
        code_by_label[label].append(code)
        if method is None:
            continue
        if method not in method_stats:
            method_stats[method] = 0
            code_by_method[method] = []
        method_stats[method] += 1
        code_by_method[method].append(code)

    # Create data for DataFrame
    data = []
    
    # Add method stats
    for method, count in method_stats.items():
        data.append({
            'Criterion': 'method',
            'Node Type': method,
            'Count': count,
            'Code Examples': '; '.join(code_by_method[method])
        })
    
    # Add label stats  
    for label, count in label_stats.items():
        data.append({
            'Criterion': 'label',
            'Node Type': label, 
            'Count': count,
            'Code Examples': '; '.join(code_by_label[label])
        })

In [None]:
create_table()

In [None]:

    df = pd.DataFrame(data)
    # Sort by Count in descending order
    df = df.sort_values('Count', ascending=False)
    # Save to CSV
    df.to_csv(f"{ROOT_DIR}/vuln-char.csv", index=False)


In [3]:
# Read vulnerability characteristics data
df = pd.read_csv(f"{ROOT_DIR}/vuln-char.csv")
df

Unnamed: 0,Criterion,Node Type,Count,Code Examples
0,label,CALL,46035,int factor = (1 << 31) % random; (1 << 31) % r...
1,label,IDENTIFIER,39122,factor; random; Tracer; factor; counter; count...
2,label,FIELD_IDENTIFIER,6465,vowlessInferentialist; length; splurgeZoograft...
3,method,fieldAccess,6465,GenericController.vowlessInferentialist; stone...
4,method,assignment,5559,int factor = (1 << 31) % random; char counter ...
...,...,...,...,...
2619,method,overpersuasionEellike,1,overpersuasionEellike
2620,method,galaginaeTribracteolate,1,galaginaeTribracteolate
2621,method,hyperalgesisAnticreeper,1,hyperalgesisAnticreeper
2622,method,untrillIdleness,1,untrillIdleness


In [None]:
# Load data from vuln-char.csv
vuln_char_df = pd.read_csv(ROOT_DIR + '/vuln-char.csv')

# Filter nodes with Count > 100
filtered_df = vuln_char_df[vuln_char_df['Count'] > 100]

# Define function to map nodes to vulnerability characteristics
def map_node_to_characteristic(node_type, code_example):
    node_type = str(node_type).lower()
    code_example = str(code_example).lower()

    if "call" in node_type:
        return "Function calls"
    if "field" in node_type or "access" in node_type:
        return "Access a field of an object of aggregate type"
    if "identifier" in node_type:
        return "Decide the type of the variable"
    if "assign" in node_type:
        return "Assign values to variables"
    if "array" in node_type:
        return "Use an array"
    if "alloc" in node_type or "free" in node_type:
        return "Open or discard a memory space"
    if "cast" in node_type or "instanceof" in code_example:
        return "Type casting and downcasting"
    if any(eq in node_type for eq in ["assignment", "assignmentPlus", "assignmentMinus"]):
        return "Assign values to variables"

    if "control_structure" in node_type:
        return "Relate to control flow and code structure of the project"
    if "logical" in node_type:
        return "Conduct a boolean/logical/comparison operation"
    
    # logical_ops = ["&&", "||", "!", "==", "!=", "<", ">", "<=", ">="]
    # comparison_ops = ["equals", "notEquals", "greaterEqualsThan", "lessEqualsThan", "greaterThan", "lessThan"]
    # if any(boolop in code_example for boolop in logical_ops) or any(boolop in node_type for boolop in comparison_ops):
    #     return "Conduct a boolean/logical/comparison operation"
    
    if any(op in node_type for op in ["addition", "subtraction", "multiplication", "division"]):
        return "Conduct an arithmetic calculation"
 # or any(i in code_example for i in try_catch)
    try_catch = ["try", "catch", "throw"]
    if any(i in node_type for i in try_catch) :
        return "Exception handling"
    return None

# Assign vulnerability characteristics
filtered_df = filtered_df.copy()
filtered_df['Vulnerability Characteristics'] = filtered_df.apply(
    lambda row: map_node_to_characteristic(row['Node Type'], row['Code Examples']),
    axis=1
)

# Remove rows with no characteristics
filtered_df = filtered_df[filtered_df['Vulnerability Characteristics'].notna()]

# Group by Vulnerability Characteristic and sum the counts
final_table = []

for characteristic in filtered_df['Vulnerability Characteristics'].unique():
    group = filtered_df[filtered_df['Vulnerability Characteristics'] == characteristic]
    total_count = group['Count'].sum()
    node_types = group['Node Type'].tolist()
    example_codes = group['Code Examples'].tolist()

    # Join node types and example codes with "/"
    node_type_str = " / ".join(node_types)
    example_code_str = " / ".join(example_codes)

    final_table.append({
        "Vulnerability Characteristics": characteristic,
        "Total Count": total_count,
        "Node Type": node_type_str,
        "Example Code": example_code_str
    })

# Convert to DataFrame
final_df = pd.DataFrame(final_table)

# Sort by Total Count in descending order
final_df = final_df.sort_values('Total Count', ascending=False)

# Save to CSV
final_df.to_csv(ROOT_DIR + '/formatted_vulnerability_characteristics.csv', index=False)
print("✅ Created 'formatted_vulnerability_characteristics.csv' with total counts for each characteristic!")


✅ Created 'formatted_vulnerability_characteristics.csv' with total counts for each characteristic!


In [91]:
df = pd.read_csv(f"{ROOT_DIR}/formatted_vulnerability_characteristics.csv")
# Read vulnerability characteristics data
df

Unnamed: 0,Vulnerability Characteristics,Total Count,Node Type,Example Code
0,Function calls,46035,CALL,int factor = (1 << 31) % random; (1 << 31) % r...
1,Decide the type of the variable,39122,IDENTIFIER,factor; random; Tracer; factor; counter; count...
2,Access a field of an object of aggregate type,13827,FIELD_IDENTIFIER / fieldAccess / indexAccess,vowlessInferentialist; length; splurgeZoograft...
3,Assign values to variables,5559,assignment,int factor = (1 << 31) % random; char counter ...
4,Conduct an arithmetic calculation,2606,addition,e.getClass().getName() + \; e.getClass().getNa...
5,Open or discard a memory space,1307,alloc,new int[size]; new String[stonesoup_value]; ne...
6,Exception handling,1112,CatchClause / getCountry:<unresolvedSignature>...,catch; catch; catch; catch; catch; catch; catc...
7,Relate to control flow and code structure of t...,1055,CONTROL_STRUCTURE,else; catch; else; catch; else; catch; else; c...
8,Use an array,1037,arrayInitializer / stonesoup_array,<operator>.arrayInitializer; <operator>.arrayI...
9,Conduct a boolean/logical/comparison operation,631,logicalAnd / logicalNot,(stonesoup_counter + stonesoup_offset > 0) && ...


## Select Center nodes T function

In [78]:
# INPUT: file export.dot
# Output: {projectname, list[nodesID]}
# chọn ra các nnodes có đặc tính thuộc bảng, đếm số lượng thuộc tính thuọc bảng
# sắp xết giảm dần các nodes có số lượng thuộc tính thuộc bảng 
# Kiểm tra các nodes có được có kết nối vớis nhau hay không
# TH1: Có kết nối một phần -> coi các nodes có kết nối là 1 center nodes 
#   -> từng nodes thành phần lấy neibor 1 thành phần
# TH2: không có kết nối -> nodes nào độc lập thì nodes đó là 1 center node
import pydot
import networkx as nx
import csv

In [79]:
# Step 1: Load vulnerability characteristics
df_char = pd.read_csv(ROOT_DIR + "/vuln-char-table-final.csv")
valid_node_types = set(df_char['Node Type'].dropna().unique())
print(valid_node_types)

{'IDENTIFIER', 'addition', 'CALL', 'FIELD_IDENTIFIER / fieldAccess / indexAccess', 'CatchClause', 'alloc', 'cast', 'CONTROL_STRUCTURE', 'arrayInitializer / stonesoup_array', 'assignment', 'logicalAnd / logicalNot'}


In [86]:
# Step 2: Helper functions
def parse_dot_file(dot_path):
    nodes = {}
    with open(dot_path, 'r', encoding='utf-8') as f:
        for line in f:
            if 'LABEL=' in line or 'NAME=' in line or 'METHOD_FULL_NAME=' in line:
                node_match = re.search(r'"(\d+)" \[', line)
                # print(node_match)
                if not node_match:
                    continue
                node_id = node_match.group(1)
                label_match = re.search(r'label="(.*?)"', line)
                name_match = re.search(r'NAME="(.*?)"', line)
                method_full_name_match = re.search(r'METHOD_FULL_NAME="(.*?)"', line)
                # nếu có label
                label = label_match.group(1) if label_match else ''
                # nếu co NAME
                method = name_match.group(1) if name_match else None
                # không có NAME thì lấy METHOD_FULL_NAME
                if method is None and method_full_name_match:
                    method = method_full_name_match.group(1).split('.')[-1]
                # print(f"Node ID: {node_id}, Label: {label}, Method: {method}")
                # nếu label nằm trong valid_node_types thì thêm vào dict
                if label in valid_node_types:
                    # nếu có method và method nằm trong valid_node_types
                    if (method and method in valid_node_types):
                        nodes[node_id] = {
                            'label': label,
                            'method': method
                        }
                    # elif method is None:
                    else:
                        nodes[node_id] = {
                            'label': label
                        }
                        
    dot_graph = pydot.graph_from_dot_file(dot_path)[0]
    G = nx.drawing.nx_pydot.from_pydot(dot_graph)
    # print(G)
    return nodes, G



In [127]:
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np

def render_final_centers_clustered(G, final_centers, output_path="graph_clusters.png"):
    plt.figure(figsize=(12, 10))

    # Khởi tạo pos trống
    pos = {}

    # Khoảng cách giữa các cluster
    cluster_spacing = 5

    # Tạo color map
    colors = plt.cm.get_cmap('tab20', len(final_centers))

    for idx, group in enumerate(final_centers):
        # Tạo 1 subgraph riêng cho group
        subG = G.subgraph(group)

        # Layout riêng trong group nhỏ (giữ cluster gọn)
        sub_pos = nx.spring_layout(subG, seed=idx)

        # Dịch chuyển toàn bộ group sang vùng riêng biệt
        shift_x = (idx % 5) * cluster_spacing  # mỗi 5 group xuống 1 hàng
        shift_y = -(idx // 5) * cluster_spacing

        for node, (x, y) in sub_pos.items():
            pos[node] = (x + shift_x, y + shift_y)

        # Vẽ node cho group
        nx.draw_networkx_nodes(G, pos,
                               nodelist=group,
                               node_color=[colors(idx)],
                               label=f'Group {idx+1}',
                               node_size=300)

    # Vẽ edges
    nx.draw_networkx_edges(G, pos, alpha=0.3)
    nx.draw_networkx_labels(G, pos, font_size=8)

    plt.axis('off')
    plt.legend()
    plt.title("Clustered Graph Visualization")
    plt.tight_layout()
    plt.savefig(output_path, dpi=300)
    print(f"Graph saved to {output_path}")
    plt.close()


In [130]:
def process_project(dot_path):
    nodes, G = parse_dot_file(dot_path)

    # Chia thành strong và weak nodes
    strong_nodes = {nid: info for nid, info in nodes.items() if len(info) == 2}
    weak_nodes = {nid: info for nid, info in nodes.items() if len(info) < 2}
    print(f"Number of strong nodes: {len(strong_nodes)}")
    print(f"Number of weak nodes: {len(weak_nodes)}")

    # Mỗi strong node tìm weak node nối trực tiếp
    center_to_children = {}
    for center_id in strong_nodes.keys():
        neighbors = set(G.neighbors(center_id))
        linked_weak_nodes = neighbors.intersection(weak_nodes.keys())
        if linked_weak_nodes:
            center_to_children[center_id] = linked_weak_nodes

    # Các weak nodes đã được gán cho strong
    assigned_weak_nodes = set()
    for linked in center_to_children.values():
        assigned_weak_nodes.update(linked)

    remaining_weak_nodes = set(weak_nodes.keys()) - assigned_weak_nodes
    subG = G.subgraph(remaining_weak_nodes)
    subG_undirected = subG.to_undirected()

    final_centers = []

    # ✅ 1. Handle WEAK nodes: chọn node nhiều neighbor nhất làm center
    remaining_nodes = set(subG_undirected.nodes())

    while remaining_nodes:
        # Đếm số neighbor trong remaining_nodes
        neighbor_counts = {}
        for node in remaining_nodes:
            neighbors = set(subG_undirected.neighbors(node))
            valid_neighbors = neighbors & remaining_nodes
            neighbor_counts[node] = len(valid_neighbors)

        # Chọn node nhiều neighbor nhất làm center
        center_node = max(neighbor_counts, key=lambda x: neighbor_counts[x])

        # Center này gom neighbor, nhưng chỉ lưu lại center
        final_centers.append(center_node)

        # Xoá center và neighbor ra khỏi remaining_nodes
        neighbors = set(subG_undirected.neighbors(center_node)) & remaining_nodes
        group = set([center_node]) | neighbors
        remaining_nodes -= group

    print(f"Number of weak centers: {len(final_centers)}")

    # ✅ 2. Handle STRONG nodes: strong node luôn là center
    for center_id in strong_nodes.keys():
        final_centers.append(center_id)

    print(f"Total number of centers (weak + strong): {len(final_centers)}")
    
    # Nếu muốn render nhóm thì phải có group -> không cần render group nữa
    return final_centers


In [131]:
results = []
stats = []
dot_folder = ROOT_DIR + "/cpg-output"
project_name = "1553-v1.0.0-good"  
project_path = os.path.join(dot_folder, project_name)
for file in tqdm(os.listdir(project_path), desc=f"Processing project {project_name}"):
    if file.endswith("export.dot"):
        dot_path = os.path.join(project_path, file)
        center_nodes = process_project(dot_path)
        results.append({
            'project': project_name,
            'center_nodes': center_nodes
        })

# projects = [p for p in os.listdir(dot_folder)]
# for project in tqdm(projects, desc="Processing projects"):
#     # print(os.path.join(dot_folder, project))
#     for file in os.listdir(os.path.join(dot_folder, project_name)):
#         if file.endswith("export.dot"):
#             dot_path = os.path.join(dot_folder, project_name, file)
#             center_nodes = process_project(dot_path)
#             results.append({
#                 'project': project_name,
#                 'center_nodes': center_nodes
#             })
#             stats.append({
#             'project': project_name,
#             'num_center_nodes': len(center_nodes)
#             })

# csv_path = ROOT_DIR + "/center_nodes_stats.csv"
# with open(csv_path, "w", newline='', encoding="utf-8") as csvfile:
#     writer = csv.DictWriter(csvfile, fieldnames=["project", "num_center_nodes"])
#     writer.writeheader()
#     for row in stats:
#         writer.writerow(row)
with open(ROOT_DIR + "/center_nodes_result.json", "w", encoding="utf-8") as f:
    json.dump(results, f, indent=2)
print("✅ Done! Saved to center_nodes_result.json")

Processing project 1553-v1.0.0-good: 100%|██████████| 1/1 [00:07<00:00,  7.34s/it]

Number of strong nodes: 1
Number of weak nodes: 82
Number of weak centers: 17
Total number of centers (weak + strong): 18
✅ Done! Saved to center_nodes_result.json



