In [1]:
import csv
import networkx as nx
import matplotlib.pyplot as plt
import math
import random
import igraph as ig
import os

In [None]:
import csv
import igraph as ig
from concurrent.futures import ThreadPoolExecutor, as_completed

# 创建图对象，预先不指定方向，稍后定义
G = ig.Graph(directed=True)

vertices = set()
edges = set()
edges_to_add = []

with open('data/mini_graph_supply_chain.csv', mode='r', newline='') as file:
    dependents_data = csv.DictReader(file)
    for row in dependents_data:
        up_name = row['up_name']
        down_name = row['down_name']

        # 记录顶点和边
        vertices.add(up_name)
        vertices.add(down_name)
        if (up_name, down_name) not in edges:
            edges.add((up_name, down_name))
            edges_to_add.append((up_name, down_name))

# 一次性添加所有顶点和边
G.add_vertices(list(vertices))
G.add_edges(edges_to_add)
print(f"已添加 {len(edges_to_add)} 条初级依赖边。")

# 创建索引映射以加速查找
name_to_index = {v['name']: v.index for v in G.vs}

# 函数来处理每个边的二级依赖检查
def check_second_level_dependencies(edge):
    source = name_to_index[edge[0]]
    target = name_to_index[edge[1]]
    source_neighbors = set(G.neighbors(source, mode='out'))
    target_neighbors = set(G.neighbors(target, mode='out'))

    common_dependents = source_neighbors & target_neighbors
    new_edges = []
    for dep in common_dependents:
        if (source, dep) not in edges:
            print(f'append {source}, {dep}')
            new_edges.append((source, dep))
    return new_edges

# 使用 ThreadPoolExecutor 处理二级依赖
new_edges_to_add = []
print(f"Total edges to process: {len(edges_to_add)}")
with ThreadPoolExecutor(max_workers=4) as executor:
    futures = {executor.submit(check_second_level_dependencies, edge): edge for edge in edges_to_add}
    for i, future in enumerate(as_completed(futures), 1):
        new_edges = future.result()
        new_edges_to_add.extend(new_edges)
        print(f"处理进度：{i}/{len(edges_to_add)} (完成了 {(i/len(edges_to_add) * 100):.2f}%)")

# 主线程中添加所有新边
for edge in new_edges_to_add:
    G.add_edge(edge[0], edge[1])
    edges.add((edge[0], edge[1]))

print(f"完成二级依赖边添加，共 {len(new_edges_to_add)} 条。")


In [None]:
# 查看图的一些属性
print(f"顶点数: {G.vcount()}")
print(f"边数: {G.ecount()}")

In [None]:
# 创建一个字典以存储节点属性
node_attributes = {}

# 从文件中读取节点属性数据
with open('new_outdate_osv_cwe_ids.csv', mode='r', newline='') as file:
    cwe_data = csv.DictReader(file)
    for row in cwe_data:
        package_name = row['name']
        cwe_id = row['cwe_id']

        # 检查顶点是否存在（某些包发生了漏洞事件但没有dependent包，所以不在图中）
        try:
            node = G.vs.find(name=package_name)
        except ValueError:
            # 如果顶点不存在，跳过
            print(f" {package_name} does not exist in G")
            continue
        if node:
            node['cwe_ids'].append(cwe_id)

for node, attributes in node_attributes.items():
    node_index = G.vs.find(name=node).index  # 获取节点的索引
    if 'package_name' in attributes:
        G.vs[node_index]['attributes'] = attributes
    else:
        print(f"Warning: Node '{node}' does not have 'package_name' attribute.")

In [None]:
import pickle

# 保存图 G 到文件
with open('graph.pkl', 'wb') as file:
    pickle.dump(G, file)


In [None]:
# 使用 Louvain 算法进行社区检测，并传入分辨率参数
undirected_G = G.as_undirected()
best_partition = undirected_G.community_multilevel(resolution=0.2) # 分辨率越大，社区数量越多
com_num = len(best_partition)
print(f"共划分为{com_num}个社区")

In [None]:
# # 输出每个社区的个数
for community_index, community in enumerate(best_partition):
    print(f"Community {community_index + 1}:")
    print(f"数量：{len(community)}")
    print()

In [None]:
for i, community in enumerate(best_partition):
    community_nodes[i] = list(community)

# 将社区数据保存到 CSV 文件
with open('communities.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Community_ID', 'Node'])
    for community_id, nodes in community_nodes.items():
        for node in nodes:
            writer.writerow([community_id, node])

print("社区数据已保存到 'communities.csv' 文件中。")
