# Co-author Network and Centrality Analysis

This script will generate the following files:
- `network_nodes.csv` and `network_edges.csv`: nodes and edges (weights) of the co-author network
- `centrality.csv`: centrality of top-50 authors

In [1]:
import os
import re
import itertools
import math
import random
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt

RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

CSV_PATH = "../data/paper.csv"   
EXPORT_PREFIX = "../data/network/"
MIN_EDGE_WEIGHT = 2         # 只保留合著次数 >= 该阈值的边（可减少噪音）
MIN_DEGREE = 0              # 只保留度数 >= 该阈值的节点（进一步稀疏网络）
TOP_K = 100                 # Top 中心性表中展示的前 K 位作者

In [2]:
df = pd.read_csv(CSV_PATH)
df = df[["author"]]

# 将 authors 列解析成作者列表
def split_authors(s: str, delim):
    tmp = s.split(",")
    return [a.strip() for a in tmp if a and a.strip()]

df["author_list"] = df["author"].apply(lambda x: split_authors(str(x), ","))
df["team_size"] = df["author_list"].apply(len)
display(df)

# print("作者列表样例：", df["author_list"].head(3).to_list())
# print("团队规模分布（前 10 档）：")
print(df["team_size"].value_counts().sort_index().head(10))

Unnamed: 0,author,author_list,team_size
0,"Yunlong Sheng, Xuye Zhuang, Jiancheng Yin, Bow...","[Yunlong Sheng, Xuye Zhuang, Jiancheng Yin, Bo...",5
1,"Eduard Baranov, Axel Legay","[Eduard Baranov, Axel Legay]",2
2,"Andrea Bombarda, Angelo Gargantini","[Andrea Bombarda, Angelo Gargantini]",2
3,"Pierre Martou, Benoît Duhoux, Kim Mens, Axel L...","[Pierre Martou, Benoît Duhoux, Kim Mens, Axel ...",4
4,"Kambiz Nezami Balouchi, Julien Mercier, Robert...","[Kambiz Nezami Balouchi, Julien Mercier, Rober...",3
...,...,...,...
1095,"Charles J. Colbourn, Paul C. van Oorschot","[Charles J. Colbourn, Paul C. van Oorschot]",2
1096,"Gadiel Seroussi, Nader H. Bshouty","[Gadiel Seroussi, Nader H. Bshouty]",2
1097,Keizo Tatsumi,[Keizo Tatsumi],1
1098,"Keizo Tatsumi, S. Watanabe, Y. Takeuchi, H. Sh...","[Keizo Tatsumi, S. Watanabe, Y. Takeuchi, H. S...",4


team_size
1     100
2     258
3     286
4     219
5     151
6      58
7      16
8       7
9       3
12      1
Name: count, dtype: int64


In [3]:
# 构建无向图：节点=作者，边权重=合著次数
G = nx.Graph()

for authors in df["author_list"]:
    if len(authors) < 2:
        continue
    # 两两组合
    for a1, a2 in itertools.combinations(sorted(set(authors)), 2):
        if G.has_edge(a1, a2):
            G[a1][a2]["weight"] += 1
        else:
            G.add_edge(a1, a2, weight=1)

print("[原始网络] 节点数 =", G.number_of_nodes(), "边数 =", G.number_of_edges())

edges = [(u, v, data['weight']) for u, v, data in G.edges(data=True)]
top_edges = sorted(edges, key=lambda x: x[2], reverse=True)[:30]
for u, v, w in top_edges:
    print(f"{u} - {v}, weight = {w}")    

[原始网络] 节点数 = 1379 边数 = 3332
D. Richard Kuhn - Raghu N. Kacker, weight = 80
Yu Lei - Raghu N. Kacker, weight = 62
Yu Lei - D. Richard Kuhn, weight = 59
Dimitris E. Simos - Bernhard Garn, weight = 23
Dimitris E. Simos - Ludwig Kampel, weight = 22
Dimitris E. Simos - Manuel Leithner, weight = 19
Dimitris E. Simos - Raghu N. Kacker, weight = 18
Changhai Nie - Huayao Wu, weight = 17
Himer Avila-George - Jose Torres-Jimenez, weight = 17
Kamal Z. Zamli - Bestoun S. Ahmed, weight = 16
Dimitris E. Simos - Michael Wagner, weight = 14
Jinfu Chen - Rubing Huang, weight = 14
Dimitris E. Simos - D. Richard Kuhn, weight = 13
Jose Torres-Jimenez - Idelfonso Izquierdo-Marquez, weight = 13
Changhai Nie - Xintao Niu, weight = 12
Jinfu Chen - Dave Towey, weight = 12
Andrea Bombarda - Angelo Gargantini, weight = 11
Changhai Nie - Hareton Leung, weight = 11
Kamal Z. Zamli - Mohammed I. Younis, weight = 11
Jose Torres-Jimenez - Nelson Rangel-Valdez, weight = 11
Dave Towey - Rubing Huang, weight = 11
Horst Li

In [4]:
# 根据边权重过滤
if MIN_EDGE_WEIGHT > 1:
    edges_to_remove = [(u, v) for u, v, w in G.edges(data="weight") if w < MIN_EDGE_WEIGHT]
    G.remove_edges_from(edges_to_remove)
    # 同时移除孤立点
    isolates = list(nx.isolates(G))
    G.remove_nodes_from(isolates)

# 按度过滤（迭代移除直到满足阈值）
#if MIN_DEGREE > 0:
#    nodes_to_remove = [n for n, d in dict(G.degree()).items() if d < MIN_DEGREE]
#    while nodes_to_remove:
#        G.remove_nodes_from(nodes_to_remove)
#        nodes_to_remove = [n for n, d in dict(G.degree()).items() if d < MIN_DEGREE]

print("[稀疏化后网络] 节点数 =", G.number_of_nodes(), "边数 =", G.number_of_edges())

# 只保留最大连通分量
#largest_cc_nodes = max(nx.connected_components(G), key=len)
#G = G.subgraph(largest_cc_nodes).copy()
#print("[最大连通分量] 节点数 =", G.number_of_nodes(), "边数 =", G.number_of_edges())

# 遍历连通分量，保留节点数大于 5 的 subgraph
big_components = [c for c in nx.connected_components(G) if len(c) > 5]
G = G.subgraph(set().union(*big_components)).copy()
print("[连通分量过滤] 节点数 =", G.number_of_nodes(), "边数 =", G.number_of_edges())

edges = [(u, v, data['weight']) for u, v, data in G.edges(data=True)]
top_edges = sorted(edges, key=lambda x: x[2], reverse=True)[:30]
for u, v, w in top_edges:
    print(f"{u} - {v}, weight = {w}") 

[稀疏化后网络] 节点数 = 479 边数 = 893
[连通分量过滤] 节点数 = 348 边数 = 763
D. Richard Kuhn - Raghu N. Kacker, weight = 80
Yu Lei - Raghu N. Kacker, weight = 62
Yu Lei - D. Richard Kuhn, weight = 59
Dimitris E. Simos - Bernhard Garn, weight = 23
Dimitris E. Simos - Ludwig Kampel, weight = 22
Dimitris E. Simos - Manuel Leithner, weight = 19
Dimitris E. Simos - Raghu N. Kacker, weight = 18
Changhai Nie - Huayao Wu, weight = 17
Himer Avila-George - Jose Torres-Jimenez, weight = 17
Kamal Z. Zamli - Bestoun S. Ahmed, weight = 16
Dimitris E. Simos - Michael Wagner, weight = 14
Jinfu Chen - Rubing Huang, weight = 14
Dimitris E. Simos - D. Richard Kuhn, weight = 13
Jose Torres-Jimenez - Idelfonso Izquierdo-Marquez, weight = 13
Changhai Nie - Xintao Niu, weight = 12
Jinfu Chen - Dave Towey, weight = 12
Andrea Bombarda - Angelo Gargantini, weight = 11
Changhai Nie - Hareton Leung, weight = 11
Kamal Z. Zamli - Mohammed I. Younis, weight = 11
Jose Torres-Jimenez - Nelson Rangel-Valdez, weight = 11
Dave Towey - Rubing

In [5]:
# 度中心性（未加权）
deg_cent = nx.degree_centrality(G)

# 介数中心性（考虑权重；权重越大代表越近的关系 -> 这里将距离设置为 1/weight）
# NetworkX 的 betweenness_centrality 接受 edge weight 作为“距离”，因此我们传入转换后的临时图
H = G.copy()
for u, v, d in H.edges(data=True):
    w = d.get("weight", 1.0)
    # 避免除零
    d["distance"] = 1.0 / max(w, 1e-9)

bet_cent = nx.betweenness_centrality(H, weight="distance", normalized=True)

# 特征向量中心性（考虑权重）
eig_cent = nx.eigenvector_centrality(G, max_iter=1000, weight="weight")

# 加权度（strength）
strength = {n: sum(d.get("weight", 0) for _, _, d in G.edges(n, data=True)) for n in G.nodes()}

centrality_df = pd.DataFrame({
    "Author": list(G.nodes()),
    "Degree": pd.Series(deg_cent),
    "Strength": pd.Series(strength),
    "Betweenness": pd.Series(bet_cent),
    "Eigenvector": pd.Series(eig_cent)
}).fillna(0.0)

# 排序：先按 Degree，再按 Strength
centrality_sorted = centrality_df.sort_values(["Degree", "Strength"], ascending=False)

top_authors = centrality_sorted.head(TOP_K).reset_index(drop=True)
display(top_authors.head(50))

Unnamed: 0,Author,Degree,Strength,Betweenness,Eigenvector
0,Raghu N. Kacker,0.106628,278,0.052381,0.566843
1,D. Richard Kuhn,0.103746,262,0.085563,0.553818
2,Yu Lei,0.100865,266,0.065551,0.5064969
3,Charles J. Colbourn,0.083573,77,0.151094,0.005967827
4,Dimitris E. Simos,0.074928,190,0.063725,0.1918535
5,Changhai Nie,0.051873,104,0.084333,0.03465866
6,Myra B. Cohen,0.051873,61,0.077266,0.0002696326
7,Kamal Z. Zamli,0.048991,75,0.005089,3.175415e-08
8,Bernhard Garn,0.04611,82,3.3e-05,0.08758703
9,Jinfu Chen,0.040346,67,0.000862,7.608991e-07


In [6]:
# 导出结果文件
nodes_csv = f"{EXPORT_PREFIX}/network_nodes.csv"
edges_csv = f"{EXPORT_PREFIX}/network_edges.csv"
centrality_csv = f"{EXPORT_PREFIX}/centrality.csv"

# Top 中心性表
top_authors.to_csv(centrality_csv, index=False)

# 边列表（u, v, weight）
edges_out = pd.DataFrame([(u, v, d.get("weight", 1)) for u, v, d in G.edges(data=True)], columns=["source", "target", "weight"])
edges_out.to_csv(edges_csv, index=False)

# 节点列表
nodes_out = pd.DataFrame({"id": list(G.nodes())})
nodes_out.to_csv(nodes_csv, index=False)

print("已导出：")
print(" -", centrality_csv)
print(" -", edges_csv)
print(" -", nodes_csv)

已导出：
 - ../data/network//centrality.csv
 - ../data/network//network_edges.csv
 - ../data/network//network_nodes.csv
