Research Tree PoC 20250318

# Setup

In [None]:
research_topic = "llm literature review"
seed_dois = ['10.48550/arXiv.2406.10252',  # AutoSurvey: Large Language Models Can Automatically Write Surveys
             '10.48550/arXiv.2412.10415',  # Generative Adversarial Reviews: When LLMs Become the Critic
             '10.48550/arXiv.2402.12928',  # A Literature Review of Literature Reviews in Pattern Analysis and Machine Intelligence 
             ]
seed_titles = ['PaperRobot: Incremental Draft Generation of Scientific Ideas',
               'From Hypothesis to Publication: A Comprehensive Survey of AI-Driven Research Support Systems'
               ]

# Paper Exploration

In [None]:
import sys
import os

parent_dir = os.path.dirname(os.getcwd())
print(parent_dir)
sys.path.append(parent_dir)

In [None]:
import toml

config_file = "config.toml"
try:
    with open(config_file, 'r', encoding='utf-8') as toml_file:
        config_param = toml.load(toml_file)
except FileNotFoundError:
    print(f"Config file '{config_file}' not found. Please ensure it exists.")
    config_param = {} 

In [None]:
llm_api_key = config_param.get('models', {}).get('llm', {}).get('api_key')
llm_model_name = config_param.get('models', {}).get('llm', {}).get('model_name')
embed_api_key = config_param.get('models', {}).get('embed', {}).get('api_key')
embed_model_name = config_param.get('models', {}).get('embed', {}).get('model_name')

In [None]:
from graph.paper_trace import PaperExploration
# paperbot = PaperExploration(
#     seed_paper_dois=seed_dois[0],
#     llm_api_key = llm_api_key,
#     llm_model_name = llm_model_name,
#     embed_api_key = embed_api_key,
#     embed_model_name = embed_model_name
#     )
paperbot = PaperExploration(
    research_topic=research_topic, 
    seed_paper_dois=seed_dois, 
    seed_paper_titles=seed_titles,
    llm_api_key = llm_api_key,
    llm_model_name = llm_model_name,
    embed_api_key = embed_api_key,
    embed_model_name = embed_model_name    
    )

### Get initial papers

In [None]:
paperbot.initial_paper_query(limit=50, from_dt='2023-01-01', to_dt='2025-03-24')

In [None]:
init_paper = [x for x in paperbot.nodes_json if x['labels'] == ["Paper"] and 'Seed' in x['properties']['source']]
init_paper_dois = [x['id'] for x in init_paper]

In [None]:
init_paper_dois

In [None]:
print(len(paperbot.nodes_json), len(paperbot.edges_json))

### Search Citation Information

In [None]:
import time
for paper_doi in init_paper_dois:
    paperbot.get_cited_papers(paper_doi) 
    time.sleep(5)
    paperbot.get_citing_papers(paper_doi) 
    time.sleep(5)

In [None]:
print(len(paperbot.nodes_json), len(paperbot.edges_json))

### Get Recommended Papers

In [None]:
paperbot.get_recommend_papers(paper_dois=init_paper_dois, from_dt='2022-01-01', to_dt='2025-03-13')

In [None]:
print(len(paperbot.nodes_json), len(paperbot.edges_json))

### Get Papers from Related Topics

In [None]:
domains, init_paper_info = [], []
for item in init_paper:
    title = item.get('properties',{}).get('title')
    abstract = item.get('properties',{}).get('abstract')
    domain = item.get('properties',{}).get('fieldsOfStudy')
    info = f"<paper> TITLE: {title}\nABSTRACT: {abstract} </paper>"
    init_paper_info.append(info)
    domains.extend(domain)

from collections import Counter
domain = Counter(domains).most_common(1)[0][0]

In [None]:
paperbot.get_related_papers(domain, input_text="\n".join(init_paper_info), from_dt='2022-01-01', to_dt='2025-03-13')

In [None]:
paper_nodes_json = [x for x in paperbot.nodes_json if x['labels'] == ["Paper"] ]
await paperbot.add_semantic_relationship(paper_nodes_json)

In [None]:
print(len(paperbot.nodes_json), len(paperbot.edges_json))

In [None]:
import json

filename = "paper_nodes_json.jsonl"

with open(filename, 'w') as f:
    for item in paperbot.nodes_json:
        json.dump(item, f, ensure_ascii=False)
        f.write('\n')

In [None]:
import json

filename = "paper_edges_json.jsonl"

with open(filename, 'w') as f:
    for item in paperbot.edges_json:
        json.dump(item, f, ensure_ascii=False)
        f.write('\n')

# Import Data to Graph

In [1]:
import json

filename = "paper_nodes_json.jsonl"

nodes_json = []
with open(filename, 'r') as f:
    for item in f:
        nodes_json.append(json.loads(item))

In [2]:
print(len(nodes_json))

4257


In [3]:
nodes_json[0].keys()

dict_keys(['type', 'id', 'labels', 'properties', 'source', 'sourceDesc'])

In [4]:
def filter_and_reorder_dict(input_dict, keys_to_keep):
    """filter and re-order keys of dict"""
    return {key: input_dict[key] for key in keys_to_keep if key in input_dict}

nodes_json_rvsd = [filter_and_reorder_dict(x, ['type', 'id', 'labels', 'properties']) for x in nodes_json]

In [5]:
import json

filename = "paper_edges_json.jsonl"

edges_json = []
with open(filename, 'r') as f:
    for item in f:
        edges_json.append(json.loads(item))

In [8]:
edges_json_rvsd = []
for x in edges_json:
    if x['relationshipType'] == 'SIMILAR_TO':
        if x['properties']['weight'] > 0.7:
            edges_json_rvsd.append(x)
        else:
            continue
    else:
        edges_json_rvsd.append(x)
print(len(edges_json_rvsd))

8371


In [6]:
import networkx as nx
from typing import List, Dict

class PaperGraph:
    def __init__(self, name):
        self.graph = nx.MultiDiGraph(name=name)

    def add_graph_nodes(self, nodes_json: List[Dict]|Dict):
        """add paper node to graph
        Args:
            nodes_json (List[Dict] or Dict): original node json processed in format like:
                dct = {
                    "type": "node",
                    "id": ,
                    "labels": ["Paper"],
                    "properties": {"key":value}
                    }
        """
        if type(nodes_json) == dict:
            nodes_json = [nodes_json]

        nx_nodes_info = []
        for item in nodes_json:
            id = item['id']
            properties = item['properties']
            properties['nodeType'] = item['labels'][0]
            # be aware that node shall take the form like (1, dict(size=11)) for networkX
            nx_nodes_info.append((id, properties))  
        
        self.graph.add_nodes_from(nx_nodes_info)


    def add_graph_edges(self, edges_json: List[Dict]|Dict):
        """add paper node to graph
        Args:
            edges_json (List[Dict] or Dict): original relationship json processed in format like:
                dct = {
                        "type": "relationship",
                        "relationshipType": "WRITES",
                        "startNodeId": ,
                        "endNodeId": ,
                        "properties": {'authorOrder': author_order, 'coauthors': coauthors}
                        }
        """
        if type(edges_json) == dict:
            edges_json = [edges_json]

        nx_edges_info = []
        for item in edges_json:
            source_id = item['startNodeId']
            target_id = item['endNodeId']
            properties = item['properties']
            properties['relationshipType'] = item['relationshipType']
            # be aware that relationship shall take the form like (4, 5, dict(route=282)) for networkX
            nx_edges_info.append((source_id, target_id, properties))  
        
        self.graph.add_edges_from(nx_edges_info)
    

    def update_node_property(self, node_id, kv_dict):
        """update node properties in graph
        Args:
            node_id: unique node identifier
            kv_dict: information to update
        """
        assert node_id in self.graph.nodes

        for key in kv_dict.keys():
            value = kv_dict[key]
            self.graph.nodes[node_id][key] = value


    def update_edge_property(self, source_id, target_id, kv_dict):
        """update edge properties in graph
        Args:
            source_id, target_id: unique edge identifier
            kv_dict: information to update
        """
        assert (source_id, target_id) in self.graph.edges

        for key in kv_dict.keys():
            value = kv_dict[key]
            self.graph.edges[source_id, target_id][key] = value

In [9]:
name = 'Paper Citation Graph Test 001'

pg = PaperGraph(name)
pg.add_graph_nodes(nodes_json_rvsd)
pg.add_graph_edges(edges_json_rvsd)

In [10]:
paper_ids = ['9e57dda195973c4b6c81386b1cc44595ecfd4697',
 '9f3ae8055e227edb413c54417c9c216f1f554f52',
 '69b53faee7ce5c007e4d3e3ea532818ed8d0645d',
 'a6aed0c4e0f39a55edb407f492e41f178a62907f',
 'cdb34c0092a767848ca1de6fa7e3a6b822585fa4']

seed_dois = []
for item in nodes_json_rvsd:
    if item['labels'] == ['Paper']:
        if item['properties']['s2PaperId'] in paper_ids:
            seed_dois.append(item['id'])

print(seed_dois)

['10.48550/arXiv.2406.10252', '10.48550/arXiv.2412.10415', '10.48550/arXiv.2402.12928', '10.48550/arXiv.2503.01424', '10.48550/arXiv.1905.07870']


## Analyze the Key Points

In [31]:
# 计算入度中心性
in_degree_centrality = nx.in_degree_centrality(pg.graph)
print("入度中心性:", in_degree_centrality)

入度中心性: {'10.48550/arXiv.2406.10252': 0.014936336924583743, '2108024279': 0.0, '2273779175': 0.0, '2286328804': 0.0, '2116271777': 0.0, '2262020955': 0.0, '2293356300': 0.0, '2289004972': 0.0, '2257010530': 0.0, '2259709647': 0.0, '2307012818': 0.0, 'ArXiv': 0.0, 'd9720b90-d60b-48bc-9df8-87a30b9a60dd': 0.002448579823702253, '10.48550/arXiv.2412.10415': 0.012242899118511263, '2335566763': 0.0, '2335569348': 0.0, '1901e811-ee72-4b20-8f7e-de08cd395a10': 0.0, '10.48550/arXiv.2402.12928': 0.006366307541625857, '2284827556': 0.0, '2268132119': 0.0, '2275569993': 0.0, '2284825678': 0.0, '2284824283': 0.0, '10.48550/arXiv.2503.01424': 0.01248775710088149, '2328342585': 0.0, '2674998': 0.0, '2265930173': 0.0, '51056442': 0.0, '2349068478': 0.0, '2337225259': 0.0, '2216503559': 0.0, '2265878959': 0.0, '2112678409': 0.0, '2118640235': 0.0, '10.1007/978-981-96-0348-0_3': 0.0009794319294809011, '31727676': 0.0, '2303557186': 0.0, '2282414081': 0.0, '2282412592': 0.0, 'b76366f5-0af9-45f3-8fe3-78fdb01

In [32]:
sorted_items = sorted(in_degree_centrality.items(), key=lambda item: item[1], reverse=True)

In [33]:
sorted_items

[('10.48550/arXiv.2409.04600', 0.02277179236043095),
 ('10.1109/FLLM63129.2024.10852447', 0.021057786483839373),
 ('10.48550/arXiv.2412.15249', 0.019833496571988248),
 ('10.1186/s13643-024-02575-4', 0.015915768854064642),
 ('10.48550/arXiv.2403.08399', 0.015915768854064642),
 ('10.3346/jkms.2025.40.e92', 0.015670910871694418),
 ('10.2196/47049', 0.015670910871694418),
 ('10.48550/arXiv.2412.13612', 0.015426052889324193),
 ('10.1145/3637371', 0.015426052889324193),
 ('10.48550/arXiv.2406.10252', 0.014936336924583743),
 ('10.48550/arXiv.2308.10620', 0.014936336924583743),
 ('10.3390/fi16050167', 0.014936336924583743),
 ('10.2196/56537', 0.014936336924583743),
 ('10.1093/jamia/ocaf030', 0.013956904995102841),
 ('10.48550/arXiv.2503.08569', 0.013712047012732615),
 ('10.32604/cmc.2025.061263', 0.01346718903036239),
 ('10.1145/3627673.3679677', 0.01346718903036239),
 ('10.48550/arXiv.2403.07183', 0.013222331047992164),
 ('10.1186/s12911-025-02954-4', 0.01297747306562194),
 ('10.48550/arXiv.2

In [None]:
# 计算出度中心性
out_degree_centrality = nx.out_degree_centrality(pg.graph)
print("出度中心性:", out_degree_centrality)

In [None]:
sorted_items_2 = sorted(out_degree_centrality.items(), key=lambda item: item[1], reverse=True)

In [None]:
sorted_items_2

In [None]:
# 计算介数中心性
betweenness_centrality = nx.betweenness_centrality(pg.graph)
print("介数中心性:", betweenness_centrality)

In [None]:
sorted_items_3 = sorted(betweenness_centrality.items(), key=lambda item: item[1], reverse=True)

In [None]:
sorted_items_3

In [None]:
# 计算紧密中心性
closeness_centrality = nx.closeness_centrality(pg.graph)
print("紧密中心性:", closeness_centrality)

In [None]:
sorted_items_4 = sorted(closeness_centrality.items(), key=lambda item: item[1], reverse=True)

In [None]:
sorted_items_4

In [None]:
new_set =(set([x[0] for x in sorted_items[0:100]]) 
    & set([x[0] for x in sorted_items_2[0:100]]) 
    & set([x[0] for x in sorted_items_3[0:100]])
    & set([x[0] for x in sorted_items_4[0:100]]))

In [None]:
new_set

## Analyze Key Path

In [14]:
import networkx as nx

def find_paths_connecting_all_nodes(graph, nodes):
    """
    找到连接所有指定节点的最短路径的组合。
    """
    if not nodes or len(nodes) < 2:
        return

    all_paths = []
    for i in range(len(nodes)):
        for j in range(i + 1, len(nodes)):
            try:
                shortest_path = nx.shortest_path(graph, source=nodes[i], target=nodes[j])
                all_paths.append(shortest_path)
            except nx.NetworkXNoPath:
                print(f"节点 {nodes[i]} 和 {nodes[j]} 之间没有路径。")
                return None

    # 这里可以进一步处理 all_paths 来合并或分析连接所有节点的路径
    # 例如，可以提取所有路径中的边，构建一个包含这些边的子图。
    edges = set()
    for path in all_paths:
        for i in range(len(path) - 1):
            u, v = sorted((path[i], path[i+1])) # 考虑无向图，对节点排序
            edges.add((u, v))

    connecting_subgraph = nx.Graph(list(edges)) # 创建包含这些边的子图
    nodes_in_subgraph = set(connecting_subgraph.nodes())
    if all(node in nodes_in_subgraph for node in nodes):
        return connecting_subgraph
    else:
        print("无法找到直接连接所有指定节点的路径组合。")
        return None

G = pg.graph.to_undirected()
# 用户输入的节点
user_nodes = seed_dois

# 查找连接这些节点的路径
connecting_subgraph = find_paths_connecting_all_nodes(G, user_nodes)

if connecting_subgraph:
    print("连接指定节点的子图的边:", connecting_subgraph.edges())

连接指定节点的子图的边: [('10.48550/arXiv.1905.07870', '10.48550/arXiv.2406.10252'), ('10.48550/arXiv.1905.07870', '10.48550/arXiv.2412.10415'), ('10.48550/arXiv.2406.10252', '10.48550/arXiv.2402.12928'), ('10.48550/arXiv.2406.10252', '10.48550/arXiv.2412.10415'), ('10.48550/arXiv.2406.10252', '10.48550/arXiv.2503.01424')]


In [15]:
len(connecting_subgraph.edges())

5

In [16]:
seed_dois

['10.48550/arXiv.2406.10252',
 '10.48550/arXiv.2412.10415',
 '10.48550/arXiv.2402.12928',
 '10.48550/arXiv.2503.01424',
 '10.48550/arXiv.1905.07870']

In [None]:
import networkx as nx

def find_common_paths_between_pairs(graph, nodes):
    """
    找到用户输入的每两个节点之间的所有简单路径。
    """
    if not nodes or len(nodes) < 2:
        return {}

    all_paths = {}
    for i in range(len(nodes)):
        for j in range(i + 1, len(nodes)):
            source_node = nodes[i]
            target_node = nodes[j]
            try:
                paths = list(nx.all_simple_paths(graph, source=source_node, target=target_node))
                if paths:
                    all_paths[(source_node, target_node)] = paths
                else:
                    print(f"节点 {source_node} 和 {target_node} 之间没有简单路径。")
            except nx.NodeNotFound as e:
                print(f"节点 {e} 不在图中。")
                return None
    return all_paths

# 创建一个示例图
G = pg.graph.to_undirected()

# 用户输入的节点
user_nodes = [1, 4]

# 查找这些节点对之间的所有简单路径
common_paths = find_common_paths_between_pairs(G, user_nodes)

if common_paths:
    for (u, v), paths in common_paths.items():
        print(f"节点 {u} 和 {v} 之间的简单路径:")
        for path in paths:
            print(path)

In [None]:
import networkx as nx

def find_shortest_paths_between_pairs(graph, nodes):
    """
    找到用户输入的每两个节点之间的最短路径。
    """
    if not nodes or len(nodes) < 2:
        return {}

    shortest_paths = {}
    for i in range(len(nodes)):
        for j in range(i + 1, len(nodes)):
            source_node = nodes[i]
            target_node = nodes[j]
            try:
                path = nx.shortest_path(graph, source=source_node, target=target_node)
                shortest_paths[(source_node, target_node)] = path
            except nx.NetworkXNoPath:
                print(f"节点 {source_node} 和 {target_node} 之间没有路径。")
            except nx.NodeNotFound as e:
                print(f"节点 {e} 不在图中。")
                return None
    return shortest_paths

# 创建一个示例图
G = pg.graph.to_undirected()

# 用户输入的节点
user_nodes = [1, 4]

# 查找这些节点对之间的最短路径
shortest_paths = find_shortest_paths_between_pairs(G, user_nodes)

if shortest_paths:
    for (u, v), path in shortest_paths.items():
        print(f"节点 {u} 和 {v} 之间的最短路径: {path}")

## Analyze Community

In [17]:
import networkx as nx
import community as community_louvain
import matplotlib.pyplot as plt

# 使用 Louvain 算法发现社群
G = pg.graph.to_undirected()
partition = community_louvain.best_partition(G)

# partition 是一个字典，键是节点，值是社群编号
print("社群划分结果:", partition)

社群划分结果: {'10.48550/arXiv.2406.10252': 27, '2108024279': 27, '2273779175': 27, '2286328804': 27, '2116271777': 27, '2262020955': 27, '2293356300': 27, '2289004972': 27, '2257010530': 27, '2259709647': 27, '2307012818': 27, 'ArXiv': 1, 'd9720b90-d60b-48bc-9df8-87a30b9a60dd': 27, '10.48550/arXiv.2412.10415': 2, '2335566763': 2, '2335569348': 2, '1901e811-ee72-4b20-8f7e-de08cd395a10': 3, '10.48550/arXiv.2402.12928': 4, '2284827556': 4, '2268132119': 4, '2275569993': 4, '2284825678': 4, '2284824283': 4, '10.48550/arXiv.2503.01424': 5, '2328342585': 5, '2674998': 5, '2265930173': 5, '51056442': 5, '2349068478': 5, '2337225259': 5, '2216503559': 5, '2265878959': 5, '2112678409': 5, '2118640235': 5, '10.1007/978-981-96-0348-0_3': 6, '31727676': 6, '2303557186': 6, '2282414081': 6, '2282412592': 6, 'b76366f5-0af9-45f3-8fe3-78fdb0114f67': 6, '10.48550/arXiv.2403.03699': 7, '2290069986': 7, '51902554': 7, '1784556': 7, '10.48550/arXiv.2407.16148': 2, '23608432': 2, '2203427167': 2, '2052201732': 

In [None]:
paper_ids = ['9e57dda195973c4b6c81386b1cc44595ecfd4697',
 '9f3ae8055e227edb413c54417c9c216f1f554f52',
 '69b53faee7ce5c007e4d3e3ea532818ed8d0645d',
 'a6aed0c4e0f39a55edb407f492e41f178a62907f',
 'cdb34c0092a767848ca1de6fa7e3a6b822585fa4']

seed_dois = []
for item in nodes_json_rvsd:
    if item['labels'] == ['Paper']:
        if item['properties']['s2PaperId'] in paper_ids:
            seed_dois.append(item['id'])

print(seed_dois)

In [24]:
# 查找特定节点所属的社群 (需要遍历社群)
specific_nodes = seed_dois
for node in specific_nodes:
    found = False
    for key, value in partition.items():
        if node in key:
            print(f"节点 {node} 属于Louvain发现的社群 {value}: {key}")
            found = True
            break
    if not found:
        print(f"节点 {node} 不在任何已发现的社群中")

节点 10.48550/arXiv.2406.10252 属于Louvain发现的社群 27: 10.48550/arXiv.2406.10252
节点 10.48550/arXiv.2412.10415 属于Louvain发现的社群 2: 10.48550/arXiv.2412.10415
节点 10.48550/arXiv.2402.12928 属于Louvain发现的社群 4: 10.48550/arXiv.2402.12928
节点 10.48550/arXiv.2503.01424 属于Louvain发现的社群 5: 10.48550/arXiv.2503.01424
节点 10.48550/arXiv.1905.07870 属于Louvain发现的社群 43: 10.48550/arXiv.1905.07870


In [22]:
len(partition)

4085

In [23]:
partition

{'10.48550/arXiv.2406.10252': 27,
 '2108024279': 27,
 '2273779175': 27,
 '2286328804': 27,
 '2116271777': 27,
 '2262020955': 27,
 '2293356300': 27,
 '2289004972': 27,
 '2257010530': 27,
 '2259709647': 27,
 '2307012818': 27,
 'ArXiv': 1,
 'd9720b90-d60b-48bc-9df8-87a30b9a60dd': 27,
 '10.48550/arXiv.2412.10415': 2,
 '2335566763': 2,
 '2335569348': 2,
 '1901e811-ee72-4b20-8f7e-de08cd395a10': 3,
 '10.48550/arXiv.2402.12928': 4,
 '2284827556': 4,
 '2268132119': 4,
 '2275569993': 4,
 '2284825678': 4,
 '2284824283': 4,
 '10.48550/arXiv.2503.01424': 5,
 '2328342585': 5,
 '2674998': 5,
 '2265930173': 5,
 '51056442': 5,
 '2349068478': 5,
 '2337225259': 5,
 '2216503559': 5,
 '2265878959': 5,
 '2112678409': 5,
 '2118640235': 5,
 '10.1007/978-981-96-0348-0_3': 6,
 '31727676': 6,
 '2303557186': 6,
 '2282414081': 6,
 '2282412592': 6,
 'b76366f5-0af9-45f3-8fe3-78fdb0114f67': 6,
 '10.48550/arXiv.2403.03699': 7,
 '2290069986': 7,
 '51902554': 7,
 '1784556': 7,
 '10.48550/arXiv.2407.16148': 2,
 '23608432

In [28]:
# 计算紧密中心性
specific_nodes = seed_dois

community_index = []
for node in specific_nodes:
    found = False
    for key, value in partition.items():
        if node in key:
            community_index.append(value)

community_nodes = []
for i in community_index:
    community_i_nodes = []
    for key, value in partition.items():
        if value == i:
            community_i_nodes.append(key)
    community_nodes.append(community_i_nodes)

In [37]:
for idx, ndoes in enumerate(community_nodes):
    print(f"Community:{community_index[idx]}")
    sub_G = pg.graph.subgraph(ndoes)
    closeness_centrality = nx.closeness_centrality(sub_G)
    sorted_items = sorted(closeness_centrality.items(), key=lambda item: item[1], reverse=True)
    i = 0
    for item in sorted_items:
        if i < 10:
            if item[0].startswith('10.'):
                print(item, sub_G.nodes[item[0]]['title'])
                i += 1
    print("-"*40)

Community:27
('10.48550/arXiv.2406.10252', 0.15247642287146343) AutoSurvey: Large Language Models Can Automatically Write Surveys
('10.48550/arXiv.2310.07521', 0.12326927255130458) Survey on Factuality in Large Language Models: Knowledge, Retrieval and Domain-Specificity
('10.48550/arXiv.2303.08774', 0.12326927255130458) GPT-4 Technical Report
('10.48550/arXiv.2304.02643', 0.12326927255130458) Segment Anything
('10.48550/arXiv.2203.11147', 0.12326927255130458) Teaching language models to support answers with verified quotes
('10.48550/arXiv.2306.05087', 0.12015645265888457) PandaLM: An Automatic Evaluation Benchmark for LLM Instruction Tuning Optimization
('10.48550/arXiv.2307.03109', 0.12015645265888457) A Survey on Evaluation of Large Language Models
('10.48550/arXiv.2305.13304', 0.11830789184874788) RecurrentGPT: Interactive Generation of (Arbitrarily) Long Text
('10.48550/arXiv.2109.10862', 0.11584901119178467) Recursively Summarizing Books with Human Feedback
('10.48550/arXiv.2306

In [None]:
# from networkx.algorithms.community import girvan_newman

# # 使用 Girvan-Newman 算法发现社群
# communities_generator = girvan_newman(G)

# # 获取前几步的社群划分结果
# top_level_communities = next(communities_generator)
# next_level_communities = next(communities_generator)

# sorted_communities = sorted(map(sorted, top_level_communities))
# print("Girvan-Newman 算法发现的社群 (第一层):", sorted_communities)

# # 查找特定节点所属的社群 (需要遍历社群)
# specific_nodes = seed_dois
# for node in specific_nodes:
#     found = False
#     for i, community in enumerate(sorted_communities):
#         if node in community:
#             print(f"节点 {node} 属于 Girvan-Newman 算法发现的社群 {i}: {community}")
#             found = True
#             break
#     if not found:
#         print(f"节点 {node} 不在任何已发现的社群中")

In [None]:
from networkx.algorithms.community import label_propagation_communities

# 使用标签传播算法发现社群
communities = list(label_propagation_communities(G))
print("标签传播算法发现的社群:", communities)

# 查找特定节点所属的社群 (需要遍历社群)
specific_nodes = seed_dois
for node in specific_nodes:
    found = False
    for i, community in enumerate(communities):
        if node in community:
            print(f"节点 {node} 属于标签传播算法发现的社群 {i}: {community}")
            found = True
            break
    if not found:
        print(f"节点 {node} 不在任何已发现的社群中")

Stats

In [None]:
paperbot.nodes_json[0].keys()

In [None]:
paperbot.nodes_json[0]['source']

In [None]:
paperbot.nodes_json[0]['properties'].keys()

In [None]:
paperbot.nodes_json[0]['properties']['source']

In [None]:
import copy 

def remove_key_values(input_dict, keys_to_delete):
    """delete key-value in dict"""
    opt_dct = copy.deepcopy(input_dict)
    for key in keys_to_delete:
        if key in opt_dct:  # 检查键是否存在，避免 KeyError
            del opt_dct[key]
    return opt_dct # 为了方便链式调用，返回修改后的字典

In [None]:
for item in paperbot.nodes_json:
    source = item.get('source')
    source_desc = item.get('sourceDesc')
    if (isinstance(source, list) and len(source) > 0) or (isinstance(source_desc, list) and len(source_desc) > 0):
        if isinstance(item['properties']['source'], list):
            item['properties']['source'].extend(source)
        else:
            item['properties']['source'] = source
        if isinstance(item['properties']['sourceDesc'], list):
            item['properties']['sourceDesc'].extend(source_desc)
        else:
            item['properties']['sourceDesc'] = source_desc
    item = remove_key_values(item, ['source', 'sourceDesc'])

In [None]:
set([x['labels'][0] for x in paperbot.nodes_json])

In [None]:
len([x['id'] for x in paperbot.nodes_json]), len(set([x['id'] for x in paperbot.nodes_json]))

In [None]:
for item in paperbot.nodes_json:
    if item['labels'] == ['Venue']:
        print(item)

In [None]:
for item in paperbot.nodes_json:
    item['ref_cnt'] = len(set(item['properties']['source']))

In [None]:
sorted_data_lambda = sorted(paperbot.nodes_json, key=lambda item: item['ref_cnt'], reverse=True)

In [None]:
i = 0
for item in sorted_data_lambda:
    if i < 10:
        if item['labels'] == ['Paper']:
            print(item)
            print(item['properties']['title'], item['ref_cnt'])
            i += 1
    else:
        break

In [None]:
set([x['relationshipType'] for x in paperbot.edges_json])

In [None]:

for item in paperbot.edges_json:
    if item['relationshipType'] == 'SIMILAR_TO':
        print(item)
        break

In [None]:
tmp = []
for item in paperbot.edges_json:
    if item['relationshipType'] == 'SIMILAR_TO':
        if item['properties']['weight'] > 0.7:
            tmp.append(item)
        else:
            continue
    else:
        tmp.append(item)




In [None]:
len(tmp), len(paperbot.edges_json)

In [None]:
from collections import defaultdict

end_to_paper_dcts = [x for x in tmp if 'arXiv' in x['endNodeId']]

target_key = 'endNodeId'
value_counts = defaultdict(int)

for item in end_to_paper_dcts:
    if target_key in item:
        value = item[target_key]
        value_counts[value] += 1

# 按照出现次数从高到低排序
sorted_counts = sorted(value_counts.items(), key=lambda item: item[1], reverse=True)

In [None]:
print(f"'{target_key}' 对应的取值统计 (从高到低排序):")

node_dois = [x['id'] for x in paperbot.nodes_json]

next_dois = []
for value, count in sorted_counts:
    if count > 30:
        print(f"{value}: {count}")
        idx = node_dois.index(value)
        print(paperbot.nodes_json[idx]['properties']['title'])
        next_dois.append(value)


In [None]:
from collections import defaultdict

start_from_paper_dcts = [x for x in tmp if 'arXiv' in x['startNodeId']]

target_key = 'startNodeId'
value_counts = defaultdict(int)

for item in start_from_paper_dcts:
    if target_key in item:
        value = item[target_key]
        value_counts[value] += 1

# 按照出现次数从高到低排序
sorted_counts = sorted(value_counts.items(), key=lambda item: item[1], reverse=True)

print(f"'{target_key}' 对应的取值统计 (从高到低排序):")

node_dois = [x['id'] for x in paperbot.nodes_json]

for value, count in sorted_counts:
    if count > 30:
        print(f"{value}: {count}")
        idx = node_dois.index(value)
        print(paperbot.nodes_json[idx]['properties']['title'])

# Progressive Analysis

## Basic Search Results

In [12]:
basic_nodes, basic_nodes_id = [], []
for node_id in pg.graph.nodes:
    node = pg.graph.nodes[node_id]
    node_source = node['source']
    if len(set(node_source) & set(['Seed', 'CitedPaper', 'CitingPaper'])):
        basic_nodes.append(node)
        basic_nodes_id.append(node_id)

basic_paper_nodes = [x for x in basic_nodes if x['nodeType']=='Paper']
print(len(basic_nodes), len(basic_paper_nodes))

1528 213


In [13]:
basic_graph = pg.graph.subgraph(basic_nodes_id)

In [15]:
len(basic_graph.edges)

1844

In [None]:
def pagerank_multidigraph(graph, damping_factor=0.85, max_iter=100, tol=1e-6):
    """
    计算networkx MultiDiGraph的PageRank值。

    Args:
        graph (nx.MultiDiGraph): 输入的多重有向图。
        damping_factor (float): 阻尼系数，介于0和1之间，通常为0.85。
        max_iter (int): 最大迭代次数。
        tol (float): 收敛的容忍度。

    Returns:
        dict: 一个字典，键是图中的节点，值是对应的PageRank值。
    """
    if not graph.number_of_nodes():
        return {}

    pagerank = {node: 1.0 / graph.number_of_nodes() for node in graph}

    for _ in range(max_iter):
        new_pagerank = {node: (1 - damping_factor) / graph.number_of_nodes() for node in graph}
        for node in graph:
            out_degree = graph.out_degree(node)
            if out_degree > 0:
                for _, neighbor in graph.out_edges(node):
                    new_pagerank[neighbor] += damping_factor * pagerank[node] / out_degree

        # 检查是否收敛
        diff = sum(abs(new_pagerank[node] - pagerank[node]) for node in graph)
        if diff < tol:
            return new_pagerank

        pagerank = new_pagerank

In [50]:
# 计算入度中心性
in_degree_centrality = nx.in_degree_centrality(basic_graph)
sorted_items = sorted(in_degree_centrality.items(), key=lambda item: item[1], reverse=True)
print("入度中心性:", sorted_items[0:10])


# 计算出度中心性
out_degree_centrality = nx.out_degree_centrality(basic_graph)
sorted_items_2 = sorted(out_degree_centrality.items(), key=lambda item: item[1], reverse=True)
print("出度中心性:", sorted_items_2[0:10])

# 计算介数中心性
betweenness_centrality = nx.betweenness_centrality(basic_graph)
sorted_items_3 = sorted(betweenness_centrality.items(), key=lambda item: item[1], reverse=True)
print("介数中心性:", sorted_items_3[0:10])

# 计算紧密中心性
closeness_centrality = nx.closeness_centrality(basic_graph)
sorted_items_4 = sorted(closeness_centrality.items(), key=lambda item: item[1], reverse=True)
print("紧密中心性:", sorted_items_4[0:10])

# 取交集
new_set =(set([x[0] for x in sorted_items[0:100]]) 
    & set([x[0] for x in sorted_items_2[0:100]]) 
    & set([x[0] for x in sorted_items_3[0:100]])
    & set([x[0] for x in sorted_items_4[0:100]]))
new_set

入度中心性: [('10.48550/arXiv.2406.10252', 0.02881466928618206), ('10.48550/arXiv.2503.01424', 0.019646365422396856), ('10.48550/arXiv.2303.18223', 0.01440733464309103), ('10.48550/arXiv.2312.10997', 0.01440733464309103), ('10.48550/arXiv.2302.13971', 0.01440733464309103), ('10.48550/arXiv.2502.17086', 0.011787819253438114), ('10.48550/arXiv.2402.12928', 0.009823182711198428), ('10.48550/arXiv.2101.01169', 0.0091683038637852), ('10.48550/arXiv.2411.00816', 0.0091683038637852), ('25248f80-fe99-48e5-9b8e-9baef3b8e23b', 0.008513425016371971)]
出度中心性: [('10.48550/arXiv.2402.12928', 0.06614276358873608), ('10.48550/arXiv.2412.10415', 0.03798297314996726), ('10.48550/arXiv.2406.10252', 0.03470857891290111), ('2108024279', 0.0045841519318926), ('10.48550/arXiv.2101.01169', 0.0026195153896529143), ('2273553706', 0.0026195153896529143), ('2286328804', 0.0026195153896529143), ('10.18653/V1/2021.NAACL-MAIN.341', 0.0026195153896529143), ('10.48550/arXiv.1805.08660', 0.0026195153896529143), ('10.3390/SU1

{'10.1016/j.xinn.2021.100179',
 '10.1038/s41586-024-07487-w',
 '10.18653/V1/2021.NAACL-MAIN.341',
 '10.48550/arXiv.1805.08660',
 '10.48550/arXiv.2101.01169',
 '10.48550/arXiv.2305.03514',
 '10.48550/arXiv.2307.03172',
 '10.48550/arXiv.2309.00770',
 '10.48550/arXiv.2406.10252'}

# What's Next

## Further Exapnsion (Optional)

In [None]:
len(next_dois)

In [None]:
papers_info = []
for item in paperbot.nodes_json:
    if item['labels'] == ['Paper']:
        papers_info.append(item)


In [None]:
len(papers_info)

In [None]:
next_author_ids = []
for item in papers_info:
    if item['id'] in next_dois:
        author_ids = [x['authorId'] for x in item['properties']['authors']][0:5]
        for author_id in author_ids:
            if author_id not in next_author_ids:
                next_author_ids.append(author_id)

In [None]:
sorted_data_lambda = sorted(data, key=lambda item: item['score'], reverse=True)
print(sorted_data_lambda)

In [None]:
from apis.s2_api import SemanticScholarKit

s2 = SemanticScholarKit()
authors = s2.search_author_by_ids(author_ids=next_author_ids[0:100])

In [None]:
affiliations

In [None]:
authors[0].keys()

In [None]:
find_authors_id = [x['authorId'] for x in authors]

In [None]:
next_author_ids = []
for item in papers_info:
    hindex = 0 
    if item['id'] in next_dois:
        author_ids = [x['authorId'] for x in item['properties']['authors']][0:5]
        for author_id in author_ids:
            if author_id in find_authors_id:
                idx = find_authors_id.index(author_id)
                hindex += authors[idx].get('hIndex', 0)
    item['authors_hindex'] = hindex

In [None]:
sorted_data_lambda = sorted(papers_info, key=lambda item: item['authors_hindex'], reverse=True)

i = 0
for item in sorted_data_lambda:
    if i < 20:
        print(item)
        print(item['properties']['title'])
        i += 1

### Expand References for Highly Correlated Papers

In [None]:
next_dois = []
for edge in paperbot.edges_json:
    if edge['relationshipType'] == 'SIMILAR_TO' and edge['startNodeId'] in init_paper_dois:
        if edge['properties']['weight'] > 0.75 and edge['properties']['weight'] < 0.9:
            id = edge['startNodeId']
            if id not in next_dois:
                next_dois.append(edge['startNodeId'])


In [None]:
import time
for paper_doi in next_dois:
    paperbot.get_cited_papers(paper_doi) 
    time.sleep(5)

### Exapnd Key Cited Papers

identify key papers from seed papers' reference list

In [None]:
cited_paper_dois = []

next_dois = []
for edge in paperbot.edges_json:
    if edge['relationshipType'] == 'SIMILAR_TO' and edge['startNodeId'] in init_paper_dois:
        if edge['properties']['weight'] > 0.75 and edge['properties']['weight'] < 0.9:
            id = edge['startNodeId']
            if id not in next_dois:
                next_dois.append(edge['startNodeId'])

## Paper Filtering

In [None]:
filtered_dois, filtered_nodes, filtered_relationships = [], [], []

# seed paper
filtered_dois.extend(init_paper_dois)


for node in paperbot.nodes_json:
    # reference for seed paper
    if node['id'] in init_paper_dois and 'CitedPaper' in node['properties']['source']:
        filtered_dois.append(node['id'])


In [None]:
node['properties']

In [None]:
for node in paperbot.nodes_json:
    # reference for seed paper
    if node['labels'] == ['Paper'] and 'RecommendedPaper' in node['properties']['source']:
        print(node['properties']['title'])

In [None]:
for node in paperbot.nodes_json:
    # reference for seed paper
    if node['labels'] == ['Paper'] and node['id'] in init_paper_dois :
        print(node['properties']['title'], '\n', node['properties']['abstract'])
        print('-'*40)