# Paper Collection

## Setup

In [1]:
import os
import json

from paper_collect import PaperCollector

In [7]:
'abc'.startswith('a')

True

In [2]:
llm_api_key = os.getenv('GEMINI_API_KEY_3')
llm_model_name="gemini-2.0-flash"
embed_api_key = os.getenv('GEMINI_API_KEY_3')
embed_model_name="models/text-embedding-004"

research_topic = "llm literature review"
seed_dois = ['10.48550/arXiv.2406.10252',  # AutoSurvey: Large Language Models Can Automatically Write Surveys
            '10.48550/arXiv.2412.10415',  # Generative Adversarial Reviews: When LLMs Become the Critic
            '10.48550/arXiv.2402.12928',  # A Literature Review of Literature Reviews in Pattern Analysis and Machine Intelligence 
            ]
seed_titles = ['PaperRobot: Incremental Draft Generation of Scientific Ideas',
            'From Hypothesis to Publication: A Comprehensive Survey of AI-Driven Research Support Systems'
            ]

In [3]:
ps = PaperCollector(   
    research_topic = research_topic,   
    seed_paper_titles = seed_titles, 
    seed_paper_dois = seed_dois,
    llm_api_key = llm_api_key,
    llm_model_name = llm_model_name,
    embed_api_key = embed_api_key,
    embed_model_name = embed_model_name,
    from_dt = '2020-01-01',
    to_dt = '2025-04-30',
    fields_of_study = ['Computer Science'])

## Inital Search
- initial search for seed paper metadata
- basic search takes about 30-60 seconds 

### Paper Search

In [4]:
# --- INITIAL QUERY on SEED ---
# initial query for seed papers basic information
print("--- Running Initial Query for Seed Papers Information ---")
await ps.init_search(
    ps.research_topic,
    ps.seed_paper_titles,
    ps.seed_paper_dois,
    ps.search_limit,
    ps.from_dt,
    ps.to_dt
)
# get seed DOIs
seed_paper_dois = [node['id'] for node in ps.nodes_json if node['labels'] == ['Paper'] and node['properties'].get('from_seed')==True]
seed_author_ids = []
for node in ps.nodes_json:
    if node['labels'] == ['Paper'] and node['properties'].get('from_seed')==True and isinstance(node['properties'].get('authors'), list):
        authors_id = [x['authorId'] for x in node['properties']['authors'] if x['authorId'] is not None] 
        seed_author_ids.extend(authors_id)
seed_paper_json = [node for node in ps.nodes_json if node['labels'] == ['Paper'] and node['properties'].get('from_seed')==True]
ps.explored_nodes['seed'].extend(seed_paper_dois) 

2025-04-15 09:14:51,604 - INFO - SemanticScholarKit initialized with max_concurrency=10, sleep_interval=3.0s
2025-04-15 09:14:51,605 - INFO - Fetching papers by 3 DOIs...
2025-04-15 09:14:51,606 - INFO - Fetching papers by title: 'PaperRobot: Incremental Draft Generation of Scientific Ideas...'
2025-04-15 09:14:51,606 - INFO - Fetching papers by title: 'From Hypothesis to Publication: A Comprehensive Survey of AI-Driven Research Support Systems...'
2025-04-15 09:14:51,606 - INFO - Fetching papers by topic: 'llm literature review...'
2025-04-15 09:14:51,606 - INFO - Running 4 initial query tasks concurrently...
2025-04-15 09:14:51,606 - INFO - async_search_paper_by_ids: Creating 1 tasks for 3 IDs.
2025-04-15 09:14:51,607 - INFO - async_search_paper_by_ids: Gathering 1 tasks...
2025-04-15 09:14:51,607 - INFO - async_search_paper_by_keywords: Searching papers by keyword: 'PaperRobot: Incremental Draft Generation of Scient...' with effective limit 50.
2025-04-15 09:14:51,607 - INFO - _sync

--- Running Initial Query for Seed Papers Information ---


2025-04-15 09:14:52,708 - INFO - HTTP Request: POST https://api.semanticscholar.org/graph/v1/paper/batch?fields=abstract%2Cauthors%2CcitationCount%2CcitationStyles%2CcorpusId%2CexternalIds%2CfieldsOfStudy%2CinfluentialCitationCount%2CisOpenAccess%2Cjournal%2CopenAccessPdf%2CpaperId%2CpublicationDate%2CpublicationTypes%2CpublicationVenue%2CreferenceCount%2Cs2FieldsOfStudy%2Ctitle%2Curl%2Cvenue%2Cyear "HTTP/1.1 200 OK"
2025-04-15 09:14:52,715 - INFO - _sync_get_papers: API call successful for batch (first 5: ['10.48550/arXiv.2406.10252', '10.48550/arXiv.2412.10415', '10.48550/arXiv.2402.12928']...), returning 3 items.
2025-04-15 09:14:52,886 - INFO - HTTP Request: GET https://api.semanticscholar.org/graph/v1/paper/search?query=PaperRobot%3A+Incremental+Draft+Generation+of+Scientific+Ideas&fields=abstract%2Cauthors%2CcitationCount%2CcitationStyles%2CcorpusId%2CexternalIds%2CfieldsOfStudy%2CinfluentialCitationCount%2CisOpenAccess%2Cjournal%2CopenAccessPdf%2CpaperId%2CpublicationDate%2Cpubl

### Graph Stats

In [5]:
import copy
from collections import Counter

g = copy.deepcopy(ps.pg.graph)
print(len(ps.nodes_json), len(ps.edges_json), len(g.nodes), len(g.edges))

# check node types
node_types = [g.nodes[nid].get('nodeType') for nid in g.nodes]
node_types_cnt = Counter(node_types)
# 按计数降序排序
sorted_node_counts = node_types_cnt.most_common()
print(sorted_node_counts)

# check node types
edge_types = [d.get('relationshipType') for u, v, d in g.edges(data=True)]
edge_types_cnt = Counter(edge_types)
# 按计数降序排序
sorted_egdes_counts = edge_types_cnt.most_common()
print(sorted_egdes_counts)

373 321 373 321
[('Author', 258), ('Paper', 56), ('Journal', 31), ('Venue', 28)]
[('WRITES', 262), ('PRINTS_ON', 30), ('RELEASES_IN', 29)]


### Graph Viz

In [6]:
import sys
import os

parent_dir = os.path.dirname(os.getcwd())
sys.path.append(parent_dir)

from graph.graph_viz import GraphViz
viz = GraphViz(g, 'test')
viz.preprocessing()
viz.visulization()

Running preprocessing...
Preprocessing complete.
Calculating layout...
Layout calculated using kamada_kawai_layout.
Preparing data sources...
Creating plot...
Creating search widgets...
Creating legends...
Arranging layout...
Displaying plot.


## Basic Search
- basic search for the following information based on user input:
    - search_citation: enable search along citation chain
        - 'reference': papers cited by seeds
        - 'citing': papers cites seeds
        - 'both': all of the above
    - search_author: search on seed papers authors
        - would get authors information and other publications from the authors
    - find_recommend: get recommended papers based on seed papers
    

### Paper Search

In [8]:
search_citation = "both"
search_author = True
find_recommend = True

In [None]:
# --- MORE INFORMATION on SEED ---
print("--- Getting More Information Related to Seed Papers ---")
# basic search for seed papers
# may include seed paper authors, seed paper citation chain, recommendations based on seed papers 
await ps.collect(
    seed_paper_dois=seed_paper_dois,
    seed_author_ids=seed_author_ids,
    search_citation = search_citation,
    search_author = search_author,
    find_recommend = find_recommend,
    recommend_limit = ps.recommend_limit,
    citation_limit = ps.citation_limit,
    from_dt = ps.from_dt,
    to_dt = ps.to_dt,
    fields_of_study = ps.fields_of_study,
    )

if search_citation in ['reference', 'both']:
    ps.explored_nodes['reference'].extend(seed_paper_dois) 
if search_citation in ['citing', 'both']:
    ps.explored_nodes['citing'].extend(seed_paper_dois) 
if search_author:
    ps.explored_nodes['author'].extend(seed_author_ids) 

2025-04-15 09:16:42,883 - INFO - Preparing author info task for 27 authors.
2025-04-15 09:16:42,883 - INFO - SemanticScholarKit initialized with max_concurrency=10, sleep_interval=3.0s
2025-04-15 09:16:42,884 - INFO - Preparing reference/citation tasks for 4 seed papers.
2025-04-15 09:16:42,884 - INFO - SemanticScholarKit initialized with max_concurrency=10, sleep_interval=3.0s
2025-04-15 09:16:42,884 - INFO - Preparing recommendations task.
2025-04-15 09:16:42,884 - INFO - SemanticScholarKit initialized with max_concurrency=10, sleep_interval=3.0s
2025-04-15 09:16:42,885 - INFO - Running 10 main data collection tasks concurrently...
2025-04-15 09:16:42,885 - INFO - Fetching info for 27 authors...
2025-04-15 09:16:42,885 - INFO - async_search_author_by_ids: Fetching 27 authors by ID in 1 batches.
2025-04-15 09:16:42,885 - INFO - async_search_author_by_ids: Gathering 1 tasks...
2025-04-15 09:16:42,886 - INFO - Fetching papers cited by 10.48550/arXiv.2406.10252...
2025-04-15 09:16:42,886

--- Getting More Information Related to Seed Papers ---


2025-04-15 09:16:43,952 - INFO - HTTP Request: GET https://api.semanticscholar.org/graph/v1/paper/10.48550/arXiv.2412.10415/citations?fields=contexts%2Cintents%2CcontextsWithIntent%2CisInfluential%2Cabstract%2Cauthors%2CcitationCount%2CcitationStyles%2CcorpusId%2CexternalIds%2CfieldsOfStudy%2CinfluentialCitationCount%2CisOpenAccess%2Cjournal%2CopenAccessPdf%2CpaperId%2CpublicationDate%2CpublicationTypes%2CpublicationVenue%2CreferenceCount%2Cs2FieldsOfStudy%2Ctitle%2Curl%2Cvenue%2Cyear&offset=0&limit=100 "HTTP/1.1 200 OK"
2025-04-15 09:16:43,953 - INFO - HTTP Request: GET https://api.semanticscholar.org/graph/v1/paper/10.48550/arXiv.2402.12928/citations?fields=contexts%2Cintents%2CcontextsWithIntent%2CisInfluential%2Cabstract%2Cauthors%2CcitationCount%2CcitationStyles%2CcorpusId%2CexternalIds%2CfieldsOfStudy%2CinfluentialCitationCount%2CisOpenAccess%2Cjournal%2CopenAccessPdf%2CpaperId%2CpublicationDate%2CpublicationTypes%2CpublicationVenue%2CreferenceCount%2Cs2FieldsOfStudy%2Ctitle%2Cur

### Graph Stat

In [10]:
import copy
from collections import Counter

g = copy.deepcopy(ps.pg.graph)
print(len(ps.nodes_json), len(ps.edges_json), len(g.nodes), len(g.edges))

# check node types
node_types = [g.nodes[nid].get('nodeType') for nid in g.nodes]
node_types_cnt = Counter(node_types)
# 按计数降序排序
sorted_node_counts = node_types_cnt.most_common()
print(sorted_node_counts)

# check node types
edge_types = [d.get('relationshipType') for u, v, d in g.edges(data=True)]
edge_types_cnt = Counter(edge_types)
# 按计数降序排序
sorted_egdes_counts = edge_types_cnt.most_common()
print(sorted_egdes_counts)

3986 4794 3717 4794
[('Author', 2821), ('Paper', 652), ('Journal', 123), ('Venue', 118), ('Affiliation', 3)]
[('WRITES', 3882), ('CITES', 377), ('RELEASES_IN', 349), ('PRINTS_ON', 183), ('WORKS_IN', 3)]


### Graph Viz

In [11]:
import sys
import os

parent_dir = os.path.dirname(os.getcwd())
sys.path.append(parent_dir)

# from graph.graph_viz import GraphViz
viz = GraphViz(g, 'paper graph after basic search')
viz.preprocessing()
viz.visulization()

Running preprocessing...
Preprocessing complete.
Calculating layout...
Graph is large, using spring_layout (may take time)...
Layout calculated using spring_layout.
Preparing data sources...
Creating plot...
Creating search widgets...
Creating legends...
Arranging layout...
Displaying plot.


### Filter & Ranking

In [12]:
# seed papers
seed_paper_dois = [nid for nid in g.nodes 
                   if g.nodes[nid].get('nodeType')=='Paper' and
                      g.nodes[nid].get('from_seed') == True]
seed_paper_nodes = [g.nodes[nid] for nid in g.nodes 
                    if g.nodes[nid].get('nodeType')=='Paper' and
                        g.nodes[nid].get('from_seed') == True]

seed_author_ids, seed_author_nodes = [], []
for doi in seed_paper_dois:
    pre_ids = g.predecessors(doi)
    for id in pre_ids:
        if g.nodes[id].get('nodeType')=='Author':
            seed_author_ids.append(id)
            seed_author_nodes.append(g.nodes[id])

Paper Ranking  
- most cited papers


The purpose of most cited papers is to build a common citation chain for seed papers.  
A filtering condition is:  
- paper cited more than min(# of seed papers, 4)  
- rank order by cited in descending, get more than 10 of the papers  
- not in seed papers  
- may add significance citeria in the future

In [13]:
paper_stat = []
for n in g.nodes:
    if g.nodes[n].get('nodeType') == 'Paper':
        in_edges_info = g.in_edges(n, data=True)
        cite_cnt = sum([1 for u, v, data in in_edges_info if data.get('relationshipType') == 'CITES'])
        paper_stat.append((n, cite_cnt))

In [15]:
tot_seed_paper_cnt = len(seed_paper_dois)
tot_paper_cnt = len([nid for nid in g.nodes if g.nodes[nid].get('nodeType')=='Paper'])
print(tot_seed_paper_cnt, tot_paper_cnt)

4 652


In [14]:
import numpy as np
sorted_by_cite = sorted(paper_stat, key=lambda item: item[1], reverse=True)
print(sorted_by_cite[0:20])

for item in sorted_by_cite[0:20]:
    n = item[0]
    cite_cnt = item[1]
    # paper infos
    title = g.nodes[n].get('title')
    in_seed = True if item[0] in seed_paper_dois else False
    overall_cite_cnt = g.nodes[n].get('citationCount')
    influential_cite_cnt = g.nodes[n].get('influentialCitationCount')
    # author infors
    hindex_lst = []
    for u in g.predecessors(n):
        if g.nodes[u].get('nodeType') == 'Author':
            hIndex = g.nodes[u].get('hIndex')
            if hIndex:
                hindex_lst.append(hIndex)
            paperCount = g.nodes[u].get('paperCount')
            citationCount = g.nodes[u].get('citationCount')
    h_index = np.average(hindex_lst)

    paper_info = {"doi":n, "title":title, "if_seed": in_seed,
                  "local_refs":cite_cnt, "global_refs":overall_cite_cnt, "inf_cite_cnt":influential_cite_cnt,
                  "h_index": h_index}
    print(paper_info)

[('10.48550/arXiv.2406.10252', 14), ('10.48550/arXiv.2312.10997', 2), ('10.48550/arXiv.2305.14627', 2), ('10.48550/arXiv.2303.18223', 2), ('10.48550/arXiv.2302.13971', 2), ('10.1145/3505244', 2), ('10.48550/arXiv.2408.07884', 2), ('10.48550/arXiv.2410.09403', 2), ('10.48550/arXiv.2408.06292', 2), ('10.48550/arXiv.2406.05688', 2), ('10.48550/arXiv.2310.07984', 2), ('10.48550/arXiv.2310.03302', 2), ('10.48550/arXiv.2310.01783', 2), ('10.48550/arXiv.2307.05492', 2), ('10.48550/arXiv.2306.00622', 2), ('10.18653/v1/2024.acl-long.18', 2), ('10.48550/arXiv.2305.16960', 2), ('10.2139/ssrn.4526071', 2), ('10.1007/978-981-97-9443-0_35', 2), ('10.48550/arXiv.2412.10415', 1)]
{'doi': '10.48550/arXiv.2406.10252', 'title': 'AutoSurvey: Large Language Models Can Automatically Write Surveys', 'if_seed': True, 'local_refs': 14, 'global_refs': 14, 'inf_cite_cnt': 2, 'h_index': 17.0}
{'doi': '10.48550/arXiv.2312.10997', 'title': 'Retrieval-Augmented Generation for Large Language Models: A Survey', 'if_se

  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


Author Ranking  
- most cited authors

The purpose of most cited authors is to get key authors info, which may lead to further investigation.  
A filtering condition is:  
- author cited more than min(0.1 * total papers, 10)  
- rank order by cited in descending, get more than 10 of the authors  
- not in seed authors  
- may add significance citeria in the future

In [16]:
author_stat = []
for n in g.nodes:
    if g.nodes[n].get('nodeType') == 'Author':
        out_edges_info = g.out_edges(n, data=True)
        write_cnt = sum([1 for u, v, data in out_edges_info if data.get('relationshipType') == 'WRITES'])
        author_stat.append((n, write_cnt))

In [19]:
sorted_by_writes = sorted(author_stat, key=lambda item: item[1], reverse=True)
for item in sorted_by_writes[0:20]:
    aid = item[0]
    a_name = g.nodes[aid].get('name')
    hIndex = g.nodes[aid].get('hIndex')
    in_seed = True if aid in seed_author_ids else False
    global_paper_cnt = g.nodes[aid].get('paperCount')
    global_citation_cnt = g.nodes[aid].get('citationCount')
    print({"author_id":aid, "author_name":a_name, "write_cnt":item[1], "is_seed":in_seed,
           "hIndex":hIndex, "global_paper_cnt":global_paper_cnt, "global_citation_cnt":global_citation_cnt, })


{'author_id': '2674998', 'author_name': 'Xiaocheng Feng', 'write_cnt': 60, 'is_seed': True, 'hIndex': None, 'global_paper_cnt': None, 'global_citation_cnt': None}
{'author_id': '2108024279', 'author_name': 'Yidong Wang', 'write_cnt': 39, 'is_seed': True, 'hIndex': 17, 'global_paper_cnt': 42, 'global_citation_cnt': 3711}
{'author_id': '152277111', 'author_name': 'Bing Qin', 'write_cnt': 31, 'is_seed': False, 'hIndex': None, 'global_paper_cnt': None, 'global_citation_cnt': None}
{'author_id': '2257010530', 'author_name': 'Xinyu Dai', 'write_cnt': 25, 'is_seed': True, 'hIndex': None, 'global_paper_cnt': None, 'global_citation_cnt': None}
{'author_id': '2259709647', 'author_name': 'Min Zhang', 'write_cnt': 24, 'is_seed': True, 'hIndex': None, 'global_paper_cnt': None, 'global_citation_cnt': None}
{'author_id': '51056442', 'author_name': 'Xiachong Feng', 'write_cnt': 20, 'is_seed': True, 'hIndex': None, 'global_paper_cnt': None, 'global_citation_cnt': None}
{'author_id': '2118640235', 'auth

### Similarity Check
- check similar papers to seed, see if they are worth exploration

In [20]:
similarity_threshold = 0.7
expanded_k_papers = 20
non_seed_paper_dois = [node['id'] for node in ps.nodes_json if node['labels'] == ['Paper'] and node['id'] not in seed_paper_dois]
paper_nodes_json = [node for node in ps.nodes_json 
                    if node['labels'] == ['Paper'] and 
                    node['properties'].get('title') is not None and node['properties'].get('abstract') is not None]

In [21]:
# calculate paper nodes similarity
semantic_similar_pool = await ps.cal_embed_and_similarity(
    paper_nodes_json=paper_nodes_json,
    paper_dois_1=seed_paper_dois, 
    paper_dois_2=non_seed_paper_dois,
    similarity_threshold=similarity_threshold,
    )

if len(semantic_similar_pool) > 0:
    candit_items = []
    for item in semantic_similar_pool:
        wt = item.get('properties', {}).get('weight')
        if (wt > 0.7 and wt < 0.95) or (wt > 0.7 and wt < 0.95):
            if item['startNodeId'] in seed_paper_dois and item['endNodeId'] not in seed_paper_dois:
                candit_items.append((item['endNodeId'], wt))
            elif item['startNodeId'] not in seed_paper_dois and item['endNodeId'] in seed_paper_dois:
                candit_items.append((item['startNodeId'], wt))
    sorted_items = sorted(candit_items, key=lambda item: item[1], reverse=True)

    # filter top k similarities
    expanded_paper_dois = [x[0] for x in sorted_items[0:expanded_k_papers]]

2025-04-15 09:51:44,678 - INFO - Generating embeddings for 586 papers...
2025-04-15 09:51:53,770 - INFO - Shape of embeds_1: (4, 768)
2025-04-15 09:51:53,770 - INFO - Shape of embeds_2: (582, 768)
2025-04-15 09:51:53,771 - INFO - Calculating similarity matrix...
2025-04-15 09:51:53,785 - INFO - Processing similarity matrix to create relationships...


In [24]:
for doi in expanded_paper_dois:
    # paper infos
    title = g.nodes[doi].get('title')
    in_seed = True if doi in seed_paper_dois else False
    overall_cite_cnt = g.nodes[doi].get('citationCount')
    influential_cite_cnt = g.nodes[doi].get('influentialCitationCount')
    # author infors
    hindex_lst = []
    for u in g.predecessors(doi):
        if g.nodes[u].get('nodeType') == 'Author':
            hIndex = g.nodes[u].get('hIndex')
            if hIndex:
                hindex_lst.append(hIndex)
            paperCount = g.nodes[u].get('paperCount')
            citationCount = g.nodes[u].get('citationCount')
    h_index = np.average(hindex_lst)

    paper_info = {"doi":doi, "title":title, "if_seed": in_seed,
                  "global_refs":overall_cite_cnt, "inf_cite_cnt":influential_cite_cnt,
                  "h_index": h_index}
    print(paper_info)

{'doi': '10.48550/arXiv.2401.04259', 'title': 'MARG: Multi-Agent Review Generation for Scientific Papers', 'if_seed': False, 'global_refs': 25, 'inf_cite_cnt': 0, 'h_index': nan}
{'doi': '10.48550/arXiv.2408.10365', 'title': 'AI-Driven Review Systems: Evaluating LLMs in Scalable and Bias-Aware Academic Reviews', 'if_seed': False, 'global_refs': 11, 'inf_cite_cnt': 0, 'h_index': nan}
{'doi': '10.48550/arXiv.2503.08506', 'title': 'ReviewAgents: Bridging the Gap Between Human and AI-Generated Paper Reviews', 'if_seed': False, 'global_refs': 1, 'inf_cite_cnt': 0, 'h_index': nan}
{'doi': '10.48550/arXiv.2502.14776', 'title': 'SurveyX: Academic Survey Automation via Large Language Models', 'if_seed': False, 'global_refs': 0, 'inf_cite_cnt': 0, 'h_index': nan}
{'doi': '10.48550/arXiv.2502.17086', 'title': 'Automatically Evaluating the Paper Reviewing Capability of Large Language Models', 'if_seed': False, 'global_refs': 0, 'inf_cite_cnt': 0, 'h_index': nan}
{'doi': '10.48550/arXiv.2502.11736'

  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


## Expanded Search － Related Topics

In [26]:
if_related_topic = True

In [27]:
# --- EXPAND to RELATED TOPICS over SEED ---
# get related topics based on abstracts of seed papers
# search for related topics for more papers
if if_related_topic:
    await ps.expand(
        seed_paper_json=seed_paper_json, 
        llm_api_key=ps.llm_api_key, 
        llm_model_name=ps.llm_model_name,
        search_limit=ps.search_limit,
        from_dt=ps.from_dt,
        to_dt=ps.to_dt,
        fields_of_study = ps.fields_of_study,
        )


2025-04-15 10:00:45,502 - INFO - SemanticScholarKit initialized with max_concurrency=10, sleep_interval=3.0s
2025-04-15 10:00:45,502 - INFO - Waiting for expanded search task to complete...
2025-04-15 10:00:45,502 - INFO - Starting expanded search sub-task...
2025-04-15 10:00:45,503 - INFO - Generating related topics for 4 seed papers...
2025-04-15 10:00:45,503 - INFO - Calling LLM to generate topics...
2025-04-15 10:00:45,504 - INFO - AFC is enabled with max remote calls: 10.
2025-04-15 10:00:48,079 - INFO - LLM generated topics: {"field_of_study": ["Artificial Intelligence", "Natural Language Processing", "Information Retrieval", "Meta-Science & Science of Science"], "keywords_and_topics": ["large language models (LLMs)", "automated literature review generation", "AI-driven research support systems", "peer review process", "generative adversarial reviewers"], "tags": ["AutoSurvey", "Generative Agent Reviewers (GAR)", "PRISMA standards", "AI-generated reviews", "hypothesis formulation

{'field_of_study': ['Artificial Intelligence', 'Natural Language Processing', 'Information Retrieval', 'Meta-Science & Science of Science'], 'keywords_and_topics': ['large language models (LLMs)', 'automated literature review generation', 'AI-driven research support systems', 'peer review process', 'generative adversarial reviewers'], 'tags': ['AutoSurvey', 'Generative Agent Reviewers (GAR)', 'PRISMA standards', 'AI-generated reviews', 'hypothesis formulation'], 'queries': ['automated literature review LLM', 'AI peer review system', 'LLM meta-reviewer', 'AI research support system survey']}


2025-04-15 10:00:49,986 - INFO - HTTP Request: GET https://api.semanticscholar.org/graph/v1/paper/search?query=AI+peer+review+system&fieldsOfStudy=Computer+Science&fields=abstract%2Cauthors%2CcitationCount%2CcitationStyles%2CcorpusId%2CexternalIds%2CfieldsOfStudy%2CinfluentialCitationCount%2CisOpenAccess%2Cjournal%2CopenAccessPdf%2CpaperId%2CpublicationDate%2CpublicationTypes%2CpublicationVenue%2CreferenceCount%2Cs2FieldsOfStudy%2Ctitle%2Curl%2Cvenue%2Cyear&offset=0&limit=50 "HTTP/1.1 200 OK"
2025-04-15 10:00:50,081 - INFO - HTTP Request: GET https://api.semanticscholar.org/graph/v1/paper/search?query=automated+literature+review+LLM&fieldsOfStudy=Computer+Science&fields=abstract%2Cauthors%2CcitationCount%2CcitationStyles%2CcorpusId%2CexternalIds%2CfieldsOfStudy%2CinfluentialCitationCount%2CisOpenAccess%2Cjournal%2CopenAccessPdf%2CpaperId%2CpublicationDate%2CpublicationTypes%2CpublicationVenue%2CreferenceCount%2Cs2FieldsOfStudy%2Ctitle%2Curl%2Cvenue%2Cyear&offset=0&limit=50 "HTTP/1.1 20

### Graph Stats

In [28]:
import copy
from collections import Counter

g2 = copy.deepcopy(ps.pg.graph)
print(len(ps.nodes_json), len(ps.edges_json), len(g2.nodes), len(g2.edges))

# check node types
node_types = [g2.nodes[nid].get('nodeType') for nid in g2.nodes]
node_types_cnt = Counter(node_types)
# 按计数降序排序
sorted_node_counts = node_types_cnt.most_common()
print(sorted_node_counts)

# check node types
edge_types = [d.get('relationshipType') for u, v, d in g2.edges(data=True)]
edge_types_cnt = Counter(edge_types)
# 按计数降序排序
sorted_egdes_counts = edge_types_cnt.most_common()
print(sorted_egdes_counts)

4982 5782 4586 5782
[('Author', 3417), ('Paper', 799), ('Journal', 196), ('Venue', 167), ('Topic', 4), ('Affiliation', 3)]
[('WRITES', 4516), ('RELEASES_IN', 425), ('CITES', 377), ('PRINTS_ON', 275), ('DISCUSS', 186), ('WORKS_IN', 3)]


### Filter & Ranking

Expanded topic would not affect paper citation information.  
It may impact author stats.
Since only seed author explored, the top authors cited would also tend to be seed authors.

In [34]:
author_stat2 = []
for n in g2.nodes:
    if g2.nodes[n].get('nodeType') == 'Author':
        out_edges_info = g2.out_edges(n, data=True)
        write_cnt = sum([1 for u, v, data in out_edges_info if data.get('relationshipType') == 'WRITES'])
        author_stat2.append((n, write_cnt))

In [35]:
sorted_by_writes2 = sorted(author_stat2, key=lambda item: item[1], reverse=True)
for item in sorted_by_writes2[0:20]:
    aid = item[0]
    a_name = g2.nodes[aid].get('name')
    hIndex = g2.nodes[aid].get('hIndex')
    in_seed = True if aid in seed_author_ids else False
    global_paper_cnt = g2.nodes[aid].get('paperCount')
    global_citation_cnt = g2.nodes[aid].get('citationCount')
    print({"author_id":aid, "author_name":a_name, "write_cnt":item[1], "is_seed":in_seed,
           "hIndex":hIndex, "global_paper_cnt":global_paper_cnt, "global_citation_cnt":global_citation_cnt, })

{'author_id': '2674998', 'author_name': 'Xiaocheng Feng', 'write_cnt': 60, 'is_seed': True, 'hIndex': None, 'global_paper_cnt': None, 'global_citation_cnt': None}
{'author_id': '2108024279', 'author_name': 'Yidong Wang', 'write_cnt': 39, 'is_seed': True, 'hIndex': 17, 'global_paper_cnt': 42, 'global_citation_cnt': 3711}
{'author_id': '152277111', 'author_name': 'Bing Qin', 'write_cnt': 31, 'is_seed': False, 'hIndex': None, 'global_paper_cnt': None, 'global_citation_cnt': None}
{'author_id': '2257010530', 'author_name': 'Xinyu Dai', 'write_cnt': 25, 'is_seed': True, 'hIndex': None, 'global_paper_cnt': None, 'global_citation_cnt': None}
{'author_id': '2259709647', 'author_name': 'Min Zhang', 'write_cnt': 24, 'is_seed': True, 'hIndex': None, 'global_paper_cnt': None, 'global_citation_cnt': None}
{'author_id': '51056442', 'author_name': 'Xiachong Feng', 'write_cnt': 20, 'is_seed': True, 'hIndex': None, 'global_paper_cnt': None, 'global_citation_cnt': None}
{'author_id': '2118640235', 'auth

### Similarity Check
- data reveals that after topic expansion, the top similar papers does not change much from original ones.
- Hence, the topic expansion is relatively isolated from author expansion and citation expansion.

In [36]:
similarity_threshold = 0.7
expanded_k_papers = 20
non_seed_paper_dois = [node['id'] for node in ps.nodes_json if node['labels'] == ['Paper'] and node['id'] not in seed_paper_dois]
paper_nodes_json = [node for node in ps.nodes_json 
                    if node['labels'] == ['Paper'] and 
                    node['properties'].get('title') is not None and node['properties'].get('abstract') is not None]

In [37]:
# calculate paper nodes similarity
semantic_similar_pool = await ps.cal_embed_and_similarity(
    paper_nodes_json=paper_nodes_json,
    paper_dois_1=seed_paper_dois, 
    paper_dois_2=non_seed_paper_dois,
    similarity_threshold=similarity_threshold,
    )

if len(semantic_similar_pool) > 0:
    candit_items = []
    for item in semantic_similar_pool:
        wt = item.get('properties', {}).get('weight')
        if (wt > 0.7 and wt < 0.95) or (wt > 0.7 and wt < 0.95):
            if item['startNodeId'] in seed_paper_dois and item['endNodeId'] not in seed_paper_dois:
                candit_items.append((item['endNodeId'], wt))
            elif item['startNodeId'] not in seed_paper_dois and item['endNodeId'] in seed_paper_dois:
                candit_items.append((item['startNodeId'], wt))
    sorted_items = sorted(candit_items, key=lambda item: item[1], reverse=True)

    # filter top k similarities
    expanded_paper_dois = [x[0] for x in sorted_items[0:expanded_k_papers]]

2025-04-15 10:30:20,074 - INFO - Generating embeddings for 140 papers...
2025-04-15 10:30:28,949 - INFO - Shape of embeds_1: (4, 768)
2025-04-15 10:30:28,950 - INFO - Shape of embeds_2: (722, 768)
2025-04-15 10:30:28,950 - INFO - Calculating similarity matrix...
2025-04-15 10:30:28,952 - INFO - Processing similarity matrix to create relationships...


In [38]:
for doi in expanded_paper_dois:
    # paper infos
    title = g2.nodes[doi].get('title')
    in_seed = True if doi in seed_paper_dois else False
    overall_cite_cnt = g2.nodes[doi].get('citationCount')
    influential_cite_cnt = g2.nodes[doi].get('influentialCitationCount')
    # author infors
    hindex_lst = []
    for u in g2.predecessors(doi):
        if g2.nodes[u].get('nodeType') == 'Author':
            hIndex = g2.nodes[u].get('hIndex')
            if hIndex:
                hindex_lst.append(hIndex)
            paperCount = g2.nodes[u].get('paperCount')
            citationCount = g2.nodes[u].get('citationCount')
    h_index = np.average(hindex_lst)

    paper_info = {"doi":doi, "title":title, "if_seed": in_seed,
                  "global_refs":overall_cite_cnt, "inf_cite_cnt":influential_cite_cnt,
                  "h_index": h_index}
    print(paper_info)

{'doi': '10.48550/arXiv.2401.04259', 'title': 'MARG: Multi-Agent Review Generation for Scientific Papers', 'if_seed': False, 'global_refs': 25, 'inf_cite_cnt': 0, 'h_index': nan}
{'doi': '10.48550/arXiv.2408.10365', 'title': 'AI-Driven Review Systems: Evaluating LLMs in Scalable and Bias-Aware Academic Reviews', 'if_seed': False, 'global_refs': 11, 'inf_cite_cnt': 0, 'h_index': nan}
{'doi': '10.48550/arXiv.2503.08506', 'title': 'ReviewAgents: Bridging the Gap Between Human and AI-Generated Paper Reviews', 'if_seed': False, 'global_refs': 1, 'inf_cite_cnt': 0, 'h_index': nan}
{'doi': '10.48550/arXiv.2502.14776', 'title': 'SurveyX: Academic Survey Automation via Large Language Models', 'if_seed': False, 'global_refs': 0, 'inf_cite_cnt': 0, 'h_index': nan}
{'doi': '10.48550/arXiv.2502.17086', 'title': 'Automatically Evaluating the Paper Reviewing Capability of Large Language Models', 'if_seed': False, 'global_refs': 0, 'inf_cite_cnt': 0, 'h_index': nan}
{'doi': '10.48550/arXiv.2502.11736'

  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


### Grpah Viz

In [39]:
import sys
import os

parent_dir = os.path.dirname(os.getcwd())
sys.path.append(parent_dir)

# from graph.graph_viz import GraphViz
viz2 = GraphViz(g2, 'paper graph after topic expansion')
viz2.preprocessing()
viz2.visulization()

Running preprocessing...
Preprocessing complete.
Calculating layout...
Graph is large, using spring_layout (may take time)...
Layout calculated using spring_layout.
Preparing data sources...
Creating plot...
Creating search widgets...
Creating legends...
Arranging layout...
Displaying plot.


## Expanded Search - Similar Papers Citation Search

In [42]:
if_expanded_citations = 'reference' # by default search only reference for similar papers
print(expanded_paper_dois)

['10.48550/arXiv.2401.04259', '10.48550/arXiv.2408.10365', '10.48550/arXiv.2503.08506', '10.48550/arXiv.2502.14776', '10.48550/arXiv.2502.17086', '10.48550/arXiv.2502.11736', '10.48550/arXiv.2409.16813', '10.48550/arXiv.2503.04629', '10.48550/arXiv.2410.03019', '10.48550/arXiv.2502.12510', '10.48550/arXiv.2306.00622', '10.48550/arXiv.2404.07738', '10.48550/arXiv.2406.16253', '10.48550/arXiv.2404.17605', '10.48550/arXiv.2405.02150', '10.48550/arXiv.2502.19614', '10.48550/arXiv.2410.22394', '10.48550/arXiv.2503.08569', '10.48550/arXiv.2501.04306', '10.48550/arXiv.2307.05492']


In [43]:
# --- CHECK CROSSREF PAPERS ---
# get most similar papers to seed papers
# track citation chain of these papers
# retrieve citation for top k similar papers
await ps.collect(
    seed_paper_dois=expanded_paper_dois,
    search_citation = if_expanded_citations,
    citation_limit = ps.citation_limit,
    from_dt=ps.from_dt,
    to_dt=ps.to_dt,
    fields_of_study = ps.fields_of_study,
    )
if if_expanded_citations in ['reference', 'both']:
    ps.explored_nodes['reference'].extend(expanded_paper_dois) 
if if_expanded_citations in ['citing', 'both']:
    ps.explored_nodes['citing'].extend(expanded_paper_dois) 

2025-04-15 10:43:38,470 - INFO - Preparing reference/citation tasks for 20 seed papers.
2025-04-15 10:43:38,472 - INFO - SemanticScholarKit initialized with max_concurrency=10, sleep_interval=3.0s
2025-04-15 10:43:38,472 - INFO - Running 20 main data collection tasks concurrently...
2025-04-15 10:43:38,472 - INFO - Fetching papers cited by 10.48550/arXiv.2401.04259...
2025-04-15 10:43:38,473 - INFO - async_get_s2_cited_papers: Fetching references for paper 10.48550/arXiv.2401.04259 with effective limit 100.
2025-04-15 10:43:38,473 - INFO - Fetching papers cited by 10.48550/arXiv.2408.10365...
2025-04-15 10:43:38,473 - INFO - async_get_s2_cited_papers: Fetching references for paper 10.48550/arXiv.2408.10365 with effective limit 100.
2025-04-15 10:43:38,473 - INFO - _sync_get_paper_references: Thread started for paper 10.48550/arXiv.2401.04259 with limit 100.
2025-04-15 10:43:38,473 - INFO - Fetching papers cited by 10.48550/arXiv.2503.08506...
2025-04-15 10:43:38,473 - INFO - _sync_get_

### Graph Stats

In [44]:
import copy
from collections import Counter

g3 = copy.deepcopy(ps.pg.graph)
print(len(ps.nodes_json), len(ps.edges_json), len(g3.nodes), len(g3.edges))

# check node types
node_types = [g3.nodes[nid].get('nodeType') for nid in g3.nodes]
node_types_cnt = Counter(node_types)
# 按计数降序排序
sorted_node_counts = node_types_cnt.most_common()
print(sorted_node_counts)

# check node types
edge_types = [d.get('relationshipType') for u, v, d in g3.edges(data=True)]
edge_types_cnt = Counter(edge_types)
# 按计数降序排序
sorted_egdes_counts = edge_types_cnt.most_common()
print(sorted_egdes_counts)

7564 7580 5844 7580
[('Author', 4394), ('Paper', 1010), ('Journal', 236), ('Venue', 197), ('Topic', 4), ('Affiliation', 3)]
[('WRITES', 5728), ('CITES', 771), ('RELEASES_IN', 557), ('PRINTS_ON', 335), ('DISCUSS', 186), ('WORKS_IN', 3)]


### Filtering and Ranking

Author  
- not impacted much

In [45]:
author_stat3 = []
for n in g3.nodes:
    if g3.nodes[n].get('nodeType') == 'Author':
        out_edges_info = g3.out_edges(n, data=True)
        write_cnt = sum([1 for u, v, data in out_edges_info if data.get('relationshipType') == 'WRITES'])
        author_stat3.append((n, write_cnt))

In [46]:
sorted_by_writes3 = sorted(author_stat3, key=lambda item: item[1], reverse=True)
for item in sorted_by_writes3[0:20]:
    aid = item[0]
    a_name = g3.nodes[aid].get('name')
    hIndex = g3.nodes[aid].get('hIndex')
    in_seed = True if aid in seed_author_ids else False
    global_paper_cnt = g3.nodes[aid].get('paperCount')
    global_citation_cnt = g3.nodes[aid].get('citationCount')
    print({"author_id":aid, "author_name":a_name, "write_cnt":item[1], "is_seed":in_seed,
           "hIndex":hIndex, "global_paper_cnt":global_paper_cnt, "global_citation_cnt":global_citation_cnt, })

{'author_id': '2674998', 'author_name': 'Xiaocheng Feng', 'write_cnt': 60, 'is_seed': True, 'hIndex': None, 'global_paper_cnt': None, 'global_citation_cnt': None}
{'author_id': '2108024279', 'author_name': 'Yidong Wang', 'write_cnt': 39, 'is_seed': True, 'hIndex': 17, 'global_paper_cnt': 42, 'global_citation_cnt': 3711}
{'author_id': '152277111', 'author_name': 'Bing Qin', 'write_cnt': 31, 'is_seed': False, 'hIndex': None, 'global_paper_cnt': None, 'global_citation_cnt': None}
{'author_id': '2257010530', 'author_name': 'Xinyu Dai', 'write_cnt': 25, 'is_seed': True, 'hIndex': None, 'global_paper_cnt': None, 'global_citation_cnt': None}
{'author_id': '2259709647', 'author_name': 'Min Zhang', 'write_cnt': 24, 'is_seed': True, 'hIndex': None, 'global_paper_cnt': None, 'global_citation_cnt': None}
{'author_id': '51056442', 'author_name': 'Xiachong Feng', 'write_cnt': 20, 'is_seed': True, 'hIndex': None, 'global_paper_cnt': None, 'global_citation_cnt': None}
{'author_id': '2118640235', 'auth

Paper  
- most cited papers changed moderately
- more reliable to serve as cross ref

In [48]:
paper_stat = []
for n in g3.nodes:
    if g3.nodes[n].get('nodeType') == 'Paper':
        in_edges_info = g3.in_edges(n, data=True)
        cite_cnt = sum([1 for u, v, data in in_edges_info if data.get('relationshipType') == 'CITES'])
        paper_stat.append((n, cite_cnt))

In [49]:
tot_seed_paper_cnt = len(seed_paper_dois)
tot_paper_cnt = len([nid for nid in g3.nodes if g3.nodes[nid].get('nodeType')=='Paper'])
print(tot_seed_paper_cnt, tot_paper_cnt)

4 1010


In [50]:
import numpy as np
sorted_by_cite = sorted(paper_stat, key=lambda item: item[1], reverse=True)
print(sorted_by_cite[0:30])

for item in sorted_by_cite[0:30]:
    n = item[0]
    cite_cnt = item[1]
    # paper infos
    title = g3.nodes[n].get('title')
    in_seed = True if item[0] in seed_paper_dois else False
    overall_cite_cnt = g3.nodes[n].get('citationCount')
    influential_cite_cnt = g3.nodes[n].get('influentialCitationCount')
    # author infors
    hindex_lst = []
    for u in g3.predecessors(n):
        if g3.nodes[u].get('nodeType') == 'Author':
            hIndex = g3.nodes[u].get('hIndex')
            if hIndex:
                hindex_lst.append(hIndex)
            paperCount = g3.nodes[u].get('paperCount')
            citationCount = g3.nodes[u].get('citationCount')
    h_index = np.average(hindex_lst)

    paper_info = {"doi":n, "title":title, "if_seed": in_seed,
                  "local_refs":cite_cnt, "global_refs":overall_cite_cnt, "inf_cite_cnt":influential_cite_cnt,
                  "h_index": h_index}
    print(paper_info)

[('10.48550/arXiv.2406.10252', 14), ('10.48550/arXiv.2408.06292', 11), ('10.48550/arXiv.2310.01783', 11), ('10.48550/arXiv.2306.00622', 8), ('10.48550/arXiv.2303.08774', 7), ('10.48550/arXiv.2307.05492', 7), ('7c2b3c4ab6d701de7bd9df91d7448f3c06a1e9d7', 7), ('10.48550/arXiv.2306.05685', 5), ('10.48550/arXiv.2302.13971', 5), ('10.48550/arXiv.2406.05688', 5), ('10.48550/arXiv.2403.07183', 5), ('10.48550/arXiv.2303.17651', 5), ('10.48550/arXiv.2405.02150', 5), ('10.1162/tacl_a_00638', 4), ('10.48550/arXiv.2410.09403', 4), ('10.48550/arXiv.2408.10365', 4), ('10.48550/arXiv.2310.03302', 4), ('10.48550/arXiv.2401.04259', 4), ('10.18653/v1/2023.findings-emnlp.472', 4), ('10.48550/arXiv.2309.16609', 4), ('10.48550/arXiv.2312.10997', 3), ('10.1145/3571730', 3), ('10.18653/v1/2024.acl-long.18', 3), ('10.48550/arXiv.2201.11903', 3), ('10.18653/v1/2020.acl-main.207', 3), ('10.48550/arXiv.2410.14255', 3), ('10.48550/arXiv.2410.13185', 3), ('10.48550/arXiv.2409.05556', 3), ('10.48550/arXiv.2409.04109

  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


### Graph Viz

In [54]:
# from graph.graph_viz import GraphViz
viz3 = GraphViz(g3, 'paper graph after topic expansion')
viz3.preprocessing()
viz3.visulization()

Running preprocessing...
Preprocessing complete.
Calculating layout...
Graph is large, using spring_layout (may take time)...
Layout calculated using spring_layout.
Preparing data sources...
Creating plot...
Creating search widgets...
Creating legends...
Arranging layout...
Displaying plot.


## Expanded Search - High Impact Authors

In [59]:
if_expanded_authors = True
expanded_l_authors = 50


In [60]:
i = 0
expanded_author_ids = []
for item in sorted_by_writes3:
    if i < expanded_l_authors:
        aid = item[0]
        if aid not in seed_author_ids:
            a_name = g3.nodes[aid].get('name')
            hIndex = g3.nodes[aid].get('hIndex')
            global_paper_cnt = g3.nodes[aid].get('paperCount')
            global_citation_cnt = g3.nodes[aid].get('citationCount')
            print({"author_id":aid, "author_name":a_name, "write_cnt":item[1],
                "hIndex":hIndex, "global_paper_cnt":global_paper_cnt, "global_citation_cnt":global_citation_cnt, })
            expanded_author_ids.append(aid)
            i += 1

{'author_id': '152277111', 'author_name': 'Bing Qin', 'write_cnt': 31, 'hIndex': None, 'global_paper_cnt': None, 'global_citation_cnt': None}
{'author_id': '2257004102', 'author_name': 'Bing Qin', 'write_cnt': 20, 'hIndex': None, 'global_paper_cnt': None, 'global_citation_cnt': None}
{'author_id': '2250121772', 'author_name': 'Zhen Wu', 'write_cnt': 19, 'hIndex': None, 'global_paper_cnt': None, 'global_citation_cnt': None}
{'author_id': '2118208508', 'author_name': 'Dayong Wu', 'write_cnt': 19, 'hIndex': None, 'global_paper_cnt': None, 'global_citation_cnt': None}
{'author_id': '1519290245', 'author_name': 'Jindong Wang', 'write_cnt': 17, 'hIndex': None, 'global_paper_cnt': None, 'global_citation_cnt': None}
{'author_id': '2257796182', 'author_name': 'Meishan Zhang', 'write_cnt': 16, 'hIndex': None, 'global_paper_cnt': None, 'global_citation_cnt': None}
{'author_id': '2145500840', 'author_name': 'Linyi Yang', 'write_cnt': 15, 'hIndex': None, 'global_paper_cnt': None, 'global_citation_c

In [62]:
# --- CHECK SIGNIFICANT AUTHORS ---
# filter most refered papers from graph
# then search for author information
if if_expanded_authors:
    print(f"\n--- Get most cited authors: ---")

    # retrieve citation for top l most significant authors
    await ps.collect(
        seed_author_ids=expanded_author_ids,
        search_author = if_expanded_authors,
        from_dt=ps.from_dt,
        to_dt=ps.to_dt,
        fields_of_study = ps.fields_of_study,
        )
    ps.explored_nodes['author'].extend(expanded_author_ids) 

2025-04-15 11:15:06,257 - INFO - Preparing author info task for 50 authors.
2025-04-15 11:15:06,257 - INFO - SemanticScholarKit initialized with max_concurrency=10, sleep_interval=3.0s
2025-04-15 11:15:06,257 - INFO - Running 1 main data collection tasks concurrently...
2025-04-15 11:15:06,258 - INFO - Fetching info for 50 authors...
2025-04-15 11:15:06,258 - INFO - async_search_author_by_ids: Fetching 50 authors by ID in 1 batches.
2025-04-15 11:15:06,258 - INFO - async_search_author_by_ids: Gathering 1 tasks...
2025-04-15 11:15:06,258 - INFO - _sync_get_authors: Thread started for batch (50 IDs, first 5: ['152277111', '2257004102', '2250121772', '2118208508', '1519290245']...).



--- Get most cited authors: ---


2025-04-15 11:15:09,218 - INFO - HTTP Request: POST https://api.semanticscholar.org/graph/v1/author/batch?fields=affiliations%2CauthorId%2CcitationCount%2CexternalIds%2ChIndex%2Chomepage%2Cname%2CpaperCount%2Cpapers%2Cpapers.abstract%2Cpapers.authors%2Cpapers.citationCount%2Cpapers.corpusId%2Cpapers.externalIds%2Cpapers.fieldsOfStudy%2Cpapers.influentialCitationCount%2Cpapers.isOpenAccess%2Cpapers.journal%2Cpapers.openAccessPdf%2Cpapers.paperId%2Cpapers.publicationDate%2Cpapers.publicationTypes%2Cpapers.publicationVenue%2Cpapers.referenceCount%2Cpapers.s2FieldsOfStudy%2Cpapers.title%2Cpapers.url%2Cpapers.venue%2Cpapers.year%2Curl "HTTP/1.1 200 OK"
2025-04-15 11:15:11,650 - INFO - _sync_get_authors: API call successful for batch (first 5: ['152277111', '2257004102', '2250121772', '2118208508', '1519290245']...), returning 50 items.
2025-04-15 11:15:14,657 - INFO - async_search_author_by_ids: Gather complete. Processing results.
2025-04-15 11:15:14,659 - INFO - Fetching abstracts for 123

### Graph Stats

In [63]:
import copy
from collections import Counter

g4 = copy.deepcopy(ps.pg.graph)
print(len(ps.nodes_json), len(ps.edges_json), len(g4.nodes), len(g4.edges))

# check node types
node_types = [g4.nodes[nid].get('nodeType') for nid in g4.nodes]
node_types_cnt = Counter(node_types)
# 按计数降序排序
sorted_node_counts = node_types_cnt.most_common()
print(sorted_node_counts)

# check node types
edge_types = [d.get('relationshipType') for u, v, d in g4.edges(data=True)]
edge_types_cnt = Counter(edge_types)
# 按计数降序排序
sorted_egdes_counts = edge_types_cnt.most_common()
print(sorted_egdes_counts)

10246 11645 7898 11645
[('Author', 5696), ('Paper', 1615), ('Journal', 318), ('Venue', 258), ('Affiliation', 7), ('Topic', 4)]
[('WRITES', 9205), ('RELEASES_IN', 971), ('CITES', 771), ('PRINTS_ON', 501), ('DISCUSS', 186), ('WORKS_IN', 11)]


### Filtering & Ranking

Author

In [64]:
author_stat4 = []
for n in g4.nodes:
    if g4.nodes[n].get('nodeType') == 'Author':
        out_edges_info = g4.out_edges(n, data=True)
        write_cnt = sum([1 for u, v, data in out_edges_info if data.get('relationshipType') == 'WRITES'])
        author_stat4.append((n, write_cnt))

In [65]:
sorted_by_writes4 = sorted(author_stat4, key=lambda item: item[1], reverse=True)
for item in sorted_by_writes4[0:40]:
    aid = item[0]
    a_name = g4.nodes[aid].get('name')
    hIndex = g4.nodes[aid].get('hIndex')
    in_seed = True if aid in seed_author_ids else False
    global_paper_cnt = g4.nodes[aid].get('paperCount')
    global_citation_cnt = g4.nodes[aid].get('citationCount')
    print({"author_id":aid, "author_name":a_name, "write_cnt":item[1], "is_seed":in_seed,
           "hIndex":hIndex, "global_paper_cnt":global_paper_cnt, "global_citation_cnt":global_citation_cnt, })

{'author_id': '2256319', 'author_name': 'Wanxiang Che', 'write_cnt': 104, 'is_seed': False, 'hIndex': None, 'global_paper_cnt': None, 'global_citation_cnt': None}
{'author_id': '152277111', 'author_name': 'Bing Qin', 'write_cnt': 100, 'is_seed': False, 'hIndex': 45, 'global_paper_cnt': 234, 'global_citation_cnt': 14744}
{'author_id': '40282288', 'author_name': 'Ting Liu', 'write_cnt': 96, 'is_seed': False, 'hIndex': None, 'global_paper_cnt': None, 'global_citation_cnt': None}
{'author_id': '2527954', 'author_name': 'Arman Cohan', 'write_cnt': 63, 'is_seed': False, 'hIndex': 36, 'global_paper_cnt': 106, 'global_citation_cnt': 12447}
{'author_id': '1519290245', 'author_name': 'Jindong Wang', 'write_cnt': 61, 'is_seed': False, 'hIndex': 39, 'global_paper_cnt': 84, 'global_citation_cnt': 11185}
{'author_id': '2674998', 'author_name': 'Xiaocheng Feng', 'write_cnt': 60, 'is_seed': True, 'hIndex': None, 'global_paper_cnt': None, 'global_citation_cnt': None}
{'author_id': '1737249', 'author_na

In [68]:
# why there is duplicated names in S2 authors?
for nid, data in g4.nodes(data=True):
    if data.get('nodeType')=='Author' and data.get('name') == 'Wanxiang Che':
        print(nid, data)

2059027765 {'authorId': '2059027765', 'name': 'Wanxiang Che', 'nodeType': 'Author'}
2283920108 {'authorId': '2283920108', 'name': 'Wanxiang Che', 'nodeType': 'Author'}
2256319 {'authorId': '2256319', 'name': 'Wanxiang Che', 'nodeType': 'Author', 'is_complete': True}


In [69]:
from apis.s2_api import SemanticScholarKit
tmp_author_ids = ['2059027765', '2283920108', '2256319']
s2 = SemanticScholarKit()
authors_info = await s2.async_search_author_by_ids(author_ids=tmp_author_ids, with_abstract=False)

2025-04-15 11:26:53,147 - INFO - SemanticScholarKit initialized with max_concurrency=10, sleep_interval=3.0s
2025-04-15 11:26:53,147 - INFO - async_search_author_by_ids: Fetching 3 authors by ID in 1 batches.
2025-04-15 11:26:53,148 - INFO - async_search_author_by_ids: Gathering 1 tasks...
2025-04-15 11:26:53,148 - INFO - _sync_get_authors: Thread started for batch (3 IDs, first 5: ['2059027765', '2283920108', '2256319']...).
2025-04-15 11:26:54,242 - INFO - HTTP Request: POST https://api.semanticscholar.org/graph/v1/author/batch?fields=affiliations%2CauthorId%2CcitationCount%2CexternalIds%2ChIndex%2Chomepage%2Cname%2CpaperCount%2Cpapers%2Cpapers.abstract%2Cpapers.authors%2Cpapers.citationCount%2Cpapers.corpusId%2Cpapers.externalIds%2Cpapers.fieldsOfStudy%2Cpapers.influentialCitationCount%2Cpapers.isOpenAccess%2Cpapers.journal%2Cpapers.openAccessPdf%2Cpapers.paperId%2Cpapers.publicationDate%2Cpapers.publicationTypes%2Cpapers.publicationVenue%2Cpapers.referenceCount%2Cpapers.s2FieldsOfS

Papers

In [74]:
paper_stat4 = []
for n in g4.nodes:
    if g4.nodes[n].get('nodeType') == 'Paper':
        in_edges_info = g4.in_edges(n, data=True)
        cite_cnt = sum([1 for u, v, data in in_edges_info if data.get('relationshipType') == 'CITES'])
        paper_stat4.append((n, cite_cnt))

In [75]:
tot_seed_paper_cnt = len(seed_paper_dois)
tot_paper_cnt = len([nid for nid in g4.nodes if g4.nodes[nid].get('nodeType')=='Paper'])
print(tot_seed_paper_cnt, tot_paper_cnt)

4 1615


In [76]:
import numpy as np
sorted_by_cite4 = sorted(paper_stat4, key=lambda item: item[1], reverse=True)
print(sorted_by_cite4[0:30])

for item in sorted_by_cite4[0:30]:
    n = item[0]
    cite_cnt = item[1]
    # paper infos
    title = g4.nodes[n].get('title')
    in_seed = True if item[0] in seed_paper_dois else False
    overall_cite_cnt = g4.nodes[n].get('citationCount')
    influential_cite_cnt = g4.nodes[n].get('influentialCitationCount')
    # author infors
    hindex_lst = []
    for u in g4.predecessors(n):
        if g4.nodes[u].get('nodeType') == 'Author':
            hIndex = g4.nodes[u].get('hIndex')
            if hIndex:
                hindex_lst.append(hIndex)
            paperCount = g4.nodes[u].get('paperCount')
            citationCount = g4.nodes[u].get('citationCount')
    h_index = np.average(hindex_lst)

    paper_info = {"doi":n, "title":title, "if_seed": in_seed,
                  "local_refs":cite_cnt, "global_refs":overall_cite_cnt, "inf_cite_cnt":influential_cite_cnt,
                  "h_index": h_index}
    print(paper_info)

[('10.48550/arXiv.2406.10252', 14), ('10.48550/arXiv.2408.06292', 11), ('10.48550/arXiv.2310.01783', 11), ('10.48550/arXiv.2306.00622', 8), ('10.48550/arXiv.2303.08774', 7), ('10.48550/arXiv.2307.05492', 7), ('7c2b3c4ab6d701de7bd9df91d7448f3c06a1e9d7', 7), ('10.48550/arXiv.2306.05685', 5), ('10.48550/arXiv.2302.13971', 5), ('10.48550/arXiv.2406.05688', 5), ('10.48550/arXiv.2403.07183', 5), ('10.48550/arXiv.2303.17651', 5), ('10.48550/arXiv.2405.02150', 5), ('10.1162/tacl_a_00638', 4), ('10.48550/arXiv.2410.09403', 4), ('10.48550/arXiv.2408.10365', 4), ('10.48550/arXiv.2310.03302', 4), ('10.48550/arXiv.2401.04259', 4), ('10.18653/v1/2023.findings-emnlp.472', 4), ('10.48550/arXiv.2309.16609', 4), ('10.48550/arXiv.2312.10997', 3), ('10.1145/3571730', 3), ('10.18653/v1/2024.acl-long.18', 3), ('10.48550/arXiv.2201.11903', 3), ('10.18653/v1/2020.acl-main.207', 3), ('10.48550/arXiv.2410.14255', 3), ('10.48550/arXiv.2410.13185', 3), ('10.48550/arXiv.2409.05556', 3), ('10.48550/arXiv.2409.04109

  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


### Graph Viz

In [77]:
# from graph.graph_viz import GraphViz
viz4 = GraphViz(g4, 'paper graph after author expansion')
viz4.preprocessing()
viz4.visulization()

Running preprocessing...
Preprocessing complete.
Calculating layout...
Graph is large, using spring_layout (may take time)...
Layout calculated using spring_layout.
Preparing data sources...
Creating plot...
Creating search widgets...
Creating legends...
Arranging layout...
Displaying plot.


### Similarity Calculation

In [78]:
similarity_threshold = 0.7
expanded_k_papers = 20
all_paper_dois = [node['id'] for node in ps.nodes_json if node['labels'] == ['Paper']]
non_seed_paper_dois = [node['id'] for node in ps.nodes_json if node['labels'] == ['Paper'] and node['id'] not in seed_paper_dois]
paper_nodes_json = [node for node in ps.nodes_json 
                    if node['labels'] == ['Paper'] and 
                    node['properties'].get('title') is not None and node['properties'].get('abstract') is not None]

In [79]:
# calculate paper nodes similarity
semantic_similar_pool = await ps.cal_embed_and_similarity(
    paper_nodes_json=paper_nodes_json,
    paper_dois_1=all_paper_dois, 
    paper_dois_2=all_paper_dois,
    similarity_threshold=similarity_threshold,
    )

2025-04-15 11:41:31,707 - INFO - Generating embeddings for 717 papers...
2025-04-15 11:41:40,978 - INFO - Shape of embeds_1: (1443, 768)
2025-04-15 11:41:40,979 - INFO - Shape of embeds_2: (1443, 768)
2025-04-15 11:41:40,979 - INFO - Calculating similarity matrix...
2025-04-15 11:41:41,001 - INFO - Processing similarity matrix to create relationships...


In [83]:
top_k_similar_papers = 20

In [84]:
if len(semantic_similar_pool) > 0:
    candit_items = []
    for item in semantic_similar_pool:
        wt = item.get('properties', {}).get('weight')
        if (wt > 0.7 and wt < 0.95) or (wt > 0.7 and wt < 0.95):
            if item['startNodeId'] in seed_paper_dois and item['endNodeId'] not in seed_paper_dois:
                candit_items.append((item['endNodeId'], wt))
            elif item['startNodeId'] not in seed_paper_dois and item['endNodeId'] in seed_paper_dois:
                candit_items.append((item['startNodeId'], wt))
                
sorted_items = sorted(candit_items, key=lambda item: item[1], reverse=True)

# filter top k similarities
similar_paper_dois = [x[0] for x in sorted_items[0:top_k_similar_papers]]

In [86]:
for doi in similar_paper_dois:
    # paper infos
    title = g4.nodes[doi].get('title')
    in_seed = True if doi in seed_paper_dois else False
    overall_cite_cnt = g4.nodes[doi].get('citationCount')
    influential_cite_cnt = g4.nodes[doi].get('influentialCitationCount')
    # author infors
    hindex_lst = []
    for u in g4.predecessors(doi):
        if g4.nodes[u].get('nodeType') == 'Author':
            hIndex = g4.nodes[u].get('hIndex')
            if hIndex:
                hindex_lst.append(hIndex)
            paperCount = g4.nodes[u].get('paperCount')
            citationCount = g4.nodes[u].get('citationCount')
    h_index = np.average(hindex_lst)

    paper_info = {"doi":doi, "title":title, "if_seed": in_seed,
                  "global_refs":overall_cite_cnt, "inf_cite_cnt":influential_cite_cnt,
                  "h_index": h_index}
    print(paper_info)

{'doi': '10.48550/arXiv.2401.04259', 'title': 'MARG: Multi-Agent Review Generation for Scientific Papers', 'if_seed': False, 'global_refs': 25, 'inf_cite_cnt': 0, 'h_index': nan}
{'doi': '10.48550/arXiv.2408.10365', 'title': 'AI-Driven Review Systems: Evaluating LLMs in Scalable and Bias-Aware Academic Reviews', 'if_seed': False, 'global_refs': 11, 'inf_cite_cnt': 0, 'h_index': nan}
{'doi': '10.48550/arXiv.2503.08506', 'title': 'ReviewAgents: Bridging the Gap Between Human and AI-Generated Paper Reviews', 'if_seed': False, 'global_refs': 1, 'inf_cite_cnt': 0, 'h_index': nan}
{'doi': '10.48550/arXiv.2502.14776', 'title': 'SurveyX: Academic Survey Automation via Large Language Models', 'if_seed': False, 'global_refs': 0, 'inf_cite_cnt': 0, 'h_index': nan}
{'doi': '10.48550/arXiv.2502.17086', 'title': 'Automatically Evaluating the Paper Reviewing Capability of Large Language Models', 'if_seed': False, 'global_refs': 0, 'inf_cite_cnt': 0, 'h_index': nan}
{'doi': '10.48550/arXiv.2406.12708'

  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


## Reduction

### Add Similar to Graph

In [89]:
len(semantic_similar_pool)

22084

In [90]:
ps.pg.add_graph_edges(semantic_similar_pool)

In [91]:
ps._add_items_to_graph(semantic_similar_pool)

### Graph Stats

In [99]:
import copy
from collections import Counter

g5 = copy.deepcopy(ps.pg.graph)
print(len(ps.nodes_json), len(ps.edges_json), len(g5.nodes), len(g5.edges))

# check node types
node_types = [g5.nodes[nid].get('nodeType') for nid in g5.nodes]
node_types_cnt = Counter(node_types)
# 按计数降序排序
sorted_node_counts = node_types_cnt.most_common()
print(sorted_node_counts)

# check node types
edge_types = [d.get('relationshipType') for u, v, d in g5.edges(data=True)]
edge_types_cnt = Counter(edge_types)
# 按计数降序排序
sorted_egdes_counts = edge_types_cnt.most_common()
print(sorted_egdes_counts)

10246 33729 7898 55813
[('Author', 5696), ('Paper', 1615), ('Journal', 318), ('Venue', 258), ('Affiliation', 7), ('Topic', 4)]
[('SIMILAR_TO', 44168), ('WRITES', 9205), ('RELEASES_IN', 971), ('CITES', 771), ('PRINTS_ON', 501), ('DISCUSS', 186), ('WORKS_IN', 11)]


Remove Isolated Nodes and Edges

In [100]:
import networkx as nx
from typing import List, Dict, Union, List, Set, Tuple, Hashable, Literal, Optional

NodeType = Hashable # 节点类型通常是可哈希的

def find_wcc_subgraphs(
    graph,
    target_nodes: Union[NodeType, List[NodeType], Set[NodeType], Tuple[NodeType]]
) -> List[nx.MultiDiGraph]:
    """查找包含一个或多个指定节点的弱连通分量对应的子图。
    Args:
        graph: NetworkX MultiDiGraph 图对象。
        target_nodes: 一个节点 ID，或一个包含节点 ID 的列表、集合或元组。
    Returns:
        一个包含所有找到的弱连通分量子图 (作为独立的 MultiDiGraph 副本) 的列表。
        如果目标节点不在图中或找不到对应的连通分量，则返回空列表。
        注意：如果多个目标节点在同一个连通分量中，该分量的子图只会被返回一次。
    """
    # 1. 标准化输入为集合
    if isinstance(target_nodes, (list, set, tuple)):
        target_nodes_set = set(target_nodes)
    else:
        # 假设是单个节点 ID
        target_nodes_set = {target_nodes}

    # 2. 检查所有目标节点是否存在于图中
    missing_nodes = target_nodes_set - set(graph.nodes())
    if missing_nodes:
        print(f"警告：以下目标节点不在图中，将被忽略: {missing_nodes}")
        target_nodes_set -= missing_nodes # 移除不存在的节点

    if not target_nodes_set:
        print("错误：没有有效的目标节点可供查找。")
        return []

    # 3. 查找并收集包含任何目标节点的弱连通分量
    found_subgraphs = []
    found_components_nodes = set() # 用于跟踪已添加的分量的节点集，避免重复

    for component_nodes in nx.weakly_connected_components(graph):
        component_set = set(component_nodes)
        # 4. 检查当前分量是否包含任何目标节点 (使用集合交集)
        if not target_nodes_set.isdisjoint(component_set): # 如果交集非空
            # 检查这个分量是否已经添加过 (基于其节点集合)
            # frozenset 是可哈希的，可以放入集合中
            component_frozenset = frozenset(component_set)
            if component_frozenset not in found_components_nodes:
                # 5. 提取子图并添加到结果列表
                subgraph = graph.subgraph(component_nodes).copy()
                found_subgraphs.append(subgraph)
                found_components_nodes.add(component_frozenset)

                # Optional: 如果我们确定一个目标节点只能属于一个WCC,
                # 可以在这里从 target_nodes_set 中移除 component_set 里的目标节点
                # 以可能稍微提高后续迭代的效率，但这通常不是必需的
                # target_nodes_set -= component_set

    return found_subgraphs

In [106]:
g6 = find_wcc_subgraphs(graph=g5, target_nodes=seed_paper_dois)[0]

In [107]:
import copy
from collections import Counter

print(len(ps.nodes_json), len(ps.edges_json), len(g6.nodes), len(g6.edges))

# check node types
node_types = [g6.nodes[nid].get('nodeType') for nid in g6.nodes]
node_types_cnt = Counter(node_types)
# 按计数降序排序
sorted_node_counts = node_types_cnt.most_common()
print(sorted_node_counts)

# check node types
edge_types = [d.get('relationshipType') for u, v, d in g6.edges(data=True)]
edge_types_cnt = Counter(edge_types)
# 按计数降序排序
sorted_egdes_counts = edge_types_cnt.most_common()
print(sorted_egdes_counts)

10246 33729 7818 55741
[('Author', 5631), ('Paper', 1607), ('Journal', 315), ('Venue', 254), ('Affiliation', 7), ('Topic', 4)]
[('SIMILAR_TO', 44168), ('WRITES', 9138), ('RELEASES_IN', 968), ('CITES', 771), ('PRINTS_ON', 499), ('DISCUSS', 186), ('WORKS_IN', 11)]


Seed Paper Similar To

In [171]:
pre_paper_ids, after_paper_ids = [], []
pre_sim_paper_ids, after_sim_paper_ids = [], []
for nid in seed_paper_dois:
    pre_nodes_ids = g6.predecessors(nid)
    pre_paper_ids.extend([x for x in pre_nodes_ids if g6.nodes[x].get('nodeType') == 'Paper' and x not in seed_paper_dois]) 
    for pre_id in pre_paper_ids:
        print(pre_id, nid)
        edge_type = g6[pre_id][nid].get('relationshipType')
        if edge_type == 'SIMILAR_TO':
            pre_sim_paper_ids.append(pre_id)

    after_nodes_ids = g6.successors(nid)
    after_paper_ids.extend([x for x in after_nodes_ids if g6.nodes[x].get('nodeType') == 'Paper' and x not in seed_paper_dois]) 
    for after_id in after_paper_ids:
        edge_type = g6[nid][after_id].get('relationshipType')
        if edge_type == 'SIMILAR_TO':
            after_sim_paper_ids.append(pre_id)

print(len(pre_paper_ids), len(pre_sim_paper_ids), len(after_paper_ids), len(after_sim_paper_ids))

10.48550/arXiv.2402.01788 10.48550/arXiv.2406.10252
10.48550/arXiv.2403.02574 10.48550/arXiv.2406.10252
10.3390/fi16050167 10.48550/arXiv.2406.10252
10.48550/arXiv.2403.02901 10.48550/arXiv.2406.10252
10.48550/arXiv.2408.16498 10.48550/arXiv.2406.10252
10.48550/arXiv.2310.07521 10.48550/arXiv.2406.10252
10.48550/arXiv.2305.12421 10.48550/arXiv.2406.10252
10.1145/3641289 10.48550/arXiv.2406.10252
10.48550/arXiv.2311.05876 10.48550/arXiv.2406.10252
10.48550/arXiv.2305.14627 10.48550/arXiv.2406.10252
10.48550/arXiv.2303.18223 10.48550/arXiv.2406.10252
10.48550/arXiv.2504.05732 10.48550/arXiv.2406.10252
10.48550/arXiv.2503.19065 10.48550/arXiv.2406.10252
10.48550/arXiv.2502.18209 10.48550/arXiv.2406.10252
10.48550/arXiv.2502.12568 10.48550/arXiv.2406.10252
10.48550/arXiv.2501.09751 10.48550/arXiv.2406.10252
10.48550/arXiv.2411.00816 10.48550/arXiv.2406.10252
10.48550/arXiv.2410.09510 10.48550/arXiv.2406.10252
10.48550/arXiv.2410.07009 10.48550/arXiv.2406.10252
10.1007/978-3-031-72567-8_12 

KeyError: '10.48550/arXiv.2412.10415'

In [175]:
for item in g6.predecessors('10.48550/arXiv.2412.10415'):
    print(item)

2335566763
2335569348
10.1109/ICISET62123.2024.10939517
10.48550/arXiv.2402.01788
10.48550/arXiv.2409.04600
10.48550/arXiv.2403.02574
10.3390/fi16050167
10.1109/FLLM63129.2024.10852447
10.1145/3641289
10.1109/ICASSP48485.2024.10448015
10.48550/arXiv.2304.03512
10.48550/arXiv.2312.10997
10.48550/arXiv.2305.14627
10.48550/arXiv.2305.06983
10.48550/arXiv.2411.00816
10.48550/arXiv.2410.09403
10.48550/arXiv.2410.03019
10.48550/arXiv.2408.10365
10.48550/arXiv.2408.06292
10.48550/arXiv.2406.05688
10.48550/arXiv.2404.16130
10.48550/arXiv.2404.07738
10.48550/arXiv.2310.07984
10.48550/arXiv.2310.01783
10.48550/arXiv.2309.07864
10.48550/arXiv.2307.05492
10.48550/arXiv.2306.00622
10.48550/arXiv.2305.01937
10.48550/arXiv.2502.17086
10.48550/arXiv.2408.15769
10.1007/s11704-024-40231-1
10.48550/arXiv.2411.14199
10.48550/arXiv.2411.09763
10.48550/arXiv.2411.02429
10.48550/arXiv.2410.22394
10.48550/arXiv.2410.13185
10.48550/arXiv.2409.16813
10.48550/arXiv.2409.13740
10.48550/arXiv.2409.05556
10.48550/a

In [173]:
g6['10.48550/arXiv.2403.02901']['10.48550/arXiv.2412.10415']

KeyError: '10.48550/arXiv.2412.10415'

### Still Graph Stats

Get k-hop neighbor graph

In [110]:
import networkx as nx
from typing import Iterable, Set, Hashable # For type hinting

def get_k_hop_neighbors(graph: nx.MultiDiGraph,
                        start_nodes: Iterable[Hashable],
                        k: int) -> Set[Hashable]:
    """
    查找 MultiDiGraph 中一个或多个起始节点的 k-hop 邻居（忽略边的方向）。
    通过为每个起始节点计算 ego_graph 并合并结果实现。

    Args:
        graph: NetworkX MultiDiGraph 对象。
        start_nodes: 一个包含一个或多个起始节点的可迭代对象 (如 list, set)。
        k: 跳数 (hops)。

    Returns:
        一个包含所有距离任一<0xE8><0xB5><0xB7>始节点 k 跳内的节点的集合（包括起始节点）。
    """
    # 1. 创建一个无向图视图/副本 (只做一次)
    #    对于仅查找节点，nx.Graph 通常足够且更快
    undirected_graph = nx.Graph(graph)
    # 或者 undirected_graph = graph.to_undirected() # 转为 MultiGraph

    all_k_hop_neighbors = set()
    valid_start_nodes_found = False

    for start_node in start_nodes:
        if start_node not in undirected_graph:
            print(f"警告: 起始节点 {start_node} 不在图中，将被忽略。")
            continue # 跳过不在图中的起始节点

        valid_start_nodes_found = True
        # 2. 为当前有效的起始节点计算 ego_graph
        #    ego_graph 包含中心节点和 k-hop 内的所有节点
        ego = nx.ego_graph(undirected_graph, start_node, radius=k)

        # 3. 将当前 ego_graph 的节点合并到总集合中
        all_k_hop_neighbors.update(ego.nodes())

    if not valid_start_nodes_found and start_nodes:
         print("警告: 所有提供的起始节点都不在图中。")

    return all_k_hop_neighbors



In [118]:
start_nodes = seed_paper_dois

k = 1
hop_1_neighbors = get_k_hop_neighbors(g6, start_nodes, k)

In [120]:
k = 2
hop_2_neighbors = get_k_hop_neighbors(g6, start_nodes, k)

In [121]:
k = 3
hop_3_neighbors = get_k_hop_neighbors(g6, start_nodes, k)

In [122]:
k = 4
hop_4_neighbors = get_k_hop_neighbors(g6, start_nodes, k)

In [123]:
k = 5
hop_5_neighbors = get_k_hop_neighbors(g6, start_nodes, k)

In [126]:
g_view = nx.Graph(g6)

In [149]:
hop_1_paper_ids

['10.1038/s42256-024-00832-8']

In [154]:
[x for x in g.predecessors('10.1038/s42256-024-00832-8') if x in seed_paper_dois]

['10.48550/arXiv.2503.01424']

In [155]:
g6['10.48550/arXiv.2503.01424']['10.1038/s42256-024-00832-8']

AtlasView({0: {'isInfluential': False, 'contexts': [], 'intents': [], 'contextsWithIntent': [], 'relationshipType': 'CITES'}})

In [None]:
import networkx as nx
from typing import List, Dict, Any, Optional

def get_edge_data_for_paths(
    graph: nx.MultiDiGraph,
    paths: List[List[Any]]
) -> List[List[Optional[Dict[str, Any]]]]:
    """
    为给定的节点路径列表，从原始 MultiDiGraph 中提取对应的边数据。

    对于路径中的每一步 (u, v)，它会尝试查找 u->v 或 v->u 的边，
    并返回找到的第一个边的属性字典。

    Args:
        graph: 原始的 NetworkX MultiDiGraph 对象，包含边的属性数据。
        paths: 一个列表，其中每个元素是表示一条路径的节点列表
               (例如由 nx.all_simple_paths 找到)。

    Returns:
        一个列表，结构与 paths 对应。每个内部列表包含对应路径上
        每条边的属性数据字典。如果在原始图中找不到对应步骤的边
        (理论上不应发生，如果路径有效)，则该步骤的数据为 None。
        如果 u 和 v 之间有多条边，默认返回找到的第一条边的数据。
    """
    paths_info = []
    if not graph.is_multigraph():
        # 提示：虽然输入类型是 MultiDiGraph，做一个检查或转换可能更健壮
        print("警告：输入图不是 MultiDiGraph，行为可能与预期不同。")

    for path in paths:
        current_path_edge_data = []
        # 遍历路径中的每对连续节点 (u, v)
        for i in range(len(path) - 1):
            u = path[i]
            v = path[i+1]
            edge_data = None

            # 尝试查找 u -> v 的边
            # graph.get_edge_data(u, v) 返回一个字典，key 是边的 key，value 是属性字典
            # 如果不存在 u -> v 的边，返回 None
            forward_edges_data = graph.get_edge_data(u, v)
            if forward_edges_data:
                # 获取找到的第一个前向边的数据 (任意 key)
                # list(forward_edges_data.values())[0] 获取第一个key对应的值（属性字典）
                edge_data = list(forward_edges_data.values())[0]
                current_path_edge_data.append((u, v, edge_data))
            else:
                # 如果没有 u -> v，尝试查找 v -> u 的边
                backward_edges_data = graph.get_edge_data(v, u)
                if backward_edges_data:
                    # 获取找到的第一个反向边的数据 (任意 key)
                    edge_data = list(backward_edges_data.values())[0]
                    current_path_edge_data.append((v, u, edge_data))
                else:
                    current_path_edge_data.append((None, None, None))

            # 如果 edge_data 为 None，可能表示路径与原始图不一致或存在问题
            if edge_data is None:
                 print(f"警告：在路径 {path} 中，未能在原始 MultiDiGraph 中找到节点对 ({u}, {v}) 或 ({v}, {u}) 之间的任何边。")

        # 将当前路径的所有边数据列表添加到总结果中
        paths_info.append(current_path_edge_data)

    return paths_info

In [157]:
k = 1
hop_1_paper_ids = []
hop_1_paper_paths = []
hop_1_paper_paths_info = []
for id in hop_1_neighbors:
    node_data = g6.nodes[id]
    if node_data.get('nodeType') == 'Paper':
        paths = []
        for seed_doi in seed_paper_dois:
            paths_generator = nx.all_simple_paths(
                g_view,
                source=id,
                target=seed_doi,
                cutoff=k
            )
            for path in paths_generator:
                paths.append(path)
        paths_info = get_edge_data_for_paths(g6, paths)

        hop_1_paper_ids.append(id)
        hop_1_paper_paths.append(paths)
        hop_1_paper_paths_info.append(paths_info)

In [159]:
hop_1_paper_paths_info[0]

[[{'isInfluential': False,
   'contexts': [],
   'intents': [],
   'contextsWithIntent': [],
   'relationshipType': 'CITES'}]]

In [161]:
hop_1_paper_paths[0]

[['10.1038/s42256-024-00832-8', '10.48550/arXiv.2503.01424']]

In [166]:
k = 3
hop_3_paper_ids = []
hop_3_paper_paths = []
hop_3_paper_paths_info = []
for id in hop_3_neighbors:
    node_data = g6.nodes[id]
    if node_data.get('nodeType') == 'Paper':
        paths = []
        for seed_doi in seed_paper_dois:
            paths_generator = nx.all_simple_paths(
                g_view,
                source=id,
                target=seed_doi,
                cutoff=k
            )
            for path in paths_generator:
                paths.append(path)
        paths_info = get_edge_data_for_paths(g6, paths)

        hop_3_paper_ids.append(id)
        hop_3_paper_paths.append(paths)
        hop_3_paper_paths_info.append(paths_info)

KeyboardInterrupt: 

In [None]:
hop_3_paper_paths_info[0]

[[{'source': 'semantic similarity',
   'weight': 0.7084,
   'relationshipType': 'SIMILAR_TO'},
  {'isInfluential': False,
   'contexts': [],
   'intents': [],
   'contextsWithIntent': [],
   'relationshipType': 'CITES'}],
 [{'source': 'semantic similarity',
   'weight': 0.7111,
   'relationshipType': 'SIMILAR_TO'},
  {'source': 'semantic similarity',
   'weight': 0.7364,
   'relationshipType': 'SIMILAR_TO'}],
 [{'source': 'semantic similarity',
   'weight': 0.706,
   'relationshipType': 'SIMILAR_TO'},
  {'isInfluential': False,
   'contexts': [],
   'intents': [],
   'contextsWithIntent': [],
   'relationshipType': 'CITES'}],
 [{'source': 'semantic similarity',
   'weight': 0.7837,
   'relationshipType': 'SIMILAR_TO'},
  {'source': 'semantic similarity',
   'weight': 0.7294,
   'relationshipType': 'SIMILAR_TO'}],
 [{'source': 'semantic similarity',
   'weight': 0.7116,
   'relationshipType': 'SIMILAR_TO'},
  {'source': 'semantic similarity',
   'weight': 0.7004,
   'relationshipType': 

In [164]:
len(hop_2_paper_paths_info)

1345

In [None]:
# 包含无效起始节点的示例
start_nodes_with_invalid = [0, 99] # 99 不在图中
k = 2
k_hop_neighbors_invalid = get_k_hop_neighbors(graph: nx.MultiDiGraph,
(G, start_nodes_with_invalid, k)
print(f"起始节点 {start_nodes_with_invalid} 的 {k}-hop 邻居 (Ego方法): {k_hop_neighbors_invalid}")

# 如果希望结果不包含起始节点本身
k_hop_neighbors_excluding_starts = k_hop_neighbors - set(start_nodes)
print(f"起始节点 {start_nodes} 的 {k}-hop 邻居 (Ego方法, 不含自身): {k_hop_neighbors_excluding_starts}")

In [None]:
    undirected_view = nx.Graph(graph)

    # 3. 使用 networkx.all_simple_paths 查找无向视图中的所有简单路径。
    #    这个函数保证了路径中没有重复的节点（即无环）。
    #    它返回一个生成器，这对于可能存在大量路径的情况更节省内存。
    paths_generator = nx.all_simple_paths(
        undirected_view,
        source=source,
        target=target,
        cutoff=cutoff
    )

In [None]:
find_all_undirected_simple_paths

In [92]:
seed_paper_dois

['10.48550/arXiv.2406.10252',
 '10.48550/arXiv.2412.10415',
 '10.48550/arXiv.2402.12928',
 '10.48550/arXiv.2503.01424']

In [98]:
g5 = ps.pg.graph
start_node = '10.48550/arXiv.2302.00093'
end_node = seed_paper_dois[0]

print(f"查找从节点 '{start_node}' 到节点 '{end_node}' 的所有简单路径 (忽略方向):")

# 3. 调用函数查找路径
try:
    all_paths_gen = find_all_undirected_simple_paths(g5, start_node, end_node, 5)

    # 4. 遍历生成器并打印路径
    found_paths_list = list(all_paths_gen) # 将生成器转换为列表以便计数和打印

    if found_paths_list:
        print(f"共找到 {len(found_paths_list)} 条路径:")
        for i, path in enumerate(found_paths_list):
            print(f"  路径 {i+1}: {path}")
    else:
        print("未找到任何路径。")

except nx.NodeNotFound as e:
    print(f"错误: {e}")

查找从节点 '10.48550/arXiv.2302.00093' 到节点 '10.48550/arXiv.2406.10252' 的所有简单路径 (忽略方向):


KeyboardInterrupt: 

In [None]:
await ps.construct_paper_graph(
    search_citation = 'both',  # 'both',
    search_author = True,
    find_recommend = True,
    if_related_topic = True,
    if_expanded_citations  = 'reference',  #  'reference',
    if_expanded_authors = True,
    if_add_similarity = True,
    similarity_threshold = 0.7,
    expanded_k_papers = 20,
    expanded_l_authors = 100,
)

In [None]:
len(ps.nodes_json)

In [None]:
len(ps.edges_json)

In [None]:
g = ps.pg.graph

In [None]:
len(g.nodes)

In [None]:
len(g.edges)

In [None]:
# check node types
set([g.nodes[nid].get('nodeType') for nid in g.nodes])

In [None]:
# stats of node types
node_types = [g.nodes[nid].get('nodeType') for nid in g.nodes]

from collections import Counter
counts = Counter(node_types)

# 按计数降序排序
sorted_counts = counts.most_common()
sorted_counts

In [None]:
# stats of edge types
edge_types = [d.get('relationshipType') for u, v, d in g.edges(data=True)]
print(set(edge_types))

from collections import Counter
counts = Counter(edge_types)

# 按计数降序排序
sorted_counts = counts.most_common()
print(sorted_counts)

In [None]:
# seed papers
seed_paper_dois = [nid for nid in g.nodes 
                        if g.nodes[nid].get('nodeType')=='Paper' and
                           g.nodes[nid].get('from_seed') == True]
seed_paper_nodes = [g.nodes[nid] for nid in g.nodes 
                        if g.nodes[nid].get('nodeType')=='Paper' and
                           g.nodes[nid].get('from_seed') == True]

In [None]:
print(seed_paper_dois)

In [None]:
# expanded papers with citation chain
# paper with cites but no citing, paper not in seed dois
paper_w_ref_dois = []
paper_w_ref_nodes = []

for nid in g.nodes:
    node = g.nodes[nid]
    if node.get('nodeType')=='Paper' and nid not in seed_paper_dois:
        out_edges_info = g.out_edges(nid, data=True)
        cnt = 0
        for u, v, data in out_edges_info:
            if data.get('relationshipType') == 'CITES':
               cnt += 1
        if cnt > 0:
            paper_w_ref_dois.append(nid)
            paper_w_ref_nodes.append(node)

print(paper_w_ref_dois)

In [None]:
expanded_paper_w_ref_dois = [x for x in paper_w_ref_dois if x not in seed_paper_dois]

In [None]:
filtered_dois = []
for doi in expanded_paper_w_ref_dois:
    out_edges_info = g.out_edges(doi, data=True)
    ref_cnt = sum([1 for u, v, data in out_edges_info if data.get('relationshipType') == 'CITES'])
    print(doi, ref_cnt)
    if ref_cnt > 2:
        filtered_dois.append(doi)

In [None]:
g.out_edges('10.48550/arXiv.2408.16498', data=True)

In [None]:
n = '10.48550/arXiv.2408.16498'
for v in g.successors(n):
    data = g[n][v]
    if data.get('relationshipType') == 'CITES':
        print(g.nodes[v])

In [None]:
len(filtered_dois)

Check cross refs  
- most refered to
- precessor of seed dois

In [None]:
paper_stat = []
for n in g.nodes:
    if g.nodes[n].get('nodeType') == 'Paper':
        in_edges_info = g.in_edges(n, data=True)
        cite_cnt = sum([1 for u, v, data in in_edges_info if data.get('relationshipType') == 'CITES'])
        sim_cnt = sum([1 for u, v, data in in_edges_info if data.get('relationshipType') == 'SIMILAR_TO'])
        paper_stat.append((n, cite_cnt, sim_cnt))

In [None]:
import numpy as np

In [None]:
sorted_by_cite = sorted(paper_stat, key=lambda item: item[1], reverse=True)
print(sorted_by_cite[0:20])

for item in sorted_by_cite[0:20]:
    n = item[0]
    cite_cnt = item[1]
    # paper infos
    title = g.nodes[n].get('title')
    overall_cite_cnt = g.nodes[n].get('citationCount')
    influential_cite_cnt = g.nodes[n].get('influentialCitationCount')
    # author infors
    hindex_lst = []
    for u in g.predecessors(n):
        if g.nodes[u].get('nodeType') == 'Author':
            hIndex = g.nodes[u].get('hIndex')
            if hIndex:
                hindex_lst.append(hIndex)
            paperCount = g.nodes[u].get('paperCount')
            citationCount = g.nodes[u].get('citationCount')
    h_index = np.average(hindex_lst)

    paper_info = {"doi":n, "title":title, 
                  "local_refs":cite_cnt, "global_refs":overall_cite_cnt, "inf_cite_cnt":influential_cite_cnt,
                  "h_index": h_index}
    print(paper_info)

In [None]:
sorted_by_sim = sorted(paper_stat, key=lambda item: item[2], reverse=True)
print(sorted_by_sim[0:20])

for item in sorted_by_sim[0:20]:
    n = item[0]
    sim_cnt = item[2]
    title = g.nodes[n].get('title')
    overall_cite_cnt = g.nodes[n].get('citationCount')
    influential_cite_cnt = g.nodes[n].get('influentialCitationCount')
    # author infors
    hindex_lst = []
    for u in g.predecessors(n):
        if g.nodes[u].get('nodeType') == 'Author':
            hIndex = g.nodes[u].get('hIndex')
            if hIndex:
                hindex_lst.append(hIndex)
            paperCount = g.nodes[u].get('paperCount')
            citationCount = g.nodes[u].get('citationCount')
    h_index = np.average(hindex_lst)

    paper_info = {"doi":n, "title":title, 
                  "local_sims":sim_cnt, "global_refs":overall_cite_cnt, "inf_cite_cnt":influential_cite_cnt,
                  "h_index": h_index}
    print(paper_info)

In [None]:
# check key authors in graph
author_stat = []
for n in g.nodes:
    if g.nodes[n].get('nodeType') == 'Author':
        out_edges_info = g.out_edges(n, data=True)
        writes_cnt = sum([1 for u, v, data in out_edges_info if data.get('relationshipType') == 'WRITES'])
        author_stat.append((n, writes_cnt))

In [None]:
sorted_by_writes = sorted(author_stat, key=lambda item: item[1], reverse=True)
print(sorted_by_writes[0:20])

for item in sorted_by_writes[0:20]:
    n = item[0]
    print(g.nodes[n])

In [None]:
ps.explored_nodes

In [None]:
for id in graph[0].graph.nodes:
    item = graph[0].graph.nodes[id]
    if item.get('nodeType') is None:
        print(id, item)

In [None]:
set(nodes_types)

In [None]:
set([d['relationshipType'] for u, v, d in graph[0].graph.edges(data=True)])

In [None]:
# node types and edges types to keep
filtered_node_labels = ['Paper', 'Topic', 'Author']
filtered_edges_labels = ['CITES', 'DISCUSS', 'WRITES']

In [None]:
G = graph[0].graph

In [None]:
# Create a list of node IDs to iterate over
node_ids_to_check = list(G.nodes) # <--- Create a static list here

# filter node types
for id in node_ids_to_check: # <-- Iterate over the list
    # Check if the node still exists (important if edges might remove nodes indirectly, though less likely here)
    if id in G:
        item = G.nodes[id]
        node_type = item.get('nodeType')
        if node_type not in filtered_node_labels:
            G.remove_node(id) # Modify the original graph G

In [None]:
# Create a list of edge tuples (u, v, data) to iterate over
edge_list_copy = list(G.edges(data=True)) # <--- Create a static list here

# filter edge types
for u, v, d in edge_list_copy: # <-- Iterate over the copy
    edge_type = d.get('relationshipType') # Use .get() for safety if attr might be missing
    if edge_type not in filtered_edges_labels:
         # Check if edge still exists (might have been removed if graph allows parallel edges and one was removed)
         if G.has_edge(u, v):
            G.remove_edge(u, v) # Modify the original graph G

In [6]:
len(set([1,2,3]) - set([1, 2,3,4, 5,6]))

0

In [None]:

G.remove_edge(1, 3)

In [None]:
set([graph[0].graph.nodes[x]['nodeType'] for x in graph[0].graph.nodes])

In [None]:
a = False
if a:
    print(111)

In [None]:
import networkx as nx
G = nx.Graph()

In [None]:
G.add_nodes_from([(4, {"color": "red"}), (5, {"color": "green"})])

In [None]:
G.nodes[4]

In [None]:
G.add_nodes_from([(4, {"color": "blue"})])

In [None]:
G.nodes[4]

In [None]:
G.add_nodes_from([(4, {"name": "No.4"})])

In [None]:
G.nodes[4]