# Setup

In [1]:
import copy
from typing import List, Dict, Optional, Union, Tuple, Literal

In [2]:
import os
import json

import sys
import os

parent_dir = os.path.dirname(os.getcwd())
sys.path.append(parent_dir)

In [3]:
from graph.paper_graph import PaperGraph
from graph.graph_viz import GraphViz
from app.paper_data_collect import PaperSearch
from graph.graph_stats import get_graph_stats, get_author_stats, get_paper_stats

In [4]:
params = {
    # params for paper collection
    'search_limit': 100,
    'recommend_limit': 50,
    'citation_limit': 100,
    # params for expanded search
    'similarity_threshold': 0.7,
    'top_k_similar_papers': 20,
    'top_l_key_authors': 20,
    # paprams for stopping criteria
    'min_paper_cnt': 50,
    'min_author_cnt': 50,
    'min_corssref_papers': 20,
    'min_key_authors': 10,
}

In [5]:
# driving examples
llm_api_key = os.getenv('GEMINI_API_KEY_3')
llm_model_name="gemini-2.0-flash"
embed_api_key = os.getenv('GEMINI_API_KEY_3')
embed_model_name="models/text-embedding-004"

In [6]:
# from user input
research_topics = ["llm literature review"]
seed_dois = ['10.48550/arXiv.2406.10252',  # AutoSurvey: Large Language Models Can Automatically Write Surveys
            '10.48550/arXiv.2412.10415',  # Generative Adversarial Reviews: When LLMs Become the Critic
            '10.48550/arXiv.2402.12928',  # A Literature Review of Literature Reviews in Pattern Analysis and Machine Intelligence 
            ]
seed_titles = ['PaperRobot: Incremental Draft Generation of Scientific Ideas',
            'From Hypothesis to Publication: A Comprehensive Survey of AI-Driven Research Support Systems'
            ]
# constraints
from_dt = '2020-01-01'
to_dt = '2025-04-30'
fields_of_study = ['Computer Science']

In [7]:
ps = PaperSearch(   
    seed_research_topics = research_topics,   
    seed_paper_titles = seed_titles, 
    seed_paper_dois = seed_dois,
    from_dt = from_dt,
    to_dt = to_dt,
    fields_of_study = fields_of_study,
    )

2025-04-24 09:18:48,657 - INFO - SemanticScholarKit initialized with max_concurrency=20, sleep_interval=3.0s
2025-04-24 09:18:48,658 - INFO - SemanticScholarKit initialized with max_concurrency=20, sleep_interval=3.0s
2025-04-24 09:18:48,658 - INFO - SemanticScholarKit initialized with max_concurrency=20, sleep_interval=3.0s
2025-04-24 09:18:48,659 - INFO - SemanticScholarKit initialized with max_concurrency=20, sleep_interval=3.0s
2025-04-24 09:18:48,659 - INFO - SemanticScholarKit initialized with max_concurrency=20, sleep_interval=3.0s


# Round 1

In [8]:
iteration = 1
citation_limit = 100
recommend_limit = 100
search_limit = 50

from_dt = '2020-01-01'
to_dt = '2025-04-30'
fields_of_study = ['Computer Science']

## Initial Search

### Collection

In [40]:
all_candidates = {}

In [9]:
# candidates for next round
candidates = {
    'candit_dois': set(),
    'candit_author_ids': set(),
    'candit_ref_dois': set(),
    'candit_citing_dois': set(),
    'candit_topics': set()
}


In [10]:
# post-pone topic search since it takes too much time 
if len(ps.seed_paper_titles) > 0 or len(ps.seed_paper_dois) > 0:
    candidates['candit_topics'].update(ps.research_topics)
    topics = []
else:
    topics = ps.research_topics

In [11]:
if len(seed_dois) > 2:
    pos_paper_dois = seed_dois
else:
    pos_paper_dois = []

In [12]:
# --- ROUND 1 QUERY ---
# initial query for seed papers basic information
print(f"--- Running Round {iteration} Query for Papers Information ---")
await ps.consolidated_search(
    topics=topics,
    paper_titles=ps.seed_paper_titles,
    paper_dois=ps.seed_paper_dois,
    ref_paper_dois=ps.seed_paper_dois,
    citing_paper_dois=ps.seed_paper_dois,
    pos_paper_dois=pos_paper_dois,
    from_dt=ps.from_dt,
    to_dt=ps.to_dt,
    fields_of_study=ps.fields_of_study
)

2025-04-24 09:18:53,839 - INFO - Search 2 paper titles and 3 for paper information.
2025-04-24 09:18:53,841 - INFO - Fetching papers by 3 DOIs...
2025-04-24 09:18:53,842 - INFO - Fetching papers by title: 'PaperRobot: Incremental Draft Generation of Scientific Ideas...'
2025-04-24 09:18:53,843 - INFO - Fetching papers by title: 'From Hypothesis to Publication: A Comprehensive Survey of AI-Driven Research Support Systems...'
2025-04-24 09:18:53,843 - INFO - Running 3 initial query tasks concurrently...
2025-04-24 09:18:53,844 - INFO - Preparing reference for 3 papers and citing for 3 papers.
2025-04-24 09:18:53,844 - INFO - Running 6 citation collection tasks concurrently...
2025-04-24 09:18:53,845 - INFO - Recommend papers based on 3 positive papers and 0 papers.
2025-04-24 09:18:53,846 - INFO - Fetching recommendations based on 3 papers...
2025-04-24 09:18:53,847 - INFO - async_get_s2_recommended_papers: Fetching recommendations based on 3 positive IDs with effective limit 100.
2025-0

--- Running Round 1 Query for Papers Information ---


2025-04-24 09:18:54,952 - INFO - HTTP Request: POST https://api.semanticscholar.org/graph/v1/paper/batch?fields=abstract,authors,citationCount,citationStyles,corpusId,externalIds,fieldsOfStudy,influentialCitationCount,isOpenAccess,journal,openAccessPdf,paperId,publicationDate,publicationTypes,publicationVenue,referenceCount,s2FieldsOfStudy,title,url,venue,year "HTTP/1.1 200 OK"
2025-04-24 09:18:54,957 - INFO - _sync_get_papers: API call successful for batch (first 5: ['10.48550/arXiv.2406.10252', '10.48550/arXiv.2412.10415', '10.48550/arXiv.2402.12928']...), returning 3 items.
2025-04-24 09:18:54,966 - INFO - HTTP Request: GET https://api.semanticscholar.org/graph/v1/paper/10.48550/arXiv.2402.12928/citations?fields=contexts,intents,contextsWithIntent,isInfluential,abstract,authors,citationCount,citationStyles,corpusId,externalIds,fieldsOfStudy,influentialCitationCount,isOpenAccess,journal,openAccessPdf,paperId,publicationDate,publicationTypes,publicationVenue,referenceCount,s2FieldsOfS

In [13]:
print(len(ps.nodes_json), len(ps.edges_json))

2141 2092


### Routing

In [14]:
# core papers and core authors nodes
core_paper_dois = set()
for node in ps.nodes_json:
    if node['labels'] == ['Paper']:
        if node['properties'].get('from_seed', False) == True:
            core_paper_dois.add(node['id'])
        elif node['properties'].get('from_title_search', False) == True:
            core_paper_dois.add(node['id'])

core_author_ids = set()
for item in ps.nodes_json:
    if item['id'] in core_paper_dois and isinstance(item['properties'].get('authors'), list):
        authors_id = [x['authorId'] for x in item['properties']['authors'] if x['authorId'] is not None] 
        core_author_ids.update(authors_id)

print(len(core_paper_dois), len(core_author_ids))

4 27


In [15]:
from app.paper_similarity_calculation import PaperSim

sim = PaperSim(
    embed_api_key = embed_api_key,
    embed_model_name = embed_model_name
)

  from .autonotebook import tqdm as notebook_tqdm
2025-04-24 09:19:18,104 - INFO - PyTorch version 2.3.0 available.


In [16]:
# generate paper graph from nodes / edges json
G_pre = PaperGraph(name='Paper Graph Init Search')
G_pre.add_graph_nodes(ps.nodes_json)
G_pre.add_graph_edges(ps.edges_json)

In [17]:
# check core paper completeness
# paper complete
candidates['candit_dois'].update([doi for doi in core_paper_dois if doi not in ps.explored_nodes['paper']]) 
# citation complete
candidates['candit_ref_dois'].update([doi for doi in core_paper_dois if doi not in ps.explored_nodes['reference']])
candidates['candit_citing_dois'].update([doi for doi in core_paper_dois if doi not in ps.explored_nodes['citing']])
# author complete
candidates['candit_author_ids'].update([aid for aid in core_author_ids if aid not in ps.explored_nodes['author']]) 

In [18]:
# --- Graph Stat ---
g_stat = get_graph_stats(G_pre)   # graph stats

# valid paper with abstracts
complete_paper_json = [node for node in ps.nodes_json 
                        if node['labels'] == ['Paper'] 
                        and node['properties'].get('title') is not None and node['properties'].get('abstract') is not None]
complete_paper_dois = [node['id'] for node in complete_paper_json]

Graph has 2020 nodes and 2092 edges.
There are 4 node types in this graph, they are:
[('Author', 1617), ('Paper', 293), ('Journal', 58), ('Venue', 52)]
There are 4 edge types in this graph, they are:
[('WRITES', 1685), ('CITES', 200), ('RELEASES_IN', 112), ('PRINTS_ON', 95)]


In [19]:
# --- SIMILARITY CALCULATION ---
# check if similarity with edge type
edge_types = [x[0] for x in g_stat['edge_type']]

if 'SIMILAR_TO' not in edge_types:
    # calculate paper nodes similarity
    semantic_similar_pool = await sim.cal_embed_and_similarity(
        paper_nodes_json = complete_paper_json,
        paper_dois_1 = complete_paper_dois, 
        paper_dois_2 = complete_paper_dois,
        similarity_threshold=params['similarity_threshold'],
        )

    # add similarity edges to graph
    G_pre.add_graph_edges(semantic_similar_pool)  

2025-04-24 09:19:27,168 - INFO - Generating embeddings for 260 papers...
2025-04-24 09:19:35,624 - INFO - Shape of embeds_1: (260, 768)
2025-04-24 09:19:35,625 - INFO - Shape of embeds_2: (260, 768)
2025-04-24 09:19:35,625 - INFO - Calculating similarity matrix...
2025-04-24 09:19:35,649 - INFO - Processing similarity matrix to create relationships...


In [20]:
# --- PRUNNING ---
# pruning by connectivity
sub_graphs = G_pre.find_wcc_subgraphs(target_nodes=core_paper_dois)
if sub_graphs is not None and len(sub_graphs) > 0:
    G_post  = sub_graphs[0]
    # get stats after prunning
    g_stat = get_graph_stats(G_post)
else:
    G_post = G_pre

Graph has 1980 nodes and 4890 edges.
There are 4 node types in this graph, they are:
[('Author', 1585), ('Paper', 288), ('Journal', 56), ('Venue', 51)]
There are 5 edge types in this graph, they are:
[('SIMILAR_TO', 2831), ('WRITES', 1653), ('CITES', 200), ('RELEASES_IN', 112), ('PRINTS_ON', 94)]


In [21]:
# --- GET KEY STATS ---
# check paper count and author count
paper_cnt, author_cnt = 0, 0
for item in g_stat['node_type']:
    if item[0] == 'Paper':
        paper_cnt = item[1]
    elif item[0] == 'Author':
        author_cnt = item[1]

In [22]:
paper_stats = get_paper_stats(G_post, core_paper_dois)  # paper stats on graph
author_stats = get_author_stats(G_post, core_author_ids)  # author stats on graph

# check crossref
crossref_stats = []
for x in paper_stats:
    if (x['if_seed'] == False  # exclude seed papers 
        and x['local_citation_cnt'] > min(len(core_paper_dois),  5)):  # select most refered papers in graph
        crossref_stats.append(x)

# check key authors
key_authors_stats = []
for x in author_stats:
    if (x['if_seed'] == False  # exclude seed authors 
        and x['local_paper_cnt'] > min(len(core_paper_dois), 5)):  # select most refered papers in graph
        key_authors_stats.append(x)

# check paper similarity
sorted_paper_similarity = sorted(paper_stats, key=lambda x:x['max_sim_to_seed'], reverse=True)

# if cross ref insufficient, further expand similar papers on citation chain
if len(crossref_stats) < params['min_corssref_papers']:
    # filter top similar papers (to help build crossref)
    i = 0
    for item in sorted_paper_similarity:
        if i < params['top_k_similar_papers']:
            if item['if_seed'] == False and item['doi'] not in ps.explored_nodes['reference']:
                candidates['candit_ref_dois'].add(item['doi'])
                i += 1
        else:
            break

# if key authors not have complete information
if len(key_authors_stats) > params['min_key_authors']:
    sorted_key_authors = sorted(key_authors_stats, key=lambda x:x['local_paper_cnt'], reverse=True)
    # filter key authors (to amplify information)
    i = 0
    for item in sorted_key_authors:
        if i < params['top_l_key_authors']:
            if item['if_seed'] == False and item['author_id'] not in ps.explored_nodes['author']:
                candidates['candit_author_ids'].add(item['author_id'])
                i += 1
        else:
            break

In [41]:
# suggested candidates
candidates
all_candidates[f'round_{iteration}'] = candidates

In [30]:
# key status
sorted_paper_citation = sorted(paper_stats, key=lambda x:x['local_citation_cnt'], reverse=True)
sorted_top20_paper_citation = [x for x in sorted_paper_citation if x.get('if_seed', False) == False][0:20]
for item in sorted_top20_paper_citation:
    print(item)

{'doi': '10.48550/arXiv.2312.10997', 'title': 'Retrieval-Augmented Generation for Large Language Models: A Survey', 'if_seed': False, 'local_citation_cnt': 2, 'local_reference_cnt': 0, 'local_similarity_cnt': 35, 'max_sim_to_seed': 0.7133, 'global_citaion_cnt': 1337, 'influencial_citation_cnt': 72, 'global_refence_cnt': 72, 'author_cnt': 10, 'avg_h_index': None, 'weighted_h_index': None}
{'doi': '10.48550/arXiv.2303.18223', 'title': 'A Survey of Large Language Models', 'if_seed': False, 'local_citation_cnt': 2, 'local_reference_cnt': 0, 'local_similarity_cnt': 82, 'max_sim_to_seed': 0.7294, 'global_citaion_cnt': 2388, 'influencial_citation_cnt': 121, 'global_refence_cnt': 121, 'author_cnt': 10, 'avg_h_index': None, 'weighted_h_index': None}
{'doi': '10.48550/arXiv.2302.13971', 'title': 'LLaMA: Open and Efficient Foundation Language Models', 'if_seed': False, 'local_citation_cnt': 2, 'local_reference_cnt': 0, 'local_similarity_cnt': 6, 'max_sim_to_seed': -1, 'global_citaion_cnt': 11567,

In [33]:
sorted_paper_sim_to_seed = sorted(paper_stats, key=lambda x:x['max_sim_to_seed'], reverse=False)
for item in sorted_paper_sim_to_seed:
    if item.get('if_seed') == False:
        print(item)

{'doi': '10.48550/arXiv.2404.02060', 'title': 'Long-context LLMs Struggle with Long In-context Learning', 'if_seed': False, 'local_citation_cnt': 1, 'local_reference_cnt': 0, 'local_similarity_cnt': 50, 'max_sim_to_seed': -1, 'global_citaion_cnt': 141, 'influencial_citation_cnt': 6, 'global_refence_cnt': 6, 'author_cnt': 5, 'avg_h_index': None, 'weighted_h_index': None}
{'doi': '10.48550/arXiv.2403.01106', 'title': 'Distilling Text Style Transfer With Self-Explanation From LLMs', 'if_seed': False, 'local_citation_cnt': 1, 'local_reference_cnt': 0, 'local_similarity_cnt': 4, 'max_sim_to_seed': -1, 'global_citaion_cnt': 8, 'influencial_citation_cnt': 1, 'global_refence_cnt': 1, 'author_cnt': 6, 'avg_h_index': None, 'weighted_h_index': None}
{'doi': '10.48550/arXiv.2402.15043', 'title': 'KIEval: A Knowledge-grounded Interactive Evaluation Framework for Large Language Models', 'if_seed': False, 'local_citation_cnt': 1, 'local_reference_cnt': 0, 'local_similarity_cnt': 39, 'max_sim_to_seed'

In [34]:
topic_ids = set([nid for nid, node_data in G_post.nodes(data=True) if node_data.get('node_type') == 'Topic'])
topic_stats = []
for tid in topic_ids:
    i = 0
    topic = G_post.nodes(tid).get('topic_name')
    for u, v, edge_data in G_post.in_edges(tid, data=True):
        if edge_data.get('relationshipType') == 'DISCUSS':
            i += 1
    topic_stats.append({'topic_id':tid, 'topic_name':topic, 'related_papers':i})

In [38]:
core_paper_json = [x for x in complete_paper_json if x['id'] in core_paper_dois]
if len(topic_stats) < 4:
    keywords_topics_json = await ps.topic_generation(
        seed_paper_json=core_paper_json,
        llm_api_key=llm_api_key,
        llm_model_name=llm_model_name,
        )
    topic_queries = keywords_topics_json.get('queries', [])
    candidates['candit_topics'].update(topic_queries)

2025-04-24 09:32:47,579 - INFO - Use LLM to identify key related topics.
2025-04-24 09:32:47,580 - INFO - Generating related topics for 4 seed papers...
2025-04-24 09:32:47,581 - INFO - Calling LLM to generate topics...
2025-04-24 09:32:47,582 - INFO - AFC is enabled with max remote calls: 10.
2025-04-24 09:32:49,866 - INFO - LLM generated topics: {"field_of_study": ["Artificial Intelligence", "Natural Language Processing", "Meta-Research", "Information Science"], "keywords_and_topics": ["large language models", "literature reviews", "peer review process", "AI-driven research support", "generative agent reviewers"], "tags": ["AutoSurvey", "generative adversarial networks", "PRISMA standards", "bibliometric analysis", "AI in academia"], "queries": ["LLM-based literature review automation", "AI-assisted peer review", "analysis of literature reviews in PAMI", "AI tools for research lifecycle"]}


In [51]:
all_candidates

{'round_1': {'candit_dois': {'10.48550/arXiv.2503.01424'},
  'candit_author_ids': {'2108024279',
   '2112678409',
   '2116271777',
   '2118640235',
   '2216503559',
   '2257010530',
   '2259709647',
   '2262020955',
   '2265878959',
   '2265930173',
   '2268132119',
   '2273779175',
   '2275569993',
   '2284824283',
   '2284825678',
   '2284827556',
   '2286328804',
   '2289004972',
   '2293356300',
   '2307012818',
   '2328342585',
   '2335566763',
   '2335569348',
   '2337225259',
   '2349068478',
   '2674998',
   '51056442'},
  'candit_ref_dois': {'10.1007/s00607-023-01181-x',
   '10.48550/arXiv.2306.00622',
   '10.48550/arXiv.2307.05492',
   '10.48550/arXiv.2310.01783',
   '10.48550/arXiv.2404.07738',
   '10.48550/arXiv.2406.05688',
   '10.48550/arXiv.2408.07884',
   '10.48550/arXiv.2408.10365',
   '10.48550/arXiv.2410.03019',
   '10.48550/arXiv.2411.00816',
   '10.48550/arXiv.2502.17086',
   '10.48550/arXiv.2502.19614',
   '10.48550/arXiv.2503.01424',
   '10.48550/arXiv.2503.04629

# Round 2

## Collection

In [52]:
if len(all_candidates['round_1']['candit_dois']) > 2:
    dois = list(all_candidates['round_1']['candit_dois'])
else:
    dois = []

if len(all_candidates['round_1']['candit_author_ids']) > 2:
    author_ids = list(all_candidates['round_1']['candit_author_ids']) 
else:
    author_ids = []

if len(all_candidates['round_1']['candit_ref_dois']) > 2:
    ref_dois = list(all_candidates['round_1']['candit_ref_dois']) 
else:
    ref_dois = []

if len(all_candidates['round_1']['candit_citing_dois']) > 2:
    citing_dois = list(all_candidates['round_1']['candit_citing_dois'])   
else:
    citing_dois = []

if len(all_candidates['round_1']['candit_topics']) > 2:
    topics = list(all_candidates['round_1']['candit_topics'])  
else:
    topics = [] 

In [45]:
iteration += 1
# candidates for next round
candidates = {
    'candit_dois': set(),
    'candit_author_ids': set(),
    'candit_ref_dois': set(),
    'candit_citing_dois': set(),
    'candit_topics': set()
}

In [54]:
# --- ROUND 2 QUERY ---
# initial query for seed papers basic information
print(f"--- Running Round {iteration} Query for Papers Information ---")
await ps.consolidated_search(
    topics=topics,
    # paper_titles=ps.seed_paper_titles,
    paper_dois=dois,
    author_ids=author_ids,
    ref_paper_dois=ref_dois,
    citing_paper_dois=citing_dois,
    # pos_paper_dois=pos_paper_dois,
    from_dt=ps.from_dt,
    to_dt=ps.to_dt,
    fields_of_study=ps.fields_of_study
)

2025-04-24 09:45:50,427 - INFO - Search 5 topics for paper information.
2025-04-24 09:45:50,428 - INFO - Fetching related papers for query: 'AI-assisted peer review...'
2025-04-24 09:45:50,429 - INFO - async_search_paper_by_keywords: Searching papers by keyword: 'AI-assisted peer review...' with effective limit 50.
2025-04-24 09:45:50,430 - INFO - Search 27 author information.
2025-04-24 09:45:50,430 - INFO - _sync_search_paper_by_keywords: Thread started for query 'AI-assisted peer review...' with limit 50.
2025-04-24 09:45:50,430 - INFO - Fetching info for 27 authors...
2025-04-24 09:45:50,432 - INFO - async_search_author_by_ids: Fetching 27 authors by ID in 1 batches.
2025-04-24 09:45:50,432 - INFO - async_search_author_by_ids: Gathering 1 tasks...
2025-04-24 09:45:50,433 - INFO - Preparing reference for 21 papers and citing for 0 papers.
2025-04-24 09:45:50,433 - INFO - Running 21 citation collection tasks concurrently...
2025-04-24 09:45:50,434 - INFO - Fetching papers cited by 10

--- Running Round 2 Query for Papers Information ---


2025-04-24 09:45:51,515 - INFO - HTTP Request: GET https://api.semanticscholar.org/graph/v1/paper/search?query=AI-assisted%20peer%20review&fieldsOfStudy=Computer%20Science&fields=abstract,authors,citationCount,citationStyles,corpusId,externalIds,fieldsOfStudy,influentialCitationCount,isOpenAccess,journal,openAccessPdf,paperId,publicationDate,publicationTypes,publicationVenue,referenceCount,s2FieldsOfStudy,title,url,venue,year&offset=0&limit=50 "HTTP/1.1 429 "
2025-04-24 09:45:51,723 - INFO - HTTP Request: GET https://api.semanticscholar.org/graph/v1/paper/10.48550/arXiv.2408.10365/references?fields=contexts,intents,contextsWithIntent,isInfluential,abstract,authors,citationCount,citationStyles,corpusId,externalIds,fieldsOfStudy,influentialCitationCount,isOpenAccess,journal,openAccessPdf,paperId,publicationDate,publicationTypes,publicationVenue,referenceCount,s2FieldsOfStudy,title,url,venue,year&offset=0&limit=100 "HTTP/1.1 200 OK"
2025-04-24 09:45:51,776 - INFO - HTTP Request: GET https

In [55]:
print(len(ps.nodes_json), len(ps.edges_json))

7978 7947


## Rounting

In [56]:
# core papers and core authors nodes
core_paper_dois = set()
for node in ps.nodes_json:
    if node['labels'] == ['Paper']:
        if node['properties'].get('from_seed', False) == True:
            core_paper_dois.add(node['id'])
        elif node['properties'].get('from_title_search', False) == True:
            core_paper_dois.add(node['id'])

core_author_ids = set()
for item in ps.nodes_json:
    if item['id'] in core_paper_dois and isinstance(item['properties'].get('authors'), list):
        authors_id = [x['authorId'] for x in item['properties']['authors'] if x['authorId'] is not None] 
        core_author_ids.update(authors_id)

print(len(core_paper_dois), len(core_author_ids))

4 27


In [57]:
# generate paper graph from nodes / edges json
G2_pre = PaperGraph(name='Paper Graph Init Search')
G2_pre.add_graph_nodes(ps.nodes_json)
G2_pre.add_graph_edges(ps.edges_json)

In [58]:
# check core paper completeness
# paper complete
candidates['candit_dois'].update([doi for doi in core_paper_dois if doi not in ps.explored_nodes['paper']]) 
# citation complete
candidates['candit_ref_dois'].update([doi for doi in core_paper_dois if doi not in ps.explored_nodes['reference']])
candidates['candit_citing_dois'].update([doi for doi in core_paper_dois if doi not in ps.explored_nodes['citing']])
# author complete
candidates['candit_author_ids'].update([aid for aid in core_author_ids if aid not in ps.explored_nodes['author']]) 

In [59]:
candidates

{'candit_dois': {'10.48550/arXiv.2503.01424'},
 'candit_author_ids': set(),
 'candit_ref_dois': set(),
 'candit_citing_dois': {'10.48550/arXiv.2503.01424'},
 'candit_topics': set()}

In [61]:
# --- Graph Stat ---
g_stat = get_graph_stats(G2_pre)   # graph stats

# valid paper with abstracts
complete_paper_json = [node for node in ps.nodes_json 
                        if node['labels'] == ['Paper'] 
                        and node['properties'].get('title') is not None and node['properties'].get('abstract') is not None]
complete_paper_dois = [node['id'] for node in complete_paper_json]

Graph has 6032 nodes and 7947 edges.
There are 6 node types in this graph, they are:
[('Author', 4718), ('Paper', 1027), ('Journal', 144), ('Venue', 136), ('Topic', 4), ('Affiliation', 3)]
There are 6 edge types in this graph, they are:
[('WRITES', 6210), ('CITES', 958), ('RELEASES_IN', 531), ('PRINTS_ON', 229), ('DISCUSS', 16), ('WORKS_IN', 3)]


In [63]:
# --- SIMILARITY CALCULATION ---
# check if similarity with edge type
edge_types = [x[0] for x in g_stat['edge_type']]

if 'SIMILAR_TO' not in edge_types:
    # calculate paper nodes similarity
    semantic_similar_pool = await sim.cal_embed_and_similarity(
        paper_nodes_json = complete_paper_json,
        paper_dois_1 = complete_paper_dois, 
        paper_dois_2 = complete_paper_dois,
        similarity_threshold=params['similarity_threshold'],
        )

    # add similarity edges to graph
    G2_pre.add_graph_edges(semantic_similar_pool)  

2025-04-24 10:05:51,035 - INFO - Generating embeddings for 687 papers...
2025-04-24 10:05:59,671 - INFO - Shape of embeds_1: (947, 768)
2025-04-24 10:05:59,672 - INFO - Shape of embeds_2: (947, 768)
2025-04-24 10:05:59,672 - INFO - Calculating similarity matrix...
2025-04-24 10:05:59,688 - INFO - Processing similarity matrix to create relationships...


In [64]:
# --- PRUNNING ---
# pruning by connectivity
sub_graphs = G2_pre.find_wcc_subgraphs(target_nodes=core_paper_dois)
if sub_graphs is not None and len(sub_graphs) > 0:
    G2_post  = sub_graphs[0]
    # get stats after prunning
    g_stat = get_graph_stats(G2_post)
else:
    G2_post = G2_pre

Graph has 6015 nodes and 23829 edges.
There are 6 node types in this graph, they are:
[('Author', 4705), ('Paper', 1025), ('Journal', 143), ('Venue', 135), ('Topic', 4), ('Affiliation', 3)]
There are 7 edge types in this graph, they are:
[('SIMILAR_TO', 15895), ('WRITES', 6197), ('CITES', 958), ('RELEASES_IN', 531), ('PRINTS_ON', 229), ('DISCUSS', 16), ('WORKS_IN', 3)]


In [65]:
# --- GET KEY STATS ---
# check paper count and author count
paper_cnt, author_cnt = 0, 0
for item in g_stat['node_type']:
    if item[0] == 'Paper':
        paper_cnt = item[1]
    elif item[0] == 'Author':
        author_cnt = item[1]

In [71]:
paper_stats = get_paper_stats(G2_post, core_paper_dois)  # paper stats on graph
author_stats = get_author_stats(G2_post, core_author_ids)  # author stats on graph

# check crossref
crossref_stats = []
for x in paper_stats:
    if (x['if_seed'] == False  # exclude seed papers 
        and x['local_citation_cnt'] > min(len(core_paper_dois),  5)):  # select most refered papers in graph
        crossref_stats.append(x)

# check key authors
key_authors_stats = []
for x in author_stats:
    if (x['if_seed'] == False  # exclude seed authors 
        and x['local_paper_cnt'] > min(len(core_paper_dois), 5)):  # select most refered papers in graph
        key_authors_stats.append(x)

# check paper similarity
sorted_paper_similarity = sorted(paper_stats, key=lambda x:x['max_sim_to_seed'], reverse=True)

# if cross ref insufficient, further expand similar papers on citation chain
if len(crossref_stats) < params['min_corssref_papers']:
    # filter top similar papers (to help build crossref)
    i = 0
    for item in sorted_paper_similarity:
        if i < params['top_k_similar_papers']:
            if item['if_seed'] == False and item['doi'] not in ps.explored_nodes['reference']:
                candidates['candit_ref_dois'].add(item['doi'])
                i += 1
        else:
            break

# if key authors not have complete information
if len(key_authors_stats) > params['min_key_authors']:
    sorted_key_authors = sorted(key_authors_stats, key=lambda x:x['local_paper_cnt'], reverse=True)
    # filter key authors (to amplify information)
    i = 0
    for item in sorted_key_authors:
        if i < params['top_l_key_authors']:
            if item['if_seed'] == False and item['author_id'] not in ps.explored_nodes['author']:
                candidates['candit_author_ids'].add(item['author_id'])
                i += 1
        else:
            break

In [73]:
sorted_paper_cited = sorted(paper_stats, key=lambda x:x['local_citation_cnt'], reverse=True)
i = 0
for item in sorted_paper_cited:
    if i < 20:
        if item.get('if_seed') == False:
            print(item)
            i += 1
    else:
        break

{'doi': '10.48550/arXiv.2408.06292', 'title': 'The AI Scientist: Towards Fully Automated Open-Ended Scientific Discovery', 'if_seed': False, 'local_citation_cnt': 11, 'local_reference_cnt': 0, 'local_similarity_cnt': 77, 'max_sim_to_seed': 0.7518, 'global_citaion_cnt': 126, 'influencial_citation_cnt': 19, 'global_refence_cnt': 19, 'author_cnt': 6, 'avg_h_index': None, 'weighted_h_index': None}
{'doi': '10.48550/arXiv.2303.08774', 'title': 'GPT-4 Technical Report', 'if_seed': False, 'local_citation_cnt': 10, 'local_reference_cnt': 0, 'local_similarity_cnt': 12, 'max_sim_to_seed': -1, 'global_citaion_cnt': 12023, 'influencial_citation_cnt': 1757, 'global_refence_cnt': 1757, 'author_cnt': 10, 'avg_h_index': None, 'weighted_h_index': None}
{'doi': '10.48550/arXiv.2310.01783', 'title': 'Can large language models provide useful feedback on research papers? A large-scale empirical analysis', 'if_seed': False, 'local_citation_cnt': 9, 'local_reference_cnt': 16, 'local_similarity_cnt': 95, 'max

In [75]:
len(crossref_stats)

19

In [76]:
key_authors_stats

[{'author_id': '2164113313',
  'author_name': 'Zhuohao Yu',
  'if_seed': False,
  'if_complete': False,
  'h_index': None,
  'global_paper_cnt': None,
  'global_citation_cnt': None,
  'local_paper_cnt': 7,
  'top_coauthors': [('2164113313', 7),
   ('2108024279', 4),
   ('145235149', 4),
   ('2273553706', 4),
   ('2286328804', 3)],
  'weighted_coauthor_h_index': 4.45},
 {'author_id': '145235149',
  'author_name': 'Wei Ye',
  'if_seed': False,
  'if_complete': False,
  'h_index': None,
  'global_paper_cnt': None,
  'global_citation_cnt': None,
  'local_paper_cnt': 7,
  'top_coauthors': [('145235149', 7),
   ('2108024279', 5),
   ('2164113313', 4),
   ('2286328804', 3),
   ('1705434', 3)],
  'weighted_coauthor_h_index': 4.5},
 {'author_id': '2273553706',
  'author_name': 'Jindong Wang',
  'if_seed': False,
  'if_complete': False,
  'h_index': None,
  'global_paper_cnt': None,
  'global_citation_cnt': None,
  'local_paper_cnt': 10,
  'top_coauthors': [('2273553706', 10),
   ('2108024279', 

In [78]:
candidates

{'candit_dois': {'10.48550/arXiv.2503.01424'},
 'candit_author_ids': {'1519290245',
  '152277111',
  '1737249',
  '2051536212',
  '2107949588',
  '2111728470',
  '2118138548',
  '2118208508',
  '2145500840',
  '2208739098',
  '2250121772',
  '2257004102',
  '2257348206',
  '2257796182',
  '2273553706',
  '3194553',
  '32216189',
  '40282288',
  '51056532',
  '8427191'},
 'candit_ref_dois': {'10.1002/leap.1570',
  '10.1007/s11704-024-40231-1',
  '10.1057/s41599-020-00703-8',
  '10.1109/ACCESS.2025.3554504',
  '10.1145/3637371',
  '10.1145/3641289',
  '10.1613/jair.1.12862',
  '10.17705/1jais.00867',
  '10.18653/v1/2020.inlg-1.44',
  '10.48550/arXiv.2303.18223',
  '10.48550/arXiv.2305.01937',
  '10.48550/arXiv.2305.02499',
  '10.48550/arXiv.2401.04259',
  '10.48550/arXiv.2402.15589',
  '10.48550/arXiv.2404.17605',
  '10.48550/arXiv.2405.02150',
  '10.48550/arXiv.2406.12708',
  '10.48550/arXiv.2406.16253',
  '10.48550/arXiv.2407.12857',
  '10.48550/arXiv.2408.06292',
  '10.48550/arXiv.240

In [79]:
author_stats

[{'author_id': '2108024279',
  'author_name': 'Yidong Wang',
  'if_seed': True,
  'if_complete': True,
  'h_index': 17,
  'global_paper_cnt': 42,
  'global_citation_cnt': 3711,
  'local_paper_cnt': 39,
  'top_coauthors': [('2108024279', 39),
   ('1519290245', 16),
   ('2273553706', 9),
   ('2145500840', 9),
   ('2051536212', 8)],
  'weighted_coauthor_h_index': 17.0},
 {'author_id': '2273779175',
  'author_name': 'Qi Guo',
  'if_seed': True,
  'if_complete': True,
  'h_index': 1,
  'global_paper_cnt': 2,
  'global_citation_cnt': 12,
  'local_paper_cnt': 2,
  'top_coauthors': [('2273779175', 2),
   ('2257010530', 2),
   ('2286328804', 1),
   ('2116271777', 1),
   ('2262020955', 1)],
  'weighted_coauthor_h_index': 1.3533333333333333},
 {'author_id': '2286328804',
  'author_name': 'Wenjin Yao',
  'if_seed': True,
  'if_complete': True,
  'h_index': 2,
  'global_paper_cnt': 4,
  'global_citation_cnt': 31,
  'local_paper_cnt': 4,
  'top_coauthors': [('2108024279', 4),
   ('2286328804', 4),
 

In [84]:
sorted_cross_ref = sorted(crossref_stats, key=lambda x:x['local_citation_cnt'], reverse=True)
top_crossref_doi = [x['doi'] for x in sorted_cross_ref[0:params['min_corssref_papers']]]
core_paper_dois.update(top_crossref_doi)

In [85]:
core_paper_dois

{'10.18653/v1/2024.acl-long.18',
 '10.48550/arXiv.2201.11903',
 '10.48550/arXiv.2302.13971',
 '10.48550/arXiv.2303.08774',
 '10.48550/arXiv.2306.00622',
 '10.48550/arXiv.2306.05685',
 '10.48550/arXiv.2307.05492',
 '10.48550/arXiv.2309.02726',
 '10.48550/arXiv.2309.16609',
 '10.48550/arXiv.2310.01783',
 '10.48550/arXiv.2310.03302',
 '10.48550/arXiv.2401.04259',
 '10.48550/arXiv.2402.12928',
 '10.48550/arXiv.2406.05688',
 '10.48550/arXiv.2406.10252',
 '10.48550/arXiv.2408.06292',
 '10.48550/arXiv.2408.10365',
 '10.48550/arXiv.2409.04109',
 '10.48550/arXiv.2410.09403',
 '10.48550/arXiv.2410.13185',
 '10.48550/arXiv.2412.10415',
 '10.48550/arXiv.2503.01424',
 '7c2b3c4ab6d701de7bd9df91d7448f3c06a1e9d7'}