# Setup

In [1]:
import copy
from typing import List, Dict, Optional, Union, Tuple, Literal

In [2]:
import os
import json

import sys
import os

parent_dir = os.path.dirname(os.getcwd())
sys.path.append(parent_dir)

In [3]:
from graph.paper_graph import PaperGraph
from graph.graph_viz import GraphViz
from app.paper_data_collect import PaperSearch
from graph.graph_stats import get_graph_stats, get_author_stats, get_paper_stats

In [4]:
params = {
    # params for paper collection
    'search_limit': 100,
    'recommend_limit': 50,
    'citation_limit': 100,
    # params for expanded search
    'similarity_threshold': 0.7,
    'top_k_similar_papers': 20,
    'top_l_key_authors': 20,
    # paprams for stopping criteria
    'min_paper_cnt': 50,
    'min_author_cnt': 50,
    'min_corssref_papers': 20,
    'min_key_authors': 10,
}

In [5]:
# driving examples
llm_api_key = os.getenv('GEMINI_API_KEY_3')
llm_model_name="gemini-2.0-flash"
embed_api_key = os.getenv('GEMINI_API_KEY_3')
embed_model_name="models/text-embedding-004"

In [6]:
# from user input
research_topics = ["llm literature review"]
seed_dois = ['10.48550/arXiv.2406.10252',  # AutoSurvey: Large Language Models Can Automatically Write Surveys
            '10.48550/arXiv.2412.10415',  # Generative Adversarial Reviews: When LLMs Become the Critic
            '10.48550/arXiv.2402.12928',  # A Literature Review of Literature Reviews in Pattern Analysis and Machine Intelligence 
            ]
seed_titles = ['PaperRobot: Incremental Draft Generation of Scientific Ideas',
            'From Hypothesis to Publication: A Comprehensive Survey of AI-Driven Research Support Systems'
            ]
# constraints
from_dt = '2020-01-01'
to_dt = '2025-04-30'
fields_of_study = ['Computer Science']

In [7]:
ps = PaperSearch(   
    seed_research_topics = research_topics,   
    seed_paper_titles = seed_titles, 
    seed_paper_dois = seed_dois,
    from_dt = from_dt,
    to_dt = to_dt,
    fields_of_study = fields_of_study,
    )

2025-04-23 21:39:39,035 - INFO - SemanticScholarKit initialized with max_concurrency=20, sleep_interval=3.0s
2025-04-23 21:39:39,037 - INFO - SemanticScholarKit initialized with max_concurrency=20, sleep_interval=3.0s
2025-04-23 21:39:39,038 - INFO - SemanticScholarKit initialized with max_concurrency=20, sleep_interval=3.0s
2025-04-23 21:39:39,038 - INFO - SemanticScholarKit initialized with max_concurrency=20, sleep_interval=3.0s
2025-04-23 21:39:39,038 - INFO - SemanticScholarKit initialized with max_concurrency=20, sleep_interval=3.0s


# Round 1

In [8]:
iteration = 1
citation_limit = 100
recommend_limit = 100
search_limit = 50

from_dt = '2020-01-01'
to_dt = '2025-04-30'
fields_of_study = ['Computer Science']

## Initial Search

### Collection

In [9]:
# candidates for next round
candidates = {
    'candit_dois': set(),
    'candit_author_ids': set(),
    'candit_ref_dois': set(),
    'candit_citing_dois': set(),
    'candit_topics': set()
}


In [10]:
# post-pone topic search since it takes too much time 
if len(ps.seed_paper_titles) > 0 or len(ps.seed_paper_dois) > 0:
    candidates['candit_topics'].update(ps.research_topics)
    topics = []
else:
    topics = ps.research_topics

In [11]:
if len(seed_dois) > 2:
    pos_paper_dois = seed_dois
else:
    pos_paper_dois = []

In [12]:
# --- ROUND 1 QUERY ---
# initial query for seed papers basic information
print(f"--- Running Round {iteration} Query for Papers Information ---")
await ps.consolidated_search(
    topics=topics,
    paper_titles=ps.seed_paper_titles,
    paper_dois=ps.seed_paper_dois,
    ref_paper_dois=ps.seed_paper_dois,
    citing_paper_dois=ps.seed_paper_dois,
    pos_paper_dois=pos_paper_dois,
    from_dt=ps.from_dt,
    to_dt=ps.to_dt,
    fields_of_study=ps.fields_of_study
)

2025-04-23 21:39:43,439 - INFO - Search 2 paper titles and 3 for paper information.
2025-04-23 21:39:43,440 - INFO - Fetching papers by 3 DOIs...
2025-04-23 21:39:43,440 - INFO - Fetching papers by title: 'PaperRobot: Incremental Draft Generation of Scientific Ideas...'
2025-04-23 21:39:43,440 - INFO - Fetching papers by title: 'From Hypothesis to Publication: A Comprehensive Survey of AI-Driven Research Support Systems...'
2025-04-23 21:39:43,441 - INFO - Running 3 initial query tasks concurrently...
2025-04-23 21:39:43,441 - INFO - Preparing reference for 3 papers and citing for 3 papers.
2025-04-23 21:39:43,441 - INFO - Running 6 citation collection tasks concurrently...
2025-04-23 21:39:43,442 - INFO - Recommend papers based on 3 positive papers and 0 papers.
2025-04-23 21:39:43,442 - INFO - Fetching recommendations based on 3 papers...
2025-04-23 21:39:43,442 - INFO - async_get_s2_recommended_papers: Fetching recommendations based on 3 positive IDs with effective limit 100.
2025-0

--- Running Round 1 Query for Papers Information ---


2025-04-23 21:39:43,852 - INFO - HTTP Request: GET https://api.semanticscholar.org/graph/v1/paper/10.48550/arXiv.2402.12928/citations?fields=contexts%2Cintents%2CcontextsWithIntent%2CisInfluential%2Cabstract%2Cauthors%2CcitationCount%2CcitationStyles%2CcorpusId%2CexternalIds%2CfieldsOfStudy%2CinfluentialCitationCount%2CisOpenAccess%2Cjournal%2CopenAccessPdf%2CpaperId%2CpublicationDate%2CpublicationTypes%2CpublicationVenue%2CreferenceCount%2Cs2FieldsOfStudy%2Ctitle%2Curl%2Cvenue%2Cyear&offset=0&limit=100 "HTTP/1.1 200 OK"
2025-04-23 21:39:43,854 - INFO - _sync_get_paper_citations: API call successful for paper 10.48550/arXiv.2402.12928, returning 0 items.
2025-04-23 21:39:44,017 - INFO - HTTP Request: GET https://api.semanticscholar.org/graph/v1/paper/10.48550/arXiv.2406.10252/citations?fields=contexts%2Cintents%2CcontextsWithIntent%2CisInfluential%2Cabstract%2Cauthors%2CcitationCount%2CcitationStyles%2CcorpusId%2CexternalIds%2CfieldsOfStudy%2CinfluentialCitationCount%2CisOpenAccess%2Cj

In [13]:
print(len(ps.nodes_json), len(ps.edges_json))

2140 2091


### Routing

In [15]:
# core papers and core authors nodes
core_paper_dois = set()
for node in ps.nodes_json:
    if node['labels'] == ['Paper']:
        if node['properties'].get('from_seed', False) == True:
            core_paper_dois.add(node['id'])
        elif node['properties'].get('from_title_search', False) == True:
            core_paper_dois.add(node['id'])

core_author_ids = set()
for item in ps.nodes_json:
    if item['id'] in core_paper_dois and isinstance(item['properties'].get('authors'), list):
        authors_id = [x['authorId'] for x in item['properties']['authors'] if x['authorId'] is not None] 
        core_author_ids.update(authors_id)

print(len(core_paper_dois), len(core_author_ids))

4 27


In [16]:
from app.paper_similarity_calculation import PaperSim

sim = PaperSim(
    embed_api_key = embed_api_key,
    embed_model_name = embed_model_name
)

In [17]:
# generate paper graph from nodes / edges json
G_pre = PaperGraph(name='Paper Graph Init Search')
G_pre.add_graph_nodes(ps.nodes_json)
G_pre.add_graph_edges(ps.edges_json)

In [18]:
# check core paper completeness
# paper complete
candidates['candit_dois'].update([doi for doi in core_paper_dois if doi not in ps.explored_nodes['paper']]) 
# citation complete
candidates['candit_ref_dois'].update([doi for doi in core_paper_dois if doi not in ps.explored_nodes['reference']])
candidates['candit_citing_dois'].update([doi for doi in core_paper_dois if doi not in ps.explored_nodes['citing']])
# author complete
candidates['candit_author_ids'].update([aid for aid in core_author_ids if aid not in ps.explored_nodes['author']]) 

In [20]:
# --- Graph Stat ---
g_stat = get_graph_stats(G_pre)   # graph stats

# valid paper with abstracts
complete_paper_json = [node for node in ps.nodes_json 
                        if node['labels'] == ['Paper'] 
                        and node['properties'].get('title') is not None and node['properties'].get('abstract') is not None]
complete_paper_dois = [node['id'] for node in complete_paper_json]

Graph has 2019 nodes and 2091 edges.
There are 4 node types in this graph, they are:
[('Author', 1616), ('Paper', 293), ('Journal', 58), ('Venue', 52)]
There are 4 edge types in this graph, they are:
[('WRITES', 1684), ('CITES', 200), ('RELEASES_IN', 112), ('PRINTS_ON', 95)]


In [22]:
# --- SIMILARITY CALCULATION ---
# check if similarity with edge type
edge_types = [x[0] for x in g_stat['edge_type']]

if 'SIMILAR_TO' not in edge_types:
    # calculate paper nodes similarity
    semantic_similar_pool = await sim.cal_embed_and_similarity(
        paper_nodes_json = complete_paper_json,
        paper_dois_1 = complete_paper_dois, 
        paper_dois_2 = complete_paper_dois,
        similarity_threshold=params['similarity_threshold'],
        )

    # add similarity edges to graph
    G_pre.add_graph_edges(semantic_similar_pool)  

2025-04-23 21:47:25,208 - INFO - Generating embeddings for 260 papers...
2025-04-23 21:47:33,266 - INFO - Shape of embeds_1: (260, 768)
2025-04-23 21:47:33,267 - INFO - Shape of embeds_2: (260, 768)
2025-04-23 21:47:33,268 - INFO - Calculating similarity matrix...
2025-04-23 21:47:33,273 - INFO - Processing similarity matrix to create relationships...


In [23]:
# --- PRUNNING ---
# pruning by connectivity
sub_graphs = G_pre.find_wcc_subgraphs(target_nodes=core_paper_dois)
if sub_graphs is not None and len(sub_graphs) > 0:
    G_post  = sub_graphs[0]
    # get stats after prunning
    g_stat = get_graph_stats(G_post)
else:
    G_post = G_pre

Graph has 1979 nodes and 4889 edges.
There are 4 node types in this graph, they are:
[('Author', 1584), ('Paper', 288), ('Journal', 56), ('Venue', 51)]
There are 5 edge types in this graph, they are:
[('SIMILAR_TO', 2831), ('WRITES', 1652), ('CITES', 200), ('RELEASES_IN', 112), ('PRINTS_ON', 94)]


In [24]:
# --- GET KEY STATS ---
# check paper count and author count
paper_cnt, author_cnt = 0, 0
for item in g_stat['node_type']:
    if item[0] == 'Paper':
        paper_cnt = item[1]
    elif item[0] == 'Author':
        author_cnt = item[1]

In [25]:
paper_stats = get_paper_stats(G_post, core_paper_dois)  # paper stats on graph
author_stats = get_author_stats(G_post, core_author_ids)  # author stats on graph

# check crossref
crossref_stats = []
for x in paper_stats:
    if (x['if_seed'] == False  # exclude seed papers 
        and x['local_citation_cnt'] > min(len(core_paper_dois),  5)):  # select most refered papers in graph
        crossref_stats.append(x)

# check key authors
key_authors_stats = []
for x in author_stats:
    if (x['if_seed'] == False  # exclude seed authors 
        and x['local_paper_cnt'] > min(len(core_paper_dois), 5)):  # select most refered papers in graph
        key_authors_stats.append(x)

# check paper similarity
sorted_paper_similarity = sorted(paper_stats, key=lambda x:x['max_sim_to_seed'], reverse=True)

# if cross ref insufficient, further expand similar papers on citation chain
if len(crossref_stats) < params['min_corssref_papers']:
    # filter top similar papers (to help build crossref)
    i = 0
    for item in sorted_paper_similarity:
        if i < params['top_k_similar_papers']:
            if item['if_seed'] == False and item['doi'] not in ps.explored_nodes['reference']:
                candidates['candit_ref_dois'].add(item['doi'])
                i += 1
        else:
            break

# if key authors not have complete information
if len(key_authors_stats) > params['min_key_authors']:
    sorted_key_authors = sorted(key_authors_stats, key=lambda x:x['local_paper_cnt'], reverse=True)
    # filter key authors (to amplify information)
    i = 0
    for item in sorted_key_authors:
        if i < params['top_l_key_authors']:
            if item['if_seed'] == False and item['author_id'] not in ps.explored_nodes['author']:
                candidates['candit_author_ids'].add(item['author_id'])
                i += 1
        else:
            break

In [26]:
candidates

{'candit_dois': {'10.48550/arXiv.2503.01424'},
 'candit_author_ids': {'2108024279',
  '2112678409',
  '2116271777',
  '2118640235',
  '2216503559',
  '2257010530',
  '2259709647',
  '2262020955',
  '2265878959',
  '2265930173',
  '2268132119',
  '2273779175',
  '2275569993',
  '2284824283',
  '2284825678',
  '2284827556',
  '2286328804',
  '2289004972',
  '2293356300',
  '2307012818',
  '2328342585',
  '2335566763',
  '2335569348',
  '2337225259',
  '2349068478',
  '2674998',
  '51056442'},
 'candit_ref_dois': {'10.1007/s00607-023-01181-x',
  '10.48550/arXiv.2306.00622',
  '10.48550/arXiv.2307.05492',
  '10.48550/arXiv.2310.01783',
  '10.48550/arXiv.2404.07738',
  '10.48550/arXiv.2406.05688',
  '10.48550/arXiv.2408.07884',
  '10.48550/arXiv.2408.10365',
  '10.48550/arXiv.2410.03019',
  '10.48550/arXiv.2411.00816',
  '10.48550/arXiv.2502.17086',
  '10.48550/arXiv.2502.19614',
  '10.48550/arXiv.2503.01424',
  '10.48550/arXiv.2503.04629',
  '10.48550/arXiv.2503.08506',
  '10.48550/arXiv.2

In [None]:
len(core_author_ids)

In [None]:
candit_ref_dois

In [None]:
sorted_paper_similarity

In [None]:
topic_ids = set([nid for nid, node_data in G_post.nodes(data=True) if node_data.get('node_type') == 'Topic'])
topic_stats = []
for tid in topic_ids:
    i = 0
    topic = G_post.nodes(tid).get('topic_name')
    for u, v, edge_data in G_post.in_edges(tid, data=True):
        if edge_data.get('relationshipType') == 'DISCUSS':
            i += 1
    topic_stats.append({'topic_id':tid, 'topic_name':topic, 'related_papers':i})

In [None]:
core_paper_json = [x for x in complete_paper_json if x['id'] in core_paper_dois]
if len(topic_stats) < 4:
    keywords_topics_json = await ps.topic_generation(
        seed_paper_json=core_paper_json,
        llm_api_key=llm_api_key,
        llm_model_name=llm_model_name,
        round=1
        )
    topic_queries = keywords_topics_json.get('queries', [])
    candit_topics.update(topic_queries)

In [None]:
candit_topics

In [None]:
key_authors_stats

In [None]:
G_post.nodes('2112678409')

In [None]:
candit_author_ids

In [None]:
candit_dois
candit_author_ids
candit_ref_dois, candit_citing_dois = [], []

In [None]:
item

In [None]:
for item in complete_paper_json:
    print(item.get('properties', {}).get('title'))

In [None]:
candit_citing_dois

In [None]:
candit_ref_dois