# Setup

In [1]:
import copy
from typing import List, Dict, Optional, Union, Tuple, Literal

In [2]:
import os
import json

import sys
import os

parent_dir = os.path.dirname(os.getcwd())
sys.path.append(parent_dir)

In [3]:
from graph.paper_graph import PaperGraph
from graph.graph_viz import GraphViz
from app.paper_data_collect import PaperSearch
from graph.graph_stats import get_graph_stats, get_author_stats, get_paper_stats

In [4]:
params = {
    # params for paper collection
    'search_limit': 100,
    'recommend_limit': 50,
    'citation_limit': 100,
    # params for expanded search
    'similarity_threshold': 0.7,
    'top_k_similar_papers': 20,
    'top_l_key_authors': 20,
    # paprams for stopping criteria
    'min_paper_cnt': 50,
    'min_author_cnt': 50,
    'min_corssref_papers': 20,
    'min_key_authors': 10,
}

In [5]:
# driving examples
llm_api_key = os.getenv('GEMINI_API_KEY_3')
llm_model_name="gemini-2.0-flash"
embed_api_key = os.getenv('GEMINI_API_KEY_3')
embed_model_name="models/text-embedding-004"

In [6]:
# from user input
research_topics = ["llm literature review"]
seed_dois = ['10.48550/arXiv.2406.10252',  # AutoSurvey: Large Language Models Can Automatically Write Surveys
            '10.48550/arXiv.2412.10415',  # Generative Adversarial Reviews: When LLMs Become the Critic
            '10.48550/arXiv.2402.12928',  # A Literature Review of Literature Reviews in Pattern Analysis and Machine Intelligence 
            ]
seed_titles = ['PaperRobot: Incremental Draft Generation of Scientific Ideas',
            'From Hypothesis to Publication: A Comprehensive Survey of AI-Driven Research Support Systems'
            ]
# constraints
from_dt = '2020-01-01'
to_dt = '2025-04-30'
fields_of_study = ['Computer Science']

In [7]:
ps = PaperSearch(   
    seed_research_topics = research_topics,   
    seed_paper_titles = seed_titles, 
    seed_paper_dois = seed_dois,
    from_dt = from_dt,
    to_dt = to_dt,
    fields_of_study = fields_of_study,
    )

2025-04-23 17:15:03,852 - INFO - SemanticScholarKit initialized with max_concurrency=20, sleep_interval=3.0s
2025-04-23 17:15:03,853 - INFO - SemanticScholarKit initialized with max_concurrency=20, sleep_interval=3.0s
2025-04-23 17:15:03,854 - INFO - SemanticScholarKit initialized with max_concurrency=20, sleep_interval=3.0s
2025-04-23 17:15:03,855 - INFO - SemanticScholarKit initialized with max_concurrency=20, sleep_interval=3.0s


# Round 1

In [8]:
iteration = 1
citation_limit = 100
recommend_limit = 100
search_limit = 50

from_dt = '2020-01-01'
to_dt = '2025-04-30'
fields_of_study = ['Computer Science']

## Initial Search

### Collection

In [None]:
# --- ROUND 1 QUERY ---
# initial query for seed papers basic information
print(f"--- Running Round {iteration} Query for Papers Information ---")
await ps.consolidated_search(
    topics=ps.research_topics,
    paper_titles=ps.seed_paper_titles,
    paper_dois=ps.seed_paper_dois,
    # ref_paper_dois=ps.seed_paper_dois,
    # citing_paper_dois=ps.seed_paper_dois,
    # pos_paper_dois=ps.seed_paper_dois,
    from_dt=ps.from_dt,
    to_dt=ps.to_dt,
    fields_of_study=ps.fields_of_study
)

2025-04-23 17:15:08,391 - INFO - Search 1 topics, 2 paper titles and 3 for paper information.
2025-04-23 17:15:08,392 - INFO - Fetching papers by 3 DOIs...
2025-04-23 17:15:08,393 - INFO - Fetching papers by title: 'PaperRobot: Incremental Draft Generation of Scientific Ideas...'
2025-04-23 17:15:08,394 - INFO - Fetching papers by title: 'From Hypothesis to Publication: A Comprehensive Survey of AI-Driven Research Support Systems...'
2025-04-23 17:15:08,394 - INFO - Fetching papers by topic: 'llm literature review...'
2025-04-23 17:15:08,395 - INFO - Running 4 initial query tasks concurrently...
2025-04-23 17:15:08,397 - INFO - async_search_paper_by_ids: Creating 1 tasks for 3 IDs.
2025-04-23 17:15:08,398 - INFO - async_search_paper_by_ids: Gathering 1 tasks...
2025-04-23 17:15:08,398 - INFO - async_search_paper_by_keywords: Searching papers by keyword: 'PaperRobot: Incremental Draft Generation of Scient...' with effective limit 50.
2025-04-23 17:15:08,399 - INFO - _sync_search_paper_b

--- Running Round 1 Query for Papers Information ---


2025-04-23 17:15:09,168 - INFO - HTTP Request: GET https://api.semanticscholar.org/graph/v1/paper/search?query=llm%20literature%20review&fieldsOfStudy=Computer%20Science&fields=abstract,authors,citationCount,citationStyles,corpusId,externalIds,fieldsOfStudy,influentialCitationCount,isOpenAccess,journal,openAccessPdf,paperId,publicationDate,publicationTypes,publicationVenue,referenceCount,s2FieldsOfStudy,title,url,venue,year&offset=0&limit=50 "HTTP/1.1 429 "
2025-04-23 17:15:09,326 - INFO - HTTP Request: POST https://api.semanticscholar.org/graph/v1/paper/batch?fields=abstract,authors,citationCount,citationStyles,corpusId,externalIds,fieldsOfStudy,influentialCitationCount,isOpenAccess,journal,openAccessPdf,paperId,publicationDate,publicationTypes,publicationVenue,referenceCount,s2FieldsOfStudy,title,url,venue,year "HTTP/1.1 200 OK"
2025-04-23 17:15:09,331 - INFO - _sync_get_papers: API call successful for batch (first 5: ['10.48550/arXiv.2406.10252', '10.48550/arXiv.2412.10415', '10.485

In [None]:
print(len(ps.nodes_json), len(ps.edges_json))

In [None]:
isinstance(results, Paper.Paper)

In [None]:
for item in ps.nodes_json:
    if item.get('labels') == ['Author'] and item.get('properties', {}).get('is_complete'):
        print(item)

### Routing

In [None]:
candit_dois, candit_author_ids = [], []
candit_ref_dois, candit_citing_dois = [], []
candit_topics = []


In [None]:
# core papers and core authors nodes
core_paper_dois = [node['id'] for node in ps.nodes_json 
                   if node['labels'] == ['Paper'] and node['id'] in seed_dois]
core_author_ids = []
for item in ps.nodes_json:
    if item['id'] in core_paper_dois and isinstance(item['properties'].get('authors'), list):
        authors_id = [x['authorId'] for x in item['properties']['authors'] if x['authorId'] is not None] 
        core_author_ids.extend(authors_id)
core_author_ids = list(set(core_author_ids))

In [None]:
from app.paper_similarity_calculation import PaperSim

sim = PaperSim(
    embed_api_key = embed_api_key,
    embed_model_name = embed_model_name
)

In [None]:
# generate paper graph from nodes / edges json
G_pre = PaperGraph(name='Paper Graph Init Search')
G_pre.add_graph_nodes(ps.nodes_json)
G_pre.add_graph_edges(ps.edges_json)

# check core paper completeness
# paper complete
candit_dois.extend([doi for doi in core_paper_dois if doi not in G_pre.nodes()]) 
# citation complete
candit_ref_dois.extend([doi for doi in core_paper_dois if doi not in ps.explored_nodes['reference']])
candit_citing_dois.extend([doi for doi in core_paper_dois if doi not in ps.explored_nodes['citing']])
# author complete
candit_author_ids.extend([aid for aid in core_author_ids if aid not in ps.explored_nodes['author']]) 

# --- Graph Stat ---
g_stat = get_graph_stats(G_pre)   # graph stats

# valid paper with abstracts
complete_paper_json = [node for node in ps.nodes_json 
                        if node['labels'] == ['Paper'] 
                        and node['properties'].get('title') is not None and node['properties'].get('abstract') is not None]
complete_paper_dois = [node['id'] for node in complete_paper_json]

# --- SIMILARITY CALCULATION ---
# check if similarity with edge type
edge_types = [x[0] for x in g_stat['edge_type']]

if 'SIMILAR_TO' not in edge_types:
    # calculate paper nodes similarity
    semantic_similar_pool = await sim.cal_embed_and_similarity(
        paper_nodes_json = complete_paper_json,
        paper_dois_1 = complete_paper_dois, 
        paper_dois_2 = complete_paper_dois,
        similarity_threshold=params['similarity_threshold'],
        )

    # add similarity edges to graph
    G_pre.add_graph_edges(semantic_similar_pool)  

# --- PRUNNING ---
# pruning by connectivity
sub_graphs = G_pre.find_wcc_subgraphs(target_nodes=core_paper_dois)
if sub_graphs is not None and len(sub_graphs) > 0:
    G_post  = sub_graphs[0]
    # get stats after prunning
    g_stat = get_graph_stats(G_post)
else:
    G_post = G_pre

# --- GET KEY STATS ---
# check paper count and author count
paper_cnt, author_cnt = 0, 0
for item in g_stat['node_type']:
    if item[0] == 'Paper':
        paper_cnt = item[1]
    elif item[0] == 'Author':
        author_cnt = item[1]

paper_stats = get_paper_stats(G_post, core_paper_dois)  # paper stats on graph
author_stats = get_author_stats(G_post, core_author_ids)  # author stats on graph

# check crossref
crossref_stats = []
for x in paper_stats:
    if (x['if_seed'] == False  # exclude seed papers 
        and x['local_citation_cnt'] > min(len(core_paper_dois),  5)):  # select most refered papers in graph
        crossref_stats.append(x)

# check key authors
key_authors_stats = []
for x in author_stats:
    if (x['if_seed'] == False  # exclude seed authors 
        and x['local_paper_cnt'] > min(len(core_paper_dois), 5)):  # select most refered papers in graph
        key_authors_stats.append(x)

# check paper similarity
sorted_paper_similarity = sorted(paper_stats, key=lambda x:x['max_sim_to_seed'], reverse=True)

# if cross ref insufficient, further expand similar papers on citation chain
if len(crossref_stats) < params['min_corssref_papers']:
    # filter top similar papers (to help build crossref)
    i = 0
    for item in sorted_paper_similarity:
        if i < params['top_k_similar_papers']:
            if item['if_seed'] == False and item['doi'] not in ps.explored_nodes['reference']:
                candit_ref_dois.append(item['doi'])
                i += 1
        else:
            break

# if key authors not have complete information
if len(key_authors_stats) > params['min_key_authors']:
    sorted_key_authors = sorted(key_authors_stats, key=lambda x:x['local_paper_cnt'], reverse=True)
    # filter key authors (to amplify information)
    i = 0
    for item in sorted_key_authors:
        if i < params['top_l_key_authors']:
            if item['if_seed'] == False and item['author_id'] not in ps.explored_nodes['author']:
                candit_author_ids.append(item['author_id'])
                i += 1
        else:
            break

In [None]:
key_authors_stats

In [None]:
candit_dois

In [None]:
len(core_author_ids)

In [None]:
len(set(candit_author_ids))

In [None]:
candit_ref_dois

In [None]:
len(candit_ref_dois)

In [None]:
len(candit_citing_dois)

In [None]:
candit_dois, candit_author_ids = [], []
candit_ref_dois, candit_citing_dois = [], []
candit_topics = []

In [None]:
topic_ids = [nid for nid, node_data in G_post.nodes(data=True) if node_data.get('node_type') == 'Topic']
topic_stats = []
for tid in topic_ids:
    i = 0
    topic = G_post.nodes(tid).get('topic_name')
    for u, v, edge_data in G_post.in_edges(tid, data=True):
        if edge_data.get('relationshipType') == 'DISCUSS':
            i += 1
    topic_stats.append({'topic_id':tid, 'topic_name':topic, 'related_papers':i})

In [None]:
core_paper_json = [x for x in complete_paper_json if x['id'] in core_paper_dois]
if len(topic_stats) < 4:
    keywords_topics_json = await ps.topic_generation(
        seed_paper_json=core_paper_json,
        llm_api_key=ps.llm_api_key,
        llm_model_name=ps.llm_model_name,
        round=1
        )
    candit_topics.extend(keywords_topics_json.get('queries', []))

In [None]:
candit_topics

In [None]:
key_authors_stats

In [None]:
G_post.nodes('2112678409')

In [None]:
candit_author_ids

In [None]:
candit_dois
candit_author_ids
candit_ref_dois, candit_citing_dois = [], []

In [None]:
item

In [None]:
for item in complete_paper_json:
    print(item.get('properties', {}).get('title'))

In [None]:
candit_citing_dois

In [None]:
candit_ref_dois