# Setup

In [1]:
import copy
from typing import List, Dict, Optional, Union, Tuple, Literal

In [2]:
import os
import json

import sys
import os

parent_dir = os.path.dirname(os.getcwd())
sys.path.append(parent_dir)

In [3]:
from graph.paper_graph import PaperGraph
from graph.graph_viz import GraphViz

In [4]:
# driving examples
llm_api_key = os.getenv('GEMINI_API_KEY_3')
llm_model_name="gemini-2.0-flash"
embed_api_key = os.getenv('GEMINI_API_KEY_3')
embed_model_name="models/text-embedding-004"

research_topics = ["llm literature review"]
seed_dois = ['10.48550/arXiv.2406.10252',  # AutoSurvey: Large Language Models Can Automatically Write Surveys
            '10.48550/arXiv.2412.10415',  # Generative Adversarial Reviews: When LLMs Become the Critic
            '10.48550/arXiv.2402.12928',  # A Literature Review of Literature Reviews in Pattern Analysis and Machine Intelligence 
            ]
seed_titles = ['PaperRobot: Incremental Draft Generation of Scientific Ideas',
            'From Hypothesis to Publication: A Comprehensive Survey of AI-Driven Research Support Systems'
            ]

In [5]:
citation_limit = 100
author_paper_limit = 10

if len(seed_dois) < 10 or len(seed_titles) < 10:
    search_limit = 100
    recommend_limit = 100
else:
    search_limit = 50
    recommend_limit = 50

In [6]:
from collect.paper_data_collect import PaperCollector

ps = PaperCollector(   
    seed_research_topics = research_topics,   
    seed_paper_titles = seed_titles, 
    seed_paper_ids = seed_dois,
    from_dt = '2020-01-01',
    to_dt = '2025-04-30',
    fields_of_study = ['Computer Science'],
    author_paper_limit = author_paper_limit,
    search_limit = search_limit,
    recommend_limit = recommend_limit,
    citation_limit = citation_limit
    )

2025-04-29 10:08:04,985 - SemanticScholarKit - INFO - SemanticScholarKit initialized with: max_concurrency=20, max_retry=20, sleep_interval=3.0s
2025-04-29 10:08:04,985 - INFO - SemanticScholarKit initialized with: max_concurrency=20, max_retry=20, sleep_interval=3.0s
2025-04-29 10:08:04,985 - SemanticScholarKit - INFO - SemanticScholarKit initialized with: max_concurrency=20, max_retry=20, sleep_interval=3.0s
2025-04-29 10:08:04,985 - INFO - SemanticScholarKit initialized with: max_concurrency=20, max_retry=20, sleep_interval=3.0s


# Data Collection

## Initial Search

### Data Generation

Get paper data

In [7]:
iteration = 1

In [8]:
await ps.consolidated_search(
    paper_titles = seed_titles,
    paper_ids = seed_dois
)

2025-04-29 10:08:06,770 - SemanticScholarKit - INFO - consolidated_search: Starting...
2025-04-29 10:08:06,770 - INFO - consolidated_search: Starting...
2025-04-29 10:08:06,772 - SemanticScholarKit - INFO - consolidated_search: Running 1 sub-tasks concurrently...
2025-04-29 10:08:06,772 - INFO - consolidated_search: Running 1 sub-tasks concurrently...
2025-04-29 10:08:06,773 - INFO - Search 2 paper titles and 3 for paper information.
2025-04-29 10:08:06,774 - INFO - paper_search: Creating task for 3 IDs...
2025-04-29 10:08:06,774 - INFO - paper_search: Creating 2 tasks for titles...
2025-04-29 10:08:06,775 - INFO - paper_search: Running 3 query tasks concurrently...
2025-04-29 10:08:06,776 - SemanticScholarKit - INFO - get_papers: Creating 1 tasks for 1 IDs.
2025-04-29 10:08:06,776 - INFO - get_papers: Creating 1 tasks for 1 IDs.
2025-04-29 10:08:06,776 - SemanticScholarKit - INFO - get_papers: Gathering 1 tasks...
2025-04-29 10:08:06,776 - INFO - get_papers: Gathering 1 tasks...
2025-

### Post-Processing

Paper post process

In [9]:
await ps.post_process(if_supplement_abstract=True)

2025-04-29 10:08:08,397 - SemanticScholarKit - INFO - post_process: Starting data processing...
2025-04-29 10:08:08,397 - INFO - post_process: Starting data processing...
2025-04-29 10:08:08,398 - SemanticScholarKit - INFO - Processing 5 raw paper entries...
2025-04-29 10:08:08,398 - INFO - Processing 5 raw paper entries...
2025-04-29 10:08:08,400 - SemanticScholarKit - INFO - Generated 75 nodes/edges from papers.
2025-04-29 10:08:08,400 - INFO - Generated 75 nodes/edges from papers.
2025-04-29 10:08:08,401 - SemanticScholarKit - INFO - Total items after paper processing: 75
2025-04-29 10:08:08,401 - INFO - Total items after paper processing: 75
2025-04-29 10:08:08,402 - SemanticScholarKit - INFO - No author data in pool to process.
2025-04-29 10:08:08,402 - INFO - No author data in pool to process.
2025-04-29 10:08:08,403 - SemanticScholarKit - INFO - No topic data in pool to process.
2025-04-29 10:08:08,403 - INFO - No topic data in pool to process.
2025-04-29 10:08:08,405 - Semantic

### Routing

Plan next move

In [10]:
# core papers and core authors nodes
if iteration == 1:
    core_paper_ids = set(node['id'] for node in ps.nodes_json if node['labels'] == ['Paper'])
    core_author_ids = set(node['id'] for node in ps.nodes_json if node['labels'] == ['Author'])
    print(len(core_paper_ids), len(core_author_ids))

5 33


In [11]:
ps.explored_nodes['paper'].update(core_paper_ids)

In [12]:
# for authors
author_ids = [author_id for author_id in core_author_ids if author_id not in ps.explored_nodes['author']]

In [13]:
# for reference and citings
ref_ids = [pid for pid in core_paper_ids if pid not in ps.explored_nodes['reference']]
cit_ids = [pid for pid in core_paper_ids if pid not in ps.explored_nodes['citing']]

In [14]:
# recommendation 
if len(ps.explored_nodes['recommendation']) == 0:
    if len(core_paper_ids) > 3:
        pos_paper_ids = list(core_paper_ids)
        neg_paper_ids = []

In [15]:
# topics generation
core_paper_json = [x for x in ps.nodes_json if x['id'] in core_paper_ids]
if len(ps.explored_nodes['topic']) < 4:  # explored topic less than 4, generate new topics
    await ps.topic_generation(
        paper_json = core_paper_json,
        llm_api_key = llm_api_key,
        llm_model_name = llm_model_name,
        )

2025-04-29 10:08:14,309 - INFO - Use LLM to identify key related topics.
2025-04-29 10:08:14,311 - SemanticScholarKit - INFO - Generating related topics for 5 seed papers...
2025-04-29 10:08:14,311 - INFO - Generating related topics for 5 seed papers...
2025-04-29 10:08:14,313 - SemanticScholarKit - INFO - Calling LLM to generate topics...
2025-04-29 10:08:14,313 - INFO - Calling LLM to generate topics...
2025-04-29 10:08:14,315 - INFO - AFC is enabled with max remote calls: 10.
2025-04-29 10:08:16,785 - SemanticScholarKit - INFO - LLM generated topics: {"field_of_study": ["Natural Language Processing", "Artificial Intelligence", "Information Retrieval", "Meta-Research"], "keywords_and_topics": ["large language models", "literature review automation", "peer review process", "AI-driven research support systems", "knowledge graph"], "tags": ["generative adversarial networks", "AutoSurvey", "PaperRobot", "LLM agents", "scientific knowledge synthesis"], "queries": ["automated literature re

In [16]:
# identify unexplored topics
# covert topic data to k-v format
topic_pids = {}

for item in ps.data_pool['topic']:
    topic = item['topic']
    paper_id = item['paperId']
    
    if topic not in topic_pids:
        topic_pids[topic] = []
        
    topic_pids[topic].append(paper_id)

# identify topics with insufficient papers
topics = []
for topic, pids in topic_pids.items():
    if len(pids) < 10:
        topics.append(topic)
print(topics)
    

['LLM for peer review', 'AI research assistant', 'automated literature review generation', 'knowledge graph scientific discovery']


## Expanded Search

### Data Generation

In [17]:
await ps.consolidated_search(
    topics = topics,
    paper_titles = None,
    paper_ids = None,
    author_ids = author_ids,
    author_paper_ids = None,
    ref_paper_ids = ref_ids,
    cit_paper_ids = cit_ids,
    pos_paper_ids = pos_paper_ids,
    neg_paper_ids = neg_paper_ids,
    author_limit = 10,
    search_limit = ps.search_limit,
    citation_limit = ps.citation_limit,
    recommend_limit = ps.recommend_limit,
    from_dt = ps.from_dt,
    to_dt = ps.to_dt,
    fields_of_study = ps.fields_of_study
)

2025-04-29 10:11:03,787 - SemanticScholarKit - INFO - consolidated_search: Starting...
2025-04-29 10:11:03,787 - INFO - consolidated_search: Starting...
2025-04-29 10:11:03,788 - SemanticScholarKit - INFO - consolidated_search: Running 5 sub-tasks concurrently...
2025-04-29 10:11:03,788 - INFO - consolidated_search: Running 5 sub-tasks concurrently...
2025-04-29 10:11:03,790 - INFO - topic_search: Searching 4 topics.
2025-04-29 10:11:03,790 - INFO - topic_search: Running 4 topic search tasks concurrently...
2025-04-29 10:11:03,792 - INFO - authors_search: Searching 33 authors.
2025-04-29 10:11:03,793 - SemanticScholarKit - INFO - get_authors: Creating 1 tasks for 1 IDs.
2025-04-29 10:11:03,793 - INFO - get_authors: Creating 1 tasks for 1 IDs.
2025-04-29 10:11:03,795 - SemanticScholarKit - INFO - get_authors: Gathering 1 tasks...
2025-04-29 10:11:03,795 - INFO - get_authors: Gathering 1 tasks...
2025-04-29 10:11:03,796 - INFO - reference_search: Fetching references for 5 papers (limit p

Paper Post-progress

In [18]:
await ps.post_process(if_supplement_abstract=True)

2025-04-29 10:23:30,224 - SemanticScholarKit - INFO - post_process: Starting data processing...
2025-04-29 10:23:30,224 - INFO - post_process: Starting data processing...
2025-04-29 10:23:30,226 - SemanticScholarKit - INFO - Processing 1862 raw paper entries...
2025-04-29 10:23:30,226 - INFO - Processing 1862 raw paper entries...
2025-04-29 10:23:30,293 - SemanticScholarKit - INFO - Generated 18884 nodes/edges from papers.
2025-04-29 10:23:30,293 - INFO - Generated 18884 nodes/edges from papers.
2025-04-29 10:23:30,296 - SemanticScholarKit - INFO - Found 1150 paper nodes missing abstracts. Attempting to supplement...
2025-04-29 10:23:30,296 - INFO - Found 1150 paper nodes missing abstracts. Attempting to supplement...
2025-04-29 10:23:30,297 - SemanticScholarKit - INFO - supplement_abstract: Fetching abstracts for 1150 papers...
2025-04-29 10:23:30,297 - INFO - supplement_abstract: Fetching abstracts for 1150 papers...
2025-04-29 10:23:30,297 - SemanticScholarKit - INFO - get_papers: C

In [None]:
print(len(ps.data_pool['paper']), len(ps.data_pool['author']))

In [23]:
# generate paper graph from nodes / edges json
G_pre = PaperGraph(name='Paper Graph 1')
G_pre.add_graph_nodes(ps.nodes_json)
G_pre.add_graph_edges(ps.edges_json)

In [31]:
import numpy as np
np.array(list({'a':1, 'b':2}.values()))

array([1, 2])

In [28]:
G_pre.nodes['2335569348']

{'authorId': '2335569348',
 'name': 'Narimasa Watanabe',
 'nodeType': 'Author',
 'externalIds': {'DBLP': ['Narimasa Watanabe']},
 'url': 'https://www.semanticscholar.org/author/2335569348',
 'paperCount': 3,
 'citationCount': 1,
 'hIndex': 1}

In [26]:
ps.nodes_json[1]

{'type': 'node',
 'id': '2108024279',
 'labels': ['Author'],
 'properties': {'authorId': '2108024279',
  'name': 'Yidong Wang',
  'nodeType': 'Author'}}

In [27]:
author_ids

['2335569348',
 '51056442',
 '2118640235',
 '152971314',
 '2337225259',
 '2293356300',
 '2216503559',
 '2674998',
 '2265930173',
 '2257010530',
 '2284827556',
 '2349068478',
 '34170717',
 '2112678409',
 '2335566763',
 '145081697',
 '144016781',
 '2307012818',
 '2328342585',
 '2273779175',
 '2268132119',
 '2286328804',
 '143977268',
 '2116271777',
 '2265878959',
 '1786863',
 '2259709647',
 '2108024279',
 '2275569993',
 '2262020955',
 '2284825678',
 '2289004972',
 '2284824283']

In [21]:
from datetime import datetime
from dateutil.relativedelta import relativedelta

def calculate_months_since(date_str):
    """
    计算给定 'yyyy-mm-dd' 格式的日期距今的月份数。

    Args:
        date_str (str): 'yyyy-mm-dd' 格式的日期字符串。

    Returns:
        int: 给定日期距今的月份数。如果输入格式错误，返回 None。
    """
    try:
        given_date = datetime.strptime(date_str, '%Y-%m-%d').date()
        today = datetime.now().date()
        difference = relativedelta(today, given_date)
        return difference.years * 12 + difference.months
    except ValueError:
        return None

# 示例用法
date_to_calculate = '2024-01-15'
months = calculate_months_since(date_to_calculate)

if months is not None:
    print(f"日期 {date_to_calculate} 距今有 {months} 个月。")
else:
    print("输入的日期格式不正确，请使用 'yyyy-mm-dd' 格式。")

date_to_calculate_invalid = '2024/01/15'
months_invalid = calculate_months_since(date_to_calculate_invalid)

if months_invalid is not None:
    print(f"日期 {date_to_calculate_invalid} 距今有 {months_invalid} 个月。")
else:
    print(f"输入的日期格式 '{date_to_calculate_invalid}' 不正确，请使用 'yyyy-mm-dd' 格式。")

日期 2024-01-15 距今有 15 个月。
输入的日期格式 '2024/01/15' 不正确，请使用 'yyyy-mm-dd' 格式。


In [19]:
ps.edges_json[0]

{'type': 'relationship',
 'relationshipType': 'WRITES',
 'startNodeId': '2108024279',
 'endNodeId': '9e57dda195973c4b6c81386b1cc44595ecfd4697',
 'properties': {'authorOrder': 1}}

In [None]:
# --- Graph Stat ---
from graph.graph_stats import get_graph_stats, get_author_stats, get_paper_stats
g_stat = get_graph_stats(G_pre)   # graph stats

In [None]:
paper_stats = get_paper_stats(G_pre, core_paper_ids)  # paper stats on graph
author_stats = get_author_stats(G_pre, core_author_ids)  # author stats on graph

In [None]:
# check crossref
crossref_stats = []
for x in paper_stats:
    if (x['if_seed'] == False  # exclude seed papers 
        and x['local_citation_cnt'] > min(len(core_paper_ids),  5)):  # select most refered papers in graph
        crossref_stats.append(x)

In [None]:
# calculate similarity
from collect.paper_similarity_calculation import PaperSim

sim = PaperSim(
    embed_api_key = embed_api_key,
    embed_model_name = embed_model_name
)

# --- SIMILARITY CALCULATION ---
# check if similarity with edge type
edge_types = [x[0] for x in g_stat['edge_type']]

# valid paper with abstracts
complete_paper_json = [node for node in ps.nodes_json 
                        if node['labels'] == ['Paper'] 
                        and node['properties'].get('title') is not None and node['properties'].get('abstract') is not None]
complete_paper_dois = [node['id'] for node in complete_paper_json]

if 'SIMILAR_TO' not in edge_types:
    # calculate paper nodes similarity
    semantic_similar_pool = await sim.cal_embed_and_similarity(
        paper_nodes_json = complete_paper_json,
        paper_dois_1 = complete_paper_dois, 
        paper_dois_2 = complete_paper_dois,
        similarity_threshold = 0.7,
        )

    # add similarity edges to graph
    G_pre.add_graph_edges(semantic_similar_pool)  

In [None]:
# --- PRUNNING ---
# pruning by connectivity
sub_graphs = G_pre.find_wcc_subgraphs(target_nodes=core_paper_ids)
if sub_graphs is not None and len(sub_graphs) > 0:
    G_post  = sub_graphs[0]
    # get stats after prunning
    g_stat = get_graph_stats(G_post)
else:
    G_post = G_pre

In [None]:
paper_stats = get_paper_stats(G_post, core_paper_ids)  # paper stats on graph
author_stats = get_author_stats(G_post, core_author_ids)  # author stats on graph

# check crossref
crossref_stats = []
for x in paper_stats:
    if (x['if_seed'] == False  # exclude seed papers 
        and x['local_citation_cnt'] > min(len(core_paper_ids),  5)):  # select most refered papers in graph
        crossref_stats.append(x)

# check key authors
key_authors_stats = []
for x in author_stats:
    if (x['if_seed'] == False  # exclude seed authors 
        and x['local_paper_cnt'] > min(len(core_paper_ids), 5)):  # select most refered papers in graph
        key_authors_stats.append(x)

In [None]:
# check paper similarity
sorted_paper_similarity = sorted(paper_stats, key=lambda x:x['max_sim_to_seed'], reverse=True)

In [None]:
ref_ids = []
# if cross ref insufficient, further expand similar papers on citation chain
if len(crossref_stats) < 20:
    # filter top similar papers (to help build crossref)
    for item in sorted_paper_similarity:
        if item['if_seed'] == False and item['doi'] not in ps.explored_nodes['reference']:
            if item['max_sim_to_seed'] > 0.7 and item['global_citaion_cnt'] > 10:
                ref_ids.append(item['doi'])
        else:
            break

In [None]:
ref_ids = ref_ids[0:20]

In [None]:
# if key authors not have complete information
author_ids = []
if len(key_authors_stats) > 20:
    sorted_key_authors = sorted(key_authors_stats, key=lambda x:x['local_paper_cnt'], reverse=True)
    # filter key authors (to amplify information)
    for item in sorted_key_authors:
        if item['if_seed'] == False and item['author_id'] not in ps.explored_nodes['author']:
            author_ids.append(item['author_id'])

author_ids = author_ids[0:50]

In [None]:
for item in ps.nodes_json:
    if item['labels'] == ['Paper']:
        print(item['properties']['title'])
        # print(item.get('title'))

In [None]:
hop_1_sim_paper_ids = []
for u, v, edge_data in G_post.edges(data=True):
    if edge_data.get('relationshipType') == 'SIMILAR_TO' and edge_data.get('weight') > 0.7:
        if u in core_paper_ids and v not in core_paper_ids:
            hop_1_sim_paper_ids.append(v)
        elif u not in core_paper_ids and v in core_paper_ids:
            hop_1_sim_paper_ids.append(u)

In [None]:
len(hop_1_sim_paper_ids)

In [None]:
hop_1_citation_paper_ids = []
for u, v, edge_data in G_post.edges(data=True):
    if edge_data.get('relationshipType') == 'CITES':
        if u in core_paper_ids and v not in core_paper_ids:
            hop_1_citation_paper_ids.append(v)
        elif u not in core_paper_ids and v in core_paper_ids:
            hop_1_citation_paper_ids.append(u)

In [None]:
hop_1_topic_paper_ids = []
for u, v, edge_data in G_post.edges(data=True):
    if edge_data.get('relationshipType') == 'DISCUSS':
        topic = G_post.nodes[v].get('name')
        if u not in core_paper_ids:
            title = G_post.nodes[u].get('title')
            gloabl_citation = G_post.nodes[u].get('citationCount')
            if gloabl_citation > 10 and u in hop_1_sim_paper_ids:
                print(topic, title, gloabl_citation)
                # hop_1_topic_paper_ids.append(u)

In [None]:
# recommendation, author papers
hop_1_expand_papers = set(list(core_paper_ids) + hop_1_sim_paper_ids + hop_1_citation_paper_ids)

In [None]:
hop_2_sim_paper_ids = []
for u, v, edge_data in G_post.edges(data=True):
    if edge_data.get('relationshipType') == 'SIMILAR_TO' and edge_data.get('weight') > 0.7:
        if u in hop_1_expand_papers and v not in hop_1_expand_papers:
            hop_2_sim_paper_ids.append(v)
        elif u not in hop_1_expand_papers and v in hop_1_expand_papers:
            hop_2_sim_paper_ids.append(u)

In [None]:
len(hop_1_expand_papers)

In [None]:
len(set(hop_2_sim_paper_ids))

In [None]:
for id not in hop_2_sim_paper_ids

In [None]:
There are 6 node types in this graph, they are:
[('Author', 5560), ('Paper', 1933), ('Journal', 513), ('Venue', 380), ('Institution', 5), ('Topic', 4)]
There are 7 edge types in this graph, they are:
[('SIMILAR_TO', 20872), ('WRITES', 9366), ('RELEASES_IN', 1244), ('PRINTS_ON', 702), ('DISCUSS', 419), ('CITES', 288), ('WORKS_IN', 5)]

In [39]:
x = {'a':2, 'b':1, 'c':3}

In [40]:
x_sorted = sorted(x.items(), key=lambda x:x[1], reverse=True)

In [42]:
ranked_keys = {}
for index, (key, value) in enumerate(x_sorted):
    ranked_keys[key] = index

In [43]:
ranked_keys

{'c': 0, 'a': 1, 'b': 2}