# Research Tree PoC 20250318

## Setup

In [1]:
research_topic = "llm literature review"
seed_dois = ['10.48550/arXiv.2406.10252',  # AutoSurvey: Large Language Models Can Automatically Write Surveys
             '10.48550/arXiv.2412.10415',  # Generative Adversarial Reviews: When LLMs Become the Critic
             '10.48550/arXiv.2402.12928',  # A Literature Review of Literature Reviews in Pattern Analysis and Machine Intelligence 
             ]
seed_titles = ['PaperRobot: Incremental Draft Generation of Scientific Ideas',
               'From Hypothesis to Publication: A Comprehensive Survey of AI-Driven Research Support Systems'
               ]

## Paper Exploration

In [2]:
import sys
import os

parent_dir = os.path.dirname(os.getcwd())
print(parent_dir)
sys.path.append(parent_dir)

/home/jiezi/Code/GitHub/ResearchTree


In [3]:
import toml

config_file = "config.toml"
try:
    with open(config_file, 'r', encoding='utf-8') as toml_file:
        config_param = toml.load(toml_file)
except FileNotFoundError:
    print(f"Config file '{config_file}' not found. Please ensure it exists.")
    config_param = {} 

In [4]:
llm_api_key = config_param.get('models', {}).get('llm', {}).get('api_key')
llm_model_name = config_param.get('models', {}).get('llm', {}).get('model_name')
embed_api_key = config_param.get('models', {}).get('embed', {}).get('api_key')
embed_model_name = config_param.get('models', {}).get('embed', {}).get('model_name')

In [5]:
from graph.paper_trace import PaperExploration
# paperbot = PaperExploration(
#     seed_paper_dois=seed_dois[0],
#     llm_api_key = llm_api_key,
#     llm_model_name = llm_model_name,
#     embed_api_key = embed_api_key,
#     embed_model_name = embed_model_name
#     )
paperbot = PaperExploration(
    research_topic=research_topic, 
    seed_paper_dois=seed_dois, 
    seed_paper_titles=seed_titles,
    llm_api_key = llm_api_key,
    llm_model_name = llm_model_name,
    embed_api_key = embed_api_key,
    embed_model_name = embed_model_name    
    )

### Get initial papers

In [7]:
paperbot.initial_paper_query(limit=50, from_dt='2023-01-01', to_dt='2025-03-24')

2025-03-24 11:31:59,741 - INFO - HTTP Request: POST https://api.semanticscholar.org/graph/v1/paper/batch?fields=abstract%2Cauthors%2CcitationCount%2CcitationStyles%2CcorpusId%2CexternalIds%2CfieldsOfStudy%2CinfluentialCitationCount%2CisOpenAccess%2Cjournal%2CopenAccessPdf%2CpaperId%2CpublicationDate%2CpublicationTypes%2CpublicationVenue%2CreferenceCount%2Cs2FieldsOfStudy%2Ctitle%2Curl%2Cvenue%2Cyear "HTTP/1.1 429 "
2025-03-24 11:32:31,146 - INFO - HTTP Request: POST https://api.semanticscholar.org/graph/v1/paper/batch?fields=abstract%2Cauthors%2CcitationCount%2CcitationStyles%2CcorpusId%2CexternalIds%2CfieldsOfStudy%2CinfluentialCitationCount%2CisOpenAccess%2Cjournal%2CopenAccessPdf%2CpaperId%2CpublicationDate%2CpublicationTypes%2CpublicationVenue%2CreferenceCount%2Cs2FieldsOfStudy%2Ctitle%2Curl%2Cvenue%2Cyear "HTTP/1.1 429 "
2025-03-24 11:33:02,709 - INFO - HTTP Request: POST https://api.semanticscholar.org/graph/v1/paper/batch?fields=abstract%2Cauthors%2CcitationCount%2CcitationStyle

In [8]:
init_paper = [x for x in paperbot.nodes_json if x['labels'] == ["Paper"] and 'Seed' in x['properties']['source']]
init_paper_dois = [x['id'] for x in init_paper]

In [9]:
init_paper_dois

['10.48550/arXiv.2406.10252',
 '10.48550/arXiv.2412.10415',
 '10.48550/arXiv.2402.12928',
 '10.48550/arXiv.2503.01424']

In [10]:
print(len(paperbot.nodes_json), len(paperbot.edges_json))

370 311


### Search Citation Information

In [11]:
import time
for paper_doi in init_paper_dois:
    paperbot.get_cited_papers(paper_doi) 
    time.sleep(5)
    paperbot.get_citing_papers(paper_doi) 
    time.sleep(5)

2025-03-24 11:37:24,931 - INFO - HTTP Request: GET https://api.semanticscholar.org/graph/v1/paper/10.48550/arXiv.2406.10252/references?fields=contexts%2Cintents%2CcontextsWithIntent%2CisInfluential%2Cabstract%2Cauthors%2CcitationCount%2CcitationStyles%2CcorpusId%2CexternalIds%2CfieldsOfStudy%2CinfluentialCitationCount%2CisOpenAccess%2Cjournal%2CopenAccessPdf%2CpaperId%2CpublicationDate%2CpublicationTypes%2CpublicationVenue%2CreferenceCount%2Cs2FieldsOfStudy%2Ctitle%2Curl%2Cvenue%2Cyear&offset=0&limit=100 "HTTP/1.1 200 OK"
2025-03-24 11:37:37,196 - INFO - HTTP Request: GET https://api.semanticscholar.org/graph/v1/paper/10.48550/arXiv.2406.10252/citations?fields=contexts%2Cintents%2CcontextsWithIntent%2CisInfluential%2Cabstract%2Cauthors%2CcitationCount%2CcitationStyles%2CcorpusId%2CexternalIds%2CfieldsOfStudy%2CinfluentialCitationCount%2CisOpenAccess%2Cjournal%2CopenAccessPdf%2CpaperId%2CpublicationDate%2CpublicationTypes%2CpublicationVenue%2CreferenceCount%2Cs2FieldsOfStudy%2Ctitle%2Cu

In [12]:
print(len(paperbot.nodes_json), len(paperbot.edges_json))

1971 2070


### Get Recommended Papers

In [13]:
paperbot.get_recommend_papers(paper_dois=init_paper_dois, from_dt='2022-01-01', to_dt='2025-03-13')

2025-03-24 11:39:03,324 - INFO - HTTP Request: POST https://api.semanticscholar.org/recommendations/v1/papers/?fields=abstract%2Cauthors%2CcitationCount%2CcitationStyles%2CcorpusId%2CexternalIds%2CfieldsOfStudy%2CinfluentialCitationCount%2CisOpenAccess%2Cjournal%2CopenAccessPdf%2CpaperId%2CpublicationDate%2CpublicationTypes%2CpublicationVenue%2CreferenceCount%2Cs2FieldsOfStudy%2Ctitle%2Curl%2Cvenue%2Cyear&limit=100 "HTTP/1.1 200 OK"


In [14]:
print(len(paperbot.nodes_json), len(paperbot.edges_json))

2513 2563


### Get Papers from Related Topics

In [15]:
domains, init_paper_info = [], []
for item in init_paper:
    title = item.get('properties',{}).get('title')
    abstract = item.get('properties',{}).get('abstract')
    domain = item.get('properties',{}).get('fieldsOfStudy')
    info = f"<paper> TITLE: {title}\nABSTRACT: {abstract} </paper>"
    init_paper_info.append(info)
    domains.extend(domain)

from collections import Counter
domain = Counter(domains).most_common(1)[0][0]

In [16]:
paperbot.get_related_papers(domain, input_text="\n".join(init_paper_info), from_dt='2022-01-01', to_dt='2025-03-13')

2025-03-24 11:39:09,277 - INFO - AFC is enabled with max remote calls: 10.
2025-03-24 11:39:11,634 - INFO - AFC remote call 1 is done.


{'field_of_study': ['Artificial Intelligence', 'Natural Language Processing', 'Information Science', 'Meta-Research'], 'keywords_and_topics': ['large language models', 'literature reviews', 'peer review', 'AI-driven research support systems', 'generative AI'], 'tags': ['AutoSurvey', 'Generative Agent Reviewers (GAR)', 'PRISMA standards', 'bibliometric indicators', 'hypothesis formulation', 'manuscript publication'], 'queries': ['LLM automated literature review', 'AI peer review simulation', 'AI research support systems', 'evaluation of AI-generated reviews']}


2025-03-24 11:39:12,800 - INFO - HTTP Request: GET https://api.semanticscholar.org/graph/v1/paper/search?query=LLM+automated+literature+review&fields=abstract%2Cauthors%2CcitationCount%2CcitationStyles%2CcorpusId%2CexternalIds%2CfieldsOfStudy%2CinfluentialCitationCount%2CisOpenAccess%2Cjournal%2CopenAccessPdf%2CpaperId%2CpublicationDate%2CpublicationTypes%2CpublicationVenue%2CreferenceCount%2Cs2FieldsOfStudy%2Ctitle%2Curl%2Cvenue%2Cyear&offset=0&limit=100 "HTTP/1.1 429 "
2025-03-24 11:39:48,284 - INFO - HTTP Request: GET https://api.semanticscholar.org/graph/v1/paper/search?query=LLM+automated+literature+review&fields=abstract%2Cauthors%2CcitationCount%2CcitationStyles%2CcorpusId%2CexternalIds%2CfieldsOfStudy%2CinfluentialCitationCount%2CisOpenAccess%2Cjournal%2CopenAccessPdf%2CpaperId%2CpublicationDate%2CpublicationTypes%2CpublicationVenue%2CreferenceCount%2Cs2FieldsOfStudy%2Ctitle%2Curl%2Cvenue%2Cyear&offset=0&limit=100 "HTTP/1.1 200 OK"
2025-03-24 11:39:56,851 - INFO - HTTP Request:

In [17]:
paper_nodes_json = [x for x in paperbot.nodes_json if x['labels'] == ["Paper"] ]
await paperbot.add_semantic_relationship(paper_nodes_json)

In [18]:
print(len(paperbot.nodes_json), len(paperbot.edges_json))

4257 100721


In [28]:
import json

filename = "paper_nodes_json.jsonl"

with open(filename, 'w') as f:
    for item in paperbot.nodes_json:
        json.dump(item, f, ensure_ascii=False)
        f.write('\n')

In [29]:
import json

filename = "paper_edges_json.jsonl"

with open(filename, 'w') as f:
    for item in paperbot.edges_json:
        json.dump(item, f, ensure_ascii=False)
        f.write('\n')

Stats

In [22]:
paperbot.nodes_json[0].keys()

dict_keys(['type', 'id', 'labels', 'properties', 'source', 'sourceDesc'])

In [24]:
paperbot.nodes_json[0]['source']

['CitedPaper', 'CitedPaper']

In [None]:
paperbot.nodes_json[0]['properties'].keys()

dict_keys(['s2PaperId', 'externalIds', 'corpusId', 'publicationVenue', 's2Url', 'title', 'abstract', 'venue', 'year', 'referenceCount', 'citationCount', 'influentialCitationCount', 'isOpenAccess', 'openAccessPdf', 'fieldsOfStudy', 's2FieldsOfStudy', 'publicationTypes', 'publicationDate', 'journal', 'citationStyles', 'authors', 'version', 'arxivUrl', 'arxivId', 'DOI', 'id', 'source', 'sourceDesc'])

In [25]:
paperbot.nodes_json[0]['properties']['source']

['Seed']

In [26]:
import copy 

def remove_key_values(input_dict, keys_to_delete):
    """delete key-value in dict"""
    opt_dct = copy.deepcopy(input_dict)
    for key in keys_to_delete:
        if key in opt_dct:  # 检查键是否存在，避免 KeyError
            del opt_dct[key]
    return opt_dct # 为了方便链式调用，返回修改后的字典

In [27]:
for item in paperbot.nodes_json:
    source = item.get('source')
    source_desc = item.get('sourceDesc')
    if (isinstance(source, list) and len(source) > 0) or (isinstance(source_desc, list) and len(source_desc) > 0):
        if isinstance(item['properties']['source'], list):
            item['properties']['source'].extend(source)
        else:
            item['properties']['source'] = source
        if isinstance(item['properties']['sourceDesc'], list):
            item['properties']['sourceDesc'].extend(source_desc)
        else:
            item['properties']['sourceDesc'] = source_desc
    item = remove_key_values(item, ['source', 'sourceDesc'])

In [34]:
set([x['labels'][0] for x in paperbot.nodes_json])

{'Author', 'Journal', 'Paper', 'Venue'}

In [37]:
len([x['id'] for x in paperbot.nodes_json]), len(set([x['id'] for x in paperbot.nodes_json]))

(4257, 4085)

In [None]:
for item in paperbot.nodes_json:
    if item['labels'] == ['Venue']:
        print(item)

In [38]:
for item in paperbot.nodes_json:
    item['ref_cnt'] = len(set(item['properties']['source']))

In [39]:
sorted_data_lambda = sorted(paperbot.nodes_json, key=lambda item: item['ref_cnt'], reverse=True)

In [43]:
i = 0
for item in sorted_data_lambda:
    if i < 10:
        if item['labels'] == ['Paper']:
            print(item)
            print(item['properties']['title'], item['ref_cnt'])
            i += 1
    else:
        break

{'type': 'node', 'id': '10.48550/arXiv.2402.12928', 'labels': ['Paper'], 'properties': {'s2PaperId': '69b53faee7ce5c007e4d3e3ea532818ed8d0645d', 'externalIds': {'DBLP': 'journals/corr/abs-2402-12928', 'ArXiv': '2402.12928', 'DOI': '10.48550/arXiv.2402.12928', 'CorpusId': 267760070}, 'corpusId': 267760070, 'publicationVenue': {'id': '1901e811-ee72-4b20-8f7e-de08cd395a10', 'name': 'arXiv.org', 'alternate_names': ['ArXiv'], 'issn': '2331-8422', 'url': 'https://arxiv.org', 'source': ['Seed'], 'sourceDesc': ['Original seed papers']}, 's2Url': 'https://www.semanticscholar.org/paper/69b53faee7ce5c007e4d3e3ea532818ed8d0645d', 'title': 'A Literature Review of Literature Reviews in Pattern Analysis and Machine Intelligence', 'abstract': 'The rapid advancements in Pattern Analysis and Machine Intelligence (PAMI) have led to an overwhelming expansion of scientific knowledge, spawning numerous literature reviews aimed at collecting and synthesizing fragmented information. This paper presents a thor

In [45]:
set([x['relationshipType'] for x in paperbot.edges_json])

{'CITES', 'PRINTS_ON', 'RELEASES_IN', 'SIMILAR_TO', 'WRITES'}

In [46]:

for item in paperbot.edges_json:
    if item['relationshipType'] == 'SIMILAR_TO':
        print(item)
        break

{'type': 'relationship', 'relationshipType': 'SIMILAR_TO', 'startNodeId': '10.48550/arXiv.2406.10252', 'endNodeId': '10.48550/arXiv.2412.10415', 'properties': {'source': 'semantic similarity', 'weight': 0.699}}


In [49]:
tmp = []
for item in paperbot.edges_json:
    if item['relationshipType'] == 'SIMILAR_TO':
        if item['properties']['weight'] > 0.7:
            tmp.append(item)
        else:
            continue
    else:
        tmp.append(item)




In [51]:
len(tmp), len(paperbot.edges_json)

(8371, 100721)

In [56]:
from collections import defaultdict

end_to_paper_dcts = [x for x in tmp if 'arXiv' in x['endNodeId']]

target_key = 'endNodeId'
value_counts = defaultdict(int)

for item in end_to_paper_dcts:
    if target_key in item:
        value = item[target_key]
        value_counts[value] += 1

# 按照出现次数从高到低排序
sorted_counts = sorted(value_counts.items(), key=lambda item: item[1], reverse=True)

In [57]:
print(f"'{target_key}' 对应的取值统计 (从高到低排序):")

node_dois = [x['id'] for x in paperbot.nodes_json]

next_dois = []
for value, count in sorted_counts:
    if count > 30:
        print(f"{value}: {count}")
        idx = node_dois.index(value)
        print(paperbot.nodes_json[idx]['properties']['title'])
        next_dois.append(value)


'endNodeId' 对应的取值统计 (从高到低排序):
10.48550/arXiv.2409.04600: 93
The emergence of Large Language Models (LLM) as a tool in literature reviews: an LLM automated systematic review
10.48550/arXiv.2412.15249: 81
LLMs for Literature Review: Are we there yet?
10.48550/arXiv.2403.08399: 65
System for systematic literature review using multiple AI agents: Concept and an empirical evaluation
10.48550/arXiv.2412.13612: 63
Are LLMs Good Literature Review Writers? Evaluating the Literature Review Writing Ability of Large Language Models
10.48550/arXiv.2406.10252: 61
AutoSurvey: Large Language Models Can Automatically Write Surveys
10.48550/arXiv.2308.10620: 61
Large Language Models for Software Engineering: A Systematic Literature Review
10.48550/arXiv.2503.08569: 56
DeepReview: Improving LLM-based Paper Review with Human-like Deep Thinking Process
10.48550/arXiv.2403.07183: 54
Monitoring AI-Modified Content at Scale: A Case Study on the Impact of ChatGPT on AI Conference Peer Reviews
10.48550/arXiv.24

In [55]:
from collections import defaultdict

start_from_paper_dcts = [x for x in tmp if 'arXiv' in x['startNodeId']]

target_key = 'startNodeId'
value_counts = defaultdict(int)

for item in start_from_paper_dcts:
    if target_key in item:
        value = item[target_key]
        value_counts[value] += 1

# 按照出现次数从高到低排序
sorted_counts = sorted(value_counts.items(), key=lambda item: item[1], reverse=True)

print(f"'{target_key}' 对应的取值统计 (从高到低排序):")

node_dois = [x['id'] for x in paperbot.nodes_json]

for value, count in sorted_counts:
    if count > 30:
        print(f"{value}: {count}")
        idx = node_dois.index(value)
        print(paperbot.nodes_json[idx]['properties']['title'])

'startNodeId' 对应的取值统计 (从高到低排序):
10.48550/arXiv.2402.12928: 112
A Literature Review of Literature Reviews in Pattern Analysis and Machine Intelligence
10.48550/arXiv.2412.10415: 101
Generative Adversarial Reviews: When LLMs Become the Critic
10.48550/arXiv.2409.04600: 83
The emergence of Large Language Models (LLM) as a tool in literature reviews: an LLM automated systematic review
10.48550/arXiv.2412.15249: 73
LLMs for Literature Review: Are we there yet?
10.48550/arXiv.2406.10252: 70
AutoSurvey: Large Language Models Can Automatically Write Surveys
10.48550/arXiv.2412.13612: 60
Are LLMs Good Literature Review Writers? Evaluating the Literature Review Writing Ability of Large Language Models
10.48550/arXiv.2308.10620: 47
Large Language Models for Software Engineering: A Systematic Literature Review
10.48550/arXiv.2403.08399: 47
System for systematic literature review using multiple AI agents: Concept and an empirical evaluation
10.48550/arXiv.2403.07183: 45
Monitoring AI-Modified Conte

## Further Exapnsion (Optional)

In [59]:
len(next_dois)

36

In [69]:
papers_info = []
for item in paperbot.nodes_json:
    if item['labels'] == ['Paper']:
        papers_info.append(item)


In [70]:
len(papers_info)

603

In [71]:
next_author_ids = []
for item in papers_info:
    if item['id'] in next_dois:
        author_ids = [x['authorId'] for x in item['properties']['authors']][0:5]
        for author_id in author_ids:
            if author_id not in next_author_ids:
                next_author_ids.append(author_id)

In [None]:
sorted_data_lambda = sorted(data, key=lambda item: item['score'], reverse=True)
print(sorted_data_lambda)

156

In [66]:
from apis.s2_api import SemanticScholarKit

s2 = SemanticScholarKit()
authors = s2.search_author_by_ids(author_ids=next_author_ids[0:100])

2025-03-24 15:28:44,284 - INFO - HTTP Request: POST https://api.semanticscholar.org/graph/v1/author/batch?fields=affiliations%2CauthorId%2CcitationCount%2CexternalIds%2ChIndex%2Chomepage%2Cname%2CpaperCount%2Cpapers%2Cpapers.abstract%2Cpapers.authors%2Cpapers.citationCount%2Cpapers.corpusId%2Cpapers.externalIds%2Cpapers.fieldsOfStudy%2Cpapers.influentialCitationCount%2Cpapers.isOpenAccess%2Cpapers.journal%2Cpapers.openAccessPdf%2Cpapers.paperId%2Cpapers.publicationDate%2Cpapers.publicationTypes%2Cpapers.publicationVenue%2Cpapers.referenceCount%2Cpapers.s2FieldsOfStudy%2Cpapers.title%2Cpapers.url%2Cpapers.venue%2Cpapers.year%2Curl "HTTP/1.1 200 OK"


In [None]:
affiliations

In [78]:
authors[0].keys()

dict_keys(['authorId', 'externalIds', 'url', 'name', 'affiliations', 'homepage', 'paperCount', 'citationCount', 'hIndex', 'papers'])

In [68]:
find_authors_id = [x['authorId'] for x in authors]

In [74]:
next_author_ids = []
for item in papers_info:
    hindex = 0 
    if item['id'] in next_dois:
        author_ids = [x['authorId'] for x in item['properties']['authors']][0:5]
        for author_id in author_ids:
            if author_id in find_authors_id:
                idx = find_authors_id.index(author_id)
                hindex += authors[idx].get('hIndex', 0)
    item['authors_hindex'] = hindex

In [None]:
sorted_data_lambda = sorted(papers_info, key=lambda item: item['authors_hindex'], reverse=True)

i = 0
for item in sorted_data_lambda:
    if i < 20:
        print(item)
        print(item['properties']['title'])
        i += 1

{'type': 'node', 'id': '10.48550/arXiv.2402.01788', 'labels': ['Paper'], 'properties': {'s2PaperId': 'fd30d3189b3bc3295ddad05ac1f683ce41f5e9cb', 'externalIds': {'ArXiv': '2402.01788', 'DBLP': 'journals/corr/abs-2402-01788', 'DOI': '10.48550/arXiv.2402.01788', 'CorpusId': 267412619}, 'corpusId': 267412619, 'publicationVenue': {'id': '1901e811-ee72-4b20-8f7e-de08cd395a10', 'name': 'arXiv.org', 'alternate_names': ['ArXiv'], 'issn': '2331-8422', 'url': 'https://arxiv.org'}, 's2Url': 'https://www.semanticscholar.org/paper/fd30d3189b3bc3295ddad05ac1f683ce41f5e9cb', 'title': 'LitLLM: A Toolkit for Scientific Literature Review', 'abstract': 'Conducting literature reviews for scientific papers is essential for understanding research, its limitations, and building on existing work. It is a tedious task which makes an automatic literature review generator appealing. Unfortunately, many existing works that generate such reviews using Large Language Models (LLMs) have significant limitations. They 

### Expand References for Highly Correlated Papers

In [None]:
next_dois = []
for edge in paperbot.edges_json:
    if edge['relationshipType'] == 'SIMILAR_TO' and edge['startNodeId'] in init_paper_dois:
        if edge['properties']['weight'] > 0.75 and edge['properties']['weight'] < 0.9:
            id = edge['startNodeId']
            if id not in next_dois:
                next_dois.append(edge['startNodeId'])


In [None]:
import time
for paper_doi in next_dois:
    paperbot.get_cited_papers(paper_doi) 
    time.sleep(5)

### Exapnd Key Cited Papers

identify key papers from seed papers' reference list

In [None]:
cited_paper_dois = []

next_dois = []
for edge in paperbot.edges_json:
    if edge['relationshipType'] == 'SIMILAR_TO' and edge['startNodeId'] in init_paper_dois:
        if edge['properties']['weight'] > 0.75 and edge['properties']['weight'] < 0.9:
            id = edge['startNodeId']
            if id not in next_dois:
                next_dois.append(edge['startNodeId'])

## Paper Filtering

In [None]:
filtered_dois, filtered_nodes, filtered_relationships = [], [], []

# seed paper
filtered_dois.extend(init_paper_dois)


for node in paperbot.nodes_json:
    # reference for seed paper
    if node['id'] in init_paper_dois and 'CitedPaper' in node['properties']['source']:
        filtered_dois.append(node['id'])


In [None]:
node['properties']

In [None]:
for node in paperbot.nodes_json:
    # reference for seed paper
    if node['labels'] == ['Paper'] and 'RecommendedPaper' in node['properties']['source']:
        print(node['properties']['title'])

In [None]:
for node in paperbot.nodes_json:
    # reference for seed paper
    if node['labels'] == ['Paper'] and node['id'] in init_paper_dois :
        print(node['properties']['title'], '\n', node['properties']['abstract'])
        print('-'*40)