# Research Tree Interactive

## Setup

seed dois

In [1]:
seed_dois = ['10.48550/arXiv.2406.10252',  # AutoSurvey: Large Language Models Can Automatically Write Surveys
             '10.48550/arXiv.2412.10415',  # Generative Adversarial Reviews: When LLMs Become the Critic
             '10.48550/arXiv.2402.12928',  # A Literature Review of Literature Reviews in Pattern Analysis and Machine Intelligence 
             '10.48550/arXiv.1905.07870',  # PaperRobot: Incremental Draft Generation of Scientific Ideas
             '10.48550/arXiv.2503.01424'   # From Hypothesis to Publication: A Comprehensive Survey of AI-Driven Research Support Systems
             ]

load data

In [2]:
import json

filename = "paper_nodes_json.jsonl"

nodes_json = []
with open(filename, 'r') as f:
    for item in f:
        nodes_json.append(json.loads(item))

In [3]:
import json

filename = "paper_edges_json.jsonl"

edges_json = []
with open(filename, 'r') as f:
    for item in f:
        edges_json.append(json.loads(item))

load working path

In [4]:
import sys
import os

parent_dir = os.path.dirname(os.getcwd())
print(parent_dir)
sys.path.append(parent_dir)

/home/jiezi/Code/GitHub/ResearchTree/src


load params

In [5]:
import toml

config_file = "config.toml"
try:
    with open(config_file, 'r', encoding='utf-8') as toml_file:
        config_param = toml.load(toml_file)
except FileNotFoundError:
    print(f"Config file '{config_file}' not found. Please ensure it exists.")
    config_param = {} 

In [6]:
llm_api_key = config_param.get('models', {}).get('llm', {}).get('api_key')
llm_model_name = config_param.get('models', {}).get('llm', {}).get('model_name')
embed_api_key = config_param.get('models', {}).get('embed', {}).get('api_key')
embed_model_name = config_param.get('models', {}).get('embed', {}).get('model_name')

## Citation Tree - Hop 1

initially focused only on cites relationship

### Get Hop 1 Citation Papers

Get hop 1 citation paper dois

In [7]:
relationship_types = ['CITES']

hop_1_dois = []
for edge in edges_json:
    # seed paper cites other papers
    if edge['startNodeId'] in seed_dois and edge['relationshipType'] == 'CITES':
        doi = edge['endNodeId']
        if doi not in hop_1_dois:
            hop_1_dois.append(doi)

    # seed paper cited by other papers
    elif edge['endNodeId'] in seed_dois and edge['relationshipType'] == 'CITES':
        doi = edge['startNodeId']
        if doi not in hop_1_dois:
            hop_1_dois.append(doi)

In [10]:
print(len(seed_dois), len(hop_1_dois))

5 210


In [43]:
set(seed_dois) & set(hop_1_dois)

{'10.48550/arXiv.1905.07870', '10.48550/arXiv.2406.10252'}

Get hop 1 citation paper metadata

In [11]:
seed_paper_metadata, hop_1_paper_metadata =[], []

node_ids = [x['id'] for x in nodes_json]

for doi in seed_dois:
    idx = node_ids.index(doi)
    seed_paper_metadata.append(nodes_json[idx])

for doi in hop_1_dois:
    idx = node_ids.index(doi)
    hop_1_paper_metadata.append(nodes_json[idx])

In [12]:
print(len(seed_paper_metadata), len(hop_1_paper_metadata))

5 210


Find missing abstracts

In [17]:
paperids_abs_missing = []
for item in (seed_paper_metadata + hop_1_paper_metadata):
    paper_id = item['properties']['s2PaperId']
    abstract = item['properties']['abstract']
    if abstract is None and paper_id not in paperids_abs_missing:
        paperids_abs_missing.append(paper_id)

In [18]:
print(len(paperids_abs_missing))

181


Search again for papers with missing abstracts

In [19]:
from apis.s2_api import SemanticScholarKit 
s2 = SemanticScholarKit()
tmp_paper_metadata = s2.search_paper_by_ids(id_list=paperids_abs_missing)

2025-03-28 13:07:03,757 - INFO - HTTP Request: POST https://api.semanticscholar.org/graph/v1/paper/batch?fields=abstract%2Cauthors%2CcitationCount%2CcitationStyles%2CcorpusId%2CexternalIds%2CfieldsOfStudy%2CinfluentialCitationCount%2CisOpenAccess%2Cjournal%2CopenAccessPdf%2CpaperId%2CpublicationDate%2CpublicationTypes%2CpublicationVenue%2CreferenceCount%2Cs2FieldsOfStudy%2Ctitle%2Curl%2Cvenue%2Cyear "HTTP/1.1 429 "
2025-03-28 13:07:34,891 - INFO - HTTP Request: POST https://api.semanticscholar.org/graph/v1/paper/batch?fields=abstract%2Cauthors%2CcitationCount%2CcitationStyles%2CcorpusId%2CexternalIds%2CfieldsOfStudy%2CinfluentialCitationCount%2CisOpenAccess%2Cjournal%2CopenAccessPdf%2CpaperId%2CpublicationDate%2CpublicationTypes%2CpublicationVenue%2CreferenceCount%2Cs2FieldsOfStudy%2Ctitle%2Curl%2Cvenue%2Cyear "HTTP/1.1 429 "
2025-03-28 13:08:06,742 - INFO - HTTP Request: POST https://api.semanticscholar.org/graph/v1/paper/batch?fields=abstract%2Cauthors%2CcitationCount%2CcitationStyle

In [20]:
len(tmp_paper_metadata)

181

Update papers with abstracts

In [29]:
tmp_paper_dict = {item['paperId']: item['abstract'] for item in tmp_paper_metadata if 'paperId' in item and 'abstract' in item and item['abstract'] is not None}

for item in hop_1_paper_metadata:
    if 'properties' in item and 's2PaperId' in item['properties'] and item['properties'].get('abstract') is None:
        paper_id = item['properties']['s2PaperId']
        abstract = tmp_paper_dict.get(paper_id)
        if abstract is not None:
            item['properties']['abstract'] = abstract

### Calculate Similarity of Hop-1 Paper

In [45]:
seed_paper_info = {item['id']: f"{item['properties']['title']}\n{item['properties']['abstract']}"
                   for item in seed_paper_metadata if item.get('properties', {}).get('title') is not None
                     and item.get('properties', {}).get('abstract') is not None}

hop_1_paper_info = {item['id']: f"{item['properties']['title']}\n{item['properties']['abstract']}"
                   for item in hop_1_paper_metadata if item.get('properties', {}).get('title') is not None
                     and item.get('properties', {}).get('abstract') is not None}

In [54]:
seed_texts = [item[1] for item in seed_paper_info.items()]
hop_1_texts = [item[1] for item in hop_1_paper_info.items()]

In [56]:
import numpy as np
from models.embedding_models import gemini_embedding_async

embeds = await gemini_embedding_async(embed_api_key, embed_model_name, seed_texts+hop_1_texts, 10) # Assuming texts_embed_gen is an async function for IO-bound operations

In [57]:
assert len(embeds) == len(seed_texts) + len(hop_1_texts)

In [58]:
import numpy as np
from models.embedding_models import semantic_similarity_matrix

seed_text_embeds = embeds[0:len(seed_texts)]
hop_1_text_embeds = embeds[len(seed_texts):]
sim_matrix = semantic_similarity_matrix(seed_text_embeds, hop_1_text_embeds)
sim_matrix = np.array(sim_matrix)

In [62]:
row_cnt, col_cnt = sim_matrix.shape

In [65]:
sim_col_max, sim_col_min, sim_col_avg = [], [], [] 
for j in range(col_cnt):
    col_sim = sim_matrix[:, j]
    col_max = col_sim.max() # get colum max
    col_min = col_sim.min() # get colum min
    col_avg = np.average(col_sim)
    sim_col_max.append(col_max)
    sim_col_min.append(col_min)
    sim_col_avg.append(sim_col_avg)

In [87]:
tmp_dict = []
for idx, key in enumerate(hop_1_paper_info):
    sim_score = sim_col_max[idx]
    if sim_score > 0.7:
        tmp_dict.append({key:sim_score})

In [92]:
tmp_dict = {key:sim_col_max[idx] for idx, key in enumerate(hop_1_paper_info) if sim_col_max[idx] > 0.7 and sim_col_max[idx] < 0.9}

In [96]:
tmp_sorted_items = sorted(tmp_dict.items(), key=lambda item: item[1], reverse=True)

OK! These items could be seen as similar researches.

In [98]:
tmp_titles = {item['id']: item['properties']['title']
              for item in hop_1_paper_metadata if item.get('properties', {}).get('title') is not None}

for item in tmp_sorted_items:
    doi = item[0]
    sim_score = item[1]
    title = tmp_titles.get(doi)
    print(title, sim_score)

AI-Driven Review Systems: Evaluating LLMs in Scalable and Bias-Aware Academic Reviews 0.8214457401590604
Automatically Evaluating the Paper Reviewing Capability of Large Language Models 0.8033233477497779
ResearchAgent: Iterative Research Idea Generation over Scientific Literature with Large Language Models 0.7848394476347019
Is Your Paper Being Reviewed by an LLM? Investigating AI Text Detectability in Peer Review 0.7806782233064347
ReviewerGPT? An Exploratory Study on Using Large Language Models for Paper Reviewing 0.778227406526185
Peer Review as A Multi-Turn and Long-Context Dialogue with Role-Based Interactions 0.762231407990481
Instruct Large Language Models to Generate Scientific Literature Survey Step by Step 0.7609768344004895
Can large language models provide useful feedback on research papers? A large-scale empirical analysis 0.7570811958501278
CycleResearcher: Improving Automated Research via Automated Review 0.7552594032048701
Artificial intelligence to automate the system

## Citation Tree - Hop 2

### Get Hop 2 Citation Papers

Get top-k similar papers

In [100]:
k = 10
top_k_dois = [x[0] for x in tmp_sorted_items[0:k]]
tmp_ref = {item['id']: item['properties']['s2PaperId']
              for item in hop_1_paper_metadata if item.get('properties', {}).get('s2PaperId') is not None}
top_k_paperids = [tmp_ref.get(x) for x in top_k_dois]
print(len(top_k_dois), len(top_k_paperids))

10 10


In [101]:
top_k_paperids

['924956d6c788c9ea67ecdc80b63742d74350549e',
 '987d0cbe751780b9b993ebf8e670fb0d18fdaabe',
 '51b7b3ad7645a69e3c1c80cae69473b8bd472f67',
 '94fb5a19f86d81a746bb5502a5debf2659814e8e',
 '62729cff7dda7614f648a84e8967076d8878a5ff',
 '2424b7935cee3551deeea4a98b1a07abddf93649',
 '374d1e5fd7385353a4a0add1fadce23667662265',
 'f2209eb5ac6747319a29b87dedabb97770be3243',
 '92c82a51ad13c361d052987694cf93d6a72d5789',
 'f2d23bd0a60f46a5d99475977c2ad507b103eca7']

Retrieve paper metadata

In [102]:
import time
import threading
from apis.s2_api import SemanticScholarKit

class ParallelSemanticScholar:
    def __init__(self):
        self.s2 = SemanticScholarKit()
        self.ref_infos = []
        self.lock = threading.Lock()
        self.max_concurrent_requests = 10  # 根据 Semantic Scholar API 的限制调整

    def fetch_ref_info(self, paper_id):
        try:
            ref_paper_info = self.s2.get_s2_cited_papers(paper_id=paper_id)
            with self.lock:
                self.ref_infos.append(ref_paper_info)
            time.sleep(5)  # 保留原始代码中的延迟，如果需要的话
        except Exception as e:
            print(f"Error fetching info for {paper_id}: {e}")

    def worker(self, paper_id, semaphore):
        try:
            semaphore.acquire()
            self.fetch_ref_info(paper_id)
        finally:
            semaphore.release()

    def get_cited_papers_parallel(self, top_k_paperids):
        threads = []
        semaphore = threading.Semaphore(self.max_concurrent_requests)
        for paper_id in top_k_paperids:
            thread = threading.Thread(target=self.worker, args=(paper_id, semaphore))
            threads.append(thread)
            thread.start()

        for thread in threads:
            thread.join()

        return self.ref_infos

In [103]:
parallel_s2 = ParallelSemanticScholar()
start_time = time.time()
ref_infos = parallel_s2.get_cited_papers_parallel(top_k_paperids)
end_time = time.time()

print(f"Retrieved information for {len(ref_infos)} papers in {end_time - start_time:.2f} seconds.")

2025-03-28 14:32:03,811 - INFO - HTTP Request: GET https://api.semanticscholar.org/graph/v1/paper/987d0cbe751780b9b993ebf8e670fb0d18fdaabe/references?fields=contexts%2Cintents%2CcontextsWithIntent%2CisInfluential%2Cabstract%2Cauthors%2CcitationCount%2CcitationStyles%2CcorpusId%2CexternalIds%2CfieldsOfStudy%2CinfluentialCitationCount%2CisOpenAccess%2Cjournal%2CopenAccessPdf%2CpaperId%2CpublicationDate%2CpublicationTypes%2CpublicationVenue%2CreferenceCount%2Cs2FieldsOfStudy%2Ctitle%2Curl%2Cvenue%2Cyear&offset=0&limit=100 "HTTP/1.1 200 OK"
2025-03-28 14:32:03,818 - INFO - HTTP Request: GET https://api.semanticscholar.org/graph/v1/paper/94fb5a19f86d81a746bb5502a5debf2659814e8e/references?fields=contexts%2Cintents%2CcontextsWithIntent%2CisInfluential%2Cabstract%2Cauthors%2CcitationCount%2CcitationStyles%2CcorpusId%2CexternalIds%2CfieldsOfStudy%2CinfluentialCitationCount%2CisOpenAccess%2Cjournal%2CopenAccessPdf%2CpaperId%2CpublicationDate%2CpublicationTypes%2CpublicationVenue%2CreferenceCoun

Retrieved information for 10 papers in 12.99 seconds.


Extract Paper Information

In [108]:
hop_2_paper_metadata = []
for refs in ref_infos:
    for item in refs:
        paper = item.get('citedPaper')
        hop_2_paper_metadata.append(paper)

In [109]:
len(hop_2_paper_metadata)

538

### Find Common Citations

In [114]:
tmp_titles = {item['paperId']: item['title']
              for item in hop_2_paper_metadata if item.get('title') is not None}

In [117]:
from collections import defaultdict

target_key = 'paperId'
value_counts = defaultdict(int)

for item in hop_2_paper_metadata:
    if target_key in item:
        value = item[target_key]
        value_counts[value] += 1

# 按照出现次数从高到低排序
sorted_counts = sorted(value_counts.items(), key=lambda item: item[1], reverse=True)

In [123]:
hop_1_paper_ids = [item['properties']['s2PaperId'] for item in hop_1_paper_metadata]

In [125]:
print(f"'{target_key}' 对应的取值统计 (从高到低排序):")
for paper_id, count in sorted_counts:
    if paper_id is not None:
        title = tmp_titles.get(paper_id)
        in_seed_citation = 'YES' if paper_id in hop_1_paper_ids else 'NO'
        print(f"{title}: {count}, In citation: {in_seed_citation}")

'paperId' 对应的取值统计 (从高到低排序):
GPT-4 Technical Report: 6, In citation: YES
The AI Scientist: Towards Fully Automated Open-Ended Scientific Discovery: 5, In citation: YES
Can large language models provide useful feedback on research papers? A large-scale empirical analysis: 5, In citation: YES
GPT4 is Slightly Helpful for Peer-Review Assistance: A Pilot Study: 5, In citation: YES
ReviewerGPT? An Exploratory Study on Using Large Language Models for Paper Reviewing: 4, In citation: YES
Judging LLM-as-a-judge with MT-Bench and Chatbot Arena: 3, In citation: YES
Qwen Technical Report: 3, In citation: NO
RoBERTa: A Robustly Optimized BERT Pretraining Approach: 3, In citation: NO
Training language models to follow instructions with human feedback: 3, In citation: NO
Fighting reviewer fatigue or amplifying bias? Considerations and recommendations for use of ChatGPT and other large language models in scholarly peer review: 3, In citation: NO
Investigating Fairness Disparities in Peer Review: A Lan