In [1]:
doi = "10.48550/arXiv.2412.10415"
# s2_paper_id = "9e57dda195973c4b6c81386b1cc44595ecfd4697"
title = "Generative Adversarial Reviews: When LLMs Become the Critic"
abstract = "The peer review process is fundamental to scientific progress, determining which papers meet the quality standards for publication. Yet, the rapid growth of scholarly production and increasing specialization in knowledge areas strain traditional scientific feedback mechanisms. In light of this, we introduce Generative Agent Reviewers (GAR), leveraging LLM-empowered agents to simulate faithful peer reviewers. To enable generative reviewers, we design an architecture that extends a large language model with memory capabilities and equips agents with reviewer personas derived from historical data. Central to this approach is a graph-based representation of manuscripts, condensing content and logically organizing information - linking ideas with evidence and technical details. GAR's review process leverages external knowledge to evaluate paper novelty, followed by detailed assessment using the graph representation and multi-round assessment. Finally, a meta-reviewer aggregates individual reviews to predict the acceptance decision. Our experiments demonstrate that GAR performs comparably to human reviewers in providing detailed feedback and predicting paper outcomes. Beyond mere performance comparison, we conduct insightful experiments, such as evaluating the impact of reviewer expertise and examining fairness in reviews. By offering early expert-level feedback, typically restricted to a limited group of researchers, GAR democratizes access to transparent and in-depth evaluation."
text = f"TITLE: {title}\nABSTRACT: {abstract}"
paper = {'title':title, 'abstract': abstract}

In [2]:
text

"TITLE: Generative Adversarial Reviews: When LLMs Become the Critic\nABSTRACT: The peer review process is fundamental to scientific progress, determining which papers meet the quality standards for publication. Yet, the rapid growth of scholarly production and increasing specialization in knowledge areas strain traditional scientific feedback mechanisms. In light of this, we introduce Generative Agent Reviewers (GAR), leveraging LLM-empowered agents to simulate faithful peer reviewers. To enable generative reviewers, we design an architecture that extends a large language model with memory capabilities and equips agents with reviewer personas derived from historical data. Central to this approach is a graph-based representation of manuscripts, condensing content and logically organizing information - linking ideas with evidence and technical details. GAR's review process leverages external knowledge to evaluate paper novelty, followed by detailed assessment using the graph representati

In [3]:
import json

filename = "paper_nodes_json.jsonl"

nodes_json = []
with open(filename, 'r') as f:
    for item in f:
        nodes_json.append(json.loads(item))

In [4]:
def filter_and_reorder_dict(input_dict, keys_to_keep):
    """filter and re-order keys of dict"""
    return {key: input_dict[key] for key in keys_to_keep if key in input_dict}

nodes_json_rvsd = [filter_and_reorder_dict(x, ['type', 'id', 'labels', 'properties']) for x in nodes_json]

In [5]:
import json

filename = "paper_edges_json.jsonl"

edges_json = []
with open(filename, 'r') as f:
    for item in f:
        edges_json.append(json.loads(item))

## Citation Check

### Based on title and abstract through semantic similarity

In [6]:
import sys
import os

parent_dir = os.path.dirname(os.getcwd())
print(parent_dir)
sys.path.append(parent_dir)

/home/jiezi/Code/GitHub/ResearchTree/src


In [7]:
cited_dois = []
for edge in edges_json:
    if edge['startNodeId'] == doi and edge['relationshipType'] == 'CITES':
        cited_dois.append(edge['endNodeId'])

In [8]:
cited_abs_missing_ids = []
for node in nodes_json:
    if node['id'] in cited_dois:
        abstract = node['properties'].get('abstract')
        if abstract is None:
            cited_abs_missing_ids.append(node['properties']['s2PaperId'])
print(len(cited_dois), len(cited_abs_missing_ids))


57 47


In [9]:
from apis.s2_api import SemanticScholarKit

s2 = SemanticScholarKit()
cited_papers_metadata_missing_abs = s2.search_paper_by_ids(id_list=cited_abs_missing_ids)

2025-03-28 10:58:08,006 - INFO - HTTP Request: POST https://api.semanticscholar.org/graph/v1/paper/batch?fields=abstract%2Cauthors%2CcitationCount%2CcitationStyles%2CcorpusId%2CexternalIds%2CfieldsOfStudy%2CinfluentialCitationCount%2CisOpenAccess%2Cjournal%2CopenAccessPdf%2CpaperId%2CpublicationDate%2CpublicationTypes%2CpublicationVenue%2CreferenceCount%2Cs2FieldsOfStudy%2Ctitle%2Curl%2Cvenue%2Cyear "HTTP/1.1 200 OK"


In [10]:
len(cited_papers_metadata_missing_abs)

47

In [11]:
missing_ids = [x['paperId'] for x in cited_papers_metadata_missing_abs]

In [12]:
cited_papers_metadata_missing_abs[0]['abstract']

'The rapid advancement of scientific progress requires innovative tools that can accelerate knowledge discovery. Although recent AI methods, particularly large language models (LLMs), have shown promise in tasks such as hypothesis generation and experimental design, they fall short of replicating the collaborative nature of real-world scientific practices, where diverse experts work together in teams to tackle complex problems. To address the limitations, we propose an LLM-based multi-agent system, i.e., Virtual Scientists (VirSci), designed to mimic the teamwork inherent in scientific research. VirSci organizes a team of agents to collaboratively generate, evaluate, and refine research ideas. Through comprehensive experiments, we demonstrate that this multi-agent approach outperforms the state-of-the-art method in producing novel scientific ideas. We further investigate the collaboration mechanisms that contribute to its tendency to produce ideas with higher novelty, offering valuable

In [13]:
texts = []
titles, abstracts = [], []
for node in nodes_json:
    if node['id'] in cited_dois:
        title = node['properties']['title']
        s2_paper_id = node['properties']['s2PaperId']
        abstract = node['properties'].get('abstract')
        if abstract is None:
            idx = missing_ids.index(s2_paper_id)
            abstract = cited_papers_metadata_missing_abs[idx]['abstract']
            node['properties']['abstract'] = abstract

        texts.append(f"TITLE: {title}\nABSTRACT: {abstract}")
        titles.append(title)
        abstracts.append(abstract)

In [14]:
import toml

config_file = "config.toml"
try:
    with open(config_file, 'r', encoding='utf-8') as toml_file:
        config_param = toml.load(toml_file)
except FileNotFoundError:
    print(f"Config file '{config_file}' not found. Please ensure it exists.")
    config_param = {} 

In [15]:
llm_api_key = config_param.get('models', {}).get('llm', {}).get('api_key')
llm_model_name = config_param.get('models', {}).get('llm', {}).get('model_name')
embed_api_key = config_param.get('models', {}).get('embed', {}).get('api_key')
embed_model_name = config_param.get('models', {}).get('embed', {}).get('model_name')

In [16]:
# papers = [{'title': 'BERT', 'abstract': 'We introduce a new language representation model called BERT'},
#           {'title': 'Attention is all you need', 'abstract': ' The dominant sequence transduction models are based on complex recurrent or convolutional neural networks'}]

assert len(titles) == len(abstracts)
papers = []
for idx, title in enumerate(titles):
    abstract = abstracts[idx] if abstracts[idx] else 'NAN'
    papers.append({'title':title, 'abstract':abstract})

In [18]:
import numpy as np
from models.embedding_models import gemini_embedding_async

embeds = await gemini_embedding_async(embed_api_key, embed_model_name, [text]+texts, 10) # Assuming texts_embed_gen is an async function for IO-bound operations

In [19]:
import numpy as np
from models.embedding_models import semantic_similarity_matrix

sim_matrix = semantic_similarity_matrix(embeds[0], embeds[1:])
sim_matrix = np.array(sim_matrix)

In [23]:
opt = []
for idx, sim in enumerate(sim_matrix[0]):
    opt.append({'doi': cited_dois[idx], 'title':titles[idx], 'score':round(sim, 4)})

In [24]:
sorted_opt = sorted(opt, key=lambda item: item['score'], reverse=True)

In [25]:
sorted_opt

[{'doi': '10.48550/arXiv.2408.10365',
  'title': 'AI-Driven Review Systems: Evaluating LLMs in Scalable and Bias-Aware Academic Reviews',
  'score': 0.8349},
 {'doi': '10.48550/arXiv.2410.03019',
  'title': 'Is Your Paper Being Reviewed by an LLM? Investigating AI Text Detectability in Peer Review',
  'score': 0.8037},
 {'doi': '10.48550/arXiv.2307.05492',
  'title': 'ReviewerGPT? An Exploratory Study on Using Large Language Models for Paper Reviewing',
  'score': 0.7953},
 {'doi': '10.48550/arXiv.2404.16130',
  'title': 'ResearchAgent: Iterative Research Idea Generation over Scientific Literature with Large Language Models',
  'score': 0.7949},
 {'doi': '10.48550/arXiv.2307.14984',
  'title': 'GPT4 is Slightly Helpful for Peer-Review Assistance: A Pilot Study',
  'score': 0.7798},
 {'doi': '10.48550/arXiv.2406.10252',
  'title': 'Peer Review as A Multi-Turn and Long-Context Dialogue with Role-Based Interactions',
  'score': 0.7768},
 {'doi': '10.48550/arXiv.2310.05984',
  'title': 'Ca

## Top 10 Similarity Papers

In [27]:
k = 10
threshold = 0.7

top_k_dois = [x['doi'] for x in sorted_opt[0:10] if x['score'] > 0.7]
print(len(top_k_dois))

10


In [29]:
import time
from apis.s2_api import SemanticScholarKit

s2 = SemanticScholarKit()

cited_papers_info = []
for doi in top_k_dois:
    results = s2.get_s2_cited_papers(paper_id=doi)
    cited_papers_info.append(results)
    time.sleep(5)

2025-03-28 11:07:47,750 - INFO - HTTP Request: GET https://api.semanticscholar.org/graph/v1/paper/10.48550/arXiv.2408.10365/references?fields=contexts%2Cintents%2CcontextsWithIntent%2CisInfluential%2Cabstract%2Cauthors%2CcitationCount%2CcitationStyles%2CcorpusId%2CexternalIds%2CfieldsOfStudy%2CinfluentialCitationCount%2CisOpenAccess%2Cjournal%2CopenAccessPdf%2CpaperId%2CpublicationDate%2CpublicationTypes%2CpublicationVenue%2CreferenceCount%2Cs2FieldsOfStudy%2Ctitle%2Curl%2Cvenue%2Cyear&offset=0&limit=100 "HTTP/1.1 200 OK"
2025-03-28 11:07:54,718 - INFO - HTTP Request: GET https://api.semanticscholar.org/graph/v1/paper/10.48550/arXiv.2410.03019/references?fields=contexts%2Cintents%2CcontextsWithIntent%2CisInfluential%2Cabstract%2Cauthors%2CcitationCount%2CcitationStyles%2CcorpusId%2CexternalIds%2CfieldsOfStudy%2CinfluentialCitationCount%2CisOpenAccess%2Cjournal%2CopenAccessPdf%2CpaperId%2CpublicationDate%2CpublicationTypes%2CpublicationVenue%2CreferenceCount%2Cs2FieldsOfStudy%2Ctitle%2C

found overlaps

In [32]:
len(cited_papers_info)

10

In [34]:
cited_papers_info[0][0].keys()

dict_keys(['contexts', 'contextsWithIntent', 'isInfluential', 'intents', 'citedPaper'])

In [36]:
hop_2_papers = []
for results in cited_papers_info:
    for item in results:
        hop_2_papers.append(item['citedPaper'])

In [37]:
len(hop_2_papers)

316

In [31]:
len(cited_papers_metadata_missing_abs)

47

In [39]:
candit_papers = cited_papers_metadata_missing_abs + hop_2_papers

In [44]:
ids = [x['paperId'] for x in candit_papers]
ids_2 = [x['paperId'] for x in cited_papers_metadata_missing_abs]

In [45]:
from collections import defaultdict

target_key = 'paperId'
value_counts = defaultdict(int)

for item in candit_papers:
    if target_key in item:
        value = item[target_key]
        value_counts[value] += 1

# 按照出现次数从高到低排序
sorted_counts = sorted(value_counts.items(), key=lambda item: item[1], reverse=True)

print(f"'{target_key}' 对应的取值统计 (从高到低排序):")
for value, count in sorted_counts:
    paper_id = value
    if paper_id in ids_2:
        idx = ids.index(paper_id)
        title = candit_papers[idx]['title']
        print(f"{title}: {count}")

'paperId' 对应的取值统计 (从高到低排序):
The AI Scientist: Towards Fully Automated Open-Ended Scientific Discovery: 3
Peer Review as A Multi-Turn and Long-Context Dialogue with Role-Based Interactions: 3
Can large language models provide useful feedback on research papers? A large-scale empirical analysis: 3
ReviewerGPT? An Exploratory Study on Using Large Language Models for Paper Reviewing: 3
Generative Agents: Interactive Simulacra of Human Behavior: 3
GPT4 is Slightly Helpful for Peer-Review Assistance: A Pilot Study: 2
Out of One, Many: Using Language Models to Simulate Human Samples: 2
Longformer: The Long-Document Transformer: 2
From Louvain to Leiden: guaranteeing well-connected communities: 2
Many Heads Are Better Than One: Improved Scientific Idea Generation by A LLM-Based Multi-Agent System: 1
Is Your Paper Being Reviewed by an LLM? Investigating AI Text Detectability in Peer Review: 1
LLM-as-a-Judge & Reward Model: What They Can and Cannot Do: 1
AI-Driven Review Systems: Evaluating LLMs

In [None]:
for idx, sim in enumerate(sim_matrix.tolist()[0]):
    print(titles[idx], sim)


In [None]:
pos = sim_matrix.tolist()[0].index(0.9483)
texts[pos]

In [None]:
pos = np.round(sim_matrix, 4).tolist()[0].index(0.7278)
texts[pos]

In [None]:
pos = np.round(sim_matrix, 4).tolist()[0].index(0.7037)
texts[pos]

In [None]:
len(sim_matrix.tolist()[0]), len(texts)

In [None]:
llm_scores = [
  {"title": "Long-context LLMs Struggle with Long In-context Learning", "score": 0.4},
  {"title": "Distilling Text Style Transfer With Self-Explanation From LLMs", "score": 0.1},
  {"title": "KIEval: A Knowledge-grounded Interactive Evaluation Framework for Large Language Models", "score": 0.5},
  {"title": "Assisting in Writing Wikipedia-like Articles From Scratch with Large Language Models", "score": 0.7},
  {"title": "Nomic Embed: Training a Reproducible Long Context Text Embedder", "score": 0.3},
  {"title": "With Greater Text Comes Greater Necessity: Inference-Time Training Helps Long Text Generation", "score": 0.4},
  {"title": "Retrieval-Augmented Generation for Large Language Models: A Survey", "score": 0.6},
  {"title": "LooGLE: Can Long-Context Language Models Understand Long Contexts?", "score": 0.5},
  {"title": "Survey on Factuality in Large Language Models: Knowledge, Retrieval and Domain-Specificity", "score": 0.6},
  {"title": "Automatic Sensor-free Affect Detection: A Systematic Literature Review", "score": 0.05},
  {"title": "BooookScore: A systematic exploration of book-length summarization in the era of LLMs", "score": 0.45},
  {"title": "BAMBOO: A Comprehensive Benchmark for Evaluating Long Text Modeling Capacities of Large Language Models", "score": 0.5},
  {"title": "LongBench: A Bilingual, Multitask Benchmark for Long Context Understanding", "score": 0.5},
  {"title": "Challenges and Applications of Large Language Models", "score": 0.6},
  {"title": "A Survey on Evaluation of Large Language Models", "score": 0.7},
  {"title": "Lost in the Middle: How Language Models Use Long Contexts", "score": 0.6},
  {"title": "Extending Context Window of Large Language Models via Positional Interpolation", "score": 0.7},
  {"title": "Augmenting Language Models with Long-Term Memory", "score": 0.7},
  {"title": "Judging LLM-as-a-judge with MT-Bench and Chatbot Arena", "score": 0.4},
  {"title": "PandaLM: An Automatic Evaluation Benchmark for LLM Instruction Tuning Optimization", "score": 0.45},
  {"title": "Enabling Large Language Models to Generate Text with Citations", "score": 0.55}
]

In [None]:
for item in llm_scores:
    title = item.get('title')
    llm_score = item.get('score')
    for idx, text in enumerate(texts):
        if title in text:
            embed_score = sim_matrix.tolist()[0][idx]
            print(title, llm_score, embed_score)
            break
            

In [None]:
for edge in edges_json:
    if edge['startNodeId'] == doi and edge['relationshipType'] == 'SIMILAR_TO' and edge['endNodeId'] in cited_dois:
        print(edge)

### Based on contexts information

In [None]:
cited_info = []
for edge in edges_json:
    if edge['startNodeId'] == doi and edge['endNodeId'] in cited_dois:
        info = edge['properties'].get('contextsWithIntent')
        cited_info.append(info)


In [None]:
print(len(cited_info))
print(len([x for x in cited_info if len(x) > 0]))

In [None]:
filtered_contexts, filtered_idx = [], []
citation_info = []
for idx, item in enumerate(cited_info):
    if len(item) > 0:
        context = '...'. join([x.get('context') for x in item])
        filtered_contexts.append(context)
        filtered_idx.append(idx)
        title = titles[idx]
        citation_info.append({'title':title, 'context':context})

In [None]:
citation_info

In [None]:
llm_output = [
  {
    "title": "Long-context LLMs Struggle with Long In-context Learning",
    "reason": "Cited in the context of 'window limitations', this paper directly addresses one of the core challenges that AutoSurvey aims to solve, providing insights into the inherent restrictions of long-context LLMs.",
    "score": 0.8
  },
  {
    "title": "KIEval: A Knowledge-grounded Interactive Evaluation Framework for Large Language Models",
    "reason": "Cited in the context of 'Muti-LLM-as-judge evaluation', this paper introduces an evaluation framework relevant to the evaluation strategy employed by AutoSurvey for assessing the quality of generated surveys.",
    "score": 0.7
  },
  {
    "title": "Assisting in Writing Wikipedia-like Articles From Scratch with Large Language Models",
    "reason": "Cited in the context of 'parametric knowledge constraints', this paper explores methods for using LLMs to generate comprehensive and accurate content, which is relevant to overcoming the limitations of LLM's internal knowledge in survey creation.",
    "score": 0.7
  },
  {
    "title": "Retrieval-Augmented Generation for Large Language Models: A Survey",
    "reason": "Cited in the context of 'Real-time knowledge update', this paper provides a comprehensive overview of Retrieval-Augmented Generation (RAG), the core technique used by AutoSurvey for incorporating up-to-date information into the generated surveys.",
    "score": 0.9
  },
  {
    "title": "LooGLE: Can Long-Context Language Models Understand Long Contexts?",
    "reason": "Cited in the context of 'Long-form Text Generation' and related to 'window limitations', this paper investigates the fundamental ability of long-context LLMs to process and understand long texts, a crucial aspect for generating comprehensive surveys.",
    "score": 0.8
  },
  {
    "title": "Survey on Factuality in Large Language Models: Knowledge, Retrieval and Domain-Specificity",
    "reason": "Cited in the context of 'parametric knowledge constraints', this survey discusses the critical issue of factuality in LLMs, which is paramount for the reliability and accuracy of automatically generated literature surveys.",
    "score": 0.8
  },
  {
    "title": "Lost in the Middle: How Language Models Use Long Contexts",
    "reason": "Cited in the context of 'window limitations', this paper delves into how language models utilize long context, directly addressing the challenges associated with processing long input sequences in LLMs.",
    "score": 0.8
  },
  {
    "title": "Extending Context Window of Large Language Models via Positional Interpolation",
    "reason": "Cited in the context of long-context scenarios, this paper presents a specific technique for extending the context window of LLMs, which is a key area of research relevant to overcoming the limitations faced by AutoSurvey.",
    "score": 0.7
  },
  {
    "title": "Augmenting Language Models with Long-Term Memory",
    "reason": "Cited in the context of long-context scenarios, this paper explores methods for equipping LLMs with long-term memory capabilities, which is relevant to the challenge of maintaining coherence and incorporating extensive information in automatically generated surveys.",
    "score": 0.7
  },
  {
    "title": "Judging LLM-as-a-judge with MT-Bench and Chatbot Arena",
    "reason": "Cited in the context of 'Muti-LLM-as-judge evaluation', this paper evaluates the 'LLM-as-judge' method, providing insights into the reliability and effectiveness of using LLMs to evaluate text, a strategy employed by AutoSurvey.",
    "score": 0.8
  },
  {
    "title": "PandaLM: An Automatic Evaluation Benchmark for LLM Instruction Tuning Optimization",
    "reason": "Cited in the context of 'Muti-LLM-as-judge evaluation', this paper introduces an automatic evaluation benchmark for LLMs, which is relevant to the need for robust evaluation methods for automatically generated content like surveys.",
    "score": 0.7
  },
  {
    "title": "Enabling Large Language Models to Generate Text with Citations",
    "reason": "Cited in the context of Retrieval-Augmented Generation techniques, this paper specifically addresses the challenge of generating text with proper citations, which is crucial for the academic rigor of literature surveys.",
    "score": 0.8
  },
  {
    "title": "Active Retrieval Augmented Generation",
    "reason": "Cited in the context of 'Real-time knowledge update', this paper focuses on active RAG, a more sophisticated approach to retrieval-augmented generation that could enhance the real-time knowledge update mechanism in AutoSurvey.",
    "score": 0.9
  },
  {
    "title": "Teaching language models to support answers with verified quotes",
    "reason": "Cited in the context of Retrieval-Augmented Generation techniques, this paper explores how to train language models to provide answers supported by verified quotes, directly relevant to ensuring the factuality and credibility of automatically generated surveys.",
    "score": 0.8
  },
  {
    "title": "Progressive Generation of Long Text with Pretrained Language Models",
    "reason": "Cited in the context of 'Long-form Text Generation', this paper presents a method for progressively generating long text, which is highly relevant to the task of automatically creating comprehensive literature surveys.",
    "score": 0.7
  },
  {
    "title": "Self-Attention with Structural Position Representations",
    "reason": "Cited in the context of extending the context window, this paper proposes a technique to enhance the self-attention mechanism for better handling of long sequences, directly addressing the context window limitations of LLMs.",
    "score": 0.7
  },
  {
    "title": "Towards Coherent and Cohesive Long-form Text Generation",
    "reason": "Cited in the context of focusing on aspects beyond linguistic coherence in LLM-based writing, this paper addresses the crucial aspects of coherence and cohesion in long-form text generation, essential for the quality of automatically generated surveys.",
    "score": 0.7
  },
  {
    "title": "Self-Attention with Relative Position Representations",
    "reason": "Cited in the context of extending the context window, this paper introduces a method for incorporating relative positional information in the self-attention mechanism, aiming to improve the processing of long sequences.",
    "score": 0.7
  },
  {
    "title": "Discourse-Aware Neural Rewards for Coherent Text Generation",
    "reason": "Cited in the context of focusing on aspects beyond linguistic coherence, this paper proposes a reward mechanism to encourage discourse coherence in generated text, which is important for the logical flow of literature surveys.",
    "score": 0.7
  }
]

In [None]:
len(llm_output)

In [None]:
import toml

config_file = "config.toml"
try:
    with open(config_file, 'r', encoding='utf-8') as toml_file:
        config_param = toml.load(toml_file)
except FileNotFoundError:
    print(f"Config file '{config_file}' not found. Please ensure it exists.")
    config_param = {} 

In [None]:
llm_api_key = config_param.get('models', {}).get('llm', {}).get('api_key')
llm_model_name = config_param.get('models', {}).get('llm', {}).get('model_name')
embed_api_key = config_param.get('models', {}).get('embed', {}).get('api_key')
embed_model_name = config_param.get('models', {}).get('embed', {}).get('model_name')

In [None]:
import os
from google import genai
from google.genai import types



client = genai.Client(api_key=llm_api_key)
config = types.GenerateContentConfig(
    system_instruction=None,
    temperature=0.6)
response = client.models.generate_content(
    model="gemini-2.5-pro-exp-03-25", 
    contents="which is larger, 8.11 or 8.9?",
    config=config)

In [None]:
response