## Setup

In [7]:
research_topic = "llm literature review"
seed_dois = ['10.48550/arXiv.2406.10252',  # AutoSurvey: Large Language Models Can Automatically Write Surveys
             '10.48550/arXiv.2412.10415',  # Generative Adversarial Reviews: When LLMs Become the Critic
             '10.48550/arXiv.2402.12928',  # A Literature Review of Literature Reviews in Pattern Analysis and Machine Intelligence 
             ]
seed_titles = ['PaperRobot: Incremental Draft Generation of Scientific Ideas',
               'From Hypothesis to Publication: A Comprehensive Survey of AI-Driven Research Support Systems'
               ]

In [8]:
import sys
import os

parent_dir = os.path.dirname(os.getcwd())
print(parent_dir)
sys.path.append(parent_dir)

/home/jiezi/Code/GitHub/ResearchTree


In [9]:
import toml

config_file = "config.toml"
try:
    with open(config_file, 'r', encoding='utf-8') as toml_file:
        config_param = toml.load(toml_file)
except FileNotFoundError:
    print(f"Config file '{config_file}' not found. Please ensure it exists.")
    config_param = {} 

In [10]:
llm_api_key = config_param.get('models', {}).get('llm', {}).get('api_key')
llm_model_name = config_param.get('models', {}).get('llm', {}).get('model_name')
embed_api_key = config_param.get('models', {}).get('embed', {}).get('api_key')
embed_model_name = config_param.get('models', {}).get('embed', {}).get('model_name')

## Paper Search

In [11]:
from apis.s2_api import SemanticScholarKit

s2 = SemanticScholarKit()

### Paper Metadata

In [12]:
import time 

def init_paper_search(
        research_topic, 
        seed_paper_dois, 
        seed_paper_titles,
        limit = 100,
        fields = None,
        ):
    seed_paper_metadata, searched_paper_metadata = [], []
    if seed_paper_dois:
        s2_paper_metadata = s2.search_paper_by_ids(id_list=seed_paper_dois, fields=fields)
        seed_paper_metadata.extend(s2_paper_metadata)
        time.sleep(5)

    if seed_paper_titles and len(seed_paper_titles) > 0:
        for title in seed_paper_titles:
            s2_paper_metadata = s2.search_paper_by_keywords(query=title, fields=fields, limit=limit)
            if s2_paper_metadata: # Check if s2_paper_metadata is not empty to avoid IndexError
                seed_paper_metadata.append(s2_paper_metadata[0])
                searched_paper_metadata.extend(s2_paper_metadata[1:]) 
                time.sleep(5)

    if research_topic:
        s2_paper_metadata = s2.search_paper_by_keywords(query=research_topic, fields=fields, limit=limit)
        searched_paper_metadata.extend(s2_paper_metadata) # Renamed 'srched_paper_metadata' to 'searched_paper_metadata'
        time.sleep(5)
    
    return seed_paper_metadata, searched_paper_metadata

In [13]:
seed_paper_metadata, searched_paper_metadata = init_paper_search(
    research_topic, 
    seed_dois, 
    seed_titles)

2025-03-25 09:36:39,176 - INFO - HTTP Request: POST https://api.semanticscholar.org/graph/v1/paper/batch?fields=abstract%2Cauthors%2CcitationCount%2CcitationStyles%2CcorpusId%2CexternalIds%2CfieldsOfStudy%2CinfluentialCitationCount%2CisOpenAccess%2Cjournal%2CopenAccessPdf%2CpaperId%2CpublicationDate%2CpublicationTypes%2CpublicationVenue%2CreferenceCount%2Cs2FieldsOfStudy%2Ctitle%2Curl%2Cvenue%2Cyear "HTTP/1.1 200 OK"
2025-03-25 09:36:55,439 - INFO - HTTP Request: GET https://api.semanticscholar.org/graph/v1/paper/search?query=PaperRobot%3A+Incremental+Draft+Generation+of+Scientific+Ideas&fields=abstract%2Cauthors%2CcitationCount%2CcitationStyles%2CcorpusId%2CexternalIds%2CfieldsOfStudy%2CinfluentialCitationCount%2CisOpenAccess%2Cjournal%2CopenAccessPdf%2CpaperId%2CpublicationDate%2CpublicationTypes%2CpublicationVenue%2CreferenceCount%2Cs2FieldsOfStudy%2Ctitle%2Curl%2Cvenue%2Cyear&offset=0&limit=100 "HTTP/1.1 200 OK"
2025-03-25 09:37:01,993 - INFO - HTTP Request: GET https://api.semanti

In [14]:
print(len(seed_paper_metadata), len(searched_paper_metadata))

5 102


In [15]:
for item in searched_paper_metadata:
    print(item['title'])

Rising Stars Research Projects 2016-2017: Action Research to Improve MLA’s Communities
Smartrawl: a system to eliminate discards and bycatch in fisheries
Highlighting Case Studies in LLM Literature Review of Interdisciplinary System Science
Model Parallelism on Distributed Infrastructure: A Literature Review from Theory to LLM Case-Studies
CHIME: LLM-Assisted Hierarchical Organization of Scientific Studies for Literature Review Support
A Systematic Literature Review on LLM-Based Information Retrieval: The Issue of Contents Classification
Automated Literature Review Using NLP Techniques and LLM-Based Retrieval-Augmented Generation
Quality Assurance for LLM-Generated Test Cases: A Systematic Literature Review
LLM-Based Multi-Agent Systems for Software Engineering: Literature Review, Vision and the Road Ahead
Using LLM (Large Language Model) to Improve Efficiency in Literature Review for Undergraduate Research
Enhancing literature review with LLM and NLP methods. Algorithmic trading case


### Citation Metadata

In [16]:
init_ids = [x['paperId'] for x in seed_paper_metadata]

cited_paper_info, citing_paper_info = [], []
for paper_id in init_ids:
    cited_paper_info.append(s2.get_s2_cited_papers(paper_id=paper_id))
    time.sleep(5)
    citing_paper_info.append(s2.get_s2_citing_papers(paper_id=paper_id))
    time.sleep(5)

2025-03-25 09:37:15,899 - INFO - HTTP Request: GET https://api.semanticscholar.org/graph/v1/paper/9e57dda195973c4b6c81386b1cc44595ecfd4697/references?fields=contexts%2Cintents%2CcontextsWithIntent%2CisInfluential%2Cabstract%2Cauthors%2CcitationCount%2CcitationStyles%2CcorpusId%2CexternalIds%2CfieldsOfStudy%2CinfluentialCitationCount%2CisOpenAccess%2Cjournal%2CopenAccessPdf%2CpaperId%2CpublicationDate%2CpublicationTypes%2CpublicationVenue%2CreferenceCount%2Cs2FieldsOfStudy%2Ctitle%2Curl%2Cvenue%2Cyear&offset=0&limit=100 "HTTP/1.1 200 OK"
2025-03-25 09:37:22,041 - INFO - HTTP Request: GET https://api.semanticscholar.org/graph/v1/paper/9e57dda195973c4b6c81386b1cc44595ecfd4697/citations?fields=contexts%2Cintents%2CcontextsWithIntent%2CisInfluential%2Cabstract%2Cauthors%2CcitationCount%2CcitationStyles%2CcorpusId%2CexternalIds%2CfieldsOfStudy%2CinfluentialCitationCount%2CisOpenAccess%2Cjournal%2CopenAccessPdf%2CpaperId%2CpublicationDate%2CpublicationTypes%2CpublicationVenue%2CreferenceCount

In [48]:
# based on influential paper information
from typing import Literal

def get_citation_stats(
        seed_paper_metadata,
        citation_paper_metadata, 
        citation_type:Literal['citedPaper', 'citingPaper']):
    
    paper_influential_stat, paper_apprearance_stat = [], []

    for idx, papers in enumerate(citation_paper_metadata):
        for info in papers:
            seed_paper_id = seed_paper_metadata[idx]['paperId']
            citation_paper_id = info.get(citation_type, {}).get('paperId')

            if citation_type == 'citedPaper':
                from_paper_id = seed_paper_id
                to_paper_id = citation_paper_id
            else:
                from_paper_id = citation_paper_id
                to_paper_id = seed_paper_id
            
            if to_paper_id:
                tmp_appr_ids = [(x['from_id'], x['to_id']) for x in paper_apprearance_stat]
                if (from_paper_id, to_paper_id) not in tmp_appr_ids:
                    apprearance = {'category': 'cites', 'from_id': from_paper_id, 'to_id': to_paper_id, 'appearance_cnt': len(info.get('contexts', []))}
                    paper_apprearance_stat.append(apprearance)
                else:
                    pos = tmp_appr_ids.index((from_paper_id, to_paper_id))
                    paper_apprearance_stat[pos]['appearance_cnt'] += len(info.get('contexts', []))
                
                if info.get('isInfluential') == True:
                    tmp_ids = [(x['from_id'], x['to_id']) for x in paper_influential_stat]
                    if (from_paper_id, to_paper_id) not in tmp_ids:
                        influence = {'category': 'cites', 'from_id': from_paper_id, 'to_id': to_paper_id, 'is_influential': True}
                        paper_influential_stat.append(influence)
    return paper_influential_stat, paper_apprearance_stat


In [49]:
paper_influential_stat_1, paper_apprearance_stat_1 = get_citation_stats(seed_paper_metadata, cited_paper_info, 'citedPaper')
paper_influential_stat_2, paper_apprearance_stat_2 = get_citation_stats(seed_paper_metadata, citing_paper_info, 'citingPaper')
paper_influential_stat = paper_influential_stat_1 + paper_influential_stat_2
paper_apprearance_stat = paper_apprearance_stat_1 + paper_apprearance_stat_2

In [58]:
len(paper_influential_stat), len(paper_apprearance_stat)

(35, 432)

[{'category': 'cites',
  'from_id': '9e57dda195973c4b6c81386b1cc44595ecfd4697',
  'to_id': '2717e5c7384ec12cfd6cf9c34897c6adad3230ed',
  'appearance_cnt': 1},
 {'category': 'cites',
  'from_id': '9e57dda195973c4b6c81386b1cc44595ecfd4697',
  'to_id': '1a22ad92a06422d4580effd7cc2778e38d9f7368',
  'appearance_cnt': 0},
 {'category': 'cites',
  'from_id': '9e57dda195973c4b6c81386b1cc44595ecfd4697',
  'to_id': '121aef49e5fbe8d9dc829adaa472a44aff84f4f5',
  'appearance_cnt': 2},
 {'category': 'cites',
  'from_id': '9e57dda195973c4b6c81386b1cc44595ecfd4697',
  'to_id': '2fcad63cc68ca74acdfafdbc145325ee59952f24',
  'appearance_cnt': 2},
 {'category': 'cites',
  'from_id': '9e57dda195973c4b6c81386b1cc44595ecfd4697',
  'to_id': '03bdd9cbb3b768ff3e96c97b28e106748b6e4fd0',
  'appearance_cnt': 0},
 {'category': 'cites',
  'from_id': '9e57dda195973c4b6c81386b1cc44595ecfd4697',
  'to_id': '5215a3cfd67fdc6eb0201822dd0004bd4b830f91',
  'appearance_cnt': 1},
 {'category': 'cites',
  'from_id': '9e57dda19

### Author Metadata

In [24]:
author_ids = []
for item in seed_paper_metadata:
    authors = item.get('authors', [])[0:5]
    author_ids.extend([x['authorId'] for x in authors if x['authorId']])

In [25]:
print(len(author_ids))

21


In [1]:
import networkx as nx
G = nx.MultiDiGraph(name='test')
G.add_nodes_from([(4, {"color": "red"}), (5, {"color": "green"})])
G.nodes

NodeView((4, 5))

In [6]:
G.nodes[4]

{'color': 'blue', 'name': 'sky'}

In [7]:
G.add_nodes_from([(4, {"new":"abc"}), (5, {"color": "green"})])

In [8]:
G.nodes

NodeView((4, 5))

In [9]:
G.nodes[4]

{'color': 'blue', 'name': 'sky', 'new': 'abc'}

In [26]:
authros_info = s2.search_author_by_ids(author_ids=author_ids)
print(len(authros_info))

2025-03-25 09:38:57,806 - INFO - HTTP Request: POST https://api.semanticscholar.org/graph/v1/author/batch?fields=affiliations%2CauthorId%2CcitationCount%2CexternalIds%2ChIndex%2Chomepage%2Cname%2CpaperCount%2Cpapers%2Cpapers.abstract%2Cpapers.authors%2Cpapers.citationCount%2Cpapers.corpusId%2Cpapers.externalIds%2Cpapers.fieldsOfStudy%2Cpapers.influentialCitationCount%2Cpapers.isOpenAccess%2Cpapers.journal%2Cpapers.openAccessPdf%2Cpapers.paperId%2Cpapers.publicationDate%2Cpapers.publicationTypes%2Cpapers.publicationVenue%2Cpapers.referenceCount%2Cpapers.s2FieldsOfStudy%2Cpapers.title%2Cpapers.url%2Cpapers.venue%2Cpapers.year%2Curl "HTTP/1.1 200 OK"


21


In [59]:
authros_info[0]

{'authorId': '2108024279',
 'externalIds': {'DBLP': ['Yidong Wang']},
 'url': 'https://www.semanticscholar.org/author/2108024279',
 'name': 'Yidong Wang',
 'affiliations': ['Peking University'],
 'homepage': 'https://qianlanwyd.github.io/',
 'paperCount': 42,
 'citationCount': 3563,
 'hIndex': 17,
 'papers': [{'paperId': '0be6c17b47256fee1e960c94ff9c13118e85576e',
   'externalIds': {'ArXiv': '2412.15118',
    'DBLP': 'journals/corr/abs-2412-15118',
    'DOI': '10.48550/arXiv.2412.15118',
    'CorpusId': 274859836},
   'corpusId': 274859836,
   'publicationVenue': {'id': '1901e811-ee72-4b20-8f7e-de08cd395a10',
    'name': 'arXiv.org',
    'alternate_names': ['ArXiv'],
    'issn': '2331-8422',
    'url': 'https://arxiv.org'},
   'url': 'https://www.semanticscholar.org/paper/0be6c17b47256fee1e960c94ff9c13118e85576e',
   'title': 'Outcome-Refining Process Supervision for Code Generation',
   'abstract': None,
   'venue': 'arXiv.org',
   'year': 2024,
   'referenceCount': 92,
   'citationCo

## Paper Recommendations

In [27]:
from models.embedding_models import gemini_embedding_async, semantic_similarity_matrix

init_paper_texts = []
seed_paper_metadata_filtered = []
for item in seed_paper_metadata:
    title = item.get('title')
    abstract = item.get('abstract')
    if title is not None and abstract is not None:
        init_paper_texts.append(f"{title}\n{abstract}")
        seed_paper_metadata_filtered.append(item)

searched_paper_texts = []
searched_paper_metadata_filtered = []
for item in searched_paper_metadata:
    title = item.get('title')
    abstract = item.get('abstract')
    if title is not None and abstract is not None:
        searched_paper_texts.append(f"{title}\n{abstract}")
        searched_paper_metadata_filtered.append(item)

In [28]:
import numpy as np
# then calculate semantic similarities between the texts
seeds_embeds = await gemini_embedding_async(embed_api_key, embed_model_name, init_paper_texts, 10) # Assuming texts_embed_gen is an async function for IO-bound operations
candits_embeds = await gemini_embedding_async(embed_api_key, embed_model_name, searched_paper_texts, 10) # Assuming texts_embed_gen is an async function for IO-bound operations

# calculate similarity matrix
sim_matrix = semantic_similarity_matrix(seeds_embeds, candits_embeds)
sim_matrix = np.array(sim_matrix)

In [29]:
rows, cols = sim_matrix.shape

sim_col_max, sim_col_min = [], [] 
for j in range(cols):
    col_sim = sim_matrix[:, j]
    col_max = col_sim.max() # get colum max
    col_min = col_sim.min() # get colum min
    sim_col_max.append(col_max)
    sim_col_min.append(col_min)

In [30]:
len(sim_col_max), len(sim_col_min), len(searched_paper_metadata_filtered)

(87, 87, 87)

In [31]:
import copy
tmp_lst = copy.deepcopy(searched_paper_metadata_filtered)
for idx, item in enumerate(tmp_lst):
    item['sim_max'] = sim_col_max[idx]
    item['sim_min'] = sim_col_min[idx]

Prepare positive ids

In [32]:
if len(init_ids) < 5:
    # sim max in descending order
    sorted_lst= sorted(tmp_lst, key=lambda item: item['sim_max'], reverse=True)
    cnt = 5 - len(init_ids)
    proposed_ids = [x['paperId'] for x in sorted_lst[0:cnt]]
    positive_ids = init_ids + proposed_ids
    print(positive_ids)
else:
    positive_ids = init_ids

In [33]:
positive_ids

['9e57dda195973c4b6c81386b1cc44595ecfd4697',
 '9f3ae8055e227edb413c54417c9c216f1f554f52',
 '69b53faee7ce5c007e4d3e3ea532818ed8d0645d',
 'a6aed0c4e0f39a55edb407f492e41f178a62907f',
 'cdb34c0092a767848ca1de6fa7e3a6b822585fa4']

Prepare negative ids

In [34]:
sorted_lst= sorted(tmp_lst, key=lambda item: item['sim_min'], reverse=False)
cnt = 3
negative_ids = [x['paperId'] for x in sorted_lst[0:cnt]]
print(negative_ids)

['a2f071098bf00a8d113c87751790f3cfd47d28dd', '06ea3b3c4680cc8939b830f0edf260a0d142d76a', '9737ce15ad4645bcf943575b482cc9815a93aa65']


In [35]:
recommended_paper_metadata = s2.get_s2_recommended_papers(
    positive_paper_ids=positive_ids,
    negative_paper_ids=negative_ids
    )
print(len(recommended_paper_metadata))

2025-03-25 09:39:34,873 - INFO - HTTP Request: POST https://api.semanticscholar.org/recommendations/v1/papers/?fields=abstract%2Cauthors%2CcitationCount%2CcitationStyles%2CcorpusId%2CexternalIds%2CfieldsOfStudy%2CinfluentialCitationCount%2CisOpenAccess%2Cjournal%2CopenAccessPdf%2CpaperId%2CpublicationDate%2CpublicationTypes%2CpublicationVenue%2CreferenceCount%2Cs2FieldsOfStudy%2Ctitle%2Curl%2Cvenue%2Cyear&limit=100 "HTTP/1.1 200 OK"


100


## Topic Search

In [36]:
keywords_topics_example = {
    "field_of_study": ["Political Science", "Social Media Studies", "Communication Studies", "Sociology, Digital Culture"],
    "keywords_and_topics": ["social media usage", "political polarization", "mixed-methods approach", "semi-structured interviews"],
    "tags": ["online behavior", "echo chambers", "survey methodology", "young adults", "political communication", "digital ethnography", "ideology"],
    "queries": ["youth political polarization", "youth social media usage", "'online behavior' AND 'ideology'"]
}

keywords_topics_prompt = """You are a sophisticated academic scholar with expertise in {domain}. 
You are renowned for your ability to quickly grasp the core concepts of research papers and expertly categorize and tag information for optimal organization and retrieval.

## TASK
You will meticulously analyze the provided text from one or more research papers and conclude the following:
- field_of_study: Propose 2-4 detailed academic fields that these research would logically fall under.
- keywords_and_topics: Identify 3-5 key terms, phrases or topics that accurately capture the specific subject matter and central ideas discussed within the papers. These keywords should be highly relevant and representative within the specific research area.
- tags: Suggest 3-5 concise tags that could be used to further refine the indexing and searchability of the papers. These tags might include specific methodologies, theories, named entities, or emerging concepts mentioned within the texts. They should be specific enough to differentiate the content from the broader categories.
- queries: based on the above information, compose 2-4 queries to search from Google Scholar for more research work on related topics.

## EXAMPLE
Here is an example for demonstraction purpose only. Do not use this specific example in your response, it is solely illustrative.

Input Paragraph:  
Social media usage heighten political polarization in youth - A quantitative study 
This study employed a mixed-methods approach to investigate the impact of social media usage on political polarization among young adults in urban areas. 
Quantitative data was collected through a survey of 500 participants, while qualitative data was gathered via semi-structured interviews with a subset of 25 participants. 
The findings suggest a correlation between increased exposure to ideologically homogeneous content online and heightened political polarization.

Hypothetical Output from this Example (Again, illustrative and not to be used in the actual response):
```json
{example_json}
```

## INSTRUCTIONS
1. Be precise with keywords and topics, avoid overly broad or generic terms.
2. Prioritize terms that are most representative and distinctive for the papers.
3. Only one set of field_of_study, keywords_and_topics, tags, and queries for all the papers from input. Do not output multiple sets.

## INPUT
Now start analyzing the following texts from paper(s).
{input_text}

## OUTPUT
Make sure you output in json with double quotes.
"""

In [37]:
domains, seed_paper_texts = [], []
for item in seed_paper_metadata:
    title = item.get('title')
    abstract = item.get('abstract')
    domain = item.get('fieldsOfStudy')
    info = f"<paper> TITLE: {title}\nABSTRACT: {abstract} </paper>"
    seed_paper_texts.append(info)
    domains.extend(domain)

from collections import Counter
domain = Counter(domains).most_common(1)[0][0]
print(domain)

Computer Science


In [38]:
import json
from json_repair import repair_json
from models.llms import llm_gen_w_retry

# llm propose search queries
qa_prompt = keywords_topics_prompt.format(
        domain = domain,
        example_json = keywords_topics_example,
        input_text = "\n\n".join(seed_paper_texts)
    )
keywords_topics_info = llm_gen_w_retry(llm_api_key, llm_model_name, qa_prompt, sys_prompt=None, temperature=0.6)
    
# extract keywords, topics, queries
try:
    keywords_topics_json = json.loads(repair_json(keywords_topics_info)) # Use try-except to handle potential JSON repair/parsing errors
except json.JSONDecodeError as e:
    print(f"JSON Repair or Decode Error: {e}. Original LLM output: {keywords_topics_info}")
    keywords_topics_json = {} # Initialize to empty dict to avoid further errors, handle gracefully later if needed
print(keywords_topics_json)

2025-03-25 09:39:35,466 - INFO - AFC is enabled with max remote calls: 10.
2025-03-25 09:39:37,434 - INFO - AFC remote call 1 is done.


{'field_of_study': ['Natural Language Processing', 'Artificial Intelligence', 'Information Retrieval', 'Meta-Research'], 'keywords_and_topics': ['large language models', 'literature review automation', 'AI-driven research support systems', 'generative adversarial reviews', 'scientific paper generation'], 'tags': ['knowledge graphs', 'peer review simulation', 'AutoSurvey', 'Generative Agent Reviewers (GAR)', 'PaperRobot'], 'queries': ['LLM for literature review', 'AI assisted scientific writing', 'automated peer review', 'knowledge graph for research']}


In [39]:
queries = keywords_topics_json.get('queries')

s2_paper_metadata = []
if queries: # Check if queries is not None and not empty list to avoid errors
    for query in queries:
        s2_paper_metadata.extend(s2.search_paper_by_keywords(query, fields=None, limit=100))
        time.sleep(5)
print(len(s2_paper_metadata))

2025-03-25 09:39:39,378 - INFO - HTTP Request: GET https://api.semanticscholar.org/graph/v1/paper/search?query=LLM+for+literature+review&fields=abstract%2Cauthors%2CcitationCount%2CcitationStyles%2CcorpusId%2CexternalIds%2CfieldsOfStudy%2CinfluentialCitationCount%2CisOpenAccess%2Cjournal%2CopenAccessPdf%2CpaperId%2CpublicationDate%2CpublicationTypes%2CpublicationVenue%2CreferenceCount%2Cs2FieldsOfStudy%2Ctitle%2Curl%2Cvenue%2Cyear&offset=0&limit=100 "HTTP/1.1 200 OK"
2025-03-25 09:39:47,263 - INFO - HTTP Request: GET https://api.semanticscholar.org/graph/v1/paper/search?query=AI+assisted+scientific+writing&fields=abstract%2Cauthors%2CcitationCount%2CcitationStyles%2CcorpusId%2CexternalIds%2CfieldsOfStudy%2CinfluentialCitationCount%2CisOpenAccess%2Cjournal%2CopenAccessPdf%2CpaperId%2CpublicationDate%2CpublicationTypes%2CpublicationVenue%2CreferenceCount%2Cs2FieldsOfStudy%2Ctitle%2Curl%2Cvenue%2Cyear&offset=0&limit=100 "HTTP/1.1 200 OK"
2025-03-25 09:39:55,967 - INFO - HTTP Request: GET 

400
