# Paper Metadata

In [1]:
title = "Training Large Language Models to Reason in a Continuous Latent Space"

In [2]:
import sys
import os

# 获取当前脚本所在目录的父目录 (即 my_project)
parent_dir = os.path.dirname(os.getcwd())

# 将父目录添加到 sys.path
sys.path.append(parent_dir)

In [3]:
import time
import requests

from typing import List, Dict, Optional

## Basic Metadata

In [4]:
from apis.arxiv_tool import ArxivKit
from apis.semanticscholar_tool import SemanticScholarKit

In [5]:
arxiv = ArxivKit()

# arxiv_metadata = []
# for title in titles:
#     candit_arxiv_metadata = arxiv.retrieve_metadata_by_paper(query_term=title, max_cnt=3)
#     arxiv_metadata.append(candit_arxiv_metadata)
#     time.sleep(5)

arxiv_metadata = arxiv.retrieve_metadata_by_paper(query_term=title, max_cnt=3)

2025-02-12 15:02:52,985 - INFO - Requesting page (first: True, try: 0): https://export.arxiv.org/api/query?search_query=Training+Large+Language+Models+to+Reason+in+a+Continuous+Latent+Space&id_list=&sortBy=relevance&sortOrder=descending&start=0&max_results=100
2025-02-12 15:02:58,698 - INFO - Got first page: 100 of 2656582 total results


In [6]:
ss = SemanticScholarKit()

# ss_metadata = []
# for title in titles:
#     candit_ss_metadata = ss.search_paper_by_keywords(query=title, limit=3)
#     ss_metadata.append(candit_ss_metadata)
#     time.sleep(5)
ss_metadata = ss.search_paper_by_keywords(query=title, limit=3)

2025-02-12 15:03:00,872 - INFO - HTTP Request: GET https://api.semanticscholar.org/graph/v1/paper/search?query=Training%20Large%20Language%20Models%20to%20Reason%20in%20a%20Continuous%20Latent%20Space&fields=abstract,authors,citationCount,citationStyles,corpusId,externalIds,fieldsOfStudy,influentialCitationCount,isOpenAccess,journal,openAccessPdf,paperId,publicationDate,publicationTypes,publicationVenue,referenceCount,s2FieldsOfStudy,title,url,venue,year&offset=0&limit=3 "HTTP/1.1 200 OK"


## References and Citations

In [7]:
# paper_ss_id = ss_metadata[0][0].get('paperId')
paper_ss_id = ss_metadata[0].get('paperId')
print(paper_ss_id)

673fbdd957cada770d10dffca5e45b53da43a3c6


In [8]:
reference_metadata = ss.get_semanticscholar_references(paper_id=paper_ss_id, limit=100)
len(reference_metadata)

2025-02-12 15:03:02,282 - INFO - HTTP Request: GET https://api.semanticscholar.org/graph/v1/paper/673fbdd957cada770d10dffca5e45b53da43a3c6/references?fields=contexts,intents,contextsWithIntent,isInfluential,abstract,authors,citationCount,citationStyles,corpusId,externalIds,fieldsOfStudy,influentialCitationCount,isOpenAccess,journal,openAccessPdf,paperId,publicationDate,publicationTypes,publicationVenue,referenceCount,s2FieldsOfStudy,title,url,venue,year&offset=0&limit=100 "HTTP/1.1 429 "
2025-02-12 15:03:33,871 - INFO - HTTP Request: GET https://api.semanticscholar.org/graph/v1/paper/673fbdd957cada770d10dffca5e45b53da43a3c6/references?fields=contexts,intents,contextsWithIntent,isInfluential,abstract,authors,citationCount,citationStyles,corpusId,externalIds,fieldsOfStudy,influentialCitationCount,isOpenAccess,journal,openAccessPdf,paperId,publicationDate,publicationTypes,publicationVenue,referenceCount,s2FieldsOfStudy,title,url,venue,year&offset=0&limit=100 "HTTP/1.1 429 "
2025-02-12 15:

49

In [12]:
citedby_metadata = ss.get_semanticscholar_citedby(paper_id=paper_ss_id, limit=100)
len(citedby_metadata)

2025-02-12 15:04:52,603 - INFO - HTTP Request: GET https://api.semanticscholar.org/graph/v1/paper/673fbdd957cada770d10dffca5e45b53da43a3c6/citations?fields=contexts,intents,contextsWithIntent,isInfluential,abstract,authors,citationCount,citationStyles,corpusId,externalIds,fieldsOfStudy,influentialCitationCount,isOpenAccess,journal,openAccessPdf,paperId,publicationDate,publicationTypes,publicationVenue,referenceCount,s2FieldsOfStudy,title,url,venue,year&offset=0&limit=100 "HTTP/1.1 200 OK"


9

In [13]:
test_str = reference_metadata[2].get('citedPaper', {}).get('citationStyles').get('bibtex')

In [14]:
test_str

"@Article{Dubey2024TheL3,\n author = {Abhimanyu Dubey and Abhinav Jauhri and Abhinav Pandey and Abhishek Kadian and Ahmad Al-Dahle and Aiesha Letman and Akhil Mathur and Alan Schelten and Amy Yang and Angela Fan and Anirudh Goyal and Anthony S. Hartshorn and Aobo Yang and Archi Mitra and Archie Sravankumar and Artem Korenev and Arthur Hinsvark and Arun Rao and Aston Zhang and Aurélien Rodriguez and Austen Gregerson and Ava Spataru and Bap-tiste Roziere and Bethany Biron and Binh Tang and Bobbie Chern and C. Caucheteux and Chaya Nayak and Chloe Bi and Chris Marra and Chris McConnell and Christian Keller and Christophe Touret and Chunyang Wu and Corinne Wong and Cristian Cantón Ferrer and Cyrus Nikolaidis and Damien Allonsius and Daniel Song and Danielle Pintz and Danny Livshits and David Esiobu and Dhruv Choudhary and Dhruv Mahajan and Diego Garcia-Olano and Diego Perino and Dieuwke Hupkes and Egor Lakomkin and Ehab A. AlBadawy and Elina Lobanova and Emily Dinan and Eric Michael Smith a

In [15]:
import bibtexparser
# Parsing a bibtex string with default values
bib_database = bibtexparser.parse_string(test_str)
# Converting it back to a bibtex string, again with default values
new_bibtex_string = bibtexparser.write_string(bib_database)

In [16]:
new_bibtex_string

"@article{Dubey2024TheL3,\n\tauthor = {Abhimanyu Dubey and Abhinav Jauhri and Abhinav Pandey and Abhishek Kadian and Ahmad Al-Dahle and Aiesha Letman and Akhil Mathur and Alan Schelten and Amy Yang and Angela Fan and Anirudh Goyal and Anthony S. Hartshorn and Aobo Yang and Archi Mitra and Archie Sravankumar and Artem Korenev and Arthur Hinsvark and Arun Rao and Aston Zhang and Aurélien Rodriguez and Austen Gregerson and Ava Spataru and Bap-tiste Roziere and Bethany Biron and Binh Tang and Bobbie Chern and C. Caucheteux and Chaya Nayak and Chloe Bi and Chris Marra and Chris McConnell and Christian Keller and Christophe Touret and Chunyang Wu and Corinne Wong and Cristian Cantón Ferrer and Cyrus Nikolaidis and Damien Allonsius and Daniel Song and Danielle Pintz and Danny Livshits and David Esiobu and Dhruv Choudhary and Dhruv Mahajan and Diego Garcia-Olano and Diego Perino and Dieuwke Hupkes and Egor Lakomkin and Ehab A. AlBadawy and Elina Lobanova and Emily Dinan and Eric Michael Smith 

In [46]:
from pybtex import database

database.parse_string(test_str, bib_format='bibtex')

BibliographyData(
  entries=OrderedCaseInsensitiveDict([
    ('Dubey2024TheL3', Entry('article',
      fields=[
        ('bo
        oktitle', 'arXiv.org'), 
        ('journal', 'ArXiv'), ('title', 'The Llama 3 Herd of Models'), 
        ('volume', 'abs/2407.21783'), 
        ('year', '2024')],
      persons=OrderedCaseInsensitiveDict([('author', [Person('Dubey, Abhimanyu'), Person('Jauhri, Abhinav'), Person('Pandey, Abhinav'), Person('Kadian, Abhishek'), Person('Al-Dahle, Ahmad'), Person('Letman, Aiesha'), Person('Mathur, Akhil'), Person('Schelten, Alan'), Person('Yang, Amy'), Person('Fan, Angela'), Person('Goyal, Anirudh'), Person('Hartshorn, Anthony S.'), Person('Yang, Aobo'), Person('Mitra, Archi'), Person('Sravankumar, Archie'), Person('Korenev, Artem'), Person('Hinsvark, Arthur'), Person('Rao, Arun'), Person('Zhang, Aston'), Person('Rodriguez, Aurélien'), Person('Gregerson, Austen'), Person('Spataru, Ava'), Person('Roziere, Bap-tiste'), Person('Biron, Bethany'), Person('Tang, Bin

In [52]:
import pybtex
from pybtex.style.formatting import plain
style = plain.Style()
eng = pybtex.Engine
eng.format_from_string(bib_string=test_str)

TypeError: Engine.format_from_string() missing 1 required positional argument: 'self'

In [None]:
from pytex import 

In [45]:
from pybtex.database import parse_file
from pybtex.style.formatting import plain
from pybtex.backends import html, latex, markdown, plaintext

# Parse the .bib file
# bib_data = parse_file("my_references.bib")

# Choose a formatting style (e.g., plain, unsrt, alpha)
style = plain.Style()

# Choose a backend for output (e.g., html, latex, markdown, plaintext)
backend = html.Backend()

# Format the bibliography
formatted_bibliography = style.format_from_string(test_str)

# Render the formatted bibliography using the chosen backend
rendered_bibliography = backend.render_bibliography(formatted_bibliography)

AttributeError: 'Style' object has no attribute 'format_from_string'

# Segment Information Extraction

In [17]:
import json

processed_file_path = "pdf_processed_wip_20250212.json"

with open(processed_file_path, 'r') as file:
    processed_json = json.load(file)

## Segments Mapping

## Topic Analysis

In [33]:
import os
from google import genai
from google.genai import types

def llm_gen(api_key, model_name, qa_prompt, sys_prompt=None, temperature=0.3):
    client = genai.Client(api_key=api_key)
    config = types.GenerateContentConfig(
        system_instruction=sys_prompt,
        temperature=temperature)
    response = client.models.generate_content(
        model=model_name, 
        contents=qa_prompt,
        config=config)
    return response.text

def llm_gen_w_images(api_key, model_name, qa_prompt, pil_images, sys_prompt=None, temperature=0.3):
    """q&a with images
    Args:
        pil_images:
            import PIL.Image
            image = PIL.Image.open('/path/to/image.png')
    """

    client = genai.Client(api_key=api_key)

    config = types.GenerateContentConfig(
        system_instruction=sys_prompt,
        temperature=temperature)

    response = client.models.generate_content(
        model=model_name,  #　"gemini-2.0-flash-exp",
        contents=[qa_prompt]+pil_images,
        config=config)

    return response.text

In [34]:
topics_example_json = {
  "topics": [
    {
      "topic": "Performance Advantages of Transformer Networks over RNNs in Machine Translation Tasks",
      "description": "This topic broadly concerns the comparison of Transformer networks and Recurrent Neural Networks (RNNs) in the context of machine translation, focusing on the superior performance characteristics of Transformers.",
      "summary": "The provided text focuses on the significant performance advantages of Transformer networks over traditional Recurrent Neural Network (RNN) based models in machine translation tasks. It argues, based on presented empirical evidence, that Transformers achieve higher BLEU scores, indicating better translation quality, across multiple language pairs and datasets.  The authors specifically attribute this superior performance to the self-attention mechanism within Transformers, which allows for more effective capture of long-range dependencies in the input text compared to the sequential processing inherent in RNNs. The text cites experimental results demonstrating faster training times for Transformers due to their parallelizable architecture, contrasting this with the inherent sequential bottleneck of RNNs.  While acknowledging the potential computational cost of Transformers for extremely long sequences, the authors downplay this limitation in the context of typical machine translation scenarios. They further support their claims by comparing Transformers to convolutional models, arguing for the greater suitability of attention mechanisms for natural language processing. The paper concludes that the shift from recurrent to attention-based models, exemplified by Transformers, represents a major advancement in the field of machine translation. The authors mention, but do not extensively analyze, the limitations imposed by dataset size on the Transformer performance."
    },
    {
        "topic": "Role of Multi-Headed Scaled Dot-Product Self-Attention in Enhancing Contextual Understanding within Transformer Networks",
        "description": "This topic encompasses the specific type of self-attention (scaled dot-product) and its multi-headed variant used in Transformer networks, and how these mechanisms contribute to the model's ability to understand context within input sequences.",
        "summary": "The provided paragraphs delve into the critical role of multi-headed scaled dot-product self-attention in enhancing contextual understanding within Transformer networks. It explains that self-attention allows each word in a sentence to attend to all other words, including itself, to derive a context-aware representation. The scaled dot-product mechanism is presented as a computationally efficient way to calculate attention weights, preventing issues that can arise with large dot products. The text emphasizes the significance of the 'multi-headed' aspect, where multiple self-attention operations are performed in parallel, each learning different aspects of the relationships between words.  This allows the model to capture diverse contextual nuances, such as syntactic and semantic dependencies, simultaneously. The authors argue that this multi-headed approach is crucial for capturing the richness of human language. They contrast this with simpler attention mechanisms, highlighting the ability of multi-headed attention to learn multiple 'representation subspaces'.  The text provides a brief mathematical overview of the scaled dot-product calculation, reinforcing its efficiency and effectiveness. The authors posit that without multi-headed attention, the Transformer's ability to model complex language structures would be significantly diminished. They conclude by highlighting the importance for future works, such as model interpretability and analysis."
    }
  ]
}

topics_prompt = """You are a sophisticated academic scholar with expertise in {domain}. 
You are renowned for your ability to grasp the key topics and ideas of research papers which are significant and insightful.

## TASK
You are provided with a section extracted from an academic paper.
Analyze the provided information and identify key academic topics discussed.  
For each topic, generate a JSON object containing the following:

*   `topic`: A precise and information-rich name for the topic. This should be as specific as possible, potentially combining multiple concepts to accurately reflect the nuanced discussion in the text.  (e.g., 'Application of Transformer Networks to Machine Translation', 'Impact of Multi-Headed Self-Attention on Long-Range Dependency Capture in Transformers').
*   `description`: A concise, general definition of the topic (1-2 sentences). Imagine you are explaining it to a colleague *unfamiliar* with the specific paper, but familiar with AI/NLP in general.  Keep the definition broad enough to encompass the general concept, even if the topic name is very specific.
*   `summary`: A detailed summary (7-10 sentences) of the topic's treatment *within the provided text*. This should include:
    *   The specific arguments made about the topic.
    *   Any evidence or examples the authors use related to the topic.
    *   The authors' conclusions or claims regarding the topic.
    *   Any limitations or critiques of the topic presented by the authors.
    *   Any comparisons to other related concepts or methods.

Output your entire response as a single, valid JSON object. The highest level should be a list called 'topics'.


## EXAMPLE
Example (using a hypothetical excerpt about Transformer Networks):

```json
{example_json}
```

## INPUT
Here are text of the paper section in markdown format:
```markdown
{markdown_text}
```

{further_information}

## OUTPUT
Now get started!

"""

In [None]:
import json
import time
import PIL.Image

api_key = os.getenv('GEMINI_API_KEY_1')
temperature = 0.7
domain = "Artificial Intelligence and LLMs"
tmp_path = "/home/jiezi/Code/Temp/tmp/2412.06769v2"



responses = []
for section in processed_json[3:4]:
    title = section.get('title')
    md_text = section.get('refined_text')
    images = section.get('images')
    if title not in ["References", "Acknowledgments"] and len(md_text) > 200:
        imgs_prompt = ""
        pil_images = []
        if len(images) > 0:
            img_info = ""
            for img in images:
                img_title = img.get('title')
                img_url = os.path.join(tmp_path, img.get('img_path'))
                pil_images.append(PIL.Image.open(img_url))
                img_info += f"- title: {img_title}  attached image: {os.path.basename(img_url)} \n"
            imgs_prompt = f"Here are images mentioned in markdown text:\n{img_info}"
        
            qa_prompt = topics_prompt.format(
                domain = domain,
                example_json = json.dumps(topics_example_json, ensure_ascii=False), 
                markdown_text = md_text,
                further_information = imgs_prompt)

            res = llm_gen_w_images(
                api_key=api_key, model_name='gemini-2.0-flash-thinking-exp', 
                qa_prompt=qa_prompt, pil_images=pil_images, sys_prompt=None, temperature=0.6)

        else:
            qa_prompt = topics_prompt.format(
                domain = domain,
                example_json = json.dumps(topics_example_json, ensure_ascii=False), 
                markdown_text = md_text,
                further_information = "")

            res = llm_gen(
                api_key=api_key, model_name='gemini-2.0-flash-thinking-exp', 
                qa_prompt=qa_prompt, sys_prompt=None, temperature=0.6)
        responses.append(res)
        time.sleep(5)

2025-02-12 15:58:22,514 - INFO - AFC is enabled with max remote calls: 10.


## Research Comprehension

In [None]:
sys_prompt = "You are a researcher in the field of '{subject}' who is good at summarizing papers using concise statements."

summary_prompt = """ ## INSTRUCTION
Given abstraction and introduction paragraph from the paper, you are asked to:                   
1. identify the keywords of this article;
2. summarize according to the following four points
- (1): What is the research background of this article? What problem is this paper trying to solve? 
- (2): What are the relevant studies? What are the past methods? What are the issues with them? Is the approach well motivated?
- (3): How does the paper solve this problem? What is the research methodology proposed in this paper?
- (4): What experiments were done in the paper? On what task and what performance is achieved by the methods in this paper? Can the performance support their goals?
- (5): Are there unsolved issues with the paper? What gaps can be explored further? Any suggestions?

## CONTEXT
Here are abstraction from the paper:
<abstraction>
{abstraction}
</abstraction>

Here are introduction from the paper:
<introduction>
{introduction}
</introduction>

## OUTPUT
Follow the format of the output that follows: 
```text                            
1. Keywords: xxx\n\n     
2. Summary: \n\n
- (1):xxx;\n 
- (2):xxx;\n 
- (3):xxx;\n  
- (4):xxx.\n\n     
- (5):xxx.\n\n  
```

Be sure to use {lang} answers (proper nouns need to be marked in English), statements as concise and academic as possible.
Do not have too much repetitive information, numerical values using the original numbers.
Be sure to strictly follow the format, the corresponding content output to xxx, in accordance with \n line feed.                 
"""

method_prompt = """## INSTRUCTION
Given method paragraph and a summary of a paper, you are asked to describe in detail the methodological idea of this article. 
- (1):...
- (2):...
- (3):...
- .......

## CONTEXT
Here are method paragraph:
<method>
{method}
</method>

Here are summary of the paper fyi:
<summary>
{summary}
</summary>

## OUTPUT
Follow the format of the output that follows: 
```text
3. Methods: \n\n
- (1):xxx;\n 
- (2):xxx;\n 
- (3):xxx;\n  
....... \n\n     
```
Be sure to use {lang} answers (proper nouns need to be marked in English), statements as concise and academic as possible.
Do not repeat the content of the previous <summary>, the value of the use of the original numbers.
Be sure to strictly follow the format, the corresponding content output to xxx, in accordance with \n line feed, ....... means fill in according to the actual requirements.                 
"""
 
conclusion_prompt = """## INSTRUCTION
Given conclusion paragraph and a summary of a paper, you are asked to: 
4. Make the following summary:
- (1):What is the significance of this piece of work?
- (2):Summarize the strengths and weaknesses of this article in three dimensions: innovation point, performance, and workload.                   
.......

    "contribution": "What is the contribution of this paper?",
    "novelty": "What is the novelty of this paper?",
    "strength": "What are the strengths of this paper?",
    "drawback": "What are the drawbacks of this paper?",
    "improvement": "What might be the improvements of this paper?",


## CONTEXT
Here are conclusion paragraph:
<conclusion>
{conclusion}
</conclusion>

Here are summary of the paper fyi:
<summary>
{summary}
</summary>

## OUTPUT
Follow the format of the output later: 
```text
4. Conclusion: \n\n
- (1):xxx;\n                     
- (2):Innovation point: xxx; Performance: xxx; Workload: xxx;\n    
- (3):
    contribution: What is the contribution of this paper?,
    novelty: What is the novelty of this paper?,
    strength": What are the strengths of this paper?,
    drawback: What are the drawbacks of this paper?,
    improvement": What might be the improvements of this paper?
```

Be sure to use {lang} answers (proper nouns need to be marked in English), statements as concise and academic as possible.
Do not repeat the content of the previous <summary>, the value of the use of the original numbers.
Be sure to strictly follow the format, the corresponding content output to xxx, in accordance with \n line feed, ....... means fill in according to the actual requirements.                 
"""

# Highlight and Investigation

# Conclusion