# Paper Metadata

In [1]:
title = "Training Large Language Models to Reason in a Continuous Latent Space"

In [2]:
import sys
import os

# 获取当前脚本所在目录的父目录 (即 my_project)
parent_dir = os.path.dirname(os.getcwd())

# 将父目录添加到 sys.path
sys.path.append(parent_dir)

In [3]:
import time
import requests

from typing import List, Dict, Optional

## Basic Metadata

In [4]:
from apis.arxiv_tool import ArxivKit
from apis.semanticscholar_tool import SemanticScholarKit

In [5]:
arxiv = ArxivKit()

# arxiv_metadata = []
# for title in titles:
#     candit_arxiv_metadata = arxiv.retrieve_metadata_by_paper(query_term=title, max_cnt=3)
#     arxiv_metadata.append(candit_arxiv_metadata)
#     time.sleep(5)

arxiv_metadata = arxiv.retrieve_metadata_by_paper(query_term=title, max_cnt=3)

2025-02-13 09:31:37,768 - INFO - Requesting page (first: True, try: 0): https://export.arxiv.org/api/query?search_query=Training+Large+Language+Models+to+Reason+in+a+Continuous+Latent+Space&id_list=&sortBy=relevance&sortOrder=descending&start=0&max_results=100
2025-02-13 09:31:42,240 - INFO - Got first page: 100 of 2656582 total results


In [6]:
ss = SemanticScholarKit()

# ss_metadata = []
# for title in titles:
#     candit_ss_metadata = ss.search_paper_by_keywords(query=title, limit=3)
#     ss_metadata.append(candit_ss_metadata)
#     time.sleep(5)
ss_metadata = ss.search_paper_by_keywords(query=title, limit=3)

2025-02-13 09:31:43,715 - INFO - HTTP Request: GET https://api.semanticscholar.org/graph/v1/paper/search?query=Training%20Large%20Language%20Models%20to%20Reason%20in%20a%20Continuous%20Latent%20Space&fields=abstract,authors,citationCount,citationStyles,corpusId,externalIds,fieldsOfStudy,influentialCitationCount,isOpenAccess,journal,openAccessPdf,paperId,publicationDate,publicationTypes,publicationVenue,referenceCount,s2FieldsOfStudy,title,url,venue,year&offset=0&limit=3 "HTTP/1.1 200 OK"


## References and Citations

In [7]:
# paper_ss_id = ss_metadata[0][0].get('paperId')
paper_ss_id = ss_metadata[0].get('paperId')
print(paper_ss_id)

673fbdd957cada770d10dffca5e45b53da43a3c6


In [8]:
reference_metadata = ss.get_semanticscholar_references(paper_id=paper_ss_id, limit=100)
len(reference_metadata)

2025-02-13 09:31:48,942 - INFO - HTTP Request: GET https://api.semanticscholar.org/graph/v1/paper/673fbdd957cada770d10dffca5e45b53da43a3c6/references?fields=contexts,intents,contextsWithIntent,isInfluential,abstract,authors,citationCount,citationStyles,corpusId,externalIds,fieldsOfStudy,influentialCitationCount,isOpenAccess,journal,openAccessPdf,paperId,publicationDate,publicationTypes,publicationVenue,referenceCount,s2FieldsOfStudy,title,url,venue,year&offset=0&limit=100 "HTTP/1.1 200 OK"


49

In [9]:
citedby_metadata = ss.get_semanticscholar_citedby(paper_id=paper_ss_id, limit=100)
len(citedby_metadata)

2025-02-13 09:31:52,430 - INFO - HTTP Request: GET https://api.semanticscholar.org/graph/v1/paper/673fbdd957cada770d10dffca5e45b53da43a3c6/citations?fields=contexts,intents,contextsWithIntent,isInfluential,abstract,authors,citationCount,citationStyles,corpusId,externalIds,fieldsOfStudy,influentialCitationCount,isOpenAccess,journal,openAccessPdf,paperId,publicationDate,publicationTypes,publicationVenue,referenceCount,s2FieldsOfStudy,title,url,venue,year&offset=0&limit=100 "HTTP/1.1 200 OK"


9

# Segment Information Extraction

In [10]:
import json

processed_file_path = "pdf_processed_wip_20250212.json"

with open(processed_file_path, 'r') as file:
    processed_json = json.load(file)

In [38]:
import os
from google import genai
from google.genai import types

def llm_gen(api_key, model_name, qa_prompt, sys_prompt=None, temperature=0.3):
    client = genai.Client(api_key=api_key)
    config = types.GenerateContentConfig(
        system_instruction=sys_prompt,
        temperature=temperature)
    response = client.models.generate_content(
        model=model_name, 
        contents=qa_prompt,
        config=config)
    return response.text

def llm_image_gen(api_key, model_name, qa_prompt, pil_images, sys_prompt=None, temperature=0.3):
    """q&a with images
    Args:
        pil_images:
            import PIL.Image
            image = PIL.Image.open('/path/to/image.png')
    """

    client = genai.Client(api_key=api_key)

    config = types.GenerateContentConfig(
        system_instruction=sys_prompt,
        temperature=temperature)

    response = client.models.generate_content(
        model=model_name,  #　"gemini-2.0-flash-exp",
        contents=[qa_prompt]+pil_images,
        config=config)

    return response.text

def llm_gen_w_retry(api_key, model_name, qa_prompt, sys_prompt=None, temperature=0.3, max_retries=3, initial_delay=1):
    """
    Wraps the llm_gen_w_images function to enable retries on RESOURCE_EXHAUSTED errors.

    Args:
        api_key: API key for the LLM service.
        model_name: Name of the LLM model to use.
        qa_prompt: Question and answer prompt for the LLM.
        pil_images: List of PIL.Image objects.
        temperature: Temperature for LLM response generation.
        max_retries: Maximum number of retries in case of error.
        initial_delay: Initial delay in seconds before the first retry.

    Returns:
        str: The text response from the LLM, or None if max retries are exceeded and still error.
    """
    retries = 0
    delay = initial_delay

    while retries <= max_retries:
        try:
            return llm_gen(api_key, model_name, qa_prompt, sys_prompt, temperature)
        except Exception as e:
            if e.code == 429:
                if retries < max_retries:
                    retries += 1
                    print(f"Rate limit exceeded. Retrying in {delay} seconds (Retry {retries}/{max_retries})...")
                    time.sleep(delay)
                    delay *= 2  # Exponential backoff for delay
                else:
                    print(f"Max retries reached.  Raising the last exception.")
                    return None # raise  # Re-raise the last exception if max retries are exhausted
            else:
                print(f"Error Code: {e.code} Error Message: {e.message}")
                return None
                # raise  # Re-raise other ClientErrors (not related to resource exhaustion)

    return None # Should not reach here in normal cases as exception is re-raised or value is returned in try block

def llm_image_gen_w_retry(api_key, model_name, qa_prompt, pil_images, sys_prompt=None, temperature=0.3, max_retries=3, initial_delay=1):
    """
    Wraps the llm_gen_w_images function to enable retries on RESOURCE_EXHAUSTED errors.

    Args:
        api_key: API key for the LLM service.
        model_name: Name of the LLM model to use.
        qa_prompt: Question and answer prompt for the LLM.
        pil_images: List of PIL.Image objects.
        sys_prompt: Optional system prompt for the LLM.
        temperature: Temperature for LLM response generation.
        max_retries: Maximum number of retries in case of error.
        initial_delay: Initial delay in seconds before the first retry.

    Returns:
        str: The text response from the LLM, or None if max retries are exceeded and still error.
    """
    retries = 0
    delay = initial_delay

    while retries <= max_retries:
        try:
            return llm_image_gen(api_key, model_name, qa_prompt, pil_images, sys_prompt, temperature)
        except Exception as e:
            if e.code == 429:
                if retries < max_retries:
                    retries += 1
                    print(f"Rate limit exceeded. Retrying in {delay} seconds (Retry {retries}/{max_retries})...")
                    time.sleep(delay)
                    delay *= 2  # Exponential backoff for delay
                else:
                    print(f"Max retries reached.  Raising the last exception.")
                    return None # raise  # Re-raise the last exception if max retries are exhausted
            else:
                print(f"Error Code: {e.code} Error Message: {e.message}")
                return None
                # raise  # Re-raise other ClientErrors (not related to resource exhaustion)

    return None # Should not reach here in normal cases as exception is re-raised or value is returned in try block


## Topic Analysis

In [39]:
topics_example_json = {
  "topics": [
    {
      "topic": "Performance Advantages of Transformer Networks over RNNs in Machine Translation Tasks",
      "description": "This topic broadly concerns the comparison of Transformer networks and Recurrent Neural Networks (RNNs) in the context of machine translation, focusing on the superior performance characteristics of Transformers.",
      "summary": "The provided text focuses on the significant performance advantages of Transformer networks over traditional Recurrent Neural Network (RNN) based models in machine translation tasks. It argues, based on presented empirical evidence, that Transformers achieve higher BLEU scores, indicating better translation quality, across multiple language pairs and datasets.  The authors specifically attribute this superior performance to the self-attention mechanism within Transformers, which allows for more effective capture of long-range dependencies in the input text compared to the sequential processing inherent in RNNs. The text cites experimental results demonstrating faster training times for Transformers due to their parallelizable architecture, contrasting this with the inherent sequential bottleneck of RNNs.  While acknowledging the potential computational cost of Transformers for extremely long sequences, the authors downplay this limitation in the context of typical machine translation scenarios. They further support their claims by comparing Transformers to convolutional models, arguing for the greater suitability of attention mechanisms for natural language processing. The paper concludes that the shift from recurrent to attention-based models, exemplified by Transformers, represents a major advancement in the field of machine translation. The authors mention, but do not extensively analyze, the limitations imposed by dataset size on the Transformer performance.",
      "line_ids": [1, 2, 3]
    },
    {
      "topic": "Role of Multi-Headed Scaled Dot-Product Self-Attention in Enhancing Contextual Understanding within Transformer Networks",
      "description": "This topic encompasses the specific type of self-attention (scaled dot-product) and its multi-headed variant used in Transformer networks, and how these mechanisms contribute to the model's ability to understand context within input sequences.",
      "summary": "The provided paragraphs delve into the critical role of multi-headed scaled dot-product self-attention in enhancing contextual understanding within Transformer networks. It explains that self-attention allows each word in a sentence to attend to all other words, including itself, to derive a context-aware representation. The scaled dot-product mechanism is presented as a computationally efficient way to calculate attention weights, preventing issues that can arise with large dot products. The text emphasizes the significance of the 'multi-headed' aspect, where multiple self-attention operations are performed in parallel, each learning different aspects of the relationships between words.  This allows the model to capture diverse contextual nuances, such as syntactic and semantic dependencies, simultaneously. The authors argue that this multi-headed approach is crucial for capturing the richness of human language. They contrast this with simpler attention mechanisms, highlighting the ability of multi-headed attention to learn multiple 'representation subspaces'.  The text provides a brief mathematical overview of the scaled dot-product calculation, reinforcing its efficiency and effectiveness. The authors posit that without multi-headed attention, the Transformer's ability to model complex language structures would be significantly diminished. They conclude by highlighting the importance for future works, such as model interpretability and analysis.",
      "line_ids": [7, 8, 9, 10]
    }
  ]
}

topics_prompt = """You are a sophisticated academic scholar with expertise in {domain}. 
You are renowned for your ability to grasp the key topics and ideas of research papers which are significant and insightful.

## TASK
You are provided with a section of lines extracted from an academic paper.
Analyze the provided information and identify key academic topics discussed.  
For each topic, generate a JSON object containing the following:

*   `topic`: A precise and information-rich name for the topic. This should be as specific as possible, potentially combining multiple concepts to accurately reflect the nuanced discussion in the text.  (e.g., 'Application of Transformer Networks to Machine Translation', 'Impact of Multi-Headed Self-Attention on Long-Range Dependency Capture in Transformers').
*   `description`: A concise, general definition of the topic (1-2 sentences). Imagine you are explaining it to a colleague *unfamiliar* with the specific paper, but familiar with AI/NLP in general.  Keep the definition broad enough to encompass the general concept, even if the topic name is very specific.
*   `summary`: A detailed summary of the topic's treatment *within the provided text*. This should include:
    *   The specific arguments made about the topic.
    *   Any evidence or examples the authors use related to the topic.
    *   The authors' conclusions or claims regarding the topic.
    *   Any limitations or critiques of the topic presented by the authors.
    *   Any comparisons to other related concepts or methods.
*   `line_ids`: Categorize section lines based on its closeness to topics. Put only line ids here. Make sure the line ids exist in input and do not fake. Ideally each line only correspond to one topic. 
Output your entire response as a single, valid JSON object. The highest level should be a list called 'topics'.


## EXAMPLE
Example (using a hypothetical excerpt about Transformer Networks):

```json
{example_json}
```

## INPUT
Here are the section of lines for the paper in markdown format:
```markdown
{markdown_text}
```

{further_information}

## OUTPUT
Now get started!

"""

In [40]:
import json
import time
import PIL.Image

api_key = os.getenv('GEMINI_API_KEY_1')
temperature = 0.7
domain = "Artificial Intelligence and LLMs"
tmp_path = "/home/jiezi/Code/Temp/tmp/2412.06769v2"

responses = []
for section in processed_json:
    title = "#" * section.get('level') + " " + section.get('title')
    md_text = section.get('refined_text')
    md_lines = "\n".join([f"<line_id> {x.get('id')} <\line_id>  <line_text> {x.get('line')} <\line_text>" for x in section.get('lines')])
    input_text = title + "\n" + md_lines
    
    images = section.get('images')
    if title not in ["References", "Acknowledgments"] and len(md_text) > 200:
        imgs_prompt = ""
        pil_images = []
        if len(images) > 0:
            img_info = ""
            for img in images:
                img_title = img.get('title')
                img_url = os.path.join(tmp_path, img.get('img_path'))
                pil_images.append(PIL.Image.open(img_url))
                img_info += f"- image title: {img_title}  attached image: {os.path.basename(img_url)} \n"
            imgs_prompt = f"Here are images mentioned in markdown text:\n{img_info}"
        
            qa_prompt = topics_prompt.format(
                domain = domain,
                example_json = json.dumps(topics_example_json, ensure_ascii=False), 
                markdown_text = input_text,
                further_information = imgs_prompt)

            res = llm_image_gen_w_retry(
                api_key=api_key, model_name='gemini-2.0-flash-thinking-exp', 
                qa_prompt=qa_prompt, pil_images=pil_images, sys_prompt=None, temperature=0.6)

        else:
            qa_prompt = topics_prompt.format(
                domain = domain,
                example_json = json.dumps(topics_example_json, ensure_ascii=False), 
                markdown_text = input_text,
                further_information = "")

            res = llm_gen_w_retry(
                api_key=api_key, model_name='gemini-2.0-flash-thinking-exp', 
                qa_prompt=qa_prompt, sys_prompt=None, temperature=0.6)
        responses.append(res)
        time.sleep(5)

2025-02-13 11:29:05,849 - INFO - AFC is enabled with max remote calls: 10.
2025-02-13 11:29:28,850 - INFO - AFC is enabled with max remote calls: 10.


Rate limit exceeded. Retrying in 1 seconds (Retry 1/3)...


2025-02-13 11:29:31,406 - INFO - AFC is enabled with max remote calls: 10.
2025-02-13 11:29:52,404 - INFO - AFC is enabled with max remote calls: 10.
2025-02-13 11:30:08,788 - INFO - AFC is enabled with max remote calls: 10.


Rate limit exceeded. Retrying in 1 seconds (Retry 1/3)...


2025-02-13 11:30:11,750 - INFO - AFC is enabled with max remote calls: 10.
2025-02-13 11:30:34,086 - INFO - AFC is enabled with max remote calls: 10.
2025-02-13 11:31:03,679 - INFO - AFC is enabled with max remote calls: 10.
2025-02-13 11:31:30,718 - INFO - AFC is enabled with max remote calls: 10.
2025-02-13 11:31:45,718 - INFO - AFC is enabled with max remote calls: 10.
2025-02-13 11:32:40,865 - INFO - AFC is enabled with max remote calls: 10.
2025-02-13 11:33:01,437 - INFO - AFC is enabled with max remote calls: 10.
2025-02-13 11:33:16,255 - INFO - AFC is enabled with max remote calls: 10.


Rate limit exceeded. Retrying in 1 seconds (Retry 1/3)...


2025-02-13 11:33:18,427 - INFO - AFC is enabled with max remote calls: 10.


In [51]:
topic_wip_json_path = "topic_analysis_wip_20250213.json"

with open(topic_wip_json_path, "w") as file:
    json.dump(responses, file, indent=4)

## Keywords Extraction

In [49]:
example_json = {
    "field_of_study": ["Political Science", "Social Media Studies", "Communication Studies", "Sociology, Digital Culture"],
    "keywords": ["social media usage", "political polarization", "mixed-methods approach", "semi-structured interviews"],
    "tags": ["online behavior", "echo chambers", "survey methodology", "young adults", "political communication", "digital ethnography", "ideology"],
    "section_type": ["abstract", "introduction"]
}

additional_info_prompt = """You are a sophisticated academic scholar with expertise in {domain}. 
You are renowned for your ability to quickly grasp the core concepts of research papers and expertly categorize and tag information for optimal organization and retrieval.

## TASK
When presented with some texts extracted from a research paper, you will meticulously analyze its content and provide the following:
- field_of_study: Propose 2-4 detailed academic categories that this research paragraph would logically fall under. These categories should help situate the research within the fields of study. Consider the interdisciplinary nature of the paragraph as well.
- keywords: Identify 3-5 key terms or phrases that accurately capture the specific subject matter and central ideas discussed within the paragraph. These keywords should be highly relevant and commonly used within the specific research area.
- tags: Suggest 4-6 concise tags that could be used to further refine the indexing and searchability of the paragraph. These tags might include specific methodologies, theories, named entities, or emerging concepts mentioned within the text. They should be specific enough to differentiate the content from the broader categories.
- section_class: Classify given excerpts based on their content and typical function within a research paper. **Remember to output ONLY the class names, ordered by descending closeness if more than one.** Candidate classes are:
    * "Abstract"
    * "Introduction, Background, and Motivation"
    * "Related Work, and Literature Review"
    * "Methodology and Approach"
    * "Experiment"
    * "Analysis and Findings"
    * "Conclusion"
    * "References"
    * "Acknowledgments"
    * "FAQ"
    * "Code and Examples"

Make sure you output in json with double quotes.

## EXAMPLE
Here is an example for demonstraction purpose only. Do not use this specific example in your response, it is solely illustrative.

Input Paragraph:  
 ```
"This study employed a mixed-methods approach to investigate the impact of social media usage on political polarization among young adults in urban areas. 
Quantitative data was collected through a survey of 500 participants, while qualitative data was gathered via semi-structured interviews with a subset of 25 participants. 
The findings suggest a correlation between increased exposure to ideologically homogeneous content online and heightened political polarization."
 ```

Hypothetical Output from this Example (Again, illustrative and not to be used in the actual response):
```json
{example_json}
```

## INSTRUCTIONS
1. Your response should be clearly organized, using bullet points or numbered lists to separate the categories, keywords, and tags.
2. Be precise and avoid overly broad or generic terms.
3. Prioritize terms that are commonly used within the relevant academic field.
4. Focus on accurate representation of the content provided.
5. Ensure that categories, keywords, and tags are directly relevant to the specific area of expertise you are embodying.
6. Please analyze the following paragraph and provide your expert recommendations:

## INPUT
Here is the section text in markdown format.
```markdown
{markdown_text}
```
"""

In [50]:
import json
import time

api_key = os.getenv('GEMINI_API_KEY_1')
temperature = 0.7
domain = "Artificial Intelligence and LLMs"
tmp_path = "/home/jiezi/Code/Temp/tmp/2412.06769v2"

addtional_infos = []
for section in processed_json:
    title = "#" * section.get('level') + " " + section.get('title')
    md_text = section.get('refined_text')
    input_text = title + "\n" + md_text
    
    if title not in ["References", "Acknowledgments"] and len(md_text) > 200:
        imgs_prompt = ""
        qa_prompt = additional_info_prompt.format(
            domain = domain,
            example_json = json.dumps(example_json, ensure_ascii=False),
            markdown_text = input_text)

        res = llm_gen_w_retry(
            api_key=api_key, model_name='gemini-2.0-flash-thinking-exp', 
            qa_prompt=qa_prompt, sys_prompt=None, temperature=0.6)
        addtional_infos.append(res)
        time.sleep(5)

2025-02-13 12:02:13,656 - INFO - AFC is enabled with max remote calls: 10.
2025-02-13 12:02:24,075 - INFO - AFC is enabled with max remote calls: 10.


Rate limit exceeded. Retrying in 1 seconds (Retry 1/3)...


2025-02-13 12:02:25,931 - INFO - AFC is enabled with max remote calls: 10.
2025-02-13 12:02:36,219 - INFO - AFC is enabled with max remote calls: 10.
2025-02-13 12:02:49,238 - INFO - AFC is enabled with max remote calls: 10.
2025-02-13 12:03:08,182 - INFO - AFC is enabled with max remote calls: 10.
2025-02-13 12:03:17,301 - INFO - AFC is enabled with max remote calls: 10.
2025-02-13 12:03:28,662 - INFO - AFC is enabled with max remote calls: 10.


Rate limit exceeded. Retrying in 1 seconds (Retry 1/3)...


2025-02-13 12:03:30,702 - INFO - AFC is enabled with max remote calls: 10.
2025-02-13 12:03:45,140 - INFO - AFC is enabled with max remote calls: 10.
2025-02-13 12:03:57,540 - INFO - AFC is enabled with max remote calls: 10.
2025-02-13 12:04:07,781 - INFO - AFC is enabled with max remote calls: 10.
2025-02-13 12:04:18,580 - INFO - AFC is enabled with max remote calls: 10.


In [52]:
keyinfo_wip_json_path = "keyinfo_analysis_wip_20250213.json"

with open(keyinfo_wip_json_path, "w") as file:
    json.dump(addtional_infos, file, indent=4)

# Overall Research Comprehension

In [None]:
sys_prompt = "You are a researcher in the field of '{subject}' who is good at summarizing papers using concise statements."

summary_prompt = """ ## INSTRUCTION
Given abstraction and introduction paragraph from the paper, you are asked to:                   
1. identify the keywords of this article;
2. summarize according to the following four points
- (1): What is the research background of this article? What problem is this paper trying to solve? 
- (2): What are the relevant studies? What are the past methods? What are the issues with them? Is the approach well motivated?
- (3): How does the paper solve this problem? What is the research methodology proposed in this paper?
- (4): What experiments were done in the paper? On what task and what performance is achieved by the methods in this paper? Can the performance support their goals?
- (5): Are there unsolved issues with the paper? What gaps can be explored further? Any suggestions?

## CONTEXT
Here are abstraction from the paper:
<abstraction>
{abstraction}
</abstraction>

Here are introduction from the paper:
<introduction>
{introduction}
</introduction>

## OUTPUT
Follow the format of the output that follows: 
```text                            
1. Keywords: xxx\n\n     
2. Summary: \n\n
- (1):xxx;\n 
- (2):xxx;\n 
- (3):xxx;\n  
- (4):xxx.\n\n     
- (5):xxx.\n\n  
```

Be sure to use {lang} answers (proper nouns need to be marked in English), statements as concise and academic as possible.
Do not have too much repetitive information, numerical values using the original numbers.
Be sure to strictly follow the format, the corresponding content output to xxx, in accordance with \n line feed.                 
"""

method_prompt = """## INSTRUCTION
Given method paragraph and a summary of a paper, you are asked to describe in detail the methodological idea of this article. 
- (1):...
- (2):...
- (3):...
- .......

## CONTEXT
Here are method paragraph:
<method>
{method}
</method>

Here are summary of the paper fyi:
<summary>
{summary}
</summary>

## OUTPUT
Follow the format of the output that follows: 
```text
3. Methods: \n\n
- (1):xxx;\n 
- (2):xxx;\n 
- (3):xxx;\n  
....... \n\n     
```
Be sure to use {lang} answers (proper nouns need to be marked in English), statements as concise and academic as possible.
Do not repeat the content of the previous <summary>, the value of the use of the original numbers.
Be sure to strictly follow the format, the corresponding content output to xxx, in accordance with \n line feed, ....... means fill in according to the actual requirements.                 
"""
 
conclusion_prompt = """## INSTRUCTION
Given conclusion paragraph and a summary of a paper, you are asked to: 
4. Make the following summary:
- (1):What is the significance of this piece of work?
- (2):Summarize the strengths and weaknesses of this article in three dimensions: innovation point, performance, and workload.                   
.......

    "contribution": "What is the contribution of this paper?",
    "novelty": "What is the novelty of this paper?",
    "strength": "What are the strengths of this paper?",
    "drawback": "What are the drawbacks of this paper?",
    "improvement": "What might be the improvements of this paper?",


## CONTEXT
Here are conclusion paragraph:
<conclusion>
{conclusion}
</conclusion>

Here are summary of the paper fyi:
<summary>
{summary}
</summary>

## OUTPUT
Follow the format of the output later: 
```text
4. Conclusion: \n\n
- (1):xxx;\n                     
- (2):Innovation point: xxx; Performance: xxx; Workload: xxx;\n    
- (3):
    contribution: What is the contribution of this paper?,
    novelty: What is the novelty of this paper?,
    strength": What are the strengths of this paper?,
    drawback: What are the drawbacks of this paper?,
    improvement": What might be the improvements of this paper?
```

Be sure to use {lang} answers (proper nouns need to be marked in English), statements as concise and academic as possible.
Do not repeat the content of the previous <summary>, the value of the use of the original numbers.
Be sure to strictly follow the format, the corresponding content output to xxx, in accordance with \n line feed, ....... means fill in according to the actual requirements.                 
"""

In [55]:
ss_metadata[0].get('abstract')

'Large language models (LLMs) are restricted to reason in the"language space", where they typically express the reasoning process with a chain-of-thought (CoT) to solve a complex reasoning problem. However, we argue that language space may not always be optimal for reasoning. For example, most word tokens are primarily for textual coherence and not essential for reasoning, while some critical tokens require complex planning and pose huge challenges to LLMs. To explore the potential of LLM reasoning in an unrestricted latent space instead of using natural language, we introduce a new paradigm Coconut (Chain of Continuous Thought). We utilize the last hidden state of the LLM as a representation of the reasoning state (termed"continuous thought"). Rather than decoding this into a word token, we feed it back to the LLM as the subsequent input embedding directly in the continuous space. Experiments show that Coconut can effectively augment the LLM on several reasoning tasks. This novel late

In [58]:
from json_repair import repair_json  # https://github.com/mangiucugna/json_repair/
topics_infos = [repair_json(x) for x in responses]

In [None]:
topics_text_lst = []
for item in topics_infos:
    item_json = json.loads(item)
    item_topics = item_json.get('topics')
    item_topics_text = "\n".join([f"## {x.get('topic')}  \n{x.get('description')}  \n" for x in item_topics])
    topics_text_lst.append(item_topics_text)

topics_text_lst

In [81]:
abs_md_text = ss_metadata[0].get('abstract')

intro_md_text, met_text, con_md_text = "", "", ""
for item in processed_json:
    title = item.get('title').strip()
    md_text = item.get('refined_text')
    if title.lower() in ['introduction', 'overview']:
        intro_md_text = md_text
    elif title.lower() in ['method', 'methodology', 'approach', 'framework']:
        met_text = md_text
    elif title.lower() in ['conclusion', 'summary']:
        con_md_text = md_text


sum_md_text = """# Key Information  
{md_text}
""".format(md_text="\n".join(topics_text_lst))


In [82]:
tmp_info = [{
    'abstract_text': abs_md_text,
    'introduction_text': intro_md_text,
    'method_text': met_text,
    'conclusion_text': con_md_text,
    'summary_text': sum_md_text
             }]

temptext_wip_json_path = "temptext_analysis_wip_20250213.json"

with open(temptext_wip_json_path, "w") as file:
    json.dump(tmp_info, file, indent=4)