In [32]:
titles = ["Training Large Language Models to Reason in a Continuous Latent Space", 
          "Towards System 2 Reasoning in LLMs: Learning How to Think With Meta Chain-of-Thought",
          "s1: Simple test-time scaling",
          "From Few to Many: Self-Improving Many-Shot Reasoners Through Iterative Optimization and Generation",
          "Satori: Reinforcement Learning with Chain-of-Action-Thought Enhances LLM Reasoning via Autoregressive Search"]


pdf_pathes = ["./data/2412.06769v2.pdf",
              "./data/2501.04682v1.pdf",
              "./data/2501.19393v2.pdf",
              "./data/2502.00330v1.pdf",
              "./data/2502.02508v1.pdf"]

In [2]:
import sys
import os

# 获取当前脚本所在目录的父目录 (即 my_project)
parent_dir = os.path.dirname(os.getcwd())

# 将父目录添加到 sys.path
sys.path.append(parent_dir)

In [55]:
import time
import requests

In [56]:
def download_file(url, filename):
    """Downloads a file from the given URL and saves it as filename."""
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for bad status codes

        with open(filename, 'wb') as f:
            f.write(response.content)

        print(f"Successfully downloaded: {filename}")
    except requests.exceptions.RequestException as e:
        print(f"Error downloading: {e}")

In [59]:
import zipfile

def unzip_file(original_zip_file, destination_folder):
    assert os.path.splitext(original_zip_file)[-1] == '.zip'
    with zipfile.ZipFile(original_zip_file, 'r') as zip_ref:
        zip_ref.extractall(destination_folder)

# Paper Metadata

### Basic Metadata

In [4]:
from apis.arxiv_tool import ArxivKit
from apis.semanticscholar_tool import SemanticScholarKit

In [6]:
arxiv = ArxivKit()

arxiv_metadata = []
for title in titles:
    candit_arxiv_metadata = arxiv.retrieve_metadata_by_paper(query_term=title, max_cnt=3)
    arxiv_metadata.append(candit_arxiv_metadata)
    time.sleep(5)

2025-02-07 10:41:03,925 - INFO - Requesting page (first: True, try: 0): https://export.arxiv.org/api/query?search_query=Training+Large+Language+Models+to+Reason+in+a+Continuous+Latent+Space&id_list=&sortBy=relevance&sortOrder=descending&start=0&max_results=100
2025-02-07 10:41:11,305 - INFO - Got first page: 100 of 2652267 total results
2025-02-07 10:41:16,311 - INFO - Requesting page (first: True, try: 0): https://export.arxiv.org/api/query?search_query=Towards+System+2+Reasoning+in+LLMs%3A+Learning+How+to+Think+With+Meta+Chain-of-Thought&id_list=&sortBy=relevance&sortOrder=descending&start=0&max_results=100
2025-02-07 10:41:18,723 - INFO - Got first page: 100 of 2641739 total results
2025-02-07 10:41:23,730 - INFO - Requesting page (first: True, try: 0): https://export.arxiv.org/api/query?search_query=s1%3A+Simple+test-time+scaling&id_list=&sortBy=relevance&sortOrder=descending&start=0&max_results=100
2025-02-07 10:41:26,283 - INFO - Got first page: 100 of 297166 total results
2025-0

In [5]:
ss = SemanticScholarKit()

ss_metadata = []
for title in titles:
    candit_ss_metadata = ss.search_paper_by_keywords(query=title, limit=3)
    ss_metadata.append(candit_ss_metadata)
    time.sleep(5)

2025-02-07 10:40:25,144 - INFO - HTTP Request: GET https://api.semanticscholar.org/graph/v1/paper/search?query=Training%20Large%20Language%20Models%20to%20Reason%20in%20a%20Continuous%20Latent%20Space&fields=abstract,authors,citationCount,citationStyles,corpusId,externalIds,fieldsOfStudy,influentialCitationCount,isOpenAccess,journal,openAccessPdf,paperId,publicationDate,publicationTypes,publicationVenue,referenceCount,s2FieldsOfStudy,title,url,venue,year&offset=0&limit=3 "HTTP/1.1 200 OK"
2025-02-07 10:40:31,719 - INFO - HTTP Request: GET https://api.semanticscholar.org/graph/v1/paper/search?query=Towards%20System%202%20Reasoning%20in%20LLMs:%20Learning%20How%20to%20Think%20With%20Meta%20Chain-of-Thought&fields=abstract,authors,citationCount,citationStyles,corpusId,externalIds,fieldsOfStudy,influentialCitationCount,isOpenAccess,journal,openAccessPdf,paperId,publicationDate,publicationTypes,publicationVenue,referenceCount,s2FieldsOfStudy,title,url,venue,year&offset=0&limit=3 "HTTP/1.1 2

### Reference and Citedby Data

In [9]:
paper_ss_id = ss_metadata[0][0].get('paperId')
print(paper_ss_id)

673fbdd957cada770d10dffca5e45b53da43a3c6


In [10]:
reference_metadata = ss.get_semanticscholar_references(paper_id=paper_ss_id, limit=100)

2025-02-07 10:46:15,478 - INFO - HTTP Request: GET https://api.semanticscholar.org/graph/v1/paper/673fbdd957cada770d10dffca5e45b53da43a3c6/references?fields=contexts,intents,contextsWithIntent,isInfluential,abstract,authors,citationCount,citationStyles,corpusId,externalIds,fieldsOfStudy,influentialCitationCount,isOpenAccess,journal,openAccessPdf,paperId,publicationDate,publicationTypes,publicationVenue,referenceCount,s2FieldsOfStudy,title,url,venue,year&offset=0&limit=100 "HTTP/1.1 429 "
2025-02-07 10:46:46,765 - INFO - HTTP Request: GET https://api.semanticscholar.org/graph/v1/paper/673fbdd957cada770d10dffca5e45b53da43a3c6/references?fields=contexts,intents,contextsWithIntent,isInfluential,abstract,authors,citationCount,citationStyles,corpusId,externalIds,fieldsOfStudy,influentialCitationCount,isOpenAccess,journal,openAccessPdf,paperId,publicationDate,publicationTypes,publicationVenue,referenceCount,s2FieldsOfStudy,title,url,venue,year&offset=0&limit=100 "HTTP/1.1 429 "
2025-02-07 10:

In [12]:
len(reference_metadata)

49

In [11]:
citedby_metadata = ss.get_semanticscholar_citedby(paper_id=paper_ss_id, limit=100)

2025-02-07 10:47:52,816 - INFO - HTTP Request: GET https://api.semanticscholar.org/graph/v1/paper/673fbdd957cada770d10dffca5e45b53da43a3c6/citations?fields=contexts,intents,contextsWithIntent,isInfluential,abstract,authors,citationCount,citationStyles,corpusId,externalIds,fieldsOfStudy,influentialCitationCount,isOpenAccess,journal,openAccessPdf,paperId,publicationDate,publicationTypes,publicationVenue,referenceCount,s2FieldsOfStudy,title,url,venue,year&offset=0&limit=100 "HTTP/1.1 200 OK"


In [13]:
len(citedby_metadata)

8

# PDF Process

## Leverage MinerU for PDF Process

In [33]:
# minerU API from https://mineru.net/apiManage/docs
# Note: monitor_batch_status need to be further tested
import os
import uuid
import copy
import requests
import threading

from typing import List, Dict, Optional

TASK_URL = "https://mineru.net/api/v4/extract/task"
BATCH_URL = "https://mineru.net/api/v4/file-urls/batch"
BATCH_STATUS_URL = "https://mineru.net/api/v4/extract-results/batch"

def detect_lang(string):
    """
    检查整个字符串是否包含中文
    :param string: 需要检查的字符串
    :return: bool
    """

    for ch in string:
        if u'\u4e00' <= ch <= u'\u9fff':
            return 'zh'
    return 'en'

class MinerUKit:
    def __init__(self, api_key):
        self.api_key = api_key
        self.task_url = TASK_URL
        self.batch_url = BATCH_URL
        self.batch_status_url = BATCH_STATUS_URL
        self.header = {
                    'Content-Type':'application/json',
                    "Authorization":f"Bearer {self.api_key}"
                 }
        self.config = {
            "enable_formula": True,
            "language": "en",
            "layout_model":"doclayout_yolo",
            "enable_table": True
        }

    def single_process_request(self, pdf_url, if_ocr, lang):
        """apply MinerU API to process single PDF
        """
        data = copy.deepcopy(self.config)
        data['url'] = pdf_url
        data['is_ocr'] = if_ocr
        data['language'] = lang
        response = requests.post(url=self.task_url, headers=self.header, json=data)
        print(response.status_code)
        return response
    
    def batch_process_files(self, pdf_files:List[str], if_ocr:Optional[bool]=False, lang:Optional[str]='en'):
        """apply MinerU API to process multiple PDF in local path
        """
        files = []
        for file in pdf_files:
            files.append({"name": os.path.basename(file),
                          "data_id": str(uuid.uuid1())})
        data = copy.deepcopy(self.config)
        data['is_ocr'] = if_ocr
        data['language'] = lang
        data['files'] = files

        try:
            response = requests.post(url=self.batch_url,headers=self.header,json=data)
            if response.status_code == 200:
                result = response.json()
                print('response success. result:{}'.format(result))
                if result["code"] == 0:
                    batch_id = result["data"]["batch_id"]
                    urls = result["data"]["file_urls"]
                    print('batch_id:{},urls:{}'.format(batch_id, urls))

                    for idx, file_path in enumerate(pdf_files):
                        with open(file_path, 'rb') as f:
                            res_upload = requests.put(urls[idx], data=f)
                        if res_upload.status_code == 200:
                            print("upload success")
                        else:
                            print("upload failed")
                else:
                    print('apply upload url failed,reason:{}'.format(result.msg))
            else:
                print('response not success. status:{} ,result:{}'.format(response.status_code, response))
            return response
        except Exception as err:
            print(err)
        
        return None

    def batch_process_urls(self, pdf_urls:List[str], if_ocr:Optional[bool]=False, lang:Optional[str]='en'):
        """apply MinerU API to process multiple PDF urls
        """
        files = []
        for pdf_url in pdf_urls:
            files.append({"url": pdf_url,
                          "data_id": str(uuid.uuid1())})
        data = copy.deepcopy(self.config)
        data['is_ocr'] = if_ocr
        data['language'] = lang
        data['files'] = files

        try:
            response = requests.post(url=self.batch_url, headers=self.header, json=data)
            if response.status_code == 200:
                result = response.json()
                print('response success. result:{}'.format(result))
                if result["code"] == 0:
                    batch_id = result["data"]["batch_id"]
                    print('batch_id:{}'.format(batch_id))
                else:
                    print('submit task failed,reason:{}'.format(result.msg))
            else:
                print('response not success. status:{} ,result:{}'.format(response.status_code, response))
            return response
        except Exception as err:
            print(err)

        return None

    def batch_status_check(self, batch_id):
        """check status code of batch task
        """
        url = f'{self.batch_status_url}/{batch_id}'
        res = requests.get(url=url, headers=self.header)
        print(res.status_code)
        # print(res.json())
        return res
    
    def download_and_unzip(self, zip_url, download_file_name, unzip_folder_name):
        """download and unzip MinerU processed files"""
        download_file(zip_url, download_file_name)
        unzip_file(download_file_name, unzip_folder_name)

    def monitor_batch_status(self, batch_id, save_path, interval=10, max_retries=10):
        """
        monitor batch run status, try to download with max_retries

        Args:
            batch_id: batch id
            save_path: path to save processed files (in folder whose name aligned with orginal pdf)
            interval: time interval for next check (in seconds)
            max_retries: max retries
        """
        downloaded_files = set()  # 记录已下载的文件名，避免重复下载

        for _ in range(max_retries):
            running_res = self.batch_status_check(batch_id)
            if running_res.json().get('msg') == 'ok':
                results = running_res.json().get('data', {}).get('extract_result', [])
                for item in results:
                    if item.get('state') == 'done':
                        file_name = item.get('file_name')
                        if file_name not in downloaded_files:  # 检查是否已下载
                            file_name_nosuffix = file_name.rsplit('.', 1)[0]
                            zip_url = item.get('full_zip_url')
                            download_file_name = os.path.join(save_path, file_name_nosuffix + ".zip")
                            unzip_folder_name = os.path.join(save_path, file_name_nosuffix)

                            # 使用线程下载并解压，避免阻塞主线程
                            thread = threading.Thread(
                                target=self.download_and_unzip,
                                args=(zip_url, download_file_name, unzip_folder_name)
                            )
                            thread.start()

                            downloaded_files.add(file_name)  # 标记为已下载
                
                # 检查是否全部完成
                all_done = all(item.get('state') == 'done' for item in results)
                if all_done:
                    print(f"Batch {batch_id} complte")
                    return

            print(f"Batch {batch_id} running, recheck in next {interval} seconds...")
            time.sleep(interval)

        print(f"Exit as batch {batch_id} reached max retries.")

In [34]:
mineru_api_key = os.getenv('MINERU_API_KEY_1')
mineru = MinerUKit(api_key=mineru_api_key)
upload_res = mineru.batch_process_files(pdf_files=pdf_pathes, if_ocr=False, lang='en')

response success. result:{'code': 0, 'msg': 'ok', 'trace_id': '70e0b490acbd3e669f7417132a6c0aaf', 'data': {'batch_id': '1c431751-178f-45f0-b69b-b35faca8bcde', 'file_urls': ['https://mineru.oss-cn-shanghai.aliyuncs.com/api-upload/1c431751-178f-45f0-b69b-b35faca8bcde/443db9c4-b324-4c27-83c1-7ae5cccd0d94.pdf?Expires=1738983651&OSSAccessKeyId=LTAI5t9nGwatk85zetzojXbn&Signature=grLBcVRj4acEWaGHNFevsVRmOHE%3D', 'https://mineru.oss-cn-shanghai.aliyuncs.com/api-upload/1c431751-178f-45f0-b69b-b35faca8bcde/15163ff9-fa5f-418e-90a0-fb37003d5bcc.pdf?Expires=1738983651&OSSAccessKeyId=LTAI5t9nGwatk85zetzojXbn&Signature=XFlNA4fsl8%2FsOnTpAxnwV8NMkF8%3D', 'https://mineru.oss-cn-shanghai.aliyuncs.com/api-upload/1c431751-178f-45f0-b69b-b35faca8bcde/28265148-7c3e-4fe5-937d-6b833754709e.pdf?Expires=1738983651&OSSAccessKeyId=LTAI5t9nGwatk85zetzojXbn&Signature=kiAClxMx31U8Nf1G5hBSZ%2FXHMwY%3D', 'https://mineru.oss-cn-shanghai.aliyuncs.com/api-upload/1c431751-178f-45f0-b69b-b35faca8bcde/9c7daf9c-d07b-4fa2-a22

In [49]:
batch_id = upload_res.json().get('data', {}).get('batch_id')
running_res = mineru.batch_status_check(batch_id=batch_id)

200
{'code': 0, 'msg': 'ok', 'trace_id': 'bc7f2824f223f0fbcfc47c060988cca5', 'data': {'batch_id': '1c431751-178f-45f0-b69b-b35faca8bcde', 'extract_result': [{'data_id': 'bc80f48e-e4ff-11ef-9430-7413ea7e2ff2', 'file_name': '2412.06769v2.pdf', 'state': 'done', 'err_msg': '', 'full_zip_url': 'https://cdn-mineru.openxlab.org.cn/pdf/14aa1800-9750-4ad2-9941-afdbfd1a32d0.zip'}, {'data_id': 'bc80f51a-e4ff-11ef-9430-7413ea7e2ff2', 'file_name': '2501.04682v1.pdf', 'state': 'done', 'err_msg': '', 'full_zip_url': 'https://cdn-mineru.openxlab.org.cn/pdf/bd6b47a3-b65a-4670-9bac-158aaf8d32bf.zip'}, {'data_id': 'bc80f542-e4ff-11ef-9430-7413ea7e2ff2', 'file_name': '2501.19393v2.pdf', 'state': 'done', 'err_msg': '', 'full_zip_url': 'https://cdn-mineru.openxlab.org.cn/pdf/b3fbc814-da53-4ba7-b19a-854fd32c8e7e.zip'}, {'data_id': 'bc80f556-e4ff-11ef-9430-7413ea7e2ff2', 'file_name': '2502.00330v1.pdf', 'state': 'done', 'err_msg': '', 'full_zip_url': 'https://cdn-mineru.openxlab.org.cn/pdf/039a6a8b-5f27-4f88-

In [65]:
temp_path = "./tmp"

if running_res.json().get('msg') == 'ok':
    results = running_res.json().get('data', {}).get('extract_result', []) 
    for item in results:
        if item.get('state') == 'done':
            file_name_nosuffix = item.get('file_name').rsplit('.', 1)[0] 
            zip_url = item.get('full_zip_url')
            download_file_name = os.path.join(temp_path, file_name_nosuffix+".zip") 
            unzip_folder_name = os.path.join(temp_path, file_name_nosuffix) 
            download_file(zip_url, download_file_name)
            unzip_file(download_file_name, unzip_folder_name)


Successfully downloaded: ./tmp/2412.06769v2.zip
Successfully downloaded: ./tmp/2501.04682v1.zip
Successfully downloaded: ./tmp/2501.19393v2.zip
Successfully downloaded: ./tmp/2502.00330v1.zip
Successfully downloaded: ./tmp/2502.02508v1.zip


### Outline Detection

In [95]:
import fitz

def pdf_outline_detection(pdf_path, excpert_len:Optional[int]=300):
    doc = fitz.open(pdf_path)
    toc_infos = doc.get_toc(simple=False) or []

    pdf_toc = []
    for item in toc_infos:
        lvl = item[0] if len(item) > 0 else None
        title = item[1] if len(item) > 1 else None
        start_page = item[2] if len(item) > 2 else None
        end_pos = item[3].get('to') if len(item) > 3 and item[3] else None
        nameddest = item[3].get('nameddest') if len(item) > 3 and item[3] else None
        if_collapse = item[3].get('collapse', False) if len(item) > 3 and item[3] else None

        if start_page is not None:
            page = doc[start_page-1]
            blocks = page.get_text("blocks")

            lines = ""
            for block in blocks:
                x0, y0, x1, y1, text, _, _ = block
                if len(lines) < excpert_len:
                    if end_pos and x0 >= end_pos[0]:
                        lines += text
                else:
                    break

            pdf_toc.append({
                "level": lvl,
                "title": title,
                "page": start_page,
                "position": end_pos,
                "nameddest": nameddest,
                'if_collapse': if_collapse,
                "excerpt": lines + "..."
            })
    return pdf_toc

In [96]:
pdf_toc = pdf_outline_detection(pdf_path=pdf_pathes[0])

In [97]:
pdf_toc

[{'level': 1,
  'title': 'Introduction',
  'page': 1,
  'position': Point(70.866, 378.949),
  'nameddest': 'section.1',
  'if_collapse': False,
  'excerpt': 'Training Large Language Models to Reason in a\nContinuous Latent Space\nShibo Hao1,2,∗, Sainbayar Sukhbaatar1, DiJia Su1, Xian Li1, Zhiting Hu2, Jason Weston1, Yuandong Tian1\n1FAIR at Meta, 2UC San Diego\n∗Work done at Meta\nLarge language models (LLMs) are restricted to reason in the “language space”, where they typically\nexpress the reasoning process with a chain-of-thought (CoT) to solve a complex reasoning problem.\nHowever, we argue that language space may not always be optimal for reasoning. For example, most\nword tokens are primarily for textual coherence and not essential for reasoning, while some critical\ntokens require complex planning and pose huge challenges to LLMs. To explore the potential of\nLLM reasoning in an unrestricted latent space instead of using natural language, we introduce a new\nparadigm Coconut (Ch

## Structural Analysis

The following would use 2412.06769v2 for example.

In [None]:
file_name_nosuffix = "2412.06769v2"
file_path = os.path.join(temp_path, file_name_nosuffix)

from pathlib import Path  
 
for file in Path(file_path).glob('*'): 
    file_nm = os.path.basename(file)
    if "_origin.pdf" in file_nm:
        os.remove(file) 
    elif "_content_list.json" in file_nm:
        os.rename(file, os.path.join(file_path, "content_list.json"))

In [74]:
md_file = os.path.join(file_path, "full.md")
content_json_file = os.path.join(file_path, "content_list.json")
layout_json_file = os.path.join(file_path, "layout.json")

### Markdown Modification

- update markdown title
- update markdown table and image names

In [193]:
import re

def restore_md_toc(md_content, pdf_toc):
    """
    Align markdown title with pdf table of content (generated from fitz)

    Args:
        md_file: Path to the markdown file.
        pdf_toc: pdf toc from pdf_outline_detection function

    Returns:
        A list of dictionaries, where each dictionary represents a section
        with 'level', 'section_num', 'title', and 'text' keys.
        Returns an empty list if the file doesn't exist.
        Returns None if an error occurs.
    """
    # if not os.path.exists(md_file):
    #     return []

    # try:
    #     with open(md_file, 'r', encoding='utf-8') as f:
    #         markdown_content = f.read()
    # except Exception as e:
    #     print(f"Error reading file: {e}")
    #     return None


    if pdf_toc:
        modified_lines = []  # 用于存储修改后的行的列表

        title_pattern = r"^#{1,}\s*.*$"  # patttern of markdown title
        md_titles = []

        for idx, line in enumerate(md_content.splitlines()):  # iterate markdown lines
            match = re.search(title_pattern, line)
            if match:  # find markdown title
                sec_title = line
                flag = 0

                for x in pdf_toc:  # iterate pdf toc, refine markdown title based on toc title
                    toc_title = x['title'] 
                    toc_level = int(x['level'])  
                    if toc_title in line:  
                        sec_title = "#"*toc_level + " " + toc_title + "  "
                        flag = 1
                        break
                
                if flag == 0:  # markdown title not exit in toc
                    for item in ['Acknowledgement', 'Reference', 'Appendix']:
                        if item in line:
                            sec_title = line
                            flag = 1
                
                if flag == 0:
                    if len(md_titles) > 0:
                        if re.match('^#{1,}', md_titles[-1]):
                            pre_level = re.match('^#{1,}', md_titles[-1]).group(0) + "#"
                            sec_title = re.sub('^#{1,}', pre_level, line)
                        else:
                            sec_title = "#" + line

                modified_lines.append(sec_title)
                md_titles.append(sec_title)  # get markdown title

            else:
                modified_lines.append(line)
    return "\n".join(modified_lines)

In [13]:
def get_first_lines(text, sentence_length):
    if not text:
        return ""

    # 使用正则表达式分割句子
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|;|!)\s', text) # 更精确的断句正则

    result = ""
    current_length = 0

    for sentence in sentences:
        cleaned_sentence = sentence.strip()

        if cleaned_sentence:
            result += cleaned_sentence + " "
            current_length = len(result.strip())

            if current_length >= sentence_length:
                return result.strip()

    return result.strip()

In [14]:
def restore_charts_and_tables(content_json):
    """assign title and ids to images/ charts, tables, and equations
    """
    img_lst, tbl_lst, formula_lst = [], [], []
    i, j, k = 1, 1, 1
    for x in content_json:
        if x['type'] == 'image':

            desc = "\n".join(x.get('img_caption', [])) + "\n".join(x.get('img_footnote', []))
            ptrn = r"(pic|picture|img|image|chart|figure|fig)\s*([0-9]+(?:\.[0-9]+)?|[0-9]+|[IVXLCDM]+|[a-zA-Z]+)"
            mtch_rslts = re.finditer(ptrn, desc, re.IGNORECASE)

            img_ids = []
            for match in mtch_rslts:
                img_ids.append(match.group(0))  # 直接获取整个匹配的字符串

            if len(img_ids) == 0:
                img_ids = [f"Image_Number_{i}"]
                i += 1
            x['id'] = img_ids[0]
            x['related_ids'] = img_ids[1:]
            x['img_title'] = get_first_lines(desc, 10)

            img_lst.append(x)

        elif x['type'] == 'table':

            desc = "\n".join(x.get('table_caption', [])) + "\n".join(x.get('table_footnote', []))
            ptrn = r"(tbl|table|chart|figure|fig)\s*([0-9]+(?:\.[0-9]+)?|[0-9]+|[IVXLCDM]+|[a-zA-Z]+)"
            mtch_rslts = re.finditer(ptrn, desc, re.IGNORECASE)

            tbl_ids = []
            for match in mtch_rslts:
                tbl_ids.append(match.group(0))  # 直接获取整个匹配的字符串

            if len(tbl_ids) == 0:
                tbl_ids = [f"Table_Number_{j}"]
                j += 1
            x['id'] = tbl_ids[0]
            x['related_ids'] = tbl_ids[1:]
            x['table_title'] = get_first_lines(desc, 10)

            tbl_lst.append(x)

        elif x['type'] == 'equation':

            desc = x.get('text')
            ptrn = r"(formula|equation|notation|syntax)\s*([0-9]+(?:\.[0-9]+)?|[0-9]+|[IVXLCDM]+|[a-zA-Z]+)"
            mtch_rslts = re.finditer(ptrn, desc, re.IGNORECASE)

            equation_ids = []
            for match in mtch_rslts:
                equation_ids.append(match.group(0))  # 直接获取整个匹配的字符串

            if len(equation_ids) == 0:
                equation_ids = [f"Equation_Number_{k}"]
                k += 1
            x['id'] = equation_ids[0]
            x['related_ids'] = equation_ids[1:]

            formula_lst.append(x)
    return img_lst, tbl_lst, formula_lst 

In [1]:
import json
with open("./tmp/2412.06769v2/content_list.json") as json_data:
    content_json = json.load(json_data)

In [15]:
img_lst, tbl_lst, formula_lst = restore_charts_and_tables(content_json)

In [11]:
md_file = "./tmp/2412.06769v2/full.md"
with open(md_file, 'r', encoding='utf-8') as f:
    markdown_content = f.read()

In [16]:
img_lst

[{'type': 'image',
  'img_path': 'images/3b0f18697b44445c12ab2b41e0ff7a5fa498867fbcd33644600f98041b2a9f6a.jpg',
  'img_caption': ['Figure 1 A comparison of Chain of Continuous Thought (Coconut) with Chain-of-Thought (CoT). In CoT, the model generates the reasoning process as a word token sequence (e.g., $[x_{i},x_{i+1},...,x_{i+j}]$ in the fgure). Coconut regards the last hidden state as a representation of the reasoning state (termed “continuous thought”), and directly uses it as the next input embedding. This allows the LLM to reason in an unrestricted latent space instead of a language space. '],
  'img_footnote': [],
  'page_idx': 1,
  'id': 'Figure 1',
  'related_ids': [],
  'img_title': 'Figure 1 A comparison of Chain of Continuous Thought (Coconut) with Chain-of-Thought (CoT).'},
 {'type': 'image',
  'img_path': 'images/e72083f8a262261062393a5c15691de83822ae7b995a8d74e27b22ad37bcb993.jpg',
  'img_caption': ['Figure 2 Training procedure of Chain of Continuous Thought (Coconut). G

In [17]:
line_1 = """![](images/e72083f8a262261062393a5c15691de83822ae7b995a8d74e27b22ad37bcb993.jpg)  
Figure 2 Training procedure of Chain of Continuous Thought (Coconut). Given training data with language reasoning steps, at each training stage we integrate $c$ additional continuous thoughts ( $c=1$ in this example), and remove one language reasoning step. The cross-entropy loss is then used on the remaining tokens after continuous thoughts.  
"""

line_2 = "![](images/4692454aa0746096ebd571ecdc3a93b136498d5be71a2a1a90c85d5df37c9864.jpg)  "

ptrn = r"!\[(.*?)\]\((.*?)\s*(\"(.*?)\")?\)"
match = re.match(ptrn, line_1)

if match:
    alt_text = match[0]
    image_url = match[1]
    title = match[3] if match[3] else None



In [19]:
re.match(ptrn, "sdfasd")

In [27]:
match

('',
 'images/f07b61cbc539d327d8e037a04ff8fcae61d745c2cad9f031e1c0883c8546b38e.jpg',
 '',
 '')

In [43]:
import re

def update_image_info(line, img_lst):
    ptrn = r"!\[(.*?)\]\((.*?)\s*(\"(.*?)\")?\)"
    matches = list(re.finditer(ptrn, line))  # 使用 finditer 获取所有匹配项

    if matches:
        new_line = line  # 初始化 new_line
        for match in reversed(matches):  # 逆序遍历匹配项，避免替换位置错乱
            alt_text = match.group(1).strip()
            image_url = match.group(2)
            title = match.group(4).strip() if match.group(4) else None

            for item in img_lst:
                if item.get('img_path') == image_url:
                    alt_text = "\n".join(item.get('img_caption', [])) + "\n".join(item.get('img_footnote', [])) if alt_text is None or alt_text == "" else alt_text
                    title = item.get('img_title', "") if title is None or title == "" else title
                    title = f"{item.get('id')}: {title}" if item.get('id') not in title else title
                    img_md = f"![{alt_text.strip()}]({image_url.strip()} '{title.strip()}')"

                    # 计算替换的起始和结束位置
                    start, end = match.span()
                    new_line = new_line[:start] + img_md + new_line[end:]  # 精确替换
                    break  # 找到匹配的 item 后跳出循环
        return new_line.strip()
    return line.strip()

In [None]:
def update_image_info(line, img_lst):
    ptrn = r"!\[(.*?)\]\((.*?)\s*(\"(.*?)\")?\)"
    matches = list(re.finditer(ptrn, line))  # 使用 finditer 获取所有匹配项

    if matches:
        new_line = line  # 初始化 new_line
        for match in reversed(matches):  # 逆序遍历匹配项，避免替换位置错乱
            alt_text = match.group(1).strip()
            image_url = match.group(2)
            title = match.group(4).strip() if match.group(4) else None

            for item in img_lst:
                if item.get('img_path') == image_url:
                    alt_text = "\n".join(item.get('img_caption', [])) + "\n".join(item.get('img_footnote', [])) if alt_text is None or alt_text == "" else alt_text
                    title = item.get('img_title', "") if title is None or title == "" else title
                    title = f"{item.get('id')}: {title}" if item.get('id') not in title else title
                    img_md = f"![{alt_text.strip()}]({image_url.strip()} '{title.strip()}')"

                    # 计算替换的起始和结束位置
                    start, end = match.span()
                    new_line = new_line[:start] + img_md + new_line[end:]  # 精确替换
                    break  # 找到匹配的 item 后跳出循环
        return new_line.strip()
    return line.strip()

In [None]:
import re

def markdown_table_to_html(markdown_text):
    """
    将 Markdown 文本中的 Markdown 表格转换为 HTML 表格。

    Args:
        markdown_text: 包含 Markdown 表格的 Markdown 文本。

    Returns:
        转换后的 Markdown 文本，表格部分已转换为 HTML 表格。
    """

    lines = markdown_text.splitlines()
    output_lines = []
    in_table = False
    table_lines = []

    for line in lines:
        if line.strip().startswith('|'):
            in_table = True
            table_lines.append(line)
        else:
            if in_table:
                # 表格结束，处理之前收集的表格行
                html_table = _convert_table_lines_to_html(table_lines)
                output_lines.append(html_table)
                in_table = False
                table_lines = []
            output_lines.append(line)

    # 处理文本末尾可能存在的表格
    if in_table:
        html_table = _convert_table_lines_to_html(table_lines)
        output_lines.append(html_table)

    return "\n".join(output_lines)


def _convert_table_lines_to_html(table_lines):
    """
    将 Markdown 表格行转换为 HTML 表格。

    Args:
        table_lines: Markdown 表格行的列表。

    Returns:
        HTML 表格字符串。
    """
    html_lines = ["<table>", "  <thead>", "    <tr>"]
    header_cells = [cell.strip() for cell in table_lines[0].strip('|').split('|')]
    for header in header_cells:
        html_lines.append(f"      <th>{header}</th>")
    html_lines.append("    </tr>")
    html_lines.append("  </thead>")
    html_lines.append("  <tbody>")

    if len(table_lines) > 1 and re.match(r'^\|[-:| ]+\|[-:| ]*$', table_lines[1].strip()):
        # 存在分隔行，跳过分隔行，从第三行开始是数据行
        data_start_index = 2
    else:
        data_start_index = 1 # 没有分隔行，从第二行开始是数据行

    for i in range(data_start_index, len(table_lines)):
        html_lines.append("    <tr>")
        data_cells = [cell.strip() for cell in table_lines[i].strip('|').split('|')]
        for cell in data_cells:
            html_lines.append(f"      <td>{cell}</td>")
        html_lines.append("    </tr>")

    html_lines.append("  </tbody>")
    html_lines.append("</table>")
    return "\n".join(html_lines)

In [7]:
import re 
def reorg_section_content(md_content, json_content, level):
    title_pattern = re.compile(rf"^#{{{level}}}\s+(.+)$", re.MULTILINE)

    sections = []

    paragraphs = []
    current_section = ""
    current_title = ""

    section_num = 1  # Initialize section number
    para_id = 1  # initialize pragraph number

    for line in md_content.splitlines():
        if line.strip() not in ["\n", "\s", "\r", ""]:
            match = title_pattern.match(line)
            if match:
                if current_section:  # Save the previous section
                    sections.append({
                        'level': level,
                        'section_num': section_num,
                        'title': current_title,
                        'text': current_section.strip(),  # Remove leading/trailing whitespace
                        'paragraphs': paragraphs
                    })
                    section_num += 1  # Increment for the next section
                
                # ready for next section
                current_title = match.group(1).strip()
                current_section = ""  # Start a new section (no title line)
                paragraphs = []
                para_id = 1
            else:
                current_section += line + "\n"  # Add to the current section
                paragraphs.append({'paragraph_id':f"paragraph_{para_id}", 'md_content': line})
                para_id += 1

    if current_section:  # Save the last section
        sections.append({
            'level': level,
            'section_num': section_num,
            'title': current_title,
            'text': current_section.strip(),
            'paragraphs': paragraphs
        })

    return sections

In [8]:
md_file = "./tmp/2412.06769v2/full.md"
with open(md_file, 'r', encoding='utf-8') as f:
    markdown_content = f.read()

sections = reorg_section_content(md_content=markdown_content, json_content=None, level=1)

In [9]:
sections

[{'level': 1,
  'section_num': 1,
  'title': 'Training Large Language Models to Reason in a Continuous Latent Space',
  'text': 'Shibo Hao $^{1,2,*}$ , Sainbayar Sukhbaatar1, DiJia $\\mathtt{s u}^{1}$ , Xian Li1, Zhiting ${\\mathsf{H}}{\\mathsf{u}}^{2}$ , Jason Weston1, Yuandong Tian1   \n1FAIR at Meta, $^2$ UC San Diego   \n∗Work done at Meta  \nLarge language models (LLMs) are restricted to reason in the “language space”, where they typically express the reasoning process with a chain-of-thought (CoT) to solve a complex reasoning problem. However, we argue that language space may not always be optimal for reasoning. For example, most word tokens are primarily for textual coherence and not essential for reasoning, while some critical tokens require complex planning and pose huge challenges to LLMs. To explore the potential of LLM reasoning in an unrestricted latent space instead of using natural language, we introduce a new paradigm Coconut (Chain of Continuous Thought). We utilize th

In [196]:
import json  

with open(content_json_file) as f:  
    content_lst = json.load(f)  

In [197]:
set([x['type'] for x in content_lst])

{'equation', 'image', 'table', 'text'}

In [None]:
# extract images
img_lst = [x for x in content_lst if x['type'] == 'image']
for img in img_lst:
    desc = "\n".join(img.get('img_caption', [])) + "\n".join(img.get('img_footnote', []))
    desc

# extract tables
tbl_lst = [x for x in content_lst if x['type'] == 'table']

# extract formula
formula_lst = [x for x in content_lst if x['type'] == 'equation']

In [205]:
img_lst

[{'type': 'image',
  'img_path': 'images/3b0f18697b44445c12ab2b41e0ff7a5fa498867fbcd33644600f98041b2a9f6a.jpg',
  'img_caption': ['Figure 1 A comparison of Chain of Continuous Thought (Coconut) with Chain-of-Thought (CoT). In CoT, the model generates the reasoning process as a word token sequence (e.g., $[x_{i},x_{i+1},...,x_{i+j}]$ in the fgure). Coconut regards the last hidden state as a representation of the reasoning state (termed “continuous thought”), and directly uses it as the next input embedding. This allows the LLM to reason in an unrestricted latent space instead of a language space. '],
  'img_footnote': [],
  'page_idx': 1},
 {'type': 'image',
  'img_path': 'images/e72083f8a262261062393a5c15691de83822ae7b995a8d74e27b22ad37bcb993.jpg',
  'img_caption': ['Figure 2 Training procedure of Chain of Continuous Thought (Coconut). Given training data with language reasoning steps, at each training stage we integrate $c$ additional continuous thoughts ( $c=1$ in this example), and 

In [204]:
formula_lst

[{'type': 'equation',
  'text': '$$\n\\begin{array}{c}{H_{t}=\\operatorname{Transformer}(E_{t})}\\\\ {\\mathcal{M}(x_{t+1}\\mid x_{\\le t})=\\operatorname{softmax}(W h_{t})}\\end{array}\n$$',
  'text_format': 'latex',
  'page_idx': 2}]

In [203]:
tbl_lst

[{'type': 'table',
  'img_path': 'images/c81e5cdf7aa362b6915254737e207d052d121cdf144405aa782f3632384b8feb.jpg',
  'table_caption': ['Table 1 Results on three datasets: GSM8l, ProntoQA and ProsQA. Higher accuracy indicates stronger reasoning ability, while generating fewer tokens indicates better efciency. ∗The result is from Deng et al. (2024). '],
  'table_footnote': [],
  'table_body': '\n\n<html><body><table><tr><td rowspan="2">Method</td><td colspan="2">GSM8k</td><td colspan="2">ProntoQA</td><td colspan="2">ProsQA</td></tr><tr><td>Acc. . (%)</td><td># Tokens</td><td>Acc. (%)</td><td># Tokens</td><td>Acc. (%)</td><td># Tokens</td></tr><tr><td>CoT</td><td>42.9 ±0.2</td><td>25.0</td><td>98.8 ±0.8</td><td>92.5</td><td>77.5 ±1.9</td><td>49.4</td></tr><tr><td>No-CoT</td><td>16.5 ±0.5</td><td>2.2</td><td>93.8 ±0.7</td><td>3.0</td><td>76.7 ±1.0</td><td>8.2</td></tr><tr><td>iCoT</td><td>30.0*</td><td>2.2</td><td>99.8 ±0.3</td><td>3.0</td><td>98.2 ±0.3</td><td>8.2</td></tr><tr><td>Pause Toke

In [202]:
img_lst

[{'type': 'image',
  'img_path': 'images/3b0f18697b44445c12ab2b41e0ff7a5fa498867fbcd33644600f98041b2a9f6a.jpg',
  'img_caption': ['Figure 1 A comparison of Chain of Continuous Thought (Coconut) with Chain-of-Thought (CoT). In CoT, the model generates the reasoning process as a word token sequence (e.g., $[x_{i},x_{i+1},...,x_{i+j}]$ in the fgure). Coconut regards the last hidden state as a representation of the reasoning state (termed “continuous thought”), and directly uses it as the next input embedding. This allows the LLM to reason in an unrestricted latent space instead of a language space. '],
  'img_footnote': [],
  'page_idx': 1},
 {'type': 'image',
  'img_path': 'images/e72083f8a262261062393a5c15691de83822ae7b995a8d74e27b22ad37bcb993.jpg',
  'img_caption': ['Figure 2 Training procedure of Chain of Continuous Thought (Coconut). Given training data with language reasoning steps, at each training stage we integrate $c$ additional continuous thoughts ( $c=1$ in this example), and 

In [194]:
md_content = restore_md_toc(md_file, pdf_toc)

### Markdown Process

In [78]:
import re

def split_markdown_into_dicts(filepath, level=2):
    """
    Splits a markdown file into a list of dictionaries based on a specified title level.

    Args:
        filepath: Path to the markdown file.
        level: The heading level to split by (e.g., 2 for ## headings).

    Returns:
        A list of dictionaries, where each dictionary represents a section
        with 'level', 'section_num', 'title', and 'text' keys.
        Returns an empty list if the file doesn't exist.
        Returns None if an error occurs.
    """

    if not os.path.exists(filepath):
        return []

    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            markdown_content = f.read()
    except Exception as e:
        print(f"Error reading file: {e}")
        return None

    title_pattern = re.compile(rf"^#{{{level}}}\s+(.+)$", re.MULTILINE)

    sections = []
    current_section = ""
    current_title = ""
    section_num = 1  # Initialize section number

    for line in markdown_content.splitlines():
        match = title_pattern.match(line)
        if match:
            if current_section:  # Save the previous section
                sections.append({
                    'level': level,
                    'section_num': section_num,
                    'title': current_title,
                    'text': current_section.strip()  # Remove leading/trailing whitespace
                })
                section_num += 1  # Increment for the next section
            current_title = match.group(1).strip()
            current_section = ""  # Start a new section (no title line)
        else:
            current_section += line + "\n"  # Add to the current section

    if current_section:  # Save the last section
        sections.append({
            'level': level,
            'section_num': section_num,
            'title': current_title,
            'text': current_section.strip()
        })

    return sections

In [79]:
sections = split_markdown_into_dicts(md_file, level=1)

In [None]:
for sec in sections:
    sec_level = sec.get('level')
    sec_num = sec.get('section_num')
    sec_title = sec.get('title')
    sec_text = sec.get('text')
    for item in pdf_toc:
        

In [80]:
sections

[{'level': 1,
  'section_num': 1,
  'title': 'Training Large Language Models to Reason in a Continuous Latent Space',
  'text': 'Shibo Hao $^{1,2,*}$ , Sainbayar Sukhbaatar1, DiJia $\\mathtt{s u}^{1}$ , Xian Li1, Zhiting ${\\mathsf{H}}{\\mathsf{u}}^{2}$ , Jason Weston1, Yuandong Tian1   \n1FAIR at Meta, $^2$ UC San Diego   \n∗Work done at Meta  \n\nLarge language models (LLMs) are restricted to reason in the “language space”, where they typically express the reasoning process with a chain-of-thought (CoT) to solve a complex reasoning problem. However, we argue that language space may not always be optimal for reasoning. For example, most word tokens are primarily for textual coherence and not essential for reasoning, while some critical tokens require complex planning and pose huge challenges to LLMs. To explore the potential of LLM reasoning in an unrestricted latent space instead of using natural language, we introduce a new paradigm Coconut (Chain of Continuous Thought). We utilize 

### Restore Images and Tables