## Running Test for Paper Reaqd Through

- time: 2025-02-20
- first trial: on pdf processing

In [1]:
import re
import fitz
import toml
import copy
from collections import Counter
from typing import List, Dict, Optional

In [2]:
title = "Towards System 2 Reasoning in LLMs: Learning How to Think With Meta Chain-of-Though"
pdf_path = "/home/jiezi/Code/Temp/data/2501.04682v1.pdf"
data_path = "../data"

## PDF Metadata

In [3]:
from apis.semanticscholar_tool import SemanticScholarKit

ss = SemanticScholarKit()
ss_metadata = ss.search_paper_by_keywords(query=title, limit=3)

paper_ss_id = ss_metadata[0].get('paperId')
print(paper_ss_id)

reference_metadata = ss.get_semanticscholar_references(paper_id=paper_ss_id, limit=100)
len(reference_metadata)

2025-02-21 09:55:30,655 - INFO - HTTP Request: GET https://api.semanticscholar.org/graph/v1/paper/search?query=Towards+System+2+Reasoning+in+LLMs%3A+Learning+How+to+Think+With+Meta+Chain-of-Though&fields=abstract%2Cauthors%2CcitationCount%2CcitationStyles%2CcorpusId%2CexternalIds%2CfieldsOfStudy%2CinfluentialCitationCount%2CisOpenAccess%2Cjournal%2CopenAccessPdf%2CpaperId%2CpublicationDate%2CpublicationTypes%2CpublicationVenue%2CreferenceCount%2Cs2FieldsOfStudy%2Ctitle%2Curl%2Cvenue%2Cyear&offset=0&limit=3 "HTTP/1.1 200 OK"


0e63a3aebf14fc7a68c0df7a922770bde5b77360


2025-02-21 09:55:31,846 - INFO - HTTP Request: GET https://api.semanticscholar.org/graph/v1/paper/0e63a3aebf14fc7a68c0df7a922770bde5b77360/references?fields=contexts%2Cintents%2CcontextsWithIntent%2CisInfluential%2Cabstract%2Cauthors%2CcitationCount%2CcitationStyles%2CcorpusId%2CexternalIds%2CfieldsOfStudy%2CinfluentialCitationCount%2CisOpenAccess%2Cjournal%2CopenAccessPdf%2CpaperId%2CpublicationDate%2CpublicationTypes%2CpublicationVenue%2CreferenceCount%2Cs2FieldsOfStudy%2Ctitle%2Curl%2Cvenue%2Cyear&offset=0&limit=100 "HTTP/1.1 200 OK"


0

## Outline Generation

pass

In [None]:
import sys
import os

parent_dir = os.path.dirname(os.getcwd())
sys.path.append(parent_dir)

In [None]:
from pdf_process.pdf_outline_gen import PDFOutline

In [None]:
outline = PDFOutline(pdf_path=pdf_path)
toc_1 = outline.toc_extraction()
toc_2 = outline.toc_detection()

toc_1_rvsd = outline.identify_toc_appendix(toc_1)
toc_2_rvsd = outline.identify_toc_appendix(toc_2)

## PDF Process

success

In [None]:
from apis.mineru_tool import MinerUKit

In [None]:
import os
mineru = MinerUKit(api_key=os.getenv('MINERU_API_KEY_1'))

In [None]:
res = mineru.batch_process_files(pdf_files=[pdf_path], if_ocr=False, lang='en')

In [None]:
if res.status_code == 200:
    batch_id = res.json().get('data', {}).get('batch_id')
    print(batch_id)
    if batch_id:
        mineru.monitor_batch_status(batch_id=batch_id, save_path=data_path, interval=10, max_retries=10)

## PDF Post Process

In [None]:
file_name = os.path.basename(pdf_path)
file_name_nosuffix = file_name.rsplit('.', 1)[0] 
processed_file_path = os.path.join(data_path, file_name_nosuffix)

md_file = os.path.join(processed_file_path, "full.md")
content_json_file = os.path.join(processed_file_path, "content_list.json")

import json
with open(content_json_file) as json_data:
    content_json = json.load(json_data)

with open(md_file, 'r', encoding='utf-8') as f:
    markdown_content = f.read()

In [None]:
from pdf_process.pdf_post_process import PDFProcess

In [None]:
pdf = PDFProcess(pdf_path=pdf_path, pdf_toc=toc_1_rvsd,pdf_json=content_json)

In [None]:
pdf.align_md_toc()

In [None]:
pdf.align_content_json()

In [None]:
pdf.align_reference_info(reference_metadata)

In [None]:
pdf_json_rvsd_path = os.path.join(processed_file_path, "processed_content_list.json")
with open(pdf_json_rvsd_path, "w") as file:
    json.dump(pdf.pdf_json, file, indent=4)

## PDF Segmentation

In [55]:
import json

pdf_json_rvsd_path = "/home/jiezi/Code/GitHub/PaperReadThrough/data/2501.04682v1/processed_content_list.json"
with open(pdf_json_rvsd_path, "r") as file:
    pdf_json_rvsd = json.load(file)

In [56]:
# name patterns for image / table / equation names
IMG_REGX_NAME_PTRN = r"(pic|picture|img|image|chart|figure|fig|table|tbl)\s*([0-9]+(?:\.[0-9]+)?|[0-9]+|[IVXLCDM]+|[a-zA-Z]+)"
TBL_REGX_NAME_PTRN = r"(tbl|table|chart|figure|fig)\s*([0-9]+(?:\.[0-9]+)?|[0-9]+|[IVXLCDM]+|[a-zA-Z]+)"
EQT_REGX_NAME_PTRN = r"(formula|equation|notation|syntax)\s*([0-9]+(?:\.[0-9]+)?|[0-9]+|[IVXLCDM]+|[a-zA-Z]+)"

In [None]:
class PDFSeg:
    def __init__(self, pdf_json):
        self.pdf_json = pdf_json

    def get_toc_hierachy(self):
        """generate ToC tree
        Args:
            pdf_json:
        Returns:
            tree form hierachy of sections
        """
        toc_hierachy = []
        section_stack = []

        for i, item in enumerate(self.pdf_json):
            if item['type'] == 'title':
                level = item['text_level']
                title = item['text']

                while section_stack and section_stack[-1]['level'] >= level:
                    popped_section = section_stack.pop()
                    popped_section['end_position'] = i - 1
                    if section_stack:
                        section_stack[-1]['subsection'].append(popped_section)
                    else:
                        toc_hierachy.append(popped_section)

                new_section = {'title': title, 'level': level, 'start_position': i, 'end_position': -1, 'subsection': []}
                section_stack.append(new_section)

        while section_stack:
            popped_section = section_stack.pop()
            popped_section['end_position'] = len(self.pdf_json) - 1
            if section_stack:
                section_stack[-1]['subsection'].append(popped_section)
            else:
                toc_hierachy.append(popped_section)

        return toc_hierachy
    
    def gen_seg_paras(self, toc_hierachy, seg_text_length:Optional[int]=20000):
        """segment content json based on toc hierachy"""
        pdf_texts = [item.get('text', '') for item in self.pdf_json]

        all_seg_paras = []
        for section in toc_hierachy:
            section_paras = []
            
            start_pos = section['start_position']
            end_pos = section['end_position']
            tmp_text = "\n".join(pdf_texts[start_pos:end_pos+1])
            
            if len(tmp_text) > seg_text_length and section.get('subsection', []) != []:
                # if the section is too long, then breakdown to subsection
                for subsection in section.get('subsection'):
                    sub_start_pos = subsection['start_position']
                    sub_end_pos = subsection['end_position']
                    section_paras.append(self.pdf_json[sub_start_pos:sub_end_pos+1])
                    tmp_text = "\n".join(pdf_texts[sub_start_pos:sub_end_pos+1])
                    print('subsection', subsection.get('title'), len(tmp_text))
            else:
                section_paras.append(self.pdf_json[start_pos:end_pos+1])
                print('section', section.get('title'), len(tmp_text))
                    
            all_seg_paras.extend(section_paras)
        return all_seg_paras

    def gen_md_from_json(self, content_json):
        """input json with predefined format and convert to markdown"""
        md_text = ""
        if len(content_json) > 0:
            for item in content_json:
                if item.get('type') == 'title':
                    md_text += f"{'#'*item.get('text_level')} {item.get('text')}  \n" 

                elif item.get('type') in ['image']:
                    alt_text = "\n".join(item.get('img_caption', [])) 
                    md_text += f"\n![{alt_text}]({item.get('img_path')} '{item.get('id')}')  \n"  
                    md_text += "\n".join(item.get('img_footnote'), []) 

                elif item.get('type') in ['table']:
                    alt_text = "\n".join(item.get('table_caption', [])) 
                    md_text += f"\n![{alt_text}]({item.get('img_path')} '{item.get('id')}')  \n"  
                    md_text += "\n".join(item.get('table_footnote'), []) 

                elif item.get('type') in ['equation']:
                    md_text += f"""```latex\n{item.get('text')}\n```"""

                elif item.get('type') in ['text', 'reference']:
                    md_text += f"{item.get('text')}  \n"  
        return md_text
    

    def restore_seg_elements(self, seg_paras):
        """put all elements (images, tables, equations, refs) metioned in place where the refered to"""

        img_lst = [x for x in self.pdf_json if x.get('type')=='image']
        tbl_lst = [x for x in self.pdf_json if x.get('type')=='table']
        eqt_lst = [x for x in self.pdf_json if x.get('type')=='equation']
        ref_lst = [x for x in self.pdf_json if x.get('type')=='reference']

        seg_paras_rvsd = []
        for seg in seg_paras:
            seg_img_lst = [x for x in seg if x.get('type')=='image']
            seg_tbl_lst = [x for x in seg if x.get('type')=='table']
            seg_eqt_lst = [x for x in seg if x.get('type')=='equation']
            seg_ref_lst = [x for x in seg if x.get('type')=='reference']

            for item in seg:
                if item.get('if_being_reffered') is None:
                    item_text = item.get('text', '')

                    mtch_rslts = re.finditer(IMG_REGX_NAME_PTRN, item_text, re.IGNORECASE)
                    for match in mtch_rslts:
                        img_id = match.group(0)
                        if img_id not in [x.get('id') for x in seg_img_lst]:
                            added_items = [x for x in img_lst if x.get('id')==img_id]
                            print(added_items)
                            for y in added_items:
                                y['if_being_reffered'] = True
                            seg_img_lst.extend(added_items)
                            seg.extend(added_items)

                    mtch_rslts = re.finditer(TBL_REGX_NAME_PTRN, item_text, re.IGNORECASE)
                    for match in mtch_rslts:
                        tbl_id = match.group(0)
                        if tbl_id not in [x.get('id') for x in seg_tbl_lst]:
                            added_items = [x for x in tbl_lst if x.get('id')==tbl_id]
                            for y in added_items:
                                y['if_being_reffered'] = True
                            seg_tbl_lst.extend(added_items)
                            seg.extend(added_items)

                    mtch_rslts = re.finditer(EQT_REGX_NAME_PTRN, item_text, re.IGNORECASE)
                    for match in mtch_rslts:
                        eqt_id = match.group(0)
                        if eqt_id not in [x.get('id') for x in seg_eqt_lst]:
                            added_items = [x for x in eqt_lst if x.get('id')==eqt_id]
                            for y in added_items:
                                y['if_being_reffered'] = True
                            seg_eqt_lst.extend(added_items)
                            seg.extend(added_items)
            seg_paras_rvsd.append(seg)
        
        return seg_paras_rvsd
                
                

In [58]:
seg = PDFSeg(pdf_json_rvsd)

In [59]:
toc_hierachy = seg.get_toc_hierachy()

In [60]:
toc_hierachy

[{'title': '1. Introduction ',
  'level': 1,
  'start_position': 18,
  'end_position': 55,
  'subsection': [{'title': '1.1. Motivation ',
    'level': 2,
    'start_position': 19,
    'end_position': 51,
    'subsection': []},
   {'title': '1.2. Outline ',
    'level': 2,
    'start_position': 52,
    'end_position': 55,
    'subsection': []}]},
 {'title': '2. Meta Chain-Of-Thought ',
  'level': 1,
  'start_position': 56,
  'end_position': 79,
  'subsection': [{'title': '2.1. Deriving The Meta-CoT Process ',
    'level': 2,
    'start_position': 58,
    'end_position': 73,
    'subsection': []},
   {'title': '2.2. Why Does (Classical) CoT Fail? ',
    'level': 2,
    'start_position': 74,
    'end_position': 79,
    'subsection': []}]},
 {'title': '3. Towards Deliberate Reasoning With Language Models - Search ',
  'level': 1,
  'start_position': 80,
  'end_position': 117,
  'subsection': [{'title': '3.1. Inference-Time Compute: Search ',
    'level': 2,
    'start_position': 86,
    'e

In [61]:
seg_paras = seg.gen_seg_paras(toc_hierachy)

section 1. Introduction  6335
section 2. Meta Chain-Of-Thought  10669
section 3. Towards Deliberate Reasoning With Language Models - Search  13821
subsection 4.1. Bootstrapping Meta-CoT  3760
subsection 4.2. Empirical Examples Of Internalizing Search  13436
subsection 4.3. Synthetic Meta-CoT Via Search  4532
subsection 4.4. Do Advanced Reasoning Systems Implement In-Context Search?  4534
section 5. Process Supervision  6885
subsection 6.1. Meta-RL In Small Domains  1275
subsection 6.2. Meta-RL In Language Model Reasoning  2672
subsection 6.3. Efciency Or Super-Intelligence?  3958
subsection 6.4. Can System 2 Reasoning Emerge From Pure RL?  9318
section 7. Putting It All Together - A Pipeline for System 2 Reasoning  15162
subsection 8.1. The "Big MATH" Project  10507
subsection 8.2. Infrastructure  2648
subsection 8.3. Open Research Questions  11793
section 9. Conclusion  1745
section 10. Acknowledgments  197
section References  32378
section A. Prompting  711
section B. Regret Analysis

In [62]:
seg_paras_rvsd = seg.restore_seg_elements(seg_paras)

[]
[]
[]
[]
[{'type': 'image', 'img_path': 'images/c9d400a79bd62951a363cac2aabd041f71ae1d695b6a98d1affe998739c86cb1.jpg', 'img_caption': ['Figure 1: Top: Performance of current frontier models by size on the HARP mathematics benchmark (Yue et al., 2024) by difculty level and topic. The OpenAI O1 series signifcantly out-performs prior generation models across the board. Source: Figure 3 in (Yue et al., 2024). Bottom Average number of tokens generated by each model grouped by difculty level, as well as average number of tokens in human-generated solutions (using GPT4 tokenizer). Source: Figure 4 in (Yue et al., 2024). '], 'img_footnote': [], 'page_idx': 7, 'id': 'Figure 1', 'related_ids': ['Figure 3', 'Figure 4'], 'if_aligned': True}]
[]
[{'type': 'image', 'img_path': 'images/98af3e8774697add630cde871085b466600f00539cd676efc56ec7b6e7631f9c.jpg', 'img_caption': ['Figure 13: Resulting $\\mathbf{A}^{*}$ search tree on the math problem from OpenAI (2024). This trace presents more of a best-f

In [None]:
md_text = seg.gen_md_from_json(pdf_json_rvsd)