## Running Test for Paper Reaqd Through

- time: 2025-02-20
- first trial: on pdf processing

In [None]:
import re
import fitz
import toml
import copy
from collections import Counter
from typing import List, Dict, Optional

In [None]:
title = "Towards System 2 Reasoning in LLMs: Learning How to Think With Meta Chain-of-Though"
pdf_path = "/home/jiezi/Code/Temp/data/2501.04682v1.pdf"
data_path = "../data"

## PDF Metadata

In [None]:
from apis.semanticscholar_tool import SemanticScholarKit

ss = SemanticScholarKit()
ss_metadata = ss.search_paper_by_keywords(query=title, limit=3)

paper_ss_id = ss_metadata[0].get('paperId')
print(paper_ss_id)

reference_metadata = ss.get_semanticscholar_references(paper_id=paper_ss_id, limit=100)
len(reference_metadata)

## Outline Generation

pass

In [None]:
import sys
import os

parent_dir = os.path.dirname(os.getcwd())
sys.path.append(parent_dir)

In [None]:
from pdf_process.pdf_outline_gen import PDFOutline

In [None]:
outline = PDFOutline(pdf_path=pdf_path)
toc_1 = outline.toc_extraction()
toc_2 = outline.toc_detection()

toc_1_rvsd = outline.identify_toc_appendix(toc_1)
toc_2_rvsd = outline.identify_toc_appendix(toc_2)

## PDF Process

success

In [None]:
from apis.mineru_tool import MinerUKit

In [None]:
import os
mineru = MinerUKit(api_key=os.getenv('MINERU_API_KEY_1'))

In [None]:
res = mineru.batch_process_files(pdf_files=[pdf_path], if_ocr=False, lang='en')

In [None]:
if res.status_code == 200:
    batch_id = res.json().get('data', {}).get('batch_id')
    print(batch_id)
    if batch_id:
        mineru.monitor_batch_status(batch_id=batch_id, save_path=data_path, interval=10, max_retries=10)

## PDF Post Process

In [None]:
file_name = os.path.basename(pdf_path)
file_name_nosuffix = file_name.rsplit('.', 1)[0] 
processed_file_path = os.path.join(data_path, file_name_nosuffix)

md_file = os.path.join(processed_file_path, "full.md")
content_json_file = os.path.join(processed_file_path, "content_list.json")

import json
with open(content_json_file) as json_data:
    content_json = json.load(json_data)

with open(md_file, 'r', encoding='utf-8') as f:
    markdown_content = f.read()

In [None]:
from pdf_process.pdf_post_process import PDFProcess

In [None]:
pdf = PDFProcess(pdf_path=pdf_path, pdf_toc=toc_1_rvsd,pdf_json=content_json)

In [None]:
pdf.align_md_toc()

In [None]:
pdf.align_content_json()

In [None]:
pdf.align_reference_info(reference_metadata)

In [None]:
pdf_json_rvsd_path = os.path.join(processed_file_path, "processed_content_list.json")
with open(pdf_json_rvsd_path, "w") as file:
    json.dump(pdf.pdf_json, file, indent=4)

## PDF Segmentation

In [None]:
def get_toc_hierachy(pdf_json):
    """解析markdown json数据，生成树状章节结构。
    Args:
        pdf_json: markdown json数据列表
    Returns:
        树状章节结构的json数据列表
    """
    toc = []
    section_stack = []

    for i, item in enumerate(pdf_json):
        if item['type'] == 'title':
            level = item['text_level']
            title = item['text']

            while section_stack and section_stack[-1]['level'] >= level:
                popped_section = section_stack.pop()
                popped_section['end_position'] = i - 1
                if section_stack:
                    section_stack[-1]['subsection'].append(popped_section)
                else:
                    toc.append(popped_section)

            new_section = {'title': title, 'level': level, 'start_position': i, 'end_position': -1, 'subsection': []}
            section_stack.append(new_section)

    while section_stack:
        popped_section = section_stack.pop()
        popped_section['end_position'] = len(pdf_json) - 1
        if section_stack:
            section_stack[-1]['subsection'].append(popped_section)
        else:
            toc.append(popped_section)

    return toc

In [None]:
toc_hierachy = get_toc_hierachy(pdf.pdf_json)

In [None]:
toc_hierachy

In [None]:
import tiktoken

def calculate_tokens(text, encoding_name="cl100k_base"):
    """Calculates the number of tokens for a given text using a specified encoding.
    """
    try:
        encoding = tiktoken.get_encoding(encoding_name)
        num_tokens = len(encoding.encode(text))
        return num_tokens
    except tiktoken.EncodingError as e:
        print(f"Error: Encoding '{encoding_name}' not found or text encoding failed. Please check the encoding name and text format.")
        return None

In [None]:
toc_hierachy

In [None]:
def gen_segmentation(self, toc_hierachy, seg_text_length:Optional[int]=20000):
    """segment content json based on toc hierachy"""
    pdf_texts = [item.get('text', '') for item in self.pdf_json]

    all_seg_paras = []
    for section in toc_hierachy:
        section_paras = []
        
        start_pos = section['start_position']
        end_pos = section['end_position']
        
        if len(tmp_text) > seg_text_length and section.get('subsection', []) != []:
            # if the section is too long, then breakdown to subsection
            for subsection in section.get('subsection'):
                sub_start_pos = subsection['start_position']
                sub_end_pos = subsection['end_position']
                section_paras.append(self.pdf_json[sub_start_pos:sub_end_pos+1])
                tmp_text = "\n".join(pdf_texts[start_pos:end_pos+1])
                print('subsection', subsection, len(tmp_text))
        else:
            section_paras.append(self.pdf_json[start_pos:end_pos+1])
            tmp_text = "\n".join(pdf_texts[start_pos:end_pos+1])
            print('section', section, len(tmp_text))
                
        all_seg_paras.extend(section_paras)
    return all_seg_paras

In [None]:
len(all_seg_paras)

In [None]:
all_seg_paras[0]

In [None]:
    def restore_seg_information(self, seg_paras):
        """restore images, tables, references within segments
        Args:
            seg_paras: PDF content json organized in segments, data from gen_segmentation function
            img_lst, tbl_lst: 
        """
        lines = md_text.splitlines()
        seg_images, seg_tbls, seg_refs = [], [], []

        for idx, line in enumerate(lines):
            if line.strip() not in ["\n", "\s", "\r", ""]:
                # resore images in segment
                for img in img_lst:
                    md_ref = img.get('mod_md_ref', '').strip()
                    # image cited in line but not exist in section 
                    if (md_ref not in "\n".join(lines).strip()
                        and (img.get('id') in line.strip() or img.get('title') in line.strip())):
                        lines.insert(idx+1, md_ref)
                        if img not in seg_images:
                            seg_images.append(img)

                    # line contains image ref but not cited in section
                    if md_ref in line.strip():
                        if img.get('id') not in "\n".join(lines).strip() or img.get('title') in "\n".join(lines).strip():
                            lines[idx] = line.replace(md_ref, "  ")
                        elif img not in seg_images:
                            seg_images.append(img)

                # resore tables in segment
                for tbl in tbl_lst:
                    md_ref = tbl.get('mod_md_ref').strip()

                    # image cited in line but not exist in section 
                    if (md_ref not in "\n".join(lines).strip()
                        and (tbl.get('id') in line.strip() or tbl.get('title') in line.strip())):
                        lines.insert(idx+1, md_ref)
                        if tbl not in seg_tbls:
                            seg_tbls.append(tbl)

                    # line contains image ref but not cited in section
                    if md_ref in line.strip():
                        if (tbl.get('id') not in "\n".join(lines).strip() or tbl.get('title') in "\n".join(lines).strip()):
                            lines[idx] = line.replace(md_ref, "  ")
                        elif tbl not in seg_tbls:
                            seg_tbls.append(tbl)

                # restore refs in segment
                for idx, line in enumerate(lines):
                    if line.strip() not in ["\n", "\s", "\r", ""]:
                        for ref in ref_lst:
                            if ref not in seg_refs:
                                contexts = ref.get('contexts')
                                for x in contexts:
                                    if x.strip() in line:
                                        seg_refs.append(ref.get('citedPaper', {}))  # get only ref paper information, neglect isInfluential, intent, etc.
                                        break
        
        # append references to segments
        if len(seg_refs) > 0:
            lines.extend([x.get('org_md_ref') for x in ref_lst])

        return "\n".join(lines), seg_images, seg_tbls, seg_refs