# Debug

## Test on ToC Detection

In [1]:
import re
import fitz
import toml
import copy
from collections import Counter
from typing import List, Dict, Optional

from pdf_process.pdf_meta_det import extract_meta, dump_toml
from pdf_process.pdf_toc_det import gen_toc

SECTION_TITLES = ["Abstract",
                'Introduction', 'Related Work', 'Background',
                "Introduction and Motivation", "Computation Function", " Routing Function",
                "Preliminary", "Problem Formulation",
                'Methods', 'Methodology', "Method", 'Approach', 'Approaches',
                "Materials and Methods", "Experiment Settings",
                'Experiment', "Experimental Results", "Evaluation", "Experiments",
                "Results", 'Findings', 'Data Analysis',
                "Discussion", "Results and Discussion", "Conclusion",
                'References',
                "Acknowledgments", "Appendix", "FAQ", "Frequently Asked Questions"]
APPENDDIX_TITLES = ["References", "Acknowledgments", "Appendix", "FAQ", "Frequently Asked Questions"]     

def count_by_keys(lst_dct, keys):
    """get item count within a list of dict by specified dict keys
    Args:
        lst_dct: list of dict
        keys: specified dict keys like ['a', 'b', 'c']。
    Returns:
        dict: count of items based on keys combinations in descending order
    """
    key_combinations = []
    for dct in lst_dct:
        combination = tuple(dct.get(key) for key in keys)
        key_combinations.append(combination)
    result_cnt = Counter(key_combinations)
    sorted_result = sorted(result_cnt.items(), key=lambda item: item[0], reverse=True)
    return sorted_result


# OUtline Detection
class PDFOutline:
    def __init__(self, pdf_path):
        self.pdf_path = pdf_path
        self.doc = self.open_pdf()

    def open_pdf(self):
        """open pdf doc"""
        try:
            doc = fitz.open(self.pdf_path)
            return doc
        except Exception as e:
            print(f"处理 PDF 文件时出错: {self.pdf_path}, 错误信息: {e}")
            return None # 或者抛出异常，根据实际需求决定
        
    def toc_extraction(self, excpert_len:Optional[int]=300):
        """apply pymupdf to extract outline
        Args:
            pdf_path: path to pdf file
            excpert_len: excerpt lenght of initial text
        Return:
            pdf_toc: pdf toc including level, title, page, position, nameddest, if_collapse, excerpt
                     if_collapse: if contains next level title
                     excerpt: initial text
        """
        toc = self.doc.get_toc(simple=False) or []

        pdf_toc = []
        if len(toc) > 0:
            for item in toc:
                lvl = item[0] if len(item) > 0 else None
                title = item[1] if len(item) > 1 else None
                start_page = item[2] if len(item) > 2 else None
                pos = item[3].get('to') if len(item) > 3 and item[3] else None
                nameddest = item[3].get('nameddest') if len(item) > 3 and item[3] else None
                if_collapse = item[3].get('collapse', False) if len(item) > 3 and item[3] else None

                # get initial lines
                lines = ""
                if start_page is not None:
                    page = self.doc[start_page-1]
                    blocks = page.get_text("blocks")
                    for block in blocks:
                        x0, y0, x1, y1, text, _, _ = block
                        if len(lines) < excpert_len:
                            if pos and x0 >= pos.x:
                                lines += text
                        else:
                            break

                    pdf_toc.append({
                        "level": lvl,
                        "title": title,
                        "page": start_page,
                        "position": pos,
                        "nameddest": nameddest,
                        'if_collapse': if_collapse,
                        "excerpt": lines + "..."
                    })
        return pdf_toc
    
    def toc_detection(self, excpert_len:Optional[int]=300, titles=SECTION_TITLES):
        """identify toc based on title font, layout, etc"""
        matched_meta_lst = []
        pattern = '|'.join(re.escape(title) for title in titles)  
        for i in range(len(self.doc)):
            # extract_meta returns font size (size), font style (flags), font type (char_flags) 
            res = extract_meta(self.doc, pattern=pattern, page=i+1, ign_case=True)
            matched_meta_lst.extend(res)

        # get font size for titles
        keys = ['size']
        combinations = count_by_keys(matched_meta_lst, keys)  # get sorted count by keys in matched_meta_lst
        for x in combinations:
            if x[1] > 2:
                font_size = x[0][0]
                break

        # return to sampled_metadata to match all potential combinations
        title_meta_sample = [item for item in matched_meta_lst if item.get('size') == font_size]

        auto_level = 1
        addnl = False
        title_meta_toml = [dump_toml(m, auto_level, addnl) for m in title_meta_sample]

        # 直接使用 toml.loads 从字符串中加载 TOML 数据
        recipe = toml.loads('\n'.join(title_meta_toml))
        toc = gen_toc(self.doc, recipe)

        pdf_toc = []
        if len(toc) > 0:
            for item in toc:
                start_page = item.pagenum
                pos = item.pos
                
                # get initial lines
                if start_page is not None:
                    page = self.doc[start_page-1]
                    blocks = page.get_text("blocks")
                    lines = ""
                    for block in blocks:
                        x0, y0, x1, y1, text, _, _ = block
                        if len(lines) < excpert_len:
                            if pos and x0 >= pos.x:
                                lines += text
                        else:
                            break

                    pdf_toc.append({
                        "level": item.level,
                        "title": item.title,
                        "page": item.pagenum,
                        "position": item.pos,
                        "nameddest": "section.",
                        'if_collapse': None,
                        "excerpt": lines + "..."
                    })
        return pdf_toc
    
    def identify_toc_appendix(self, pdf_toc):
        pdf_toc_rvsd = copy.deepcopy(pdf_toc)
        pattern = '|'.join(re.escape(title) for title in APPENDDIX_TITLES) 

        for idx, item in enumerate(pdf_toc_rvsd):
            mtch = re.search(pattern, item.get('title'), re.IGNORECASE)
            if mtch:
                item['if_appendix'] = True
            elif 'appendix' in item.get('nameddest'):
                item['if_appendix'] = True
            elif idx > 0 and pdf_toc_rvsd[idx-1].get('if_appendix') == True:
                item['if_appendix'] = True
            else:
                item['if_appendix'] = False
        return pdf_toc_rvsd

In [2]:
title = "Training Large Language Models to Reason in a Continuous Latent Space"
pdf_path = "/home/jiezi/Code/Temp/data/2412.06769v2.pdf"

outline = PDFOutline(pdf_path=pdf_path)
toc_1 = outline.toc_extraction()
toc_2 = outline.toc_detection()

toc_1_rvsd = outline.identify_toc_appendix(toc_1)
toc_2_rvsd = outline.identify_toc_appendix(toc_2)

## Align Markdown Titles with ToC

In [3]:
md_file = "/home/jiezi/Code/Temp/tmp/2412.06769v2/full.md"
with open(md_file, 'r', encoding='utf-8') as f:
    markdown_content = f.read()

In [4]:
def align_md_toc(pdf_md, pdf_toc):
    """Align markdown title with pdf table of content 
    Args:
        md_file: Path to the markdown file.
        pdf_toc: pdf toc from pdf_outline_detection function
    Returns:
        A list of dictionaries, where each dictionary represents a section
        with 'level', 'section_num', 'title', and 'text' keys.
        Returns an empty list if the file doesn't exist.
        Returns None if an error occurs.
    """
    lines = pdf_md.splitlines()

    modified_lines = [] 
    md_titles_info = []  # store title after modification
    title_pattern = r"^#{1,}\s*.*$"  # patttern of markdown title
    
    for line in lines: 
        if line.strip() not in ["\n", "\s", "\r", ""] and len(line) < 100:
            ptrn_match = re.match(title_pattern, line)
            if ptrn_match:  # find markdown title
                flag = 0

                for toc in pdf_toc:  # iterate pdf toc, refine markdown title based on toc title
                    toc_title = toc['title'] 
                    toc_level = int(toc['level'])  
                    if_appendix = toc['if_appendix']
                    if re.search(re.escape(toc_title), line, re.IGNORECASE): # if toc_title in line: 
                        section_title = "#"*toc_level + " " + toc_title + "  "
                        title_info = {'title': section_title, 'level': toc_level, 'if_appendix': if_appendix, 'if_modified': True}
                        flag = 1
                        break
                
                if flag == 0:  
                    # for appendix
                    pattern = '|'.join(re.escape(title) for title in APPENDDIX_TITLES) 
                    mtch = re.search(pattern, line, re.IGNORECASE)
                    if mtch:
                        section_title = title
                        level = re.match('^#{1,}', line).group(0).count("#")
                        title_info = {'title': section_title, 'level': level, 'if_appendix': True, 'if_modified': False}
                        flag = 1
                
                if flag == 0:
                    # for others, downgrade one more level
                    if len(md_titles_info) > 0:
                        level = line.count("#") + 1
                        if_appendix = md_titles_info[-1].get('if_appendix')
                        section_title = re.sub('^#{1,}', '#'*level, line)
                        title_info = {'title': section_title, 'level': level, 'if_appendix': if_appendix, 'if_modified': True}
                    else:
                        section_title = line
                        title_info = {'title': section_title, 'level': 1, 'if_appendix': False, 'if_modified': False}

                modified_lines.append(section_title)
                if title_info not in md_titles_info:
                    md_titles_info.append(title_info)  # get markdown title
                
        else:
            modified_lines.append(line)

    return "\n".join(modified_lines), md_titles_info

In [5]:
text_1, titles_1 = align_md_toc(markdown_content, toc_1_rvsd)

## Modify Reference List

In [6]:
from apis.arxiv_tool import ArxivKit
from apis.semanticscholar_tool import SemanticScholarKit

title = "Training Large Language Models to Reason in a Continuous Latent Space"

ss = SemanticScholarKit()
ss_metadata = ss.search_paper_by_keywords(query=title, limit=3)

paper_ss_id = ss_metadata[0].get('paperId')
print(paper_ss_id)

reference_metadata = ss.get_semanticscholar_references(paper_id=paper_ss_id, limit=100)
len(reference_metadata)

2025-02-14 19:03:00,252 - INFO - HTTP Request: GET https://api.semanticscholar.org/graph/v1/paper/search?query=Training%20Large%20Language%20Models%20to%20Reason%20in%20a%20Continuous%20Latent%20Space&fields=abstract,authors,citationCount,citationStyles,corpusId,externalIds,fieldsOfStudy,influentialCitationCount,isOpenAccess,journal,openAccessPdf,paperId,publicationDate,publicationTypes,publicationVenue,referenceCount,s2FieldsOfStudy,title,url,venue,year&offset=0&limit=3 "HTTP/1.1 200 OK"


673fbdd957cada770d10dffca5e45b53da43a3c6


2025-02-14 19:03:02,139 - INFO - HTTP Request: GET https://api.semanticscholar.org/graph/v1/paper/673fbdd957cada770d10dffca5e45b53da43a3c6/references?fields=contexts,intents,contextsWithIntent,isInfluential,abstract,authors,citationCount,citationStyles,corpusId,externalIds,fieldsOfStudy,influentialCitationCount,isOpenAccess,journal,openAccessPdf,paperId,publicationDate,publicationTypes,publicationVenue,referenceCount,s2FieldsOfStudy,title,url,venue,year&offset=0&limit=100 "HTTP/1.1 200 OK"


49

In [7]:
lines = markdown_content.splitlines()

In [8]:
import string

def remove_non_text_chars(text):
    """remove non text chars
    """
    valid_chars = string.ascii_letters + string.digits  # 包含所有字母和数字的字符串
    cleaned_text = ''
    for char in text:
        if char in valid_chars:
            cleaned_text += char
    return cleaned_text

In [9]:
import re
from thefuzz import fuzz # pip install thefuzz  https://github.com/seatgeek/thefuzz

def modify_image_info(self, reference_metadata):
    lines = self.pdf_md.splitlines()

    ref_lst = copy.deepcopy(reference_metadata)
    for ref in reference_metadata:
        title = ref.get('citedPaper', {}).get('title')
        if title:
            for line in lines:
                if len(line) < 500:
                    if re.search(re.escape(title), line, re.IGNORECASE):
                        ref['org_md_ref'] = line
                    else:
                        ratio = fuzz.partial_ratio(title, line)
                        if ratio > 80:
                            ref['org_md_ref'] = line
                            break
                        else:
                            ptrn = remove_non_text_chars(title) 
                            line_rvsd =  remove_non_text_chars(line)
                            if re.search(re.escape(ptrn), line_rvsd, re.IGNORECASE):
                                ref['org_md_ref'] = line
    return ref_lst

## Test on PDF post process

In [21]:
import json
with open("/home/jiezi/Code/Temp/tmp/2412.06769v2/content_list.json") as json_data:
    content_json = json.load(json_data)

In [22]:
from pdf_process.pdf_post_process import PDFProcess

pdf = PDFProcess(pdf_path, toc_1_rvsd, markdown_content, content_json)

In [23]:
pdf.alighn_md_tables()

In [24]:
md_titles_info = pdf.align_md_toc()

In [25]:
img_lst, tbl_lst, formula_lst  = pdf.align_content_json()

In [26]:
img_lst_rvsd = pdf.modify_image_info(img_lst)

In [27]:
tbl_lst_rvsd = pdf.modify_tables_info(tbl_lst)

  soup = BeautifulSoup(line, 'html.parser')


In [28]:
ref_lst = pdf.modify_reference_info(reference_metadata)