In [2]:
import fitz

pdf_path = "../data/2502.00330v1.pdf"
doc = fitz.open(pdf_path)

# PDF Process

## PDF Table of Content

### Based on PDF Outline

In [3]:
toc_infos = doc.get_toc(simple=False) or []

pdf_toc = []
for item in toc_infos:
    lvl = item[0] if len(item) > 0 else None
    title = item[1] if len(item) > 1 else None
    start_page = item[2] if len(item) > 2 else None
    end_pos = item[3].get('to') if len(item) > 3 and item[3] else None
    nameddest = item[3].get('nameddest') if len(item) > 3 and item[3] else None

    if start_page is not None:
        page = doc[start_page-1]
        blocks = page.get_text("blocks")

        lines = ""
        for block in blocks:
            x0, y0, x1, y1, text, _, _ = block
            if len(lines) < 100:
                if end_pos and x0 >= end_pos[0]:
                    lines += text
            else:
                break

        pdf_toc.append({
            "level": lvl,
            "title": title,
            "page": start_page,
            "position": end_pos,
            "nameddest": nameddest,
            "text": lines[:200] + "..."
        })

In [None]:
if pdf_toc: 
    lvl_1_toc = [item for item in pdf_toc if item["level"] == 1]
    sorted_lvl_1_toc = sorted(lvl_1_toc, key=lambda d: d['page'])

# 基于level的位置，切分pdf
for idx, item in enumerate(lvl_1_toc):
    title = item.get('title')
    start_page = item.get('page')
    start_pos = item.get('position')
    if idx < len(lvl_1_toc) - 1:
        next_item = lvl_1_toc[idx+1]
        end_page = next_item.get('page')
        end_pos = next_item.get('position')
    else:
        end_page = None
        end_pos = None
    print(title, start_page, start_pos, end_page, end_pos)

In [4]:
if pdf_toc: 
    lvl_1_toc = [item for item in pdf_toc if item["level"] == 1]
    sorted_lvl_1_toc = sorted(lvl_1_toc, key=lambda d: d['page'])

In [None]:
for item in sorted_lvl_1_toc:
    print(item.get('nameddest'), item.get('title'), item.get('page'))

### Based on Formatting

In [9]:
section_list = ["Abstract",
                "Introduction", "Background", "Introduction and Motivation", "Preliminary", 
                "Related Work", "Literature Review", "Related Research",
                "Methods", "Methodology", "Method", "Approach", "Work Flow", "Materials and Methods", "Computation Function", "Problem Formulation", "Mathmatical Formulation", "Psedo Code",
                "Experiment", "Experiment Settings", "Experimental Results", "Evaluation", "Experiments",
                "Analysis", "Results", "Findings", "Data Analysis", "Results and Findings",
                "Conclusion", "Discussion", "Results and Discussion", "Further Discussion", 
                "References",
                "Acknowledgments", 
                "FAQ", "Frequently Asked Questions",
                "Implementation Code", "Examples", "Appendix"]

In [10]:
import re
from pdf_meta_det import extract_meta, dump_toml

pattern = '|'.join(re.escape(section) for section in section_list)

mtch_rslts = []
for i in range(min(len(doc), 10)):
    tmp_rslt = extract_meta(doc, pattern=pattern, page=i+1)
    mtch_rslts.extend(tmp_rslt)

In [None]:
size, flags = 0, 0
for item in mtch_rslts:
    if item.get('size') > size:
        size = item.get('size')
    if item.get('flags') > flags:
        flags = item.get('flags')
print(size, flags)

In [12]:
rvsd_mtch_rslts = [item for item in mtch_rslts if item.get('size') == size and item.get('flags') == flags]

In [13]:
from pdf_meta_det import extract_meta, dump_toml

auto_level = 1
addnl = False
tmp_meta_ptrn = [dump_toml(m, auto_level, addnl) for m in rvsd_mtch_rslts]

In [14]:
# 将 tmp_meta_ptrn 写入 recipe.toml 文件
with open('recipe.toml', 'w', encoding='utf-8') as f:
    f.write('\n'.join(tmp_meta_ptrn))

In [15]:
import toml
from pdf_toc_gen import get_file_encoding, gen_toc

recipe_file_path = 'recipe.toml'
recipe_file = open(recipe_file_path, "r", encoding=get_file_encoding(recipe_file_path))
recipe = toml.load(recipe_file)
toc = gen_toc(doc, recipe)

In [None]:
toc

## PDF Paragraph Info

In [8]:
import fitz  # PyMuPDF

def extract_text_by_paragraph(pdf_path):
    """
    Extracts text from a PDF, splitting it into paragraphs using PyMuPDF.
    Also provides page number and bounding box for each paragraph.

    Args:
        pdf_path (str): The path to the PDF file.

    Returns:
        list: A list of dictionaries, where each dictionary represents a paragraph
              and contains the 'text', 'page', and 'pos' (position) keys.
        Returns None if the file doesn't exist.
    """
    try:
        doc = fitz.open(pdf_path)
        paragraphs = []

        for page_num, page in enumerate(doc):
            blocks = page.get_text("dict")["blocks"]
            for b in blocks:
                if b['type'] == 0:
                    block_text = ""
                    block_rects = []  # Collect rectangles for the entire block

                    for l in b["lines"]:
                        for s in l["spans"]:
                            block_text += s["text"]
                            block_rects.append(fitz.Rect(s["bbox"]))
                    
                    # Combine the rects to get the overall block rect
                    if block_rects:
                        block_rect = block_rects[0]
                        for rect in block_rects[1:]:
                            block_rect |= rect  # Union of rectangles

                    block_paragraphs = block_text.strip().split('\n\n') # You can further improve this with regex if needed

                    for p in block_paragraphs:
                      if p.strip():
                        paragraphs.append({
                            'text': p.strip(),
                            'page': page_num + 1,  # Page numbers start from 1
                            'pos': block_rect
                        })

        return paragraphs

    except FileNotFoundError:
        print(f"Error: File not found at '{pdf_path}'")
        return None
    except Exception as e:
        print(f"Error: An error occurred: {e}")
        return None

In [None]:
# Example Usage:
pdf_file = "../data/2502.00330v1.pdf"  # Replace with your PDF file path
paragraphs = extract_text_by_paragraph(pdf_file)

if paragraphs:
    for paragraph in paragraphs:
        print(f"Page: {paragraph['page']}")
        print(f"Position: {paragraph['pos']}")
        print(f"Text:\n{paragraph['text']}\n")

In [None]:
len(paragraphs)

In [None]:
from markitdown import MarkItDown

md = MarkItDown()
result = md.convert("../data/2502.00330v1.pdf")
print(result.text_content)

## Layout Check

测试minerU API

In [2]:
mineru_api_key = "eyJ0eXBlIjoiSldUIiwiYWxnIjoiSFM1MTIifQ.eyJqdGkiOiI4NTUwNzEwNiIsInJvbCI6IlJPTEVfUkVHSVNURVIiLCJpc3MiOiJPcGVuWExhYiIsImlhdCI6MTczODgwNTU1NSwiY2xpZW50SWQiOiJsa3pkeDU3bnZ5MjJqa3BxOXgydyIsInBob25lIjoiIiwidXVpZCI6IjFjOWE0NjE5LWMxNWItNDkxNi04MjQ4LWY4YjQ1MjJiZTZiYyIsImVtYWlsIjoiIiwiZXhwIjoxNzQwMDE1MTU1fQ.SCAEEIbeeTXheBOqa78koRcgS0uw0IXRFt9kLq3eA0zBfS0Qeml7vy-VXlg1Hh9dwm9WnLc-GDKZXwys1tGJKg"

In [3]:
import requests

url='https://mineru.net/api/v4/file-urls/batch'
header = {
    'Content-Type':'application/json',
    "Authorization":f"Bearer {mineru_api_key}"
}
data = {
    "enable_formula": True,
    "language": "en",
    "layout_model":"doclayout_yolo",
    "enable_table": True,
    "files": [
        {"name":"2502.00330v1.pdf", "is_ocr": False, "data_id": "test-20250206-002"},
        {"name":"2502.02508v1.pdf", "is_ocr": False, "data_id": "test-20250206-003"}
    ]
}
file_pathes = [r"../data/2502.00330v1.pdf", r"../data/2502.02508v1.pdf"]
try:
    response = requests.post(url,headers=header,json=data)
    if response.status_code == 200:
        result = response.json()
        print('response success. result:{}'.format(result))
        if result["code"] == 0:
            batch_id = result["data"]["batch_id"]
            urls = result["data"]["file_urls"]
            print('batch_id:{},urls:{}'.format(batch_id, urls))
            for idx, file_path in enumerate(file_pathes):
                with open(file_path, 'rb') as f:
                    res_upload = requests.put(urls[idx], data=f)
                if res_upload.status_code == 200:
                    print("upload success")
                else:
                    print("upload failed")
        else:
            print('apply upload url failed,reason:{}'.format(result.msg))
    else:
        print('response not success. status:{} ,result:{}'.format(response.status_code, response))
except Exception as err:
    print(err)

response success. result:{'code': 0, 'msg': 'ok', 'trace_id': '386768fd68e9d6cff859773ad73ad5c6', 'data': {'batch_id': '79e3e16a-8a1b-4911-9722-ada284c6d7cf', 'file_urls': ['https://mineru.oss-cn-shanghai.aliyuncs.com/api-upload/79e3e16a-8a1b-4911-9722-ada284c6d7cf/b9917862-36ee-48b1-aabd-7df65948fb91.pdf?Expires=1738919408&OSSAccessKeyId=LTAI5t9nGwatk85zetzojXbn&Signature=qIincgRD%2BPs8spX0EXsPKTHoOf4%3D', 'https://mineru.oss-cn-shanghai.aliyuncs.com/api-upload/79e3e16a-8a1b-4911-9722-ada284c6d7cf/de74e957-707c-4e48-b282-7540327a802d.pdf?Expires=1738919408&OSSAccessKeyId=LTAI5t9nGwatk85zetzojXbn&Signature=nIzYUVhCEcQMPtHpBgecevHLqjo%3D']}}
batch_id:79e3e16a-8a1b-4911-9722-ada284c6d7cf,urls:['https://mineru.oss-cn-shanghai.aliyuncs.com/api-upload/79e3e16a-8a1b-4911-9722-ada284c6d7cf/b9917862-36ee-48b1-aabd-7df65948fb91.pdf?Expires=1738919408&OSSAccessKeyId=LTAI5t9nGwatk85zetzojXbn&Signature=qIincgRD%2BPs8spX0EXsPKTHoOf4%3D', 'https://mineru.oss-cn-shanghai.aliyuncs.com/api-upload/79e3e

In [4]:
result

{'code': 0,
 'msg': 'ok',
 'trace_id': '386768fd68e9d6cff859773ad73ad5c6',
 'data': {'batch_id': '79e3e16a-8a1b-4911-9722-ada284c6d7cf',
  'file_urls': ['https://mineru.oss-cn-shanghai.aliyuncs.com/api-upload/79e3e16a-8a1b-4911-9722-ada284c6d7cf/b9917862-36ee-48b1-aabd-7df65948fb91.pdf?Expires=1738919408&OSSAccessKeyId=LTAI5t9nGwatk85zetzojXbn&Signature=qIincgRD%2BPs8spX0EXsPKTHoOf4%3D',
   'https://mineru.oss-cn-shanghai.aliyuncs.com/api-upload/79e3e16a-8a1b-4911-9722-ada284c6d7cf/de74e957-707c-4e48-b282-7540327a802d.pdf?Expires=1738919408&OSSAccessKeyId=LTAI5t9nGwatk85zetzojXbn&Signature=nIzYUVhCEcQMPtHpBgecevHLqjo%3D']}}

In [3]:
import requests

batch_id = "79e3e16a-8a1b-4911-9722-ada284c6d7cf"
url = f'https://mineru.net/api/v4/extract-results/batch/{batch_id}'
header = {
    'Content-Type':'application/json',
    "Authorization":f"Bearer {mineru_api_key}"
}

res = requests.get(url, headers=header)
print(res.status_code)
print(res.json())
print(res.json()["data"])

200
{'code': 0, 'msg': 'ok', 'trace_id': '15963df2935fb9f13304412207a95e94', 'data': {'batch_id': '79e3e16a-8a1b-4911-9722-ada284c6d7cf', 'extract_result': [{'data_id': 'test-20250206-002', 'file_name': '2502.00330v1.pdf', 'state': 'done', 'err_msg': '', 'full_zip_url': 'https://cdn-mineru.openxlab.org.cn/pdf/039a6a8b-5f27-4f88-a2df-3988a66e6af9.zip'}, {'data_id': 'test-20250206-003', 'file_name': '2502.02508v1.pdf', 'state': 'done', 'err_msg': '', 'full_zip_url': 'https://cdn-mineru.openxlab.org.cn/pdf/2af9928d-81a2-4d15-a166-d0e8144c0ca9.zip'}]}}
{'batch_id': '79e3e16a-8a1b-4911-9722-ada284c6d7cf', 'extract_result': [{'data_id': 'test-20250206-002', 'file_name': '2502.00330v1.pdf', 'state': 'done', 'err_msg': '', 'full_zip_url': 'https://cdn-mineru.openxlab.org.cn/pdf/039a6a8b-5f27-4f88-a2df-3988a66e6af9.zip'}, {'data_id': 'test-20250206-003', 'file_name': '2502.02508v1.pdf', 'state': 'done', 'err_msg': '', 'full_zip_url': 'https://cdn-mineru.openxlab.org.cn/pdf/2af9928d-81a2-4d15-a1

首先按page切，暂不考虑acknowledgement, reference及以后的信息

In [None]:
append_section_list = ['References', "Acknowledgments", "Appendix", "FAQ", "Frequently Asked Questions"]

import re
sec_ptrn = '|'.join(re.escape(section) for section in append_section_list)

mtch_rslts = []
page = len(doc)
for item in toc:
    if re.match(sec_ptrn, item.title):
        if item.pagenum < page:
            page = item.pagenum

In [None]:
page=10

In [None]:
import fitz  # PyMuPDF

def save_pdf_pages(input_pdf_path, output_pdf_path, page_numbers):
    # 打开PDF文件
    pdf_document = fitz.open(input_pdf_path)
    
    # 创建一个新的PDF文档
    output_pdf = fitz.open()
    
    # 添加指定的页面到新的PDF文档
    for page_number in page_numbers:
        # 将页面添加到新的PDF文档中
        output_pdf.insert_pdf(pdf_document, from_page=page_number, to_page=page_number)
    
    # 保存新的PDF文件
    output_pdf.save(output_pdf_path)
    output_pdf.close()
    pdf_document.close()

In [1]:
# 保存前n页
in_pdf_path = "/Users/jiezi/Documents/Local Code/Project/PaperPal/dev/tmp/2201.11903v6.pdf"
out_pdf_path = 'tmp.pdf'  # 输出PDF文件路径
# save_pdf_pages(in_pdf_path, out_pdf_path, list(range(0, page)))

In [None]:
import sys

from pdf_layout_det import PDF2MARKDOWN

import sys
sys.path.append("/home/jiezi/Packages/PDF-Extract-Kit")

from pdf_extract_kit.utils.config_loader import load_config, initialize_tasks_and_models


TASK_NAME = 'pdf2markdown'
config_path = "/home/jiezi/Packages/PDF-Extract-Kit/project/pdf2markdown/configs/pdf2markdown.yaml"
config = load_config(config_path)
task_instances = initialize_tasks_and_models(config)

# get input and output path from config
input_data = out_pdf_path
result_path = "./opt"

layout_model = task_instances['layout_detection'].model if 'layout_detection' in task_instances else None
mfd_model = task_instances['formula_detection'].model if 'formula_detection' in task_instances else None
mfr_model = None
# mfr_model = task_instances['formula_recognition'].model if 'formula_recognition' in task_instances else None
ocr_model = None
# ocr_model = task_instances['ocr'].model if 'ocr' in task_instances else None

pdf2md = PDF2MARKDOWN(layout_model, mfd_model, mfr_model, ocr_model)
res_list, final_blocks, md_content = pdf2md.process(input_path=input_data, save_dir=result_path, visualize=True, merge2markdown=True)

In [None]:
paras = "".join(md_content).split("#")

In [None]:
filtered_paras = [item for item in paras if item is not None and item != '' and len(item) >= 100]

In [None]:
paras_dct = []
for idx, item in enumerate(filtered_paras):
    paras_dct.append({'id':idx, 'lines':item[0:300]+"..."})

In [None]:
str(paras_dct)

In [None]:
tmp_paras_dct = []
for idx, item in enumerate(filtered_paras):
    tmp_paras_dct.append({'para_id':idx, 'content':item[0:50]+"..."})

In [None]:
str(tmp_paras_dct)

In [None]:
match_prompt = """## INSTRUCTION
已知table_of_content记录了章节标题和对应的页面，para中抽取了各个章节的起始句子。
对于table_of_content中的每一项，根据section_title和paras中content的内容进行匹配，并将全部匹配到的para_id添加到table_of_content中。
注意以下两种情况均构成匹配：
- content直接对应section_title；
- content是section_title的二级目录下的内容。
如无匹配的项，则将置空。

## INPUT
<toc>
{toc}
</toc>

<paras>
{paras}
</paras>

## OUTPUT
Output in json with double quotes in the following format:
```json
[{{'section_title':xxx, 'page_num':xxx, 'vpos':xxx, 'para_ids':[list of all matched para_id, blank if no match]}}
, ...]
```
"""


toc_lst = []
for item in toc:
    toc_lst.append({'section_title':item.title, 'page_num':item.pagenum, 'vpos':item.vpos})
prompt = match_prompt.format(toc=str(toc_lst),paras=str(tmp_paras_dct))

In [None]:
print(prompt)

In [None]:
import os
from zhipuai import ZhipuAI

def zhipu_llm(sys_prompt, qa_promt):
    if not sys_prompt:
        sys_prompt = "You are a helpful assistant."
    
    
    client = ZhipuAI(api_key=os.getenv("ZHIPU_API_KEY_1")) # 填写您自己的APIKey
    response = client.chat.completions.create(
        model="glm-4-flash",  # 填写需要调用的模型编码
        messages=[
            {"role": "system", "content": sys_prompt},
            {"role": "user", "content": qa_promt}
        ],
    )
    opt_result = response.choices[0].message.content
    return opt_result

In [None]:
import re
import json

def convert_quotes(json_str):
    # 将单引号替换为双引号，但是需要排除字符串内的单引号
    json_str = re.sub(r"(?<!\\)'(.*?)(?<!\\)'", r'"\1"', json_str)
    return json_str

def get_json(json_str):
    # 正则表达式，匹配以 ```json 开头，后面可能跟着换行符，然后是JSON内容，直到 ``` 结尾
    pattern = r"```json\n?(.*?)\n?```"

    # 使用正则表达式找到匹配的JSON字符串
    matches = re.findall(pattern, json_str, re.DOTALL)

    json_data = None
    # 如果找到匹配项，尝试将其转换为JSON对象
    if matches:
        json_str = matches[0].strip()  # 移除字符串前后的空白字符
        json_str = convert_quotes(json_str)  # 转换单引号为双引号
        try:
            json_data = json.loads(json_str)
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")
            
    else:
        print("No JSON content found.")

    return json_data

In [None]:
len(prompt)

In [None]:
outline_rslt = zhipu_llm(sys_prompt=None, qa_promt=prompt)

In [None]:
outline_rslt

In [None]:
outline_json = get_json(outline_rslt)

In [None]:
outline_json

## Double Confirm

Download source data or use html to double confirm pdf data.

In [None]:
import arxiv

paper = next(arxiv.Client().results(arxiv.Search(id_list=["1605.08386v1"])))
# Download the archive to the PWD with a default filename.
paper.download_source()
# Download the archive to the PWD with a custom filename.
paper.download_source(filename="downloaded-paper.tar.gz")
# Download the archive to a specified directory with a custom filename.
paper.download_source(dirpath="./mydir", filename="downloaded-paper.tar.gz")

Use html for information

In [None]:
import json
import requests
from bs4 import BeautifulSoup

arxiv_id = "2410.24175"
url = f"https://arxiv.org/html/{arxiv_id}"
response = requests.get(url)
html_content = response.text

soup = BeautifulSoup(html_content, 'html.parser')

# Find all the links on the page
figures = []
tables = []

figure_images = soup.select('.ltx_figure > img')
figure_captions = soup.select('.ltx_figure > figcaption') 
for figure_image, figure_caption in zip(figure_images, figure_captions):
    figure = {
        'figure_path': f"https://arxiv.org/html/{arxiv_id}/{figure_image.get('src')}",
        'figure_caption': figure_caption.text.strip()
    }
    figures.append(figure)


table_contents = soup.select('table.ltx_tabular')
table_captions = soup.select('.ltx_table > figcaption')
for table_content, table_caption in zip(table_contents, table_captions):
    table = {
        'table_content': str(table_content),
        'table_caption': table_caption.text.strip()
    }
    tables.append(table)

with open('figures.json', 'w') as f:
    json.dump(figures, f)

with open('tables.json', 'w') as f:
    json.dump(tables, f)

## LLM Comprehension

### 方案一：直接使用LLM针对特定章节问答

需要补充：
- 长度控制模块
- 段落切分，按段落问答并总结

In [None]:
section_identify_prompt = """## TASK
You are an academic researcher in Computer Science and AI field. 
You are given section title together with initial lines of paragraphs from a paper.
Now you are asked to identify the section type. The section type can be one of the following: 
['Bio', 'Abstraction', 'Introduction',  'Related Works and Literature Review', 'Methodology', 'Experiment and Results', 'Discussion and Conclusion', 'Others']
Please identify the section type based on the given section.

## PARA
{content}

## OUTPUT
Output in json with double quotes in the following format:
```json
[{{'id':0, 'sectoin_type':xxx}}, {{'id':1, 'sectoin_type':xxx}}, ...]
```
"""

prompt = section_identify_prompt.format(content=str(paras_dct))

In [None]:
import os
from zhipuai import ZhipuAI

client = ZhipuAI(api_key=os.getenv("ZHIPU_API_KEY_1")) # 填写您自己的APIKey
response = client.chat.completions.create(
    model="glm-4-flash",  # 填写需要调用的模型编码
    messages=[
        {"role": "system", "content": "You are a helpful assistant whose task is to provide users with professional, accurate, and insightful advice."},
        {"role": "user", "content": prompt}
    ],
)
opt_result = response.choices[0].message.content

In [None]:
opt_result

In [None]:
opt_json = get_json(opt_result)

In [None]:
opt_json

In [None]:
abstract, introduction, method, conclusion = "", "", "", ""
for idx, item in enumerate(opt_json):
    if item['section_type'] == 'Abstraction':
        abstract += filtered_paras[idx]
    if item['section_type'] == 'Introduction':
        introduction += '\n\n\n' + filtered_paras[idx]   
    if item['section_type'] == 'Methodology':
        method += '\n\n\n' + filtered_paras[idx]
    if item['section_type'] == 'Discussion and Conclusion':
        conclusion += '\n\n\n' + filtered_paras[idx]

In [None]:
self.cur_api += 1
self.cur_api = 0 if self.cur_api >= len(self.chat_api_list) - 1 else self.cur_api
text_token = len(self.encoding.encode(text))
clip_text_index = int(len(text) * (self.max_token_num - method_prompt_token) / text_token)
clip_text = text[:clip_text_index]

In [None]:
sys_prompt = "You are a researcher in the field of '{subject}' who is good at summarizing papers using concise statements."

summary_prompt = """ ## INSTRUCTION
Given abstraction and introduction paragraph from the paper, you are asked to:                   
1. identify the keywords of this article;
2. summarize according to the following four points
- (1): What is the research background of this article? What problem is this paper trying to solve? 
- (2): What are the relevant studies? What are the past methods? What are the issues with them? Is the approach well motivated?
- (3): How does the paper solve this problem? What is the research methodology proposed in this paper?
- (4): What experiments were done in the paper? On what task and what performance is achieved by the methods in this paper? Can the performance support their goals?
- (5): Are there unsolved issues with the paper? What gaps can be explored further? Any suggestions?

## CONTEXT
Here are abstraction from the paper:
<abstraction>
{abstraction}
</abstraction>

Here are introduction from the paper:
<introduction>
{introduction}
</introduction>

## OUTPUT
Follow the format of the output that follows: 
```text                            
1. Keywords: xxx\n\n     
2. Summary: \n\n
- (1):xxx;\n 
- (2):xxx;\n 
- (3):xxx;\n  
- (4):xxx.\n\n     
- (5):xxx.\n\n  
```

Be sure to use {lang} answers (proper nouns need to be marked in English), statements as concise and academic as possible.
Do not have too much repetitive information, numerical values using the original numbers.
Be sure to strictly follow the format, the corresponding content output to xxx, in accordance with \n line feed.                 
"""

method_prompt = """## INSTRUCTION
Given method paragraph and a summary of a paper, you are asked to describe in detail the methodological idea of this article. 
- (1):...
- (2):...
- (3):...
- .......

## CONTEXT
Here are method paragraph:
<method>
{method}
</method>

Here are summary of the paper fyi:
<summary>
{summary}
</summary>

## OUTPUT
Follow the format of the output that follows: 
```text
3. Methods: \n\n
- (1):xxx;\n 
- (2):xxx;\n 
- (3):xxx;\n  
....... \n\n     
```
Be sure to use {lang} answers (proper nouns need to be marked in English), statements as concise and academic as possible.
Do not repeat the content of the previous <summary>, the value of the use of the original numbers.
Be sure to strictly follow the format, the corresponding content output to xxx, in accordance with \n line feed, ....... means fill in according to the actual requirements.                 
"""
 
conclusion_prompt = """## INSTRUCTION
Given conclusion paragraph and a summary of a paper, you are asked to: 
4. Make the following summary:
- (1):What is the significance of this piece of work?
- (2):Summarize the strengths and weaknesses of this article in three dimensions: innovation point, performance, and workload.                   
.......

    "contribution": "What is the contribution of this paper?",
    "novelty": "What is the novelty of this paper?",
    "strength": "What are the strengths of this paper?",
    "drawback": "What are the drawbacks of this paper?",
    "improvement": "What might be the improvements of this paper?",


## CONTEXT
Here are conclusion paragraph:
<conclusion>
{conclusion}
</conclusion>

Here are summary of the paper fyi:
<summary>
{summary}
</summary>

## OUTPUT
Follow the format of the output later: 
```text
4. Conclusion: \n\n
- (1):xxx;\n                     
- (2):Innovation point: xxx; Performance: xxx; Workload: xxx;\n    
- (3):
    contribution: What is the contribution of this paper?,
    novelty: What is the novelty of this paper?,
    strength": What are the strengths of this paper?,
    drawback: What are the drawbacks of this paper?,
    improvement": What might be the improvements of this paper?
```

Be sure to use {lang} answers (proper nouns need to be marked in English), statements as concise and academic as possible.
Do not repeat the content of the previous <summary>, the value of the use of the original numbers.
Be sure to strictly follow the format, the corresponding content output to xxx, in accordance with \n line feed, ....... means fill in according to the actual requirements.                 
"""

In [None]:
lang = "English"
sum_prompt = summary_prompt.format(abstraction=abs, introduction=intro, lang=lang)

In [None]:
len(sum_prompt)

In [None]:
opt_result = zhipu_llm(sys_prompt, sum_prompt)

In [None]:
print(opt_result)

In [None]:
lang = "English"
dis_prompt = method_prompt.format(conclusion=dis, summary=opt_result, lang=lang)

In [None]:
opt_result_2 = zhipu_llm(sys_prompt, dis_prompt)

In [None]:
print(opt_result_2)

In [None]:
lang = "English"
met_prompt = method_prompt.format(method=method, summary=opt_result, lang=lang)

In [None]:
len(met_prompt)

In [None]:
opt_result_3 = zhipu_llm(sys_prompt, met_prompt)

In [None]:
print(opt_result_3)

### 方案二：使用传统RAG

to-do

### 方案三：使用GraphRAG

## Multimodal Comprehension

定位图片、表格或公式的详细位置
- 'figure', 'figure_caption',
- 'table', 'table_caption', 'table_footnote',
- 'formula', 'formula_caption'

In [None]:
# 获取图片、表格或公式的详细位置
def get_bounding_box(poly):
    x_coords = poly[0::2]
    y_coords = poly[1::2]
    return min(x_coords), min(y_coords), max(x_coords), max(y_coords)

def do_boxes_overlap(box1, box2, max_distance=20):
    x1_min, y1_min, x1_max, y1_max = box1
    x2_min, y2_min, x2_max, y2_max = box2

    horizontal_overlap = not (x1_max < x2_min or x1_min > x2_max)
    vertical_overlap_or_close = not (y1_max < y2_min - max_distance or y1_min > y2_max + max_distance)

    return horizontal_overlap or vertical_overlap_or_close

def consolidate_positions(items):
    if not items:
        return None
    x_min = min(get_bounding_box(item['poly'])[0] for item in items)
    y_min = min(get_bounding_box(item['poly'])[1] for item in items)
    x_max = max(get_bounding_box(item['poly'])[2] for item in items)
    y_max = max(get_bounding_box(item['poly'])[3] for item in items)
    return [x_min, y_min, x_max, y_max]

def find_matches(metadata, category_types):
    filtered_items = [item for item in metadata if item['category_type'] in category_types]
    results = []

    while filtered_items:
        base_item = filtered_items.pop(0)
        base_box = get_bounding_box(base_item['poly'])
        group = [base_item]

        for other_item in list(filtered_items):  # Use list to avoid modifying during iteration
            other_box = get_bounding_box(other_item['poly'])
            if do_boxes_overlap(base_box, other_box):
                group.append(other_item)
                filtered_items.remove(other_item)

        consolidated_box = consolidate_positions(group)
        concatenated_text = ' '.join(item.get('text', '') for item in group)
        results.append({
            'output_category': ' & '.join(item['category_type'] for item in group),
            'output_poly': consolidated_box,
            'output_text': concatenated_text
        })

    return results


将图片、表格或公式保存为图片

In [None]:
DEFAULT_DPI = 144
# since there is a manipulation of image size, we need to map the image coordinates back to the pdf coordinates
def map_image_to_pdf(image_x, image_y, pix, dpi=DEFAULT_DPI):
    if pix.width <= 3000 and pix.height <= 3000:
        scale = dpi / 72
        pdf_x = image_x / scale
        pdf_y = image_y / scale
    else:
        pdf_x = image_x
        pdf_y = image_y
    return pdf_x, pdf_y

In [None]:
category_types = ['figure', 'figure_caption']
results = find_matches(final_blocks[5], category_types)

for result in results:
    print(result)

In [None]:
# 还原页面
idx = 5
page = doc.load_page(idx)
pix = page.get_pixmap(matrix=fitz.Matrix(DEFAULT_DPI/72, DEFAULT_DPI/72))
area = result['output_poly']
x0, y0 = map_image_to_pdf(area[0], area[1], pix)
x1, y1 = map_image_to_pdf(area[2], area[3], pix)

pix_map = page.get_pixmap(clip=fitz.Rect(x0, y0, x1, y1))
pix_map.save("output_new.png")

获取对应的段落信息，作为上下文辅助
- 思路一：从来源追溯，找寻最契合
- 思路二：基于向量匹配

In [None]:
# 查找对应的章节
def find_titles_for_page(toc, page_idx):
    titles = []
    for i, entry in enumerate(toc):
        # 对于最后一个条目，由于没有下一个条目，所以单独处理
        if i == len(toc) - 1:
            if entry.pagenum <= page_idx:
                titles.append(entry.title)
        else:
            # 对于其他条目，确保页面索引在当前条目和下一个条目之间
            if entry.pagenum <= page_idx < toc[i + 1].pagenum:
                titles.append(entry.title)
            # 如果当前条目和下一个条目的页码相同，则添加当前条目的标题
            elif entry.pagenum == page_idx == toc[i + 1].pagenum:
                titles.append(entry.title)
    return titles

In [None]:
toc

In [None]:
# 源头追溯
idx = 5
section_titles = find_titles_for_page(toc, idx+1)

In [None]:
section_titles

In [None]:
filtered_paras

In [None]:
# 使用模糊匹配
import difflib

def fuzzy_match(short_texts, long_texts):
    matches = []
    for i, short_text in enumerate(short_texts):
        # 使用difflib.get_close_matches获取所有可能的匹配项
        close_matches = difflib.get_close_matches(short_text, long_texts, n=len(long_texts), cutoff=0.0)
        # 如果有匹配项，选择相似度最高的一个
        if close_matches:
            # 按相似度排序，取第一个元素（相似度最高）
            best_match = max(close_matches, key=lambda x: difflib.SequenceMatcher(None, short_text, x).ratio())
            # 获取长文本在列表中的位置
            best_match_index = long_texts.index(best_match)
            # 将匹配的索引对添加到列表中
            matches.append((i, best_match_index))
    return matches

In [None]:
test_rslts = fuzzy_match(section_titles, [item[:50] for item in filtered_paras])

In [None]:
test_rslts

In [None]:
conten = filtered_paras[test_rslts[0][1]]

In [None]:
points_extraction_prompt = """## TASK
Extract key information from context that is relevant to the clues.

## CLUES
{intro_of_figure_table_formula}

## CONTEXT
{context}

## OUTPUT
Related information are: \n

"""

In [None]:
prompt = points_extraction_prompt.format(
    intro_of_figure_table_formula=result['output_text'],
    context=conten)
len(prompt)

In [None]:
test_result = zhipu_llm(None, prompt)

In [None]:
test_result

基于向量匹配

In [None]:
import re

def split_text_into_chunks(text, chunk_size, overlap_size):
    # 确保重叠大小不超过chunk大小
    overlap_size = min(overlap_size, chunk_size)
    
    # 使用正则表达式分割文本，保持句子的完整性
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
    
    chunks = []
    current_chunk = ""
    
    for sentence in sentences:
        # 如果当前chunk加上下一个句子小于chunk_size，则加入当前chunk
        if len(current_chunk) + len(sentence) < chunk_size:
            current_chunk += sentence + " "
        else:
            # 如果加上下一个句子超过chunk_size，则先保存当前chunk
            chunks.append(current_chunk.strip())
            # 计算重叠部分
            overlap = " " + " ".join(sentences[sentences.index(sentence)-1].split()[-overlap_size:])
            # 开始新的chunk，包含重叠部分
            current_chunk = overlap + sentence + " "
    
    # 添加最后一个chunk
    if current_chunk:
        chunks.append(current_chunk.strip())
    
    return chunks

多模态语义理解

In [None]:
import base64
from zhipuai import ZhipuAI

def zhipu_vllm(img_path, prompt):
    with open(img_path, 'rb') as img_file:
        img_base = base64.b64encode(img_file.read()).decode('utf-8')

    client = ZhipuAI(api_key=os.getenv("ZHIPU_API_KEY_1")) # 填写您自己的APIKey
    response = client.chat.completions.create(
        model="glm-4v",  # "glm-4v-plus",  # 填写需要调用的模型名称
        messages=[
        {
            "role": "user",
            "content": [
            {
                "type": "image_url",
                "image_url": {
                    "url": img_base
                }
            },
            {
                "type": "text",
                "text": prompt
            }
            ]
        }
        ]
    )
    return (response.choices[0].message)

In [None]:
prompt = """## TASK
You are an academic scholar analyzing a image from a paper. 
Extract key information from the image that is relevant to the context.
Try to answer the following questions: 
1. What is the image showing?
2. What is the image related to?
3. What is the image trying to convey?
Be very concise and explicit in your answers. Try to show concrete results and numbers.

## CONTEXT
Here is background information of the paper for your guidance:
<background>
{background}
</background>

Here is short description of the image:
<description>
{description}
</description>

## OUTPUT
The image reveals that: \n
"""

In [None]:
img_path = 'output_new.png'
prompt = prompt.format(
    background=test_result,
    description=result['output_text'])
tmp_result = zhipu_vllm(img_path, prompt)

In [None]:
print(tmp_result.content)

In [None]:
# Example usage
for idx, item in enumerate(final_blocks):
    full_text = 
    category_types = ['figure', 'figure_caption']
    results = find_matches(final_blocks[5], category_types)

    for result in results:
        print(result)

In [None]:
result['output_poly']