In [8]:
from sentence_transformers import SentenceTransformer
import fitz
import os
from chromadb import PersistentClient
# 初始化嵌入模型
embed_model = SentenceTransformer("BAAI/bge-m3")




local_chunk_set = []


def read_pdf(pdf_path):
    """讀取 PDF 檔案並依據文本頁數返回其內容"""
    page_content= []
    doc = fitz.open(pdf_path)
    for page_num, page in enumerate(doc):
        text = page.get_text()
        page_content.append({'page':page_num+1,'content':text})
    doc.close()
    return page_content


def chunk_text(pages, chunk_size):
    """將文本拆分為指定大小的 chunks，允許 chunk 跨越頁數，並標記 chunk 涉及的頁數範圍"""
    chunks = []
    all_text = ""
    page_mapping = []
    
    for page in pages:
        start_idx = len(all_text)
        all_text += page['content'] + "\n"
        end_idx = len(all_text)
        page_mapping.append((start_idx, end_idx, page['page']))
    
    tokens = embed_model.tokenizer.tokenize(all_text)
    
    for i in range(0, len(tokens), chunk_size):
        chunk_tokens = tokens[i:i + chunk_size]
        chunk_text = embed_model.tokenizer.convert_tokens_to_string(chunk_tokens)
        
        chunk_pages = set()
        chunk_start = len(embed_model.tokenizer.convert_tokens_to_string(tokens[:i]))
        chunk_end = len(embed_model.tokenizer.convert_tokens_to_string(tokens[:i + chunk_size]))
        
        for start_idx, end_idx, page in page_mapping:
            if chunk_start < end_idx and chunk_end > start_idx:
                chunk_pages.add(page)
        
        chunks.append({'pages': sorted(chunk_pages), 'content': chunk_text})
    
    return chunks


In [9]:
folder_path = '../pdf'
pdf_files = [f for f in os.listdir(folder_path) if f.lower().endswith('.pdf')]
import numpy as np


原始的chunk

In [10]:
import os
import faiss
import numpy as np
import json

# 儲存 embeddings 避免重複計算
embedding_cache = {}
data = []

# FAISS 索引初始化（假設 embedding 維度為 384）
EMBEDDING_DIM = 1024  # **請根據你的 embedding 模型改變這個數值**
index = faiss.IndexFlatL2(EMBEDDING_DIM)  # L2 距離索引

# 儲存 FAISS ID 對應的 metadata
faiss_metadata = {}

# 設定批次大小，避免 OOM

chunk_sizes = [128,256,512,1024]  # 測試不同 chunk sizes

for pdf_file in pdf_files:
    full_path = os.path.join(folder_path, pdf_file)
    content = read_pdf(full_path)

    for size in chunk_sizes:
        chunks = chunk_text(content, size)

        for idx, chunk in enumerate(chunks):
            text = chunk["content"]

            # **🚀 檢查 embedding 是否已經計算過**
            if text in embedding_cache:
                continue  # 跳過已處理過的文本
            
            # **🚀 計算 embedding 並存入 cache**
            embedding = embed_model.encode(text).astype(np.float32)
            embedding_cache[text] = embedding

            # **🚀 儲存 metadata**
            data.append({
                "File_Name": pdf_file,
                "content": text,
                "Page_Num": ','.join(str(p) for p in chunk["pages"])
            })

            # **🚀 將 embedding 加入 FAISS**
            index.add(np.array([embedding]))  # 加入 FAISS
            faiss_metadata[len(faiss_metadata)] = data[-1]  # FAISS ID 對應 metadata

# **🚀 儲存 FAISS 索引**
faiss.write_index(index, "faiss_index.idx")
with open("faiss_metadata.json", "w", encoding="utf-8") as f:
    json.dump(faiss_metadata, f, ensure_ascii=False, indent=4)

print(f"已儲存 {len(faiss_metadata)} 筆資料到 FAISS")

已儲存 192 筆資料到 FAISS


summary_keyword_chunk

計算 naive chunk 的準確度

In [17]:

index = faiss.read_index("faiss_index.idx")
with open("faiss_metadata.json", "r", encoding="utf-8") as f:
    faiss_metadata = json.load(f)
def search_faiss(query_text, top_k=3):
    """搜尋最相近的前 top_k 筆資料"""
    query_embedding = embed_model.encode(query_text).astype(np.float32)
    query_embedding = np.expand_dims(query_embedding, axis=0)  # FAISS 需要 2D 陣列

    # **🚀 查詢 FAISS**
    _, indices = index.search(query_embedding, top_k)

    results = []
    for idx in indices[0]:
        if str(idx) in faiss_metadata:
            results.append(faiss_metadata[str(idx)])

    return results

with open('pdf_questions.json', 'r', encoding='utf-8') as f:
    data = json.load(f)



win = 0
loss = 0
for each in data : 


    res = search_faiss(each['question'])
    found = False
    for metadata in res:
        if metadata['File_Name'] == each['File_Name'] and str(each['Page_Num']) in  metadata['Page_Num']:
            win += 1
            found = True
            break
    if not found:
        loss += 1

print(win/(win+loss))


0.9145299145299145
