In [20]:
from sentence_transformers import SentenceTransformer
import fitz
import os

# 初始化嵌入模型
embed_model = SentenceTransformer("BAAI/bge-m3")




local_chunk_set = []


def read_pdf(pdf_path):
    """讀取 PDF 檔案並依據文本頁數返回其內容"""
    page_content= []
    doc = fitz.open(pdf_path)
    for page_num, page in enumerate(doc):
        text = page.get_text()
        page_content.append({'page':page_num+1,'content':text})
    doc.close()
    return page_content


def chunk_text(pages, chunk_size):
    """將文本拆分為指定大小的 chunks，允許 chunk 跨越頁數，並標記 chunk 涉及的頁數範圍"""
    chunks = []
    current_chunk = []
    current_pages = set()
    current_length = 0
    max_token_length = chunk_size  

    for page in pages:
        text = page['content']
        text_tokens = embed_model.tokenizer.tokenize(text)

        while text_tokens:
            space_left = max_token_length - current_length

            # 如果當前 chunk 還有空間
            if space_left > 0:
                tokens_to_add = text_tokens[:space_left]
                text_tokens = text_tokens[space_left:]

                current_chunk.extend(tokens_to_add)
                current_pages.add(page['page'])
                current_length += len(tokens_to_add)

            # 當 chunk 滿了，就存入 chunks，並重置變數
            if current_length >= max_token_length or (current_length > 0 and len(text_tokens) > 0):
                chunks.append({
                    'pages': sorted(current_pages),
                    'content': embed_model.tokenizer.convert_tokens_to_string(current_chunk)
                })
                current_chunk = []
                current_pages = set()
                current_length = 0

    # 處理最後一個未滿的 chunk
    if current_length > 0:
        chunks.append({
            'pages': sorted(current_pages),
            'content': embed_model.tokenizer.convert_tokens_to_string(current_chunk)
        })

    return chunks


In [21]:
folder_path = '../pdf'
pdf_files = [f for f in os.listdir(folder_path) if f.lower().endswith('.pdf')]


原始的chunk

In [None]:
import faiss
import numpy as np
import json
import numpy as np
from rouge_score import rouge_scorer
from sentence_transformers import SentenceTransformer
import faiss
from mauve import compute_mauve
from langchain_community.llms.ollama import Ollama
def calculate_rouge_score(reference, candidate):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, candidate)
    return scores['rougeL'].fmeasure  # 取 ROUGE-L 分數
def search_faiss(query_text, top_k=3):
    """搜尋最相近的前 top_k 筆資料"""
    query_embedding = embed_model.encode(query_text).astype(np.float32)
    query_embedding = np.expand_dims(query_embedding, axis=0)  # FAISS 需要 2D 陣列

    # **🚀 查詢 FAISS**
    _, indices = index.search(query_embedding, top_k)

    results = []
    for idx in indices[0]:
        if str(idx) in faiss_metadata:
            results.append(faiss_metadata[str(idx)])

    return results
def calculate_mauve_score(reference, candidate):
    return compute_mauve(p_text=reference, q_text=candidate, verbose=False,device_id=0).mauve
llm = Ollama(model="jcai/llama-3-taiwan-8b-instruct:q4_k_m", temperature=0.2)


# 儲存 FAISS ID 對應的 metadata

# 設定批次大小，避免 OOM
result = []
chunk_sizes = [64,128, 256, 512, 1024]  # 測試不同 chunk sizes
for size in chunk_sizes:
    EMBEDDING_DIM = 1024  
    index = faiss.IndexFlatL2(EMBEDDING_DIM)  # L2 距離索引
    data = []
    faiss_metadata = {}
    files_to_delete = ['faiss_metadata.json', 'faiss_index.idx']
    for file in files_to_delete:
        if os.path.exists(file):
            os.remove(file)
    for pdf_file in pdf_files:
        full_path = os.path.join(folder_path, pdf_file)
        content = read_pdf(full_path)
        chunks = chunk_text(content, size)
        for idx, chunk in enumerate(chunks):
            text = chunk["content"]
            # **計算 embedding 並存入 cache**
            embedding = embed_model.encode(text).astype(np.float32)

            # ** 儲存 metadata**
            data.append({
                "File_Name": pdf_file,
                "content": text,
                "Page_Num": ','.join(str(p) for p in chunk["pages"])
            })
            index.add(np.array([embedding]))  # 加入 FAISS
            faiss_metadata[len(faiss_metadata)] = data[-1]  # FAISS ID 對應 metadata
    # ** 儲存 FAISS 索引**
    faiss.write_index(index, "faiss_index.idx")
    with open("faiss_metadata.json", "w", encoding="utf-8") as f:
        json.dump(faiss_metadata, f, ensure_ascii=False, indent=4)
    print(f"已儲存 {len(faiss_metadata)} 筆資料到 FAISS")
    index = faiss.IndexFlatL2(EMBEDDING_DIM)
    index = faiss.read_index("faiss_index.idx")
    with open("faiss_metadata.json", "r", encoding="utf-8") as f:
        faiss_metadata = json.load(f)
    with open('pdf_questions.json', 'r', encoding='utf-8') as f:
        data = json.load(f)
    win = 0
    loss = 0
    total_rouge = 0
    total_mauve = 0
            
    for each in data:
        res = search_faiss(each['question'])
        found = False
        
        # 整理 Context
        context = " ".join([metadata['content'] for metadata in res])
        
        # LLM Prompt
        prompt = f"""
        你是一個 AI 助理，專門根據官方文件回答資安相關問題。請根據提供的文件內容生成精確且簡潔的答案，不要加入額外資訊或推測性的內容，並確保答案與參考內容高度一致。

        **問題：** {each['question']}
        **參考內容：** {context}

        **答案（請直接從參考內容提取，確保準確性以及符合問題）：**
        """
        
        # 生成回答
        generated_answer = llm.invoke(prompt).strip()
        
        # 計算 ROUGE 分數
        rouge_score = calculate_rouge_score(each['answer'], generated_answer)
        total_rouge += rouge_score
        
        # 計算 MAUVE 分數
        try : 
            mauve_score = calculate_mauve_score(each['answer'], generated_answer)
            total_mauve += mauve_score
        except : 
            pass
        # 確認是否匹配文件名稱和頁碼
        for metadata in res:
            if metadata['File_Name'] == each['File_Name'] and str(each['Page_Num']) in metadata['Page_Num']:
                win += 1
                found = True
                break  # 找到匹配的就不繼續檢查
        if not found:
            loss += 1
        
    accuracy = win / (win + loss) if (win + loss) > 0 else 0
    result.append({'chunk_size': size, 'total_chunk': len(faiss_metadata), 'accuracy': accuracy, 'rougeL': total_rouge / (win + loss), 'mauve': total_mauve / (win + loss)})

for each in result:
    print(each)


已儲存 230 筆資料到 FAISS


Featurizing p: 100%|██████████| 105/105 [00:01<00:00, 70.99it/s]
Featurizing q: 100%|██████████| 155/155 [00:02<00:00, 73.06it/s]
Featurizing p: 100%|██████████| 38/38 [00:00<00:00, 63.26it/s]
Featurizing q: 100%|██████████| 36/36 [00:00<00:00, 73.93it/s]
Featurizing p: 100%|██████████| 57/57 [00:00<00:00, 74.80it/s]
Featurizing q:  60%|██████    | 79/131 [00:01<00:00, 74.91it/s]

計算 element chunk 的準確度

In [None]:

index = faiss.IndexFlatL2(EMBEDDING_DIM)
index = faiss.read_index("faiss_element_chunk_index.idx")
with open("faiss_element_chunk_metadata.json", "r", encoding="utf-8") as f:
    faiss_metadata = json.load(f)
with open('pdf_questions.json', 'r', encoding='utf-8') as f:
    data = json.load(f)
total_mauve = 0
total_rough=0
win = 0
loss = 0
for each in data : 
        # 整理 Context
    context = " ".join([metadata['content'] for metadata in res])
    # LLM Prompt
    prompt = f"""
    你是一個 AI 助理，專門根據官方文件回答資安相關問題。請根據提供的文件內容生成精確且簡潔的答案，不要加入額外資訊或推測性的內容，並確保答案與參考內容高度一致。

    **問題：** {each['question']}
    **參考內容：** {context}

    **答案（請直接從參考內容提取，確保準確性以及符合問題）：**
    """
        # 生成回答
    generated_answer = llm.invoke(prompt).strip()
    # 計算 BLEU 和 ROUGE 分數
    rough_score = calculate_rouge_score(each['answer'], generated_answer)
    total_rough += rough_score
    mauve_score = calculate_mauve_score(each['answer'], generated_answer)
    total_mauve += mauve_score
    res = search_faiss(each['question'])
    found = False
    for metadata in res:
        if metadata['File_Name'] == each['File_Name'] and str(each['Page_Num']) in  metadata['Page_Num']:
            win += 1
            found = True
            break
    if not found:
        loss += 1

accuracy = win / (win + loss) if (win + loss) > 0 else 0
average_rough = total_rough / (win + loss) 
average_mauve = total_mauve / (win + loss)
print(f"Accuracy: {accuracy:.4f}")
print(f"Average mauve score: {average_mauve:.4f}")
print(f"Average ROUGE-L score: {average_rough:.4f}")


TypeError: compute_mauve() got an unexpected keyword argument 'device'