In [1]:
from sentence_transformers import SentenceTransformer
import fitz
import os

# 初始化嵌入模型
embed_model = SentenceTransformer("BAAI/bge-m3")




local_chunk_set = []


def read_pdf(pdf_path):
    """讀取 PDF 檔案並依據文本頁數返回其內容"""
    page_content= []
    doc = fitz.open(pdf_path)
    for page_num, page in enumerate(doc):
        text = page.get_text()
        page_content.append({'page':page_num+1,'content':text})
    doc.close()
    return page_content


def chunk_text(pages, chunk_size):
    """將文本拆分為指定大小的 chunks，允許 chunk 跨越頁數，並標記 chunk 涉及的頁數範圍"""
    chunks = []
    current_chunk = []
    current_pages = set()
    current_length = 0
    max_token_length = chunk_size  

    for page in pages:
        text = page['content']
        text_tokens = embed_model.tokenizer.tokenize(text)

        while text_tokens:
            space_left = max_token_length - current_length

            # 如果當前 chunk 還有空間
            if space_left > 0:
                tokens_to_add = text_tokens[:space_left]
                text_tokens = text_tokens[space_left:]

                current_chunk.extend(tokens_to_add)
                current_pages.add(page['page'])
                current_length += len(tokens_to_add)

            # 當 chunk 滿了，就存入 chunks，並重置變數
            if current_length >= max_token_length or (current_length > 0 and len(text_tokens) > 0):
                chunks.append({
                    'pages': sorted(current_pages),
                    'content': embed_model.tokenizer.convert_tokens_to_string(current_chunk)
                })
                current_chunk = []
                current_pages = set()
                current_length = 0

    # 處理最後一個未滿的 chunk
    if current_length > 0:
        chunks.append({
            'pages': sorted(current_pages),
            'content': embed_model.tokenizer.convert_tokens_to_string(current_chunk)
        })

    return chunks


  from tqdm.autonotebook import tqdm, trange





In [2]:
folder_path = '../pdf'
pdf_files = [f for f in os.listdir(folder_path) if f.lower().endswith('.pdf')]


原始的chunk

In [3]:
import faiss
import numpy as np
import json

# 儲存 embeddings 避免重複計算
embedding_cache = {}
data = []

# FAISS 索引初始化（假設 embedding 維度為 384）
EMBEDDING_DIM = 1024  # **請根據你的 embedding 模型改變這個數值**
index = faiss.IndexFlatL2(EMBEDDING_DIM)  # L2 距離索引

# 儲存 FAISS ID 對應的 metadata
faiss_metadata = {}

# 設定批次大小，避免 OOM

chunk_sizes = [128]  # 測試不同 chunk sizes

for pdf_file in pdf_files:
    full_path = os.path.join(folder_path, pdf_file)
    content = read_pdf(full_path)

    for size in chunk_sizes:
        chunks = chunk_text(content, size)

        for idx, chunk in enumerate(chunks):
            text = chunk["content"]

            # **🚀 檢查 embedding 是否已經計算過**
            if text in embedding_cache:
                continue  # 跳過已處理過的文本
            
            # **🚀 計算 embedding 並存入 cache**
            embedding = embed_model.encode(text).astype(np.float32)
            embedding_cache[text] = embedding

            # **🚀 儲存 metadata**
            data.append({
                "File_Name": pdf_file,
                "content": text,
                "Page_Num": ','.join(str(p) for p in chunk["pages"])
            })
            index.add(np.array([embedding]))  # 加入 FAISS
            faiss_metadata[len(faiss_metadata)] = data[-1]  # FAISS ID 對應 metadata

# **🚀 儲存 FAISS 索引**
faiss.write_index(index, "faiss_index.idx")
with open("faiss_metadata.json", "w", encoding="utf-8") as f:
    json.dump(faiss_metadata, f, ensure_ascii=False, indent=4)

print(f"已儲存 {len(faiss_metadata)} 筆資料到 FAISS")

  attn_output = torch.nn.functional.scaled_dot_product_attention(


已儲存 119 筆資料到 FAISS


計算 naive chunk 的準確度

In [13]:
import json
import numpy as np
import faiss
from langchain.llms import Ollama
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer  # 確保安裝 `rouge-score` 套件
from sentence_transformers import SentenceTransformer  # FAISS 需要此模型

# 加載 FAISS Index 和 Metadata
index = faiss.read_index("faiss_index.idx")
with open("faiss_metadata.json", "r", encoding="utf-8") as f:
    faiss_metadata = json.load(f)

embed_model = SentenceTransformer("BAAI/bge-m3")

# 計算 BLEU 分數
def calculate_blue_score(reference, candidate):
    reference = [reference.split()]
    candidate = candidate.split()
    smoothing = SmoothingFunction().method1  # 避免 0 分數問題
    score = sentence_bleu(reference, candidate, weights=(0.5, 0.5, 0, 0), smoothing_function=smoothing)
    return score

# 計算 ROUGE 分數
def calculate_rough_score(reference, candidate):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, candidate)
    return scores['rougeL'].fmeasure  # 取 ROUGE-L 分數

# 搜尋 FAISS Index
def search_faiss(query_text, top_k=3):
    """搜尋最相近的前 top_k 筆資料"""
    query_embedding = embed_model.encode(query_text).astype(np.float32)
    query_embedding = np.expand_dims(query_embedding, axis=0)  # FAISS 需要 2D 陣列

    # **🚀 查詢 FAISS**
    _, indices = index.search(query_embedding, top_k)

    results = []
    for idx in indices[0]:
        if str(idx) in faiss_metadata:
            results.append(faiss_metadata[str(idx)])

    return results

# 讀取測試問題
with open('pdf_questions.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# 初始化 Ollama 客戶端
llm = Ollama(model="jcai/llama-3-taiwan-8b-instruct:q4_k_m")

win = 0
loss = 0
total_blue = 0
total_rough = 0

for each in data:
    res = search_faiss(each['question'])
    found = False

    # 整理 Context
    context = " ".join([metadata['content'] for metadata in res])

    # LLM Prompt
    prompt = f"""你是一個 AI 助理，專門根據官方文件回答資安相關問題。
    請根據提供的文件內容來生成精確且簡潔的答案，不要加入額外資訊。

    **問題：** {each['question']}
    **參考內容：** {context}
    **答案：**
    """
    
    # 生成回答
    generated_answer = llm.invoke(prompt).strip()
    print(generated_answer)
    # 計算 BLEU 和 ROUGE 分數
    blue_score = calculate_blue_score(each['answer'], generated_answer)
    rough_score = calculate_rough_score(each['answer'], generated_answer)

    total_blue += blue_score
    total_rough += rough_score

    # 確認是否匹配文件名稱和頁碼
    for metadata in res:
        if metadata['File_Name'] == each['File_Name'] and str(each['Page_Num']) in map(str, metadata['Page_Num']):
            win += 1
            found = True
            break  # 找到匹配的就不繼續檢查

    if not found:
        loss += 1

# 計算最終指標
accuracy = win / (win + loss) if (win + loss) > 0 else 0
average_blue = total_blue / win if win > 0 else 0
average_rough = total_rough / win if win > 0 else 0

print(f"Accuracy: {accuracy:.4f}")
print(f"Average BLEU score: {average_blue:.4f}")
print(f"Average ROUGE-L score: {average_rough:.4f}")





1. 登入「教育機構資安通報平台」。
2. 點選左方功能列的「修改資安長資料」。
3. 輸入單位的資安長相關連絡資訊(包含姓名、公務電話、公務電子郵件)。
4. 點選「送出」以儲存資料。
資安長的聯絡資訊需要包含資安長姓名、資安長公務電話及資安長公務電子郵件。
在資安事件發生時，臺灣學術網路各級學校應先確認事件，經確認為資安事件後，須於1小時內至教育機構資安通報應變網站(https://info. cert.tanet.edu.tw)通報登錄資安事件，並遵循各單位內部備份管理辦法啟動相關應變措施。


KeyboardInterrupt: 

計算 element chunk 的準確度

In [5]:
index = faiss.read_index("faiss_element_chunk_index.idx")
with open("faiss_element_chunk_metadata.json", "r", encoding="utf-8") as f:
    faiss_metadata = json.load(f)
def search_faiss(query_text, top_k=3):
    """搜尋最相近的前 top_k 筆資料"""
    query_embedding = embed_model.encode(query_text).astype(np.float32)
    query_embedding = np.expand_dims(query_embedding, axis=0)  # FAISS 需要 2D 陣列

    # **🚀 查詢 FAISS**
    _, indices = index.search(query_embedding, top_k)

    results = []
    for idx in indices[0]:
        if str(idx) in faiss_metadata:
            results.append(faiss_metadata[str(idx)])

    return results

with open('pdf_questions.json', 'r', encoding='utf-8') as f:
    data = json.load(f)



win = 0
loss = 0
for each in data : 


    res = search_faiss(each['question'])
    found = False
    for metadata in res:
        if metadata['File_Name'] == each['File_Name'] and str(each['Page_Num']) in  metadata['Page_Num']:
            win += 1
            found = True
            break
    if not found:
        loss += 1

print(win/(win+loss))


0.917910447761194
