In [2]:
import torch

print(torch.__version__)
print(torch.cuda.is_available())

2.7.0.dev20250303+cu128
True


In [3]:
reranker_list = [
    "BAAI/bge-reranker-v2-m3",
    "BAAI/bge-reranker-large",
    "BAAI/bge-reranker-base",
]

In [4]:

from sentence_transformers import SentenceTransformer
import fitz
import numpy as np
import os
import faiss
import json
from FlagEmbedding import FlagReranker
def read_pdf(pdf_path):
    """讀取 PDF 檔案並依據文本頁數返回其內容"""
    page_content= []
    doc = fitz.open(pdf_path)
    for page_num, page in enumerate(doc):
        text = page.get_text()
        page_content.append({'page':page_num+1,'content':text})
    doc.close()
    return page_content


def chunk_text(pages, chunk_size, embed_model):
    """將文本拆分為指定大小的 chunks，允許 chunk 跨越頁數，並標記 chunk 涉及的頁數範圍"""
    chunks = []
    current_chunk = []
    current_pages = set()
    current_length = 0
    max_token_length = chunk_size # 確保 chunk_size 不超過 512

    for page in pages:
        text = page['content']
        text_tokens = embed_model.tokenizer.tokenize(text)

        while text_tokens:
            space_left = max_token_length - current_length

            # 如果當前 chunk 還有空間
            if space_left > 0:
                tokens_to_add = text_tokens[:space_left]
                text_tokens = text_tokens[space_left:]

                current_chunk.extend(tokens_to_add)
                current_pages.add(page['page'])
                current_length += len(tokens_to_add)

            # 當 chunk 滿了，就存入 chunks，並重置變數
            if current_length >= max_token_length or (current_length > 0 and len(text_tokens) > 0):
                chunks.append({
                    'pages': sorted(current_pages),
                    'content': embed_model.tokenizer.convert_tokens_to_string(current_chunk)
                })
                current_chunk = []
                current_pages = set()
                current_length = 0

    # 處理最後一個未滿的 chunk
    if current_length > 0:
        chunks.append({
            'pages': sorted(current_pages),
            'content': embed_model.tokenizer.convert_tokens_to_string(current_chunk)
        })

    return chunks


def search_faiss(query_text, embed_model,index,faiss_metadata,top_k=2):
    """搜尋最相近的前 top_k 筆資料"""
    query_embedding = embed_model.encode(query_text).astype(np.float32)
    query_embedding = np.expand_dims(query_embedding, axis=0)  # FAISS 需要 2D 陣列

    # **🚀 查詢 FAISS**
    _, indices = index.search(query_embedding, top_k)

    results = []
    for idx in indices[0]:
        if str(idx) in faiss_metadata:
            results.append(faiss_metadata[str(idx)])

    return results


  from .autonotebook import tqdm as notebook_tqdm


In [8]:
import pandas as pd
import time



model_performance = []
model_name = "BAAI/bge-m3"
folder_path = '../pdf'

pdf_files = [f for f in os.listdir(folder_path) if f.lower().endswith('.pdf')]
embed_model = SentenceTransformer(model_name ,trust_remote_code=True)
EMBEDDING_DIM = embed_model.get_sentence_embedding_dimension()
chunk_sizes = [64,128,256,512]  # 測試不同 chunk sizes
top_kk = [10,20,50,100]

for size in chunk_sizes:

    faiss_metadata = {}
    index = faiss.IndexFlatL2(EMBEDDING_DIM)  # L2 距離索引

    for pdf_file in pdf_files:
        files_to_delete = ['faiss_metadata.json', 'faiss_index.idx']
        for file in files_to_delete:
            if os.path.exists(file):
                os.remove(file)
        full_path = os.path.join(folder_path, pdf_file)
        content = read_pdf(full_path)
        chunks = chunk_text(content, size,embed_model)
        for idx, chunk in enumerate(chunks):
            text = chunk["content"]
            embedding = embed_model.encode(text).astype(np.float32)
            data = {
                "File_Name": pdf_file,
                "content": text,
                "Page_Num": ','.join(str(p) for p in chunk["pages"])
            }
            index.add(np.array([embedding]))  # 加入 FAISS
            faiss_metadata[len(faiss_metadata)] = data  # FAISS ID 對應 metadata

    # **🚀 儲存 FAISS 索引**
    faiss.write_index(index, "faiss_index.idx")
    with open("faiss_metadata.json", "w", encoding="utf-8") as f:
        json.dump(faiss_metadata, f, ensure_ascii=False, indent=4)

    print(f"已儲存 {len(faiss_metadata)} 筆資料到 FAISS")
    with open("faiss_metadata.json", "r", encoding="utf-8") as f:
        faiss_metadata = json.load(f)
    with open('pdf_questions.json', 'r', encoding='utf-8') as f:
        pdf_questions = json.load(f)
    for top_k in top_kk:
        for reranker_name in reranker_list:
            index = faiss.read_index("faiss_index.idx")
            start_time = time.time()
            win = 0
            loss = 0
            MRR = 0
            reranker_model = FlagReranker(reranker_name, use_fp16=True)
            for query in pdf_questions:
                results = search_faiss(query['question'], embed_model,index,faiss_metadata,top_k=10) 
                # Prepare pairs for reranking
                pairs = []
                for result in results:
                    pairs.append([query['question'], result['content']])
                
                # Get reranking scores
                if pairs:  # Only rerank if we have results
                    scores = reranker_model.compute_score(pairs)
                    for i, result in enumerate(results):
                        result['score'] = scores[i].item() if hasattr(scores[i], "item") else scores[i]
                    # Sort results by score in descending order
                    results = sorted(results, key=lambda x: x['score'], reverse=True)
                results = results[:3]
                find = False
                for idx, metadata in enumerate(results):
                    if metadata['File_Name'] == query['File_Name'] and str(query['Page_Num']) in  metadata['Page_Num']:
                        find = True
                        win += 1
                        MRR += 1/(idx+1)
                        break
                if not find:
                    loss += 1
            end_time = time.time() - start_time
            model_performance.append({'chenk_szie':size,'top_k':top_k,'model':reranker_name,'win':win,'loss':loss,'MRR':MRR/len(pdf_questions) , 'pre_time':end_time/len(pdf_questions)})

        
        win = 0
        loss = 0
        MRR = 0
        start_time = time.time()
        for query in pdf_questions:
            results = search_faiss(query['question'], embed_model,index,faiss_metadata,top_k=3)
            find = False
            for idx, metadata in enumerate(results):
                if metadata['File_Name'] == query['File_Name'] and str(query['Page_Num']) in  metadata['Page_Num']:
                    find = True
                    win += 1
                    MRR += 1/(idx+1)
                    break
            if not find:
                loss += 1
        end_time = time.time() - start_time
        model_performance.append({'chenk_szie':size,'model':'No_Rerank','win':win,'loss':loss,'MRR':MRR/len(pdf_questions), 'pre_time':end_time/len(pdf_questions)})


# Convert model_performance list to DataFrame for better visualization
df_performance = pd.DataFrame(model_performance)

# Display the DataFrame
print(df_performance)

已儲存 4853 筆資料到 FAISS


You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with 

已儲存 2461 筆資料到 FAISS


You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with 

已儲存 1265 筆資料到 FAISS


You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with 

已儲存 668 筆資料到 FAISS


You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with 

    chenk_szie  top_k                    model  win  loss       MRR  pre_time
0           64   10.0  BAAI/bge-reranker-v2-m3  123    11  0.827114  0.102219
1           64   10.0  BAAI/bge-reranker-large  124    10  0.802239  0.104230
2           64   10.0   BAAI/bge-reranker-base  123    11  0.805970  0.076723
3           64    NaN                No_Rerank  121    13  0.766169  0.028040
4           64   20.0  BAAI/bge-reranker-v2-m3  123    11  0.827114  0.098115
..         ...    ...                      ...  ...   ...       ...       ...
59         512    NaN                No_Rerank  110    24  0.690299  0.026513
60         512  100.0  BAAI/bge-reranker-v2-m3  125     9  0.875622  0.157992
61         512  100.0  BAAI/bge-reranker-large  123    11  0.854478  0.161433
62         512  100.0   BAAI/bge-reranker-base  125     9  0.858209  0.084926
63         512    NaN                No_Rerank  110    24  0.690299  0.026586

[64 rows x 7 columns]


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
# Calculate win rate for each model (win / total)
df_performance['win_rate'] = df_performance['win'] / (df_performance['win'] + df_performance['loss'])
# Print performance visualizations data
print("Model Performance: MRR vs Processing Time")
for model in df_performance['model'].unique():
    mask = df_performance['model'] == model
    print(f"Model: {model}")
    print(df_performance[mask][['pre_time', 'MRR','win_rate','top_k','chenk_szie']])



Model Performance: MRR vs Processing Time
Model: BAAI/bge-reranker-v2-m3
    pre_time       MRR  win_rate
0   0.102219  0.827114  0.917910
4   0.098115  0.827114  0.917910
8   0.097561  0.827114  0.917910
12  0.101139  0.827114  0.917910
16  0.099310  0.863184  0.947761
20  0.097893  0.863184  0.947761
24  0.098960  0.863184  0.947761
28  0.099929  0.863184  0.947761
32  0.112484  0.864428  0.910448
36  0.111528  0.864428  0.910448
40  0.111581  0.864428  0.910448
44  0.111754  0.864428  0.910448
48  0.157325  0.875622  0.932836
52  0.158638  0.875622  0.932836
56  0.159356  0.875622  0.932836
60  0.157992  0.875622  0.932836
Model: BAAI/bge-reranker-large
    pre_time       MRR  win_rate
1   0.104230  0.802239  0.925373
5   0.101680  0.802239  0.925373
9   0.100950  0.802239  0.925373
13  0.101010  0.802239  0.925373
17  0.101822  0.851990  0.940299
21  0.101033  0.851990  0.940299
25  0.101453  0.851990  0.940299
29  0.101848  0.851990  0.940299
33  0.115263  0.822139  0.902985
37  0