In [44]:
import os
import json
import pdfplumber  # 用於從PDF文件中提取文字的工具
from tqdm import tqdm

# For OCR
from PIL import Image
import pytesseract
import fitz

question_path = './dataset/preliminary/questions_example.json'
output_path = './output/answers_scalefactor2.json'
source_path_faq = './reference/faq/pid_map_content.json'
source_path_insurance = './reference/insurance'
source_path_finance = './reference/finance'

# 讀取單個PDF文件並返回其文本內容
'''
def read_pdf(pdf_loc, page_infos: list = None):
    pdf = pdfplumber.open(pdf_loc)  # 打開指定的PDF文件

    # TODO: 可自行用其他方法讀入資料，或是對pdf中多模態資料（表格,圖片等）進行處理
    # 如果指定了頁面範圍，則只提取該範圍的頁面，否則提取所有頁面
    pages = pdf.pages[page_infos[0]:page_infos[1]] if page_infos else pdf.pages
    pdf_text = ''
    for _, page in enumerate(pages):  # 迴圈遍歷每一頁
        text = page.extract_text()  # 提取頁面的文本內容
        if text:
            pdf_text += text
    pdf.close()  # 關閉PDF文件

    return pdf_text  # 返回萃取出的文本
'''
def read_pdf(pdf_loc, page_infos: list = None, category='default', file_name='default'):
    pdf = pdfplumber.open(pdf_loc)  # 打開指定的PDF文件
    pdf_fitz = fitz.open(pdf_loc) # ocr pdf document

    # TODO: 可自行用其他方法讀入資料，或是對pdf中多模態資料（表格,圖片等）進行處理
    ##設定圖像放大的倍數，例如: 2表示將圖像放大2倍
    scale_factor = 2
    
    ## 設定OCR的語言
    ocr_language = 'chi_tra'

    #print(category)
    #print(file_name)
    
    # 如果指定了頁面範圍，則只提取該範圍的頁面，否則提取所有頁面
    pages = pdf.pages[page_infos[0]:page_infos[1]] if page_infos else pdf.pages
    pdf_text = ''
    for page_number, page in enumerate(pages):  # 迴圈遍歷每一頁
        text = page.extract_text()  # 提取頁面的文本內容
        #print(page_number)
        if text:
            pdf_text += text
        else:
            with open('None_text_page.txt', 'a', encoding='utf-8') as file:
                file.write(f"{category}_{file_name}_page_{page_number+1}\n")
            ## 讀取pdf
            page_fitz = pdf_fitz.load_page(page_number)
            
            ## 獲取頁面的像素數據
            #pix = page_fitz.get_pixmap()
            pix = page_fitz.get_pixmap(matrix=fitz.Matrix(scale_factor, scale_factor))
            
            ## 使用Pillow庫創建圖像物件，將向素數據轉換成圖像
            img = Image.frombytes('RGB',(pix.width, pix.height),pix.samples)

            ## 指定保存的圖像檔名        
            image_file_name = f'{category}_{file_name}_page_{page_number+1}.png'
            
            ## 保存圖像檔案
            img.save(image_file_name)

            ## 使用OCR擷取圖像中的文字
            text = pytesseract.image_to_string(img, lang=ocr_language)
            #print(f'{file_name}_page_{page_number+1}.png')
            #print(text)
            pdf_text += text
    
    pdf.close()  # 關閉PDF文件
    
    return pdf_text  # 返回萃取出的文本


# 載入參考資料，返回一個字典，key為檔案名稱，value為PDF檔內容的文本
def load_data(source_path, category):
    masked_file_ls = os.listdir(source_path)  # 獲取資料夾中的檔案列表
    corpus_dict = {int(file.replace('.pdf', '')): read_pdf(os.path.join(source_path, file), None, category, int(file.replace('.pdf', ''))) for file in tqdm(masked_file_ls)}  # 讀取每個PDF文件的文本，並以檔案名作為鍵，文本內容作為值存入字典
    return corpus_dict

print("載入 FAQ 資料...")
with open(source_path_faq, 'rb') as f_s:
    key_to_source_dict = json.load(f_s)  # 讀取參考資料文件
    key_to_source_dict = {int(key): value for key, value in key_to_source_dict.items()}

print("載入保險資料...")
category = 'insurance'
corpus_dict_insurance = load_data(source_path_insurance,category)

print("載入財務資料...")
category = 'finance'
corpus_dict_finance = load_data(source_path_finance, category)

print("資料載入完成！")


載入 FAQ 資料...
載入保險資料...


  8%|██████▏                                                                         | 80/1035 [03:38<43:33,  2.74s/it]
100%|████████████████████████████████████████████████████████████████████████████████| 643/643 [02:01<00:00,  5.31it/s]


載入財務資料...


100%|██████████████████████████████████████████████████████████████████████████████| 1035/1035 [18:46<00:00,  1.09s/it]

資料載入完成！





In [45]:
import json
import jieba  # 用於中文文本分詞
from rank_bm25 import BM25Okapi  # 使用BM25演算法進行文件檢索

# 根據查詢語句和指定的來源，檢索答案
def BM25_retrieve(qs, source, corpus_dict):
    filtered_corpus = [corpus_dict[int(file)] for file in source]

    # [TODO] 可自行替換其他檢索方式，以提升效能

    tokenized_corpus = [list(jieba.cut_for_search(doc)) for doc in filtered_corpus]  # 將每篇文檔進行分詞
    bm25 = BM25Okapi(tokenized_corpus)  # 使用BM25演算法建立檢索模型
    tokenized_query = list(jieba.cut_for_search(qs))  # 將查詢語句進行分詞
    ans = bm25.get_top_n(tokenized_query, list(filtered_corpus), n=1)  # 根據查詢語句檢索，返回最相關的文檔，其中n為可調整項
    a = ans[0]
    # 找回與最佳匹配文本相對應的檔案名
    res = [key for key, value in corpus_dict.items() if value == a]
    return res[0]  # 回傳檔案名

if __name__ == "__main__":

    answer_dict = {"answers": []}  # 初始化字典

    with open(question_path, 'rb') as f:
        qs_ref = json.load(f)  # 讀取問題檔案

    for q_dict in qs_ref['questions']:
        if q_dict['category'] == 'finance':
            # 進行檢索
            retrieved = BM25_retrieve(q_dict['query'], q_dict['source'], corpus_dict_finance)
            # 將結果加入字典
            answer_dict['answers'].append({"qid": q_dict['qid'], "retrieve": retrieved})

        elif q_dict['category'] == 'insurance':
            retrieved = BM25_retrieve(q_dict['query'], q_dict['source'], corpus_dict_insurance)
            answer_dict['answers'].append({"qid": q_dict['qid'], "retrieve": retrieved})

        elif q_dict['category'] == 'faq':
            corpus_dict_faq = {key: str(value) for key, value in key_to_source_dict.items() if key in q_dict['source']}
            retrieved = BM25_retrieve(q_dict['query'], q_dict['source'], corpus_dict_faq)
            answer_dict['answers'].append({"qid": q_dict['qid'], "retrieve": retrieved})

        else:
            raise ValueError("Something went wrong")  # 如果過程有問題，拋出錯誤

    # 將答案字典保存為json文件
    with open(output_path, 'w', encoding='utf8') as f:
        json.dump(answer_dict, f, ensure_ascii=False, indent=4)  # 儲存檔案，確保格式和非ASCII字符


In [50]:
def load_json(file_path):
    """載入 JSON 檔案"""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            return json.load(f)
    except FileNotFoundError:
        print(f"Error: {file_path} not found.")
        return None

def calculate_accuracy(pred_file, true_file):
    """計算預測成功率"""
    # 載入預測答案和正確答案
    pred_data = load_json(pred_file)
    true_data = load_json(true_file)

    if pred_data is None or true_data is None:
        print("Failed to load JSON files.")
        return

    # 建立以 qid 為 key 的字典，方便比對
    true_answers = {item['qid']: item['retrieve'] for item in true_data['ground_truths']}
    pred_answers = {item['qid']: item['retrieve'] for item in pred_data['answers']}

    true_answers_category = {item['qid']: item['category'] for item in true_data['ground_truths']} # caculate each category accuracy
    # 初始化統計
    total_questions = len(true_answers)
    faq_questions = sum([1 if value == 'faq' else 0 for key, value in true_answers_category.items()])
    finance_questions = sum([1 if value == 'finance' else 0 for key, value in true_answers_category.items()])
    insurance_questions = sum([1 if value == 'insurance' else 0 for key, value in true_answers_category.items()])
    
    total_correct_predictions = 0
    faq_correct_predictions = 0 
    finance_correct_predictions = 0
    insurance_correct_predictions = 0
    # 比對預測結果
    for qid, true_retrieve in true_answers.items():
        pred_retrieve = pred_answers.get(qid)
        if pred_retrieve == true_retrieve:
            total_correct_predictions += 1
            if true_answers_category.get(qid) == 'faq':
                faq_correct_predictions += 1
            elif true_answers_category.get(qid) == 'finance':
                finance_correct_predictions += 1
            elif true_answers_category.get(qid) == 'insurance':
                insurance_correct_predictions += 1

    # 計算成功率
    total_accuracy = (total_correct_predictions / total_questions) * 100
    faq_accuracy = (faq_correct_predictions / faq_questions) * 100
    finance_accuracy = (finance_correct_predictions / finance_questions) * 100
    insurance_accuracy = (insurance_correct_predictions / insurance_questions) * 100
    print(f"Total questions: {total_questions}")
    print(f"Total correct predictions: {total_correct_predictions}")
    print(f"Total accuracy: {total_accuracy:.2f}%")

    print(f"Faq questions: {faq_questions}")
    print(f"Faq correct predictions: {faq_correct_predictions}")
    print(f"Faq accuracy: {faq_accuracy:.2f}%")

    print(f"Finance questions: {finance_questions}")
    print(f"Finance correct predictions: {finance_correct_predictions}")
    print(f"Finance accuracy: {finance_accuracy:.2f}%")

    print(f"Insurance questions: {insurance_questions}")
    print(f"Insurance correct predictions: {insurance_correct_predictions}")
    print(f"Insurance accuracy: {insurance_accuracy:.2f}%")

# === 主程式 ===
if __name__ == "__main__":
    # 設定檔案路徑
    pred_file = './output/answers.json'  # 預測結果 JSON
    #pred_file = './output/answers_scalefactor2.json'  # 預測結果 JSON
    true_file = './dataset/preliminary/ground_truths_example.json'  # 正確答案 JSON
    
    # 計算成功率
    calculate_accuracy(pred_file, true_file)

Total questions: 150
Total correct predictions: 111
Total accuracy: 74.00%
Faq questions: 50
Faq correct predictions: 45
Faq accuracy: 90.00%
Finance questions: 50
Finance correct predictions: 26
Finance accuracy: 52.00%
Insurance questions: 50
Insurance correct predictions: 40
Insurance accuracy: 80.00%
