In [1]:
import os
import unicodedata

import torch
import pandas as pd
from tqdm import tqdm
import fitz  # PyMuPDF

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    pipeline,
    BitsAndBytesConfig,
    Gemma2ForCausalLM
)
from accelerate import Accelerator

# Langchain 관련
from langchain.llms import HuggingFacePipeline
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
from langchain_community.vectorstores.utils import DistanceStrategy
from langchain.schema.output_parser import StrOutputParser
from langchain_teddynote.retrievers import KiwiBM25Retriever
from langchain.retrievers import EnsembleRetriever, MultiQueryRetriever
from peft import PeftModel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from utils.extract_module import daconCustomExtractor

def process_pdf(file_path):
    # 1. pdf별 page 추출
    doc = fitz.open(file_path)
    outpt_dir = "/home/a2024712006/dacon/extract_image"
    chunk_list = []
    for page_number in range(doc.page_count):
        page = doc.load_page(page_number)
        
        #2. page별 테이블 추출 
        tables = page.find_tables()
        raw_text_list = []
        for table in tables:
            # 테이블을 감싸는 영역 계산
            min_x, min_y = float('inf'), float('inf')
            max_x, max_y = float('-inf'), float('-inf')
            
            for cell in table.cells:
                x0, y0, x1, y1 = cell[:4]  # 셀 좌표 추출
                min_x = min(min_x, x0)
                min_y = min(min_y, y0)
                max_x = max(max_x, x1)
                max_y = max(max_y, y1)
            # 1) 발견된 테이블 영역 
            table_rect = fitz.Rect(min_x, min_y, max_x, max_y)
            # 2) 발견된 테이블 영역의 테이블 형식 텍스트
            table_text = "\n"
            for row in table.extract():
                table_text += str(row)
                table_text += "\n"
            # 3) 발견된 테이블 영역의 날 것 텍스트 
            clipped_text = page.get_text("text", clip=table_rect)
            
            # 4) 날 것 텍스트 => 테이블 형식 텍스트로 변환
            raw_text_list.append((clipped_text, table_text))
            
        # 2. page별 이미지 추출        
        extractor = daconCustomExtractor(page) 
        bboxes = extractor.detect_svg_contours(page_number+1, output_dir=outpt_dir, min_svg_gap_dx=25.0, min_svg_gap_dy=25.0, min_w=2.0, min_h=2.0)

        # # 텍스트를 chunk로 분할
        # splitter = RecursiveCharacterTextSplitter(
        #     chunk_size=512,
        #     chunk_overlap=32
        # )    
        # 3. 이미지별 텍스트 추출
        for i, bbox in enumerate(bboxes):
            x0, y0, x1, y1 = bbox 
            
            full_text = page.get_text("text", clip=fitz.Rect(x0, y0, x1, y1))
            for clipped_text, table_text in raw_text_list:
                if clipped_text in full_text:
                    full_text = full_text.replace(clipped_text, table_text)
                    
            chunk_list.append(full_text)
            # chunk_temp = splitter.split_text(full_text)
            # chunk_list.extend(chunk_temp)
            
    chunks = [Document(page_content=t) for t in chunk_list]
    return chunks

def create_vector_db(chunks, model_path="BAAI/bge-m3"):
    """FAISS DB 생성"""
    # 임베딩 모델 설정
    model_kwargs = {'device': 'cuda'}
    encode_kwargs = {'normalize_embeddings': True}
    embeddings = HuggingFaceEmbeddings(
        model_name=model_path,
        model_kwargs=model_kwargs,
        encode_kwargs=encode_kwargs
    )
    # FAISS DB 생성 및 반환
    db = FAISS.from_documents(chunks, embedding=embeddings, distance_strategy = DistanceStrategy.COSINE)
    return db

def normalize_path(path):
    """경로 유니코드 정규화"""
    return unicodedata.normalize('NFC', path)

def normalize_string(s):
    """유니코드 정규화"""
    return unicodedata.normalize('NFC', s)

def process_pdfs_from_dataframe(df, base_directory):
    """딕셔너리에 pdf명을 키로해서 DB, retriever 저장"""
    pdf_databases = {}
    unique_paths = df['Source_path'].unique()
    
    for path in tqdm(unique_paths, desc="Processing PDFs"):
        # 경로 정규화 및 절대 경로 생성
        normalized_path = normalize_path(path)
        full_path = os.path.normpath(os.path.join(base_directory, normalized_path.lstrip('./'))) if not os.path.isabs(normalized_path) else normalized_path
        
        pdf_title = os.path.splitext(os.path.basename(full_path))[0]
        print(f"Processing {pdf_title}...")
        
        # PDF 처리 및 벡터 DB 생성
        chunks = process_pdf(full_path)
        db = create_vector_db(chunks)
        kiwi_bm25_retriever = KiwiBM25Retriever.from_documents(chunks)
        faiss_retriever = db.as_retriever()
        # Retriever 생성
        retriever = EnsembleRetriever(
            retrievers=[kiwi_bm25_retriever, faiss_retriever],
            weights=[0.5, 0.5],
            search_type="mmr",
        )
        
        # 결과 저장
        pdf_databases[pdf_title] = {
                'db': db,
                'retriever': retriever
        }
    return pdf_databases

In [3]:
base_directory = '/home/a2024712006/dacon' # Your Base Directory
df = pd.read_csv('/home/a2024712006/dacon/test.csv')
pdf_databases = process_pdfs_from_dataframe(df, base_directory)

Processing PDFs:   0%|          | 0/9 [00:00<?, ?it/s]

Processing 중소벤처기업부_혁신창업사업화자금(융자)...


  warn_deprecated(
Processing PDFs:  11%|█         | 1/9 [00:12<01:43, 12.96s/it]

Processing 보건복지부_부모급여(영아수당) 지원...


Processing PDFs:  22%|██▏       | 2/9 [00:24<01:25, 12.15s/it]

Processing 보건복지부_노인장기요양보험 사업운영...


Processing PDFs:  33%|███▎      | 3/9 [00:38<01:18, 13.09s/it]

Processing 산업통상자원부_에너지바우처...


Processing PDFs:  44%|████▍     | 4/9 [01:11<01:43, 20.79s/it]

Processing 국토교통부_행복주택출자...


Processing PDFs:  56%|█████▌    | 5/9 [01:24<01:12, 18.21s/it]

Processing 「FIS 이슈 & 포커스」 22-4호 《중앙-지방 간 재정조정제도》...


Processing PDFs:  67%|██████▋   | 6/9 [02:11<01:23, 27.99s/it]

Processing 「FIS 이슈 & 포커스」 23-2호 《핵심재정사업 성과관리》...


Processing PDFs:  78%|███████▊  | 7/9 [03:08<01:14, 37.19s/it]

Processing 「FIS 이슈&포커스」 22-2호 《재정성과관리제도》...


Processing PDFs:  89%|████████▉ | 8/9 [03:54<00:40, 40.04s/it]

Processing 「FIS 이슈 & 포커스」(신규) 통권 제1호 《우발부채》...


Processing PDFs: 100%|██████████| 9/9 [04:37<00:00, 30.78s/it]


In [None]:
import transformers
import torch


model_id = "rtzr/ko-gemma-2-9b-it" 

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
)

pipeline.model.eval()

In [12]:
# return: final_context, full_text
# final_context: 최종 llm에게 넘길 context
# full_doc_text: reranking하기 전 context
def llm_reranker(results, question):
    context = []
    full_doc_text = ""
    for i in range(len(results)):
        if i > 4:
            break
        context.append(results[i].page_content)
        text = f"문서{i}:" + "\n\n" + results[i].page_content
        full_doc_text += f"\n\n{text}\n==="

    prompt = f"""
    정보: 
        {full_doc_text}

    위 정보에서 아래 질문에 대한 답을 유추할 수 있고 가장 유사한 문서를 2개 골라서 문서의 번호만 출력해줘

    질문: {question}

    """
    messages = [
    {"role": "user", "content": f"{prompt}"}
    ]

    prompt = pipeline.tokenizer.apply_chat_template(
        messages, 
        tokenize=False, 
        add_generation_prompt=True
    )

    terminators = [
        pipeline.tokenizer.eos_token_id,
        pipeline.tokenizer.convert_tokens_to_ids("<end_of_turn>")
    ]

    outputs = pipeline(
        prompt,
        max_new_tokens=2048,
        eos_token_id=terminators,
        do_sample=True,
        temperature=0.6,
        top_p=0.9,
    )
    response = outputs[0]["generated_text"][len(prompt):]
    print(f"Answer: 0, {response}\n")
    
    reranked_idx_list = [0]
    for char in response:
        if char.isdigit() and char != "0":
            idx = int(char)
            if idx not in reranked_idx_list and idx < 5:
                reranked_idx_list.append(idx)
    
    final_context = ""
    for idx in reranked_idx_list:
        final_context += f"\n{context[idx]}\n"
    return final_context, full_doc_text

# return: response
def llm_answer(context, question):
    prompt = f"""
    다음 정보를 바탕으로 질문에 답하세요:
    {context}

    질문: {question}
    
    주어진 질문에만 답변하세요. 문장으로 답변해주세요. 답변할 때 질문의 주어를 써주세요.
    답변:
    """
    messages = [
    {"role": "user", "content": f"{prompt}"}
    ]

    prompt = pipeline.tokenizer.apply_chat_template(
        messages, 
        tokenize=False, 
        add_generation_prompt=True
    )

    terminators = [
        pipeline.tokenizer.eos_token_id,
        pipeline.tokenizer.convert_tokens_to_ids("<end_of_turn>")
    ]

    outputs = pipeline(
        prompt,
        max_new_tokens=2048,
        eos_token_id=terminators,
        do_sample=True,
        temperature=0.6,
        top_p=0.9,
    )
    response = outputs[0]["generated_text"][len(prompt):]
    return response

In [None]:
# 정규화된 키로 데이터베이스 검색
normalized_keys = {normalize_string(k): v for k, v in pdf_databases.items()}

debug_context = {}
debug_full_doc_text ={}

# DataFrame의 각 행에 대해 처리
for _, row in tqdm(df.iterrows(), total=len(df), desc="Answering Questions"):
    # 소스 문자열 정규화
    source = normalize_string(row['Source'])
    question = row['Question']
    retriever = normalized_keys[source]['retriever']
    results = retriever.get_relevant_documents(question)

    context, full_doc_text = llm_reranker(results, question)
    debug_context['question'] = context 
    debug_full_doc_text['question'] = full_doc_text
    
    print(f"Question: {question}")
    response = llm_answer(context, question)
    print(f"Answer: {response}\n")
    
    # 결과 저장
    results.append({
        "Source": row['Source'],
        "Source_path": row['Source_path'],
        "Question": question,
        "Answer": response
    })

In [None]:
# 제출용 샘플 파일 로드
submit_df = pd.read_csv("./sample_submission.csv")

# 생성된 답변을 제출 DataFrame에 추가
submit_df['Answer'] = [item['Answer'] for item in results]
submit_df['Answer'] = submit_df['Answer'].fillna("데이콘")     # 모델에서 빈 값 (NaN) 생성 시 채점에 오류가 날 수 있음 [ 주의 ]

# 결과를 CSV 파일로 저장
submit_df.to_csv("./submission.csv", encoding='UTF-8-sig', index=False)