# 사전수행

## Install

In [5]:
!pip install accelerate
!pip install -i https://pypi.org/simple/ bitsandbytes
!pip install transformers[torch] -U

!pip install datasets
!pip install langchain
!pip install langchain_community
!pip install PyMuPDF
!pip install sentence-transformers
!pip install faiss-gpu
!pip install --upgrade langchain

Looking in indexes: https://pypi.org/simple/


## Google Drive

In [9]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:
%cd drive/MyDrive/재정정보AI검색알고리즘경진대회/data/open
!ls

baseline_submission.csv  sample_submission.csv	test.csv  test_source  train.csv  train_source
[Errno 2] No such file or directory: 'drive/MyDrive/재정정보AI검색알고리즘경진대회/data/open'
/content/drive/MyDrive/재정정보AI검색알고리즘경진대회/data/open
baseline_submission.csv  sample_submission.csv	test.csv  test_source  train.csv  train_source


# Study

## Import


In [13]:
import os
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import re
import unicodedata
import fitz
from tqdm.auto import tqdm
import torch

## 전반부
# 분할
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
# 임베딩
from langchain.embeddings import HuggingFaceEmbeddings
# 벡터DB
from langchain.vectorstores import FAISS

## 후반부
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    pipeline,
    BitsAndBytesConfig
)
from langchain.llms import HuggingFacePipeline

# # RoBERTa 모델 (사용 모델 : cardiffnlp/twitter-roberta-base-sentiment)
# from transformers import RobertaModel, RobertaTokenizer

## 1. First Half

In [16]:
def normalize_path(path):
    """경로 유니코드 정규화"""
    return unicodedata.normalize('NFC', path)

## 2. 분할
def process_pdf(file_path, min_para_length=50):
    """PDF 텍스트 추출 후 문단 단위로 나누기"""
    try:
        doc = fitz.open(file_path)
    except Exception as e:
        print(f"Error opening file {file_path}: {e}")
        return []

    text = ''
    for page in doc:
        text += page.get_text()

    # 문단을 식별하기 위한 정규 표현식 패턴
    # 빈 줄이나 여러 개의 연속된 줄바꿈을 문단 구분자로 사용
    paragraphs = re.split(r'\n\s*\n', text)

    # 너무 짧은 문단 제거 및 Document 객체 생성
    chunks = [Document(page_content=para.strip()) for para in paragraphs if len(para.strip()) >= min_para_length]

    return chunks

## 3. 임베딩
# 재정 도메인에 특화된 모델 "sentence-transformers/LaBSE"
def create_vector_db(chunks, model_path = "sentence-transformers/LaBSE"):  # 다국어 지원 모델
    """FinBERT 모델을 사용하여 FAISS DB 생성"""
    # GPU 사용 가능 여부 확인
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    # 임베딩 모델 설정
    model_kwargs = {'device': device}
    encode_kwargs = {'normalize_embeddings': True, 'batch_size': 32}
    embeddings = HuggingFaceEmbeddings(
        model_name=model_path,
        model_kwargs=model_kwargs,
        encode_kwargs=encode_kwargs
    )

    # FAISS DB 생성 및 반환
    db = FAISS.from_documents(chunks, embedding=embeddings)
    return db

def rag_first_half(df, base_directory):
    """딕셔너리에 pdf명을 key로 해서 DB, retriever 저장"""
    pdf_databases = {}
    unique_paths = df['Source_path'].unique()

    for path in tqdm(unique_paths, desc="Processing financial documents"):
        # 경로 정규화 및 절대 경로 생성
        normalized_path = normalize_path(path)
        full_path = os.path.normpath(os.path.join(base_directory, normalized_path.lstrip('./'))) if not os.path.isabs(normalized_path) else normalized_path

        pdf_title = os.path.splitext(os.path.basename(full_path))[0]

        # 분할 및 임베딩
        chunks = process_pdf(full_path)
        if not chunks:
            print(f"Skipping {pdf_title} due to processing error.")
            continue

        db = create_vector_db(chunks)

        # Retriever 생성
        retriever = db.as_retriever(search_type="mmr", search_kwargs={'k': 5, 'fetch_k': 20})

        pdf_databases[pdf_title] = {
                'db': db,
                'retriever': retriever
        }
    return pdf_databases

# FAISS 인덱스 저장 및 로드 함수
def save_faiss_index(db, path):
    db.save_local(path)

def load_faiss_index(path, embeddings):
    return FAISS.load_local(path, embeddings)

base_directory = './' # 외부 지식 데이터 로컬 저장 경로
df = pd.read_csv('./test.csv') # 문의 사항
pdf_databases = rag_first_half(df, base_directory)

Processing financial documents:   0%|          | 0/9 [00:00<?, ?it/s]

modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/2.22k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/804 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/397 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/5.22M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.62M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

2_Dense/config.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 12.00 MiB. GPU 

## 2. Second Half

In [None]:

def setup_llm_pipeline():
    # 4비트 양자화 설정
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )

    # Hugging Face Access Token (replace with your own)
    access_token = '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'

    # 모델 ID
    model_id = "prajjwal1/bert-mini" # BERT-Mini 모델

    # 토크나이저 로드 및 설정
    tokenizer = AutoTokenizer.from_pretrained(model_id, token=access_token)
    # 시스템 프롬프트 사용 안 함
    if hasattr(tokenizer, 'use_default_system_prompt'):
        tokenizer.use_default_system_prompt = False

    # 모델 로드 및 양자화 설정 적용
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True,
        token=access_token)

    # HuggingFacePipeline 객체 생성
    text_generation_pipeline = pipeline(
        model=model,
        tokenizer=tokenizer,
        task="text-generation",
        temperature=0.2,
        return_full_text=False,
        max_new_tokens=1024,
        pad_token_id=tokenizer.eos_token_id
    )

    hf = HuggingFacePipeline(pipeline=text_generation_pipeline)

    return hf

# LLM 파이프라인 설정
llm_pipeline = setup_llm_pipeline()


In [None]:
def normalize_string(s):
    return unicodedata.normalize('NFC', s)

def format_docs(docs):
    context = ""
    for doc in docs:
        context += doc.page_content
        context += ' '
    return context

# 결과를 저장할 리스트 초기화
results = []

# DataFrame의 각 행에 대해 처리
for _, row in tqdm(df.iterrows(), total=len(df), desc="Answering Questions"):
    source = normalize_string(row['Source'])
    question = row['Question']

    # 정규화된 키로 데이터베이스 검색
    normalized_keys = {normalize_string(k): v for k, v in pdf_databases.items()}
    retriever_instance = normalized_keys[source]['retriever']

    # 문서 내용과 질문을 이용한 프롬프트 구성
    retrieved_docs = retriever_instance.get_relevant_documents(question)
    context = format_docs(retrieved_docs)

    # 질문과 문서 컨텍스트를 LLM 모델에 입력
    prompt = f"""
    참고문서들: {context.strip()}
    질문: {question}
    답변을 한 문장으로 요약:
    """

    # 답변 추론
    print(f"Question: {question}")
    response = llm_pipeline(prompt)
    first_line_response = response.split("질문")[0].strip()

    print(f"Answer: {first_line_response}\n")

    # 결과 저장
    results.append({
        "Source": row['Source'],
        "Source_path": row['Source_path'],
        "Question": question,
        "Answer": first_line_response
    })

## File Save

In [None]:
# 제출용 샘플 파일 로드
submit_df = pd.read_csv("./sample_submission.csv")

# 생성된 답변을 제출 DataFrame에 추가
submit_df['Answer'] = [item['Answer'] for item in results]
submit_df['Answer'] = submit_df['Answer'].fillna("데이콘")     # 모델에서 빈 값 (NaN) 생성 시 채점에 오류가 날 수 있음 [ 주의 ]

# 결과를 CSV 파일로 저장
submit_df.to_csv("./submission_0815.csv", encoding='UTF-8-sig', index=False)