In [19]:
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer
from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores.faiss import FAISS
from langchain.schema import Document
import fitz  # PyMuPDF
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.messages import HumanMessage
from langserve import add_routes
from llm2 import llm as model
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from tqdm import tqdm  # tqdm 라이브러리 추가

In [20]:
# 평가 지표 계산 함수
def calculate_bleu(reference: str, generated: str):
    reference_tokens = reference.split()  # 참고 문장 토큰화
    generated_tokens = generated.split()  # 생성된 문장 토큰화
    return sentence_bleu([reference_tokens], generated_tokens)

def calculate_meteor(reference: str, generated: str):
    # 토큰화하여 리스트로 변환
    reference_tokens = reference.split()  # 참고 문장을 단어 리스트로 변환
    generated_tokens = generated.split()  # 생성된 문장을 단어 리스트로 변환
    
    # METEOR 점수 계산
    return meteor_score([reference_tokens], generated_tokens)  # 토큰화된 리스트 전달

def calculate_rouge(reference: str, generated: str):
    # ROUGE 점수 계산
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)  # 리스트로 인수 전달
    scores = scorer.score(reference, generated)
    
    # BLEU 점수 계산 (SmoothingFunction을 사용하여 0 counts 문제 해결)
    reference_tokens = reference.split()  # 참고 문장 토큰화
    generated_tokens = generated.split()  # 생성된 문장 토큰화
    smoothing_function = SmoothingFunction().method1  # Smoothing 방법 선택

    return {
        "rouge_1": scores["rouge1"].fmeasure,
        "rouge_2": scores["rouge2"].fmeasure,
        "rouge_L": scores["rougeL"].fmeasure,
    }

In [21]:
def query_llm(context: str, question: str):
    RAG_PROMPT_TEMPLATE = """
    다음 정보를 바탕으로 질문에 답하세요:
    {context}

    질문:
    {question}

    질문의 핵심만 파악하여 간결하게 1-2문장으로 답변하고, 불필요한 설명은 피하며 동서울대학교와 관련된 정보만 제공하세요.

    답변:
    """
    # RAG_PROMPT_TEMPLATE을 사용하여 메시지 생성
    prompt = ChatPromptTemplate.from_template(RAG_PROMPT_TEMPLATE)
    formatted_prompt = prompt.format(context=context, question=question)
    message = HumanMessage(content=formatted_prompt)
    response = model([message])  # LLM 모델 호출 (리스트 형태로 전달)

    # 응답 내용에서 불필요한 접두어 제거
    response_content = response.content.strip()  # 공백 제거
    response_content = response_content.replace("Human:", "").replace("human:", "").strip()  # "Human:" 또는 "human:" 제거

    return response_content  # 응답 내용 반환

In [22]:
def perform_rag(question: str):
    # PDF에서 데이터 읽기
    doc = fitz.open("./QADataset_merged.pdf")
    text = ""
    for page in doc:
        text += page.get_text()

    # 텍스트 분할
    splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=50)
    chunks = [Document(page_content=t) for t in splitter.split_text(text)]

    # 임베딩 설정
    model_kwargs = {"device": "cuda"}
    encode_kwargs = {"normalize_embeddings": True}
    embeddings = HuggingFaceEmbeddings(
        model_name="intfloat/multilingual-e5-large",
        model_kwargs=model_kwargs,
        encode_kwargs=encode_kwargs,
    )

    # FAISS 벡터 데이터베이스 생성
    db = FAISS.from_documents(chunks, embedding=embeddings)
    retriever = db.as_retriever(search_kwargs={"k": 2})

    # 컨텍스트 추출
    context = retriever.get_relevant_documents(question)
    #print("Retrieved Context: ", "\n\n".join(doc.page_content for doc in context))  # context 확인
    return "\n\n".join(doc.page_content for doc in context)

In [23]:
def evaluate_model_responses(csv_file: str):
    try:
        # Try reading the CSV file with utf-8-sig encoding
        df = pd.read_csv(csv_file, encoding='utf-8-sig')  # Or try 'euc-kr' or 'ISO-8859-1'
    except UnicodeDecodeError:
        print(f"Failed to read {csv_file} with utf-8-sig encoding. Trying other encodings.")
        df = pd.read_csv(csv_file, encoding='euc-kr')  # Try with 'euc-kr' encoding
    except Exception as e:
        print(f"An error occurred: {e}")
        return

    evaluation_results = []
    
    # tqdm을 사용하여 진행 상태 표시
    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Evaluating responses"):
        question = row[0]  # 첫 번째 열: 질문
        reference = row[1]  # 두 번째 열: 정답(label)

        # RAG 수행 후 답변 생성
        context = perform_rag(question)
        generated_response = query_llm(context, question)  # LLM에서 생성된 응답

        # BLEU, METEOR, ROUGE 계산
        bleu_score = calculate_bleu(reference, generated_response)
        meteor_score_value = calculate_meteor(reference, generated_response)
        rouge_scores = calculate_rouge(reference, generated_response)

        evaluation_results.append({
            "question": question,
            "reference": reference,
            "generated": generated_response,
            "bleu": bleu_score,
            "meteor": meteor_score_value,
            "rouge_1": rouge_scores["rouge_1"],
            "rouge_2": rouge_scores["rouge_2"],
            "rouge_L": rouge_scores["rouge_L"]
        })
    
    results_df = pd.DataFrame(evaluation_results)
    results_df.to_csv("evaluation_results.csv", index=False)

    return results_df

In [None]:
# Main execution
if __name__ == "__main__":
    # Run evaluation
    evaluate_model_responses("evaluation.csv")

Evaluating responses:   0%|          | 0/421 [00:00<?, ?it/s]

  question = row[0]  # 첫 번째 열: 질문
  reference = row[1]  # 두 번째 열: 정답(label)
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
  question = row[0]  # 첫 번째 열: 질문
  reference = row[1]  # 두 번째 열: 정답(label)
  question = row[0]  # 첫 번째 열: 질문
  reference = row[1]  # 두 번째 열: 정답(label)
  question = row[0]  # 첫 번째 열: 질문
  reference = row[1]  # 두 번째 열: 정답(label)
The hypothesis contai