# RAG 시스템 보기


In [1]:
import warnings
from retrieval import load_vectorstore_retriever_embeddings, rag
from model import llm_load
import json

warnings.filterwarnings("ignore")

# 1. 벡터스토어 로드
vectorstore, retriever, embeddings = load_vectorstore_retriever_embeddings("RAG/vectorDB")

# 2. LLM 한 번만 로드 (재사용)
llm = llm_load()

  from .autonotebook import tqdm as notebook_tqdm


벡터스토어 로드 중
검색기 로드 중
검색기 로드 완료


Loading checkpoint shards: 100%|██████████| 5/5 [00:46<00:00,  9.37s/it]


In [2]:
#3. test 데이터셋 가져오기
with open("popqa_dataset/qa_dataset.json", "r") as f:
    pop_qa = json.load(f)

with open("nq_dataset/qa_dataset_origin.json", "r") as f:
    nq_qa = json.load(f)

### nq_dataset popqa랑 형식 맞추기

In [3]:
from set_data import preprocess_nq_answers

nq_qa = preprocess_nq_answers(nq_qa)

### 테스트 데이터셋 랜덤 추출

In [6]:
from set_data import extract_random_qa

pop_qa_sampled = extract_random_qa(pop_qa, num_qa=260)
nq_qa_sampled = extract_random_qa(nq_qa, num_qa=260)

ImportError: cannot import name 'extract_random_qa' from 'set_data' (/home/sehan/workspace/Knowledge-Conflicts/set_data.py)

In [None]:
pop_qa_sampled[:3]

[{'ids': '28',
  'question': "What is Thomas McMurtry's occupation?",
  'answers': '["test pilot"]'},
 {'ids': '95',
  'question': "What is Ivo Perilli's occupation?",
  'answers': '["screenwriter", "scenarist", "writer", "screen writer", "script writer", "scriptwriter"]'},
 {'ids': '135',
  'question': "What is Pierre Pansu's occupation?",
  'answers': '["mathematician"]'},
 {'ids': '194',
  'question': "What is Ron Nyswaner's occupation?",
  'answers': '["film director", "movie director", "director", "motion picture director", "screenwriter", "scenarist", "writer", "screen writer", "script writer", "scriptwriter"]'},
 {'ids': '209',
  'question': "What is Adele Capell, Countess of Essex's occupation?",
  'answers': '["socialite", "prominent person"]'}]

# Test with Base RAG

In [None]:
# 3. RAG 실행
import os
from tqdm import tqdm

# 배치 설정 (수동으로 변경)

output_file = f"output_with_base_rag.jsonl"  # JSONL 형식


for item in tqdm(test_data[:], desc="RAG 처리중"):
    question = item["question"]
    rag_output = rag(vectorstore, question, llm)
    answer = item["answers"]
    
    result = {
        "Question": question,
        "Answer": rag_output['answer'],
        "Ground_Truth": answer
    }
    
    # 한 줄씩 바로 저장 (append 모드)
    with open(output_file, "a", encoding="utf-8") as f:
        f.write(json.dumps(result, ensure_ascii=False) + "\n")
    
print(f"저장 완료: {output_file}")

RAG 처리중:   0%|          | 0/14267 [00:00<?, ?it/s]

RAG 처리중:   2%|▏         | 348/14267 [11:06<7:07:16,  1.84s/it]

### 평가하기

In [None]:
#{"Question": "In what city was Damian Pettigrew born?", "Answer": " I don't know. There is no information about Damian Pettigrew in the provided context.", "Ground_Truth": "[\"Quebec\", \"Québec\", \"QC\", \"Province of Quebec\", \"Quebec, Canada\", \"Quebec Province\"]"}
#{"Question": "In what city was Wayne G. Hammond born?", "Answer": " Wayne G. Hammond was born in Miami, Florida.", "Ground_Truth": "[\"Cleveland\", \"Cleveland, Ohio\", \"Cleveland, OH\"]"}

In [None]:
correct = 0
total = 0

with open("output_with_base_rag.jsonl", "r", encoding="utf-8") as f:
    for idx,line in enumerate(f):
        data = json.loads(line)  # JSONL 파싱 필요!
        answer = data["Answer"].lower()  # 소문자로 통일
        gold_answers = json.loads(data["Ground_Truth"])  # 문자열을 리스트로 변환

        print(f"Answer: {answer}")
        print(f"Ground Truth: {gold_answers}")
        
        # Ground Truth 중 하나라도 Answer에 포함되면 정답
        is_correct = any(gold.lower() in answer for gold in gold_answers)
        
        if is_correct:
            correct += 1
        total += 1

print(f"정확도: {correct}/{total} = {correct/total*100:.2f}%")

NameError: name 'json' is not defined

In [5]:
from retrieval import test_queries

test_queries(vectorstore, ["What is Kathy Saltzman's occupation?"],10,False,True)

Query 1: What is Kathy Saltzman's occupation?

[Result 1]
Title: Josh Saltzman
URL: https://en.wikipedia.org/wiki?curid=31744718
Content preview: Josh Saltzman is a Canadian Comedy Award winning comedian, writer and director from Toronto, Ontario who currently resides in Los Angeles. He was the head writer of DHX Media's new "Inspector Gadget" series produced for Teletoon in Canada (seen on Netflix in the United States), and has written for m...
--------------------------------------------------------------------------------

[Result 2]
Title: Katherine Saltzberg
URL: https://en.wikipedia.org/wiki?curid=41660825
Content preview: Katherine Saltzberg (née Maisnik) is an American actress, singer, and comic. She is best known for starring as the showbiz-talented 16-year-old daughter of Brian Dennehy's character in the ABC sitcom, "Star of the Family".
Theater.
In 2009, Saltzberg wrote and performed the one woman show, "Los Ange...
-----------------------------------------------------------