# BKMS2 Hands-on #4: Developing a Q&A system based on RAG

### Colab Setup

In [None]:
!unzip /content/data.zip -d /content

In [39]:
!pip install -r /content/data/requirements.txt > /dev/null

In [41]:
# from dotenv import load_dotenv
# import os, warnings

# load_dotenv('./data/.env')
# OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
# warnings.filterwarnings("ignore")

from getpass import getpass

OPENAI_API_KEY = getpass("Please enter your OpenAI API key: ")

Please enter your OpenAI API key: ··········


## 1. Basic Implementation

In [42]:
from openai import OpenAI
import chromadb
import chromadb.utils.embedding_functions as embedding_functions

### 1-1. Load and Split the Document

In [43]:
def loadText(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return text


def splitText(text, chunk_size, overlap_size):
    chunks = []
    start = 0

    while start < len(text):
        end = min(start + chunk_size, len(text))
        chunks.append(text[start:end])

        start += chunk_size - overlap_size

    return chunks

### 1-2. Store in a Vector Database

In [46]:
# client.delete_collection("gsds")

In [45]:
file_path = "./data/2024학년도 2학기 데이터사이언스대학원 석사학위 논문심사 계획 공고.txt"  # Replace with the actual path of your txt file
document = loadText(file_path)

# Split the text into overlapping chunks
chunks = splitText(document, chunk_size=300, overlap_size=50)

print(len(chunks))

17


In [47]:
# Initialize ChromaDB client
client = chromadb.PersistentClient(path="./vectordb/")

# Setup the collection to store embeddings
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
            api_key=OPENAI_API_KEY,
            model_name="text-embedding-3-small"
        )

collection = client.get_or_create_collection("gsds", embedding_function=openai_ef)

# Add documents and embeddings to ChromaDB collection
collection.add(
    ids=[f"chunk_{i}" for i in range(len(chunks))],
    documents=chunks,
    metadatas=[{"reference": file_path.split('/')[-1]} for i in range(len(chunks))]
)

### 1-3. Retrieval, Augmentation and Generation (RAG)

In [48]:
# Example query
# query = input('Question:')
query = "학위 논문 제출 기한은 언제까지인가요?"

# Retrieve top 3 relevant passages
results = collection.query(
    query_texts=query,
    n_results=3
)

# Format the retrieved passages
passages = "\n".join([f"Passage {i} (data_source: {meta['reference']}):\n{doc.strip()}\n" for i, (doc, meta) in enumerate(zip(results['documents'][0], results['metadatas'][0]), 1)])

# Create a more structured prompt
prompt = f"""
# Question: {query}

# Relevant Passages:
{passages}

# Based on the passages above, generate an answer to the question. Explicitly mention the 'data_source'.
ex) (출처: gsds_notification.pdf)
"""
# print(prompt)

In [49]:
results

{'ids': [['chunk_7', 'chunk_8', 'chunk_15']],
 'distances': [[0.7706437870639522, 0.8314104845748295, 0.8778864412731177]],
 'metadatas': [[{'reference': '2024학년도 2학기 데이터사이언스대학원 석사학위 논문심사 계획 공고.txt'},
   {'reference': '2024학년도 2학기 데이터사이언스대학원 석사학위 논문심사 계획 공고.txt'},
   {'reference': '2024학년도 2학기 데이터사이언스대학원 석사학위 논문심사 계획 공고.txt'}]],
 'embeddings': None,
 'documents': [['하여 직접 행정실 및 심사 위원께 각각 메일로 제출\n     - 최종심사 및 구술고사: 2024. 12. 13. (금)\n     - 종결: 2025. 1. 10.(금)\n\n3. 학위논문 제출(미제출시 학위수여 대상에서 제외됨)\n 가. 학위논문심사에 합격한 자는 정해진 기간 내에 학위논문을 제출해야 함\n   * 논문 제목은 최종 논문심사요지의 제목과 반드시 일치하여야 함\n  나. 학위논문 PDF 원문 파일 제출\n    1) 제출기간 : 2025. 1. 17.(금) ∼ 2. 3.(월) 24:00 <18일간> \n    2) 책자논문 제출 ',
   '1. 17.(금) ∼ 2. 3.(월) 24:00 <18일간> \n    2) 책자논문 제출 없이 원문파일만 제출 \n    3) 마감 이후 제출/수정 불가\n    4) 논문을 제출하지 않을 경우 학위를 받을 수 없음 \n    5) 논문제출 방법, 학생 안내자료 등은 별도 공지 예정 \n\n4. 심사용 논문 작성요령<학위수여규정 제10조>\n 가

In [50]:
llm = OpenAI(api_key=OPENAI_API_KEY)

response = llm.chat.completions.create(
        model='gpt-4o-mini',
        messages=[
            {"role": "user", "content": prompt},
        ],
        temperature=0
    )

print('Answer:', response.choices[0].message.content)

Answer: 학위 논문 제출 기한은 2025년 1월 17일(금)부터 2월 3일(월) 24:00까지입니다. 이 기간 동안 학위논문 PDF 원문 파일을 제출해야 하며, 제출하지 않을 경우 학위를 받을 수 없습니다. (출처: 2024학년도 2학기 데이터사이언스대학원 석사학위 논문 심사 계획 공고.txt)


### 1-4. Various Distance Metrics for Semantic Search

In [28]:
distance_metric = ["l2", "cosine", "ip"] # default: l2
query = "학위 논문 제출 기한은 언제까지인가요?"

for metric in distance_metric:
  collection = client.get_or_create_collection(
      name=f"gsds-{metric}",
      embedding_function=openai_ef,
      metadata={"hnsw:space": metric}
      )

  collection.add(
    ids=[f"chunk_{i}" for i in range(len(chunks))],
    documents=chunks,
    metadatas=[{"reference": file_path.split('/')[-1]} for i in range(len(chunks))]
    )

  results = collection.query(
    query_texts=query,
    n_results=3
    )

  client.delete_collection(f"gsds-{metric}")

  print(f'{metric}:', results['ids'][0], results['distances'][0])

l2: ['chunk_7', 'chunk_8', 'chunk_15'] [0.7707362031473365, 0.8313149965094855, 0.8807400922146627]
cosine: ['chunk_7', 'chunk_8', 'chunk_15'] [0.38536804035080297, 0.41565745981877245, 0.43931590873398574]
ip: ['chunk_7', 'chunk_8', 'chunk_15'] [0.3840508969754728, 0.41503858267255234, 0.4401669916432607]


## 2. Implementing RAG With Langchain

In [51]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [52]:
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma

from langchain_text_splitters import RecursiveCharacterTextSplitter

from langchain_core.vectorstores import InMemoryVectorStore
from langchain_core.prompts import PromptTemplate
from langchain_core.documents import Document

from langchain_community.document_loaders import TextLoader, PyPDFLoader, AsyncHtmlLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_transformers import Html2TextTransformer

from bs4 import BeautifulSoup
import io



In [53]:
# vector_store.delete_collection()

### 2-1. Load and Store the data from URL

In [54]:
urls = ["https://gsds.snu.ac.kr/academics/recruitment/"]
html = AsyncHtmlLoader(urls).load()

html2text = Html2TextTransformer()

docs_transformed = html2text.transform_documents(html)

text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=60)
docs_html = text_splitter.split_documents(docs_transformed)

vector_store = Chroma.from_documents(docs_html, OpenAIEmbeddings(model='text-embedding-3-large', openai_api_key=OPENAI_API_KEY))

Fetching pages: 100%|##########| 1/1 [00:04<00:00,  4.24s/it]


### 2-2. Load and Store the data from TXT

In [55]:
documents = TextLoader("./data/2024학년도 2학기 데이터사이언스대학원 석사학위 논문심사 계획 공고.txt").load()
# documents = PyPDFLoader("./data/2024학년도 2학기 데이터사이언스대학원 석사학위 논문심사 계획 공고.pdf").load()
docs = text_splitter.split_documents(documents)

vector_store.add_documents(documents = docs)

['48b95005-81e0-4f07-a3aa-19ee95a91eed',
 '9e36c21a-c697-459b-9f46-fc1634ce39fd',
 '1b2ce8b6-bb9f-4dc5-a355-e84fcc885441',
 'd9241ee2-3093-4696-bb2a-041f6e43e3ff',
 '1922a6b1-d73d-47a3-bdf1-8a4f35d62d71',
 'd12da2cd-866b-4eb6-8549-7f064e20070a',
 '636b8b06-9807-4ae4-87cc-d1be5bb9e955',
 '3d535db1-0646-4baf-a949-817f79cd69e1',
 '9a6278c4-0c96-4148-8b53-1f6291bacaa6',
 'b8527af0-4420-4129-866d-4d11049525fb',
 'c92864a1-14cd-4ead-a52a-25e2341f193a',
 '5c14ce21-5b0c-4efa-b485-51ce7c5b86f7',
 '01e771d7-c496-4bfa-889c-58162181a34b',
 '99f96b59-e93e-455d-bde3-5386b8b32664',
 '2ce545d3-4c79-445c-9adc-39270a2807c6',
 '6ea65fdd-4fad-44ce-bd77-5bf57a85b30d',
 'ea106867-2c52-4a3b-95fb-d7e9195a3e6f',
 '88bc0ca2-7c09-40c3-b67f-30c8e4361e46',
 'db190982-af23-4a21-baf2-4edef7116f07',
 'f25645c8-9bca-4328-b0b2-8dd30b3136cc',
 '40e392ac-2fea-4d2c-a6fa-13c22d36de24']

### 2-3. Retrieval, Augmentation and Generation (RAG)

In [56]:
# query = input('Question: ')
query = '박사 과정 신입학 모집 인원은 몇 명이야?'
retrieved_docs = vector_store.similarity_search(query, k=3)

passages = "\n".join([f"Passage {i+1} (data_source: {doc.metadata['source']}):\n{doc.page_content}\n" for i, doc in enumerate(retrieved_docs)])

prompt_template = f"""
# Question: {query}

# Relevant Passages:
{passages}

# Based on the passages above, generate an answer to the question. Explicitly mention the 'data_source'.
ex) (출처: gsds_notification.pdf)
"""

llm = ChatOpenAI(model="gpt-4o-mini", temperature=0, openai_api_key=OPENAI_API_KEY)
prompt = PromptTemplate(template=prompt_template, input_variables=["query", "passages"])

chain = prompt | llm
response = chain.invoke({"query": query, "passages": passages})

print('Answer:', response.content)

Answer: 박사 과정 신입학 모집 인원은 23명입니다 (출처: https://gsds.snu.ac.kr/academics/recruitment/).


### 2-4. Run the RAG Chain

In [57]:
def execute_chain():
  print("Type 'exit' to quit")
  while True:
    query = input("Enter a prompt: ")
    if query.lower() == 'exit':
      print('Exiting...')
      break

    else:
      retrieved_docs = vector_store.similarity_search(query, k=5)
      passages = "\n".join([f"Passage {i} (data_source: {doc.metadata['source']}):\n{doc.page_content}\n" for i, doc in enumerate(retrieved_docs)])
      prompt_template = f"""
# Question: {query}

# Relevant Passages:
{passages}

# Based on the passages above, generate an answer to the question. Explicitly mention the 'data_source'.
ex) (출처: gsds_notification.pdf)
"""
      try:
        llm = ChatOpenAI(model="gpt-4o-mini", temperature=0, openai_api_key=OPENAI_API_KEY)
        prompt = PromptTemplate(template=prompt_template, input_variables=["query", "passages"])
        chain = prompt | llm
        response = chain.invoke({"query": query, "passages": passages})
        print('Answer:', response.content)

      except Exception as e:
          print(e)

execute_chain()

Type 'exit' to quit
Enter a prompt: 학위 논문 제출 마감이 언제야?
Answer: 학위 논문 제출 마감은 2025년 2월 3일(월) 24:00입니다. 이 기간 내에 학위논문을 제출해야 하며, 마감 이후에는 제출이나 수정이 불가합니다. 제출하지 않을 경우 학위를 받을 수 없습니다. (출처: ./data/2024학년도 2학기 데이터사이언스대학원 석사학위 논문심사 계획 공고.txt)
Enter a prompt: '박사 과정 신입학 모집 인원은 몇 명이야?'
Answer: 박사 과정 신입학 모집 인원은 23명입니다. (출처: https://gsds.snu.ac.kr/academics/recruitment/)
Enter a prompt: exit
Exiting...
