# BKMS2 Hands-on #4: Developing a Q&A system based on RAG

### Colab Setup

In [None]:
!unzip /content/data.zip -d /content

In [None]:
!pip install -r /content/data/requirements.txt # > /dev/null

In [None]:
import nltk

nltk.download('punkt')

In [None]:
from getpass import getpass

OPENAI_API_KEY = getpass("Please enter your OpenAI API key: ")

In [None]:
# from dotenv import load_dotenv
# import os, warnings

# load_dotenv('./data/.env')
# OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
# warnings.filterwarnings("ignore")

## 1. Basic Implementation

In [None]:
from openai import OpenAI
import chromadb
import chromadb.utils.embedding_functions as embedding_functions

### 1-1. Load and Split the Document

In [None]:
def loadText(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return text


def splitText(text, chunk_size, overlap_size):
    chunks = []
    start = 0

    while start < len(text):
        end = min(start + chunk_size, len(text))
        chunks.append(text[start:end])

        start += chunk_size - overlap_size

    return chunks

### 1-2. Store in a Vector Database

In [None]:
file_path = "./data/2024학년도 2학기 데이터사이언스대학원 석사학위 논문심사 계획 공고.txt"
document = loadText(file_path)

# Split the text into overlapping chunks
chunks = splitText(document, chunk_size=300, overlap_size=50)

print(len(chunks))

In [None]:
# Initialize ChromaDB client
client = chromadb.PersistentClient(path="./vectordb/")

# Setup the collection to store embeddings
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
            api_key=OPENAI_API_KEY,
            model_name="text-embedding-3-small"
        )

collection = client.get_or_create_collection("gsds", embedding_function=openai_ef)

# Add documents and embeddings to ChromaDB collection
collection.add(
    ids=[f"chunk_{i}" for i in range(len(chunks))],
    documents=chunks,
    metadatas=[{"reference": file_path.split('/')[-1]} for i in range(len(chunks))]
)

In [None]:
# client.delete_collection("gsds")

### 1-3. Retrieval, Augmentation and Generation (RAG)

In [None]:
# Example query
# query = input('Question:')
query = "학위 논문 제출 기한은 언제까지인가요?"

# Retrieve top 3 relevant passages
results = collection.query(
    query_texts=query,
    n_results=3
)

# Format the retrieved passages
passages = "\n".join([f"Passage {i} (data_source: {meta['reference']}):\n{doc.strip()}\n" for i, (doc, meta) in enumerate(zip(results['documents'][0], results['metadatas'][0]), 1)])

# Create a more structured prompt
prompt = f"""
# Question: {query}

# Relevant Passages:
{passages}

# Based on the passages above, generate an answer to the question. Explicitly mention the 'data_source'.
ex) (출처: gsds_notification.pdf)
"""

# print(prompt)

In [None]:
results

In [None]:
llm = OpenAI(api_key=OPENAI_API_KEY)

response = llm.chat.completions.create(
        model='gpt-4o-mini',
        messages=[
            {"role": "user", "content": prompt},
        ],
        temperature=0
    )

print('Answer:', response.choices[0].message.content)

### 1-4. Various Distance Metrics for Semantic Search

In [None]:
distance_metric = ["l2", "cosine", "ip"] # default: l2
query = "학위 논문 제출 기한은 언제까지인가요?"

for metric in distance_metric:
  collection = client.get_or_create_collection(
      name=f"gsds-{metric}",
      embedding_function=openai_ef,
      metadata={"hnsw:space": metric}
      )

  collection.add(
    ids=[f"chunk_{i}" for i in range(len(chunks))],
    documents=chunks,
    metadatas=[{"reference": file_path.split('/')[-1]} for i in range(len(chunks))]
    )

  results = collection.query(
    query_texts=query,
    n_results=3
    )

  client.delete_collection(f"gsds-{metric}")

  print(f'{metric}:', results['ids'][0], results['distances'][0])

## 2. Implementing RAG With Langchain

In [None]:
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma

from langchain_text_splitters import RecursiveCharacterTextSplitter

from langchain_core.vectorstores import InMemoryVectorStore
from langchain_core.prompts import PromptTemplate
from langchain_core.documents import Document

from langchain_community.document_loaders import TextLoader, PyPDFLoader, AsyncHtmlLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_transformers import Html2TextTransformer

from bs4 import BeautifulSoup
import io

### 2-1. Load and Store the data from `URL`

In [None]:
urls = ["https://gsds.snu.ac.kr/academics/recruitment/"]
html = AsyncHtmlLoader(urls).load()

html2text = Html2TextTransformer()

docs_transformed = html2text.transform_documents(html)

text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=60)
docs_html = text_splitter.split_documents(docs_transformed)

vector_store = Chroma.from_documents(docs_html, OpenAIEmbeddings(model='text-embedding-3-large', openai_api_key=OPENAI_API_KEY))

### 2-2. Load and Store the data from `.txt` / `.pdf`

In [None]:
documents = TextLoader("./data/2024학년도 2학기 데이터사이언스대학원 석사학위 논문심사 계획 공고.txt").load()
# documents = PyPDFLoader("./data/2024학년도 2학기 데이터사이언스대학원 석사학위 논문심사 계획 공고.pdf").load()

docs = text_splitter.split_documents(documents)

vector_store.add_documents(documents = docs)

In [None]:
# vector_store.delete_collection()

### 2-3. Retrieval, Augmentation and Generation (RAG)

In [None]:
# query = input('Question: ')
query = '박사 과정 신입학 모집 인원은 몇 명이야?'
retrieved_docs = vector_store.similarity_search(query, k=3)

passages = "\n".join([f"Passage {i+1} (data_source: {doc.metadata['source']}):\n{doc.page_content}\n" for i, doc in enumerate(retrieved_docs)])

prompt_template = f"""
# Question: {query}

# Relevant Passages:
{passages}

# Based on the passages above, generate an answer to the question. Explicitly mention the 'data_source'.
ex) (출처: gsds_notification.pdf)
"""

llm = ChatOpenAI(model="gpt-4o-mini", temperature=0, openai_api_key=OPENAI_API_KEY)
prompt = PromptTemplate(template=prompt_template, input_variables=["query", "passages"])

chain = prompt | llm
response = chain.invoke({"query": query, "passages": passages})

print('Answer:', response.content)

### 2-4. Run the RAG Chain

In [None]:
def execute_chain():
  print("Type 'exit' to quit")
  while True:
    query = input("Enter a prompt: ")
    if query.lower() == 'exit':
      print('Exiting...')
      break

    else:
      retrieved_docs = vector_store.similarity_search(query, k=5)
      passages = "\n".join([f"Passage {i} (data_source: {doc.metadata['source']}):\n{doc.page_content}\n" for i, doc in enumerate(retrieved_docs)])
      prompt_template = f"""
# Question: {query}

# Relevant Passages:
{passages}

# Based on the passages above, generate an answer to the question. Explicitly mention the 'data_source'.
ex) (출처: gsds_notification.pdf)
"""
      try:
        llm = ChatOpenAI(model="gpt-4o-mini", temperature=0, openai_api_key=OPENAI_API_KEY)
        prompt = PromptTemplate(template=prompt_template, input_variables=["query", "passages"])
        chain = prompt | llm
        response = chain.invoke({"query": query, "passages": passages})
        print('Answer:', response.content)

      except Exception as e:
          print(e)

execute_chain()