<a target="_blank" href="https://colab.research.google.com/github/UpstageAI/cookbook/blob/main/cookbooks/upstage/Solar-Full-Stack LLM-101/05_3_OracleDB.ipynb">
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

# Retrieval Augmented Generation (RAG) Baseline
## Overview  
In this time, we will check the baseline code.
The goal of this project is to provide students with hands-on experience in handling and enhancing Large Language Models (LLMs) provided by [**Upstage**](https://www.upstage.ai) (Solar).

You can use any engineering method for improving benchmark performance excluding direct training (Fine-tuning).

*Collecting data directly related to the test set is considered cheating e.g., using MMLU-pro dataset or EWHA.pdf for KB*

In [None]:
from google.colab import drive
drive.mount("/content/drive")

In [None]:
# load datasets
from datasets import load_dataset

cache_dir = "/content/drive/MyDrive/Upstage_Project/db"
ds = load_dataset("BoltMonkey/psychology-question-answer", cache_dir=cache_dir)

In [None]:
# @title set API key
# First, enroll your API key as the colab key.
from pprint import pprint
import os

import warnings

warnings.filterwarnings("ignore")

from IPython import get_ipython

upstage_api_key_env_name = "upstage_api_key"


def load_env():
    if "google.colab" in str(get_ipython()):
        # Running in Google Colab
        from google.colab import userdata

        upstage_api_key = userdata.get(upstage_api_key_env_name)
        # print(upstage_api_key)
        return os.environ.setdefault(upstage_api_key_env_name, upstage_api_key)
    else:
        # Running in local Jupyter Notebook
        from dotenv import load_dotenv

        load_dotenv()
        return os.environ.get(upstage_api_key_env_name)


UPSTAGE_API_KEY = load_env() # Setting API Key

In [None]:
# set parameters
db_path = "./drive/MyDrive/Upstage_Project/db" # folder path containing ewah.pdf
assets_path = "./drive/MyDrive/Upstage_Project/assets"

# Baseline

In [None]:
!pip3 install -qU openai python-dotenv PyPDF2 langchain langchain-community langchain-core langchain-text-splitters langchain_upstage oracledb python-dotenv tiktoken faiss-cpu

In [None]:
from langchain_upstage import UpstageDocumentParseLoader
import os

layzer = UpstageDocumentParseLoader(api_key=UPSTAGE_API_KEY,file_path=os.path.join(db_path, 'ewha.pdf'), output_format="text")
docs = layzer.load()  # or layzer.lazy_load()

In [None]:
from langchain_text_splitters import (
    Language,
    RecursiveCharacterTextSplitter,
)

# 2. Split
text_splitter = RecursiveCharacterTextSplitter.from_language(
    chunk_size=500, chunk_overlap=100, language=Language.HTML
)

splits = text_splitter.split_documents(docs)
print("Splits:", len(splits))


In [None]:
print(splits[0].page_content)

In [None]:
print(splits[1].page_content)

In [None]:
%mkdir /content/drive/MyDrive/Upstage_Project/faiss_vectorstore

In [None]:
%cd /content/drive/MyDrive/Upstage_Project/faiss_vectorstore

In [None]:
!pwd

In [None]:
from langchain_upstage import UpstageEmbeddings
from langchain_community.vectorstores import FAISS

upstage_embeddings = UpstageEmbeddings(api_key=UPSTAGE_API_KEY, model="solar-embedding-1-large-passage")

# create and save a FAISS vectorstore
vectorstore = FAISS.from_documents(documents=splits,
                                   embedding=upstage_embeddings)
vectorstore.save_local("faiss_index_ewha")

In [None]:
# read samples.csv file

import pandas as pd

def read_data(data_path):
    data = pd.read_csv(data_path)
    prompts = data['prompts']
    answers = data['answers']
    # returns two lists: prompts and answers
    return prompts, answers

In [None]:
prompts, answers = read_data(os.path.join(assets_path, 'testset.csv'))

In [None]:
# from langchain_core.prompts import PromptTemplate
from langchain_core.prompts import ChatPromptTemplate
from langchain_upstage import ChatUpstage
from tqdm import tqdm

# load db
ewha_db = FAISS.load_local("./faiss_index_ewha",
                           upstage_embeddings,
                           allow_dangerous_deserialization=True)

# retriever
retriever = ewha_db.as_retriever(search_type="mmr", search_kwargs={'k': 5, 'lambda_mult': 0.15})

# llm
llm = ChatUpstage(api_key = UPSTAGE_API_KEY,
                  model="solar-pro2")

# prompt
prompt_template = ChatPromptTemplate.from_messages([
    ("system", """You are an expert academic advisor specializing in Ewha Womans University regulations and policies. You are smart, brilliant, and good at math.

## YOUR MISSION:
Provide a precise answer to multiple-choice questions (4-10 options) about Ewha Womans University regulations using ONLY the information from the given context.

## INSTRUCTIONS:
1. Analyze the Context: Carefully read and understand a question and all provided documents
2. Evaluate Each Option: Compare each answer choice against the context
3. Show Your Reasoning: Explain step-by-step how you arrived at your answer
4. State Your Final Answer: Clearly indicate your choice in the required format

## CONSTRAINTS:
- Use ONLY information from the provided context
- If the answer cannot be determined from the context, state: "The information is not present in the context."
- Do NOT make assumptions beyond what is explicitly stated

## REQUIRED OUTPUT FORMAT:
After your reasoning, you MUST conclude with your final answer in EXACTLY this format:

[ANSWER]: (X) [brief answer text]

Where X is the option letter (A, B, C, D, etc.)

Example:
[ANSWER]: (B) 2"""),

    ("human", """Question: {question}
    ---
    Context: {context}
    ---
    Please provide your step-by-step reasoning and final answer.""")
])

# combine documents
def format_docs(docs):
    return '\n\n'.join(doc.page_content for doc in docs)

# RAG chain
rag_chain = prompt_template | llm

responses = []

for i, prompt in enumerate(prompts[:25]):
    # get docs which are relevant to query using retriever
    docs = retriever.get_relevant_documents(prompt)

    # call RAG chain
    response = rag_chain.invoke({"question": prompt, "context": format_docs(docs)})
    responses.append(response.content)

    if i == 0:
        print(f"üìå question: {prompt}")
        print(f"‚úÖ context: {format_docs(docs)}")

In [None]:
for idx, (prompt, response) in enumerate(zip(prompts[:25], responses[:25])):
    print(f"{idx+1}.", prompt)
    print(f"     {response}")
    print('-'*10)

In [None]:
# funcion to extract an answer from response

import re

def extract_answer(response):
    """
    extracts the answer from the response using a regular expression.
    expected format: "[ANSWER]: (A) convolutional networks"

    if there are any answers formatted like the format, it returns None.
    """
    pattern = r"\[ANSWER\]:\s*\((A|B|C|D|E)\)"
    match = re.search(pattern, response)

    if match:
        return match.group(1) # Extract the letter inside parentheses (e.g., A)
    else:
        return extract_again(response)

def extract_again(response):
    pattern = r"\b[A-J]\b(?!.*\b[A-J]\b)"
    match = re.search(pattern, response)
    if match:
        return match.group(0)
    else:
        return None

In [None]:
# print accuracy

cnt = 0

for i, (answer, response) in enumerate(zip(answers[:25], responses[:25])):
    print("-"*10)
    generated_answer = extract_answer(response)
    # print(response)
    # check
    if generated_answer:
        print(f"{i+1}. generated answer: {generated_answer}, answer: {answer}")
    else:
        print(f"{i+1}. extraction fail")


    if generated_answer == None:
        continue
    if generated_answer in answer:
        cnt += 1

print()
print(f"acc: {(cnt/25)*100}%")