# Team 6 - Rag Code

In [26]:
pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-macosx_11_0_arm64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.9.0.post1-cp310-cp310-macosx_11_0_arm64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.9.0.post1
Note: you may need to restart the kernel to use updated packages.


### -1. Import the necessary packages

In [1]:
import os
import re
import time
import pickle
import torch
import numpy as np
import pandas as pd
from collections import Counter
from sentence_transformers import util
from langchain_upstage import UpstageEmbeddings
from langchain_core.prompts import PromptTemplate
from langchain_upstage import ChatUpstage
from langchain.chains import LLMChain
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings.base import Embeddings
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, manhattan_distances
from sklearn.metrics import pairwise_distances  
from scipy.spatial.distance import jaccard  
from sklearn.feature_extraction.text import CountVectorizer


UPSTAGE_API_KEY = "up_Tvox3mlJzYmPHkP9mTHqWjHE2ThJJ" # Enter your Upstage api key

#Path to the folder containing the database and embedding files
folder_path = '/Users/hongjiyoung/NLP/Term_Project/final/db_files'

  from .autonotebook import tqdm as notebook_tqdm


### 0. Function to categorize between Ewha and MMLU pro domains.

In [2]:
cate_llm = ChatUpstage(api_key = UPSTAGE_API_KEY )
cate_prompt_template = PromptTemplate.from_template(
    '''
    You are now assigned the role of classifying the topics of exam questions.

    Given the [Question] and [Topics], identify which topic the question belongs to.
    Do not provide any explanation, just return the topic name EXACTLY AS GIVEN.

    [Question] {question}
    [Topics] : (philosophy, law, business, psychology, history)

    Let's think step by step.
    '''
)
cate_chain = cate_prompt_template | cate_llm

# funcion to extract an answer from response
def extract_answer_cate(response):
    if any(keyword in response for keyword in ['philo', '철학', 'logic']) : return 'philosophy'
    if any(keyword in response for keyword in ['law', '법']) : return 'law'
    if any(keyword in response for keyword in ['busin', '경영', '비즈니스', 'eco', '경제', 'math', '수학']) : return 'business'
    if any(keyword in response for keyword in ['psy', '심리', 'statistics', 'bio', 'medi', 'sci']) : return 'psychology'
    if any(keyword in response for keyword in ['his', '역사']) : return 'history'
    return 'philosophy'


def which_category(testdata, cate_chain) : 
    # build
    which_category = pd.DataFrame(columns=['index', 'question', 'pred_pmt_category', 'pred_category'])
    for index, row in testdata.iterrows():
        index = int(index)
        prompt = row.prompts
        q = prompt.partition('(A)')[0]
        q = q.partition(')')[2]
        
        which_category.loc[len(which_category)] = {'index':index, 'question': q}

    # predict using Solar
    for idx, row in which_category.iterrows() :
        response = cate_chain.invoke({"question": row.question})
        res = response.content
        res = extract_answer_cate(res.lower().replace("\n", " "))
        which_category.loc[idx, 'pred_pmt_category'] = res

    # final predicted category
    which_category['pred_category'] = which_category['pred_pmt_category']
    for idx, row in which_category.iterrows() :
        # Get the value of ‘question’
        question_text = which_category.loc[idx, 'question']
        if bool(re.search('[가-힣]', question_text)) :
            which_category.loc[idx, 'pred_category'] = 'ewha'

    testdata['category'] = which_category['pred_category']

    return testdata, which_category

### 1. Create a single DataFrame containing all the necessary information.
(question, answer, context .etc)

In [3]:
def make_df(testdata):
    embedding_model = UpstageEmbeddings(api_key=UPSTAGE_API_KEY, model="solar-embedding-1-large")
    nowtest = pd.DataFrame(columns=['index', 'category', 'embed_ques', 'question', 'prompts', 'answers', 'top1', 'top2', 'top3'])

    for index, row in testdata.iterrows():
        q = row.prompts
        a = row.answers
        question = q.partition('(A)')[0]
        question = question.partition(')')[2]

        q = q.partition(')')[2].strip()

        category = row.category
        
        try : 
            if category != "history" and category != "psychology":    
                embedded_query = embedding_model.embed_query(question) # Get the question excluding the options and perform embedding
                nowtest.loc[len(nowtest)] = {'index':index, 'category' : category, 'embed_ques' : embedded_query, 'question' : question, 'prompts' : q, 'answers' : a}
            else : 
                nowtest.loc[len(nowtest)] = {'index':index, 'category' : category, 'embed_ques' : '-', 'question' : question, 'prompts' : q, 'answers' : a}

        except :
            print(f'pass: {index}')
            continue 
        
    return nowtest

### 2. Function to load embeddings by domain and return the top 3 contexts

##### - Ewha

In [4]:
# Define a wrapper for UpstageEmbeddings to inherit from Embeddings
class UpstageEmbeddingsWrapper(Embeddings):
    def __init__(self, api_key, model):
        self.embeddings = UpstageEmbeddings(api_key=api_key, model=model)

    def embed_documents(self, texts):
        return self.embeddings.embed_documents(texts)

    def embed_query(self, text):
        return self.embeddings.embed_query(text)

    def embed_text(self, text):
        return self.embeddings.embed_query(text)  
    
# Function to clean text
def clean_text(text):
    # Remove multiple spaces and newlines
    text = re.sub(r'\s+', ' ', text)
    # Remove specific patterns if any (e.g., page numbers)
    text = re.sub(r'Page \d+', '', text)
    return text.strip()

In [5]:
def load_embeddings_ewha(UPSTAGE_API_KEY, folder_path):
    embeddings = UpstageEmbeddingsWrapper(
        api_key=UPSTAGE_API_KEY,
        model="embedding-query"
    )

    # Load and Split the PDF
    pdf_path = os.path.join(folder_path, "ewha.pdf")
    loader = PyPDFLoader(pdf_path)
    documents = loader.load()  # Load all pages

    # Split documents into smaller chunks with optimized separators
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,  # Maximum characters per chunk
        chunk_overlap=50,  # Overlap between chunks
        separators=["\n\n", "\n", ". ", "!", "?", " "]
    )
    split_docs = text_splitter.split_documents(documents)

    # Extract and clean text content
    document_texts = [clean_text(doc.page_content) for doc in split_docs]

    # Convert documents to embeddings
    document_embeddings = embeddings.embed_documents(document_texts)

    text_embedding_pairs = list(zip(document_texts, document_embeddings))

    # Create a FAISS vector store
    vector_store = FAISS.from_embeddings(
        text_embeddings=text_embedding_pairs,  # Text-embedding pair list
        embedding=embeddings  # Pass the Embeddings object
        ) 
    return vector_store

##### - MMLU-pro : Business

In [6]:
def load_embeddings_business(folder_path):
    # Load the Business database CSV file.
    db = pd.read_csv(os.path.join(folder_path, "db_business.csv"))

    # Load the saved embeddings
    with open(os.path.join(folder_path,f"db_business_embeddings.pkl"), "rb") as f:
        context_embedding, index = pickle.load(f)
    return db, context_embedding, index

In [7]:
# Find the top 3 contexts for a given question and return them as a list.
def find_top3_contexts_business(folder_path, row):
    db, context_embedding, index = load_embeddings_business(folder_path)

    embed_ques= row.embed_ques

    similarities = util.pytorch_cos_sim(embed_ques, context_embedding)[0]
    
    # Extract the top 3 indices based on similarity.
    top3_indices = torch.topk(similarities, 3).indices.tolist()

    top3_contexts = []
    for idx in top3_indices:
        real_index = index[idx]
        best_context = db['chunks'].iloc[real_index]
        top3_contexts.append(best_context)

    return top3_contexts

##### - MMLU-pro : Law & Philosophy

In [8]:
def load_embeddings_law_philosophy(folder_path, category):
    # Load the .npy file (type == numpy).
    if category == "law" : 
        db = np.load(os.path.join(folder_path, 'db_law.npy'))
        context_embedding = np.load(os.path.join(folder_path, 'db_law_embeddings.npy'))
    elif category == "philosophy" :
        db = np.load(os.path.join(folder_path, 'db_philosophy.npy'))
        context_embedding = np.load(os.path.join(folder_path, 'db_philosophy_embeddings.npy'))

    db = db.tolist()
    context_embedding = context_embedding.tolist()
    
    return db, context_embedding

In [9]:
# Find the top 3 contexts for a given question and return them as a list.
def find_top3_contexts_law_philosophy(folder_path, category, row):

    db, context_embedding = load_embeddings_law_philosophy(folder_path, category)

    embed_ques= row.embed_ques

    # Sort by similarity in descending order.
    sorted_idx = (np.array(embed_ques) @ np.array(context_embedding).T).argsort()[::-1]

    top3_contexts = []
    top3_contexts.append(db[sorted_idx[0]])
    top3_contexts.append(db[sorted_idx[1]])
    top3_contexts.append(db[sorted_idx[2]])
        
    return top3_contexts

##### - MMLU-pro : History & Psychology

In [10]:
def create_fixed_vectorizer(history_file_path, psychology_file_path):
    """
    Combine text data from two CSV files, create a fixed TfidfVectorizer, and save it.

    Parameters:
        history_file_path (str): Path to the history CSV file.
        psychology_file_path (str): Path to the psychology CSV file.
        save_path (str, optional): Path to save the vectorizer and context embeddings (Pickle format).

    Returns:
        vectorizer (TfidfVectorizer): The fixed vectorizer object.
        context_embedding (csr_matrix): TF-IDF embeddings of the combined text.
    """
    # Load the CSV files
    df_history = pd.read_csv(history_file_path)
    df_psychology = pd.read_csv(psychology_file_path)

    # Extract the necessary text column (the text data is in the ‘text’ column)
    df_history['text'] = df_history['text'].fillna('')
    df_psychology['text'] = df_psychology['text'].fillna('')
    
    history_texts = df_history['text'].tolist()
    psychology_texts = df_psychology['text'].tolist()

    # Combine the text data.
    all_texts = history_texts + psychology_texts

    # Initialize and fit the vectorizer.
    vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words='english')
    context_embedding = vectorizer.fit_transform(all_texts)

    return vectorizer

In [11]:
def load_embeddings_history_psychology(folder_path, category, vectorizer):
    """
    Load the psychology database and calculate or load precomputed embeddings using TF-IDF.
    """
    if category == "history" : 
        db_path = os.path.join(folder_path,'db_history.csv')
    elif category == "psychology" : 
        db_path = os.path.join(folder_path,'db_psychology.csv')

    db = pd.read_csv(db_path)
    db_texts = db['text'].tolist()

    context_embedding = vectorizer.transform(db_texts)

    return db, context_embedding

In [12]:
def find_top3_contexts_history_psychology(row, db, context_embedding, vectorizer, top_n=3, similarity_type="cosine"):
    """
    Find the top 3 most relevant contexts for a given question.
    """
    question = row.question
    # Vectorize the question
    embed_ques = vectorizer.transform([question])

    # Calculate the similarity
    if similarity_type == "cosine":
        similarities = cosine_similarity(embed_ques, context_embedding).flatten()
    elif similarity_type == "euclidean":
        similarities = -euclidean_distances(embed_ques, context_embedding).flatten()
    elif similarity_type == "manhattan":
        similarities = -manhattan_distances(embed_ques, context_embedding).flatten()
    elif similarity_type == "jaccard":
        dense_matrix = context_embedding.toarray()
        dense_question = embed_ques.toarray()
        similarities = 1 - pairwise_distances(dense_question, dense_matrix, metric="jaccard").flatten()
    else:
        raise ValueError(f"Unknown similarity type: {similarity_type}")
    
    # Extract the top 3 indices based on similarity.
    top_indices = similarities.argsort()[-top_n:][::-1]

    # Return the top 3 contexts.
    top3_contexts = [db.iloc[i]['text'] for i in top_indices]

    return top3_contexts

##### - Function to apply find_top3_contexts to each row(question) of the DataFrame (excluding Ewha)

In [13]:
# Apply a function to find the top 3 contexts for each row of dataframe.
def apply_top3_contexts(folder_path, nowtest):
    # Load the history and psychology databases and compute their embeddings.
    history_file_path = os.path.join(folder_path,'db_history.csv')
    psychology_file_path = os.path.join(folder_path,'db_psychology.csv')
    vectorizer = create_fixed_vectorizer(history_file_path, psychology_file_path)
    history_db, history_context_embedding = load_embeddings_history_psychology(folder_path, "history", vectorizer)
    psy_db, psy_context_embedding = load_embeddings_history_psychology(folder_path, "psychology", vectorizer)


    for idx, row in nowtest.iterrows() : # Get the question
        category = row.category
        if category == "business" :
            top3_contexts = find_top3_contexts_business(folder_path, row)
            nowtest.loc[idx, 'top1'] = top3_contexts[0]
            nowtest.loc[idx, 'top2'] = top3_contexts[1]
            nowtest.loc[idx, 'top3'] = top3_contexts[2]

        elif category == "law" :
            top3_contexts = find_top3_contexts_law_philosophy(folder_path, category, row)
            nowtest.loc[idx, 'top1'] = top3_contexts[0]
            nowtest.loc[idx, 'top2'] = top3_contexts[1]
            nowtest.loc[idx, 'top3'] = top3_contexts[2]

        elif category == "philosophy" :
            top3_contexts = find_top3_contexts_law_philosophy(folder_path, category, row)
            nowtest.loc[idx, 'top1'] = top3_contexts[0]
            nowtest.loc[idx, 'top2'] = top3_contexts[1]
            nowtest.loc[idx, 'top3'] = top3_contexts[2]

        elif category == "history" :
            top3_contexts = find_top3_contexts_history_psychology(row, history_db, history_context_embedding, vectorizer)
            nowtest.loc[idx, 'top1'] = top3_contexts[0]
            nowtest.loc[idx, 'top2'] = top3_contexts[1]
            nowtest.loc[idx, 'top3'] = top3_contexts[2]

        elif category == "psychology" :
            top3_contexts = find_top3_contexts_history_psychology(row, psy_db, psy_context_embedding, vectorizer)
            nowtest.loc[idx, 'top1'] = top3_contexts[0]
            nowtest.loc[idx, 'top2'] = top3_contexts[1]
            nowtest.loc[idx, 'top3'] = top3_contexts[2]
        
    return nowtest


### 3. Define a function to receive Solar response (with Bagging applied).

##### - Ewha Bagging

In [14]:
def extract_choice(answer):
    """
    Extract only the alphabet corresponding to the choice from the answer.
    """
    # Pattern: (A), (B), ... or A, B, ...
    match = re.search(r'\(?([A-E])\)?', answer)
    return match.group(1).strip() if match else None

def bagging_ewha(row, qa_chain, bagging_pred) :
    # Check and initialize bagging_pred(dataframe) setup
    if bagging_pred is None:
        bagging_pred = pd.DataFrame(columns=['questionNum', 'answer', 'top1_1pred', 'top1_2pred', 'top1_3pred', 'top2pred', 'top3pred'])

    # Generate Responses with Feedback
    num_attempts = 5  # Number of attempts per question
    
    question = row.prompts
    print(question)
    answer_choices = []
    for attempt in range(num_attempts):
        # Generate answer using the QA chain
        try:
            answer = qa_chain.run(query=question).strip()  # Pass 'query' key
            # Validate the format
            if re.match(r'\(?[A-E]\)?', answer):
                extracted = extract_choice(str(answer))
                if extracted:
                    answer_choices.append(extracted)
            else:
                # If format is unexpected, skip or handle accordingly
                print(f"Attempt {attempt+1} for Question #{row.index+1}: Unexpected response format: {answer}. Skipping this attempt.")
        except Exception as e:
            print(f"Attempt {attempt+1} for Question #{row.index+1}: Error generating answer for question: {question}\nError: {e}")
    # Ensure answer_choices contains enough responses, else fill with None or empty strings
    while len(answer_choices) < 5:
        answer_choices.append('')  # or None if you prefer empty values
    
    # Add a new row to the bagging_pred DataFrame
    bagging_pred.loc[len(bagging_pred)] = {'questionNum': row.question, 'answer': row.answers, 'top1_1pred': answer_choices[0], 'top1_2pred': answer_choices[1], 'top1_3pred': answer_choices[2], 'top2pred' : answer_choices[3], 'top3pred' : answer_choices[4]}

    return bagging_pred

##### - MMLU pro Bagging

In [15]:
# funcion to extract an answer from response

def extract_answer(response):
    """
    extracts the answer from the response using a regular expression.
    expected format: "[ANSWER]: (A) convolutional networks"

    if there are any answers formatted like the format, it returns None.
    """
    pattern = r"\[ANSWER\]:\s*\((A|B|C|D|E)\)"  # Regular expression to capture the answer letter and text
    match = re.search(pattern, response)

    if match:
        return match.group(1) # Extract the letter inside parentheses (e.g., A)
    else:
        return extract_again(response)

def extract_again(response):
    pattern = r"\b[A-J]\b(?!.*\b[A-J]\b)"
    match = re.search(pattern, response)
    if match:
        return match.group(0)
    else:
        return None

In [17]:
def bagging_mmlu(row, mmlu_qa_chain, bagging_pred) : 

    # Check and initialize bagging_pred(dataframe) setup
    if bagging_pred is None:
        bagging_pred = pd.DataFrame(columns=['questionNum', 'answer', 'top1_1pred', 'top1_2pred', 'top1_3pred', 'top2pred', 'top3pred'])

    # Calculate the top1_1pred
    max_retries = 3  # Maximum retry attempts
    retries = 0
    while retries < max_retries:
        try:
            response = mmlu_qa_chain.invoke({"question": row.prompts, "context": row.top1})
            # Convert the response to a string
            if isinstance(response, dict):
                response = response.get('content', response.get('text', str(response)))
            else:
                response = str(response)
            print("response :", response)
            generated_answer = extract_answer(response)
            bagging_pred.loc[len(bagging_pred)] = {'questionNum': row.question, 'answer': row.answers, 'top1_1pred': generated_answer}
            break
        except Exception as e:  # API call error
            retries += 1
            print(f"Error occurred: {e}. Retrying {retries}/{max_retries} after 10 seconds...")
            time.sleep(20)  # Retry after waiting for 20 seconds
            if retries == max_retries:
                print(f"Failed after {max_retries} retries. Skipping this context.")

    # Calculate the top1_2,3pred
    for i in range(2,4) : 
        max_retries = 3  
        retries = 0
        while retries < max_retries:
            try:
                response = mmlu_qa_chain.invoke({"question": row.prompts, "context": row.top1})
                if isinstance(response, dict):
                    response = response.get('content', response.get('text', str(response)))
                else:
                    response = str(response)

                generated_answer = extract_answer(response)
                bagging_pred.loc[row.name, f'top1_{i}pred'] = generated_answer
                break
            except Exception as e:  # API call error
                retries += 1
                print(f"Error occurred: {e}. Retrying {retries}/{max_retries} after 10 seconds...")
                time.sleep(20)  # Retry after waiting for 20 seconds
                if retries == max_retries:
                    print(f"Failed after {max_retries} retries. Skipping this context.")

    # Calculate the top2pred
    context2 = ' '.join(filter(None, [str(row.top1), str(row.top2)]))
    max_retries = 3 
    retries = 0
    while retries < max_retries:
        try:
            response = mmlu_qa_chain.invoke({"question": row.prompts, "context": context2})
            if isinstance(response, dict):
                response = response.get('content', response.get('text', str(response)))
            else:
                response = str(response)
            generated_answer = extract_answer(response)
            bagging_pred.loc[row.name, 'top2pred'] = generated_answer
            break
        except Exception as e:
            retries += 1
            print(f"Error occurred: {e}. Retrying {retries}/{max_retries} after 10 seconds...")
            time.sleep(20)
            if retries == max_retries:
                print(f"Failed after {max_retries} retries. Skipping this context.")

    #Calculate the top3pred
    context3 = ' '.join(filter(None, [str(row.top1), str(row.top2), str(row.top3)]))
    max_retries = 3 
    retries = 0
    while retries < max_retries:
        try:
            response = mmlu_qa_chain.invoke({"question": row.prompts, "context": context3}) 
            if isinstance(response, dict):
                response = response.get('content', response.get('text', str(response)))
            else:
                response = str(response)

            generated_answer = extract_answer(response)
            bagging_pred.loc[row.name, 'top3pred'] = generated_answer
            break
        except Exception as e:
            retries += 1
            print(f"Error occurred: {e}. Retrying {retries}/{max_retries} after 10 seconds...")
            time.sleep(20)
            if retries == max_retries:
                print(f"Failed after {max_retries} retries. Skipping this context.")

    return bagging_pred

### 4. Prompt Engineering

##### - Ewha prompt

In [18]:
# Initialize LLM
ewha_llm = ChatUpstage(
    api_key=UPSTAGE_API_KEY,
    temperature=0,
    max_tokens=2048,
    top_p=0.0,
    frequency_penalty=0.0,
    presence_penalty=0.0
)

# Prompt Templates with One-Shot Example
ewha_qa_prompt_template = """
You are an assistant with expertise in Ewha University policies and history. Use the provided context to answer accurately.

Context: {context}

Question: {question}

Final Answer:
"""

# Prompt for context retrieval and feedback
ewha_qa_prompt = PromptTemplate(input_variables=["context", "query"], template=ewha_qa_prompt_template)

# Load the 'ewha.pdf' file and perform embedding.
vector_store = load_embeddings_ewha(UPSTAGE_API_KEY, folder_path)

In [19]:
# RAG-based QA System
retriever = vector_store.as_retriever(search_kwargs={"k": 5})

ewha_qa_chain = RetrievalQA.from_chain_type(
    llm=ewha_llm,
    retriever=retriever,
    chain_type="stuff",  # "stuff" processes all context at once
    chain_type_kwargs={"prompt": ewha_qa_prompt}  # Pass custom prompt
)

##### - MMLU Prompt

In [20]:
# Initialize LLM
mmlu_llm = ChatUpstage(
    api_key=UPSTAGE_API_KEY,
    temperature=0, 
    max_tokens=2048,
    top_p=0,
    frequency_penalty=0,
    presence_penalty=0
)

# Set up Few-shot learning and CoT-based ChatML prompts
few_shot_examples = """
    Example 1:
    <im_start>user
    [Question]: What is the capital of France?
    (A) Berlin
    (B) Madrid
    (C) Paris
    (D) Rome
    [Context]: France, officially the French Republic, is a country located primarily in Western Europe. France is a semi-presidential republic with its capital in Paris, the country's largest city and main cultural and commercial centre.

    Let's think step by step:
    1. The question asks for the capital of France.
    2. France is a country in Europe, and its capital city is Paris.
    3. Therefore, the correct answer is (C).
    <im_end>
    <im_start>assistant
    Final Answer: (C)
    <im_end>
"""
# Modify the ChatML prompt template
mmlu_answer_prompt_template = PromptTemplate.from_template(
    f"""
    {few_shot_examples}

    <im_start>user
    [Question]: {{question}}
    [Context]: {{context}}

    Let's think step by step:
    1.
    2.
    3.
    <im_end>
    <im_start>assistant
    Please provide the final answer in the format: Final Answer: (A).
    Final Answer: 
    """
)

# Set up the LLM Chain
mmlu_qa_chain = LLMChain(llm=mmlu_llm, prompt=mmlu_answer_prompt_template)

  mmlu_qa_chain = LLMChain(llm=mmlu_llm, prompt=mmlu_answer_prompt_template)


### 4. Check the accuracy for the test questions.

##### Load the test data and create a DataFrame.

In [21]:
# Testdata loading function.
def read_data(data_path):
    data = pd.read_csv(data_path)
    prompts = data['prompts'].tolist()
    answers = data['answers'].tolist()
    return prompts, answers

In [22]:
#Load the test questions CSV file. ex)testset.csv
prompts, answers = read_data(os.path.join(folder_path, 'testset.csv'))

testdata = pd.read_csv(os.path.join(folder_path, "testset.csv"))

In [23]:
# Categorize the test questions ex)ewha, history, law..
testdata, w = which_category(testdata, cate_chain)

  which_category.loc[idx, 'pred_pmt_category'] = res


In [24]:
w

Unnamed: 0,index,question,pred_pmt_category,pred_category
0,0,재학 중인 학생이 휴학을 하려면 학기 개시일로부터 며칠 이내에 휴학을 신청하야하나...,law,ewha
1,1,'재입학은 a회에 한하여 할 수 있다. 다만 제 28조제4호에 의하여 제적된 자는...,law,ewha
2,2,학생이 소속 학과 또는 전공 이외의 전공 교과목을 총장이 정하는 바에 따라 몇학점...,law,ewha
3,3,다음 보기의 학생들 중 제적을 당하지 않는 사람을 고르면?\n,law,ewha
4,4,2019학년도 휴먼기계바이오공학부의 입학 정원은 몇 명인가? \n,psychology,ewha
5,5,1980학년도 이전 입학생에 대하여 적용하는 등급에 따른 성적점으로 잘못 연결된 ...,philosophy,ewha
6,6,사회체육학과 소속 학생에게 수여하는 학위는 무엇인가? \n,psychology,ewha
7,7,복수전공 신청 자격에 해당하지 않는 것은? \n,philosophy,ewha
8,8,이화여자대학교의 설립 정신은 무엇인가요?,history,ewha
9,9,이화여자대학교의 위치는 어디인가요?,philosophy,ewha


In [25]:
testdata

Unnamed: 0,prompts,answers,category
0,QUESTION1) 재학 중인 학생이 휴학을 하려면 학기 개시일로부터 며칠 이내에 ...,(D),ewha
1,QUESTION2) '재입학은 a회에 한하여 할 수 있다. 다만 제 28조제4호에 ...,(A),ewha
2,QUESTION3) 학생이 소속 학과 또는 전공 이외의 전공 교과목을 총장이 정하는...,(C),ewha
3,QUESTION4) 다음 보기의 학생들 중 제적을 당하지 않는 사람을 고르면?\n(...,(D),ewha
4,QUESTION5) 2019학년도 휴먼기계바이오공학부의 입학 정원은 몇 명인가? \...,(C),ewha
5,QUESTION6) 1980학년도 이전 입학생에 대하여 적용하는 등급에 따른 성적점...,(B),ewha
6,QUESTION7) 사회체육학과 소속 학생에게 수여하는 학위는 무엇인가? \n(A)...,(D),ewha
7,QUESTION8) 복수전공 신청 자격에 해당하지 않는 것은? \n(A) 1학년을 ...,(C),ewha
8,QUESTION9) 이화여자대학교의 설립 정신은 무엇인가요? (A) 공산주의 이념 ...,(C),ewha
9,QUESTION10) 이화여자대학교의 위치는 어디인가요? (A) 강남구 (B) 서대...,(B),ewha


In [26]:
# Create a DataFrame for easy viewing
nowtest = make_df(testdata)

In [27]:
nowtest

Unnamed: 0,index,category,embed_ques,question,prompts,answers,top1,top2,top3
0,0,ewha,"[-0.0161285400390625, -0.00487518310546875, -0...",재학 중인 학생이 휴학을 하려면 학기 개시일로부터 며칠 이내에 휴학을 신청하야하나...,재학 중인 학생이 휴학을 하려면 학기 개시일로부터 며칠 이내에 휴학을 신청하야하나요...,(D),,,
1,1,ewha,"[-0.024200439453125, -0.0005092620849609375, -...",'재입학은 a회에 한하여 할 수 있다. 다만 제 28조제4호에 의하여 제적된 자는...,'재입학은 a회에 한하여 할 수 있다. 다만 제 28조제4호에 의하여 제적된 자는 ...,(A),,,
2,2,ewha,"[0.01136016845703125, -0.0199127197265625, -0....",학생이 소속 학과 또는 전공 이외의 전공 교과목을 총장이 정하는 바에 따라 몇학점...,학생이 소속 학과 또는 전공 이외의 전공 교과목을 총장이 정하는 바에 따라 몇학점 ...,(C),,,
3,3,ewha,"[0.029754638671875, -0.04205322265625, -0.0153...",다음 보기의 학생들 중 제적을 당하지 않는 사람을 고르면?\n,다음 보기의 학생들 중 제적을 당하지 않는 사람을 고르면?\n(A) 팜 : 징계에 ...,(D),,,
4,4,ewha,"[-0.0307464599609375, -0.04486083984375, -0.01...",2019학년도 휴먼기계바이오공학부의 입학 정원은 몇 명인가? \n,2019학년도 휴먼기계바이오공학부의 입학 정원은 몇 명인가? \n(A) 90명 \n...,(C),,,
5,5,ewha,"[-0.0005908012390136719, -0.0205078125, -0.032...",1980학년도 이전 입학생에 대하여 적용하는 등급에 따른 성적점으로 잘못 연결된 ...,1980학년도 이전 입학생에 대하여 적용하는 등급에 따른 성적점으로 잘못 연결된 것...,(B),,,
6,6,ewha,"[-5.805492401123047e-05, -0.0219268798828125, ...",사회체육학과 소속 학생에게 수여하는 학위는 무엇인가? \n,사회체육학과 소속 학생에게 수여하는 학위는 무엇인가? \n(A) 공학사 \n(B) ...,(D),,,
7,7,ewha,"[0.007785797119140625, -0.0120086669921875, -0...",복수전공 신청 자격에 해당하지 않는 것은? \n,복수전공 신청 자격에 해당하지 않는 것은? \n(A) 1학년을 마친 학생 \n(B)...,(C),,,
8,8,ewha,"[0.0002130270004272461, -0.03668212890625, -0....",이화여자대학교의 설립 정신은 무엇인가요?,이화여자대학교의 설립 정신은 무엇인가요? (A) 공산주의 이념 (B) 불교 정신 (...,(C),,,
9,9,ewha,"[0.0007071495056152344, -0.038818359375, -0.00...",이화여자대학교의 위치는 어디인가요?,이화여자대학교의 위치는 어디인가요? (A) 강남구 (B) 서대문구 (C) 종로구 (...,(B),,,


In [28]:
# Find the top 3 contexts for each question
nowtest = apply_top3_contexts(folder_path, nowtest)

On growing up.
Walter Benjamin, Martin Heidegger, Friedrich Nietzsche
Much as the study of the human brain benefits from a close examination of its
development, the study of the human psyche and thought process has much to gain from an
analysis of the thinker as he is growing up. It seems obvious, after all, that an individual‟s
perception should be affected by his experiences. Our very understanding of the objects
around us depends on our being able to name and categorize them, and to understand their
interrelation. We understand buildings, shelves, and water-glasses because we understand
gravity, and we understand complex social situations by having had experience with
people. Most importantly, through interacting with the world and forming memories of it,
we come to understand the boundaries of our mental and physical self. For that reason, a
child who has not yet learned the delimitations of his existence uses his surroundings to
gather meaning much as we use our memories.
Walter B

In [29]:
nowtest

Unnamed: 0,index,category,embed_ques,question,prompts,answers,top1,top2,top3
0,0,ewha,"[-0.0161285400390625, -0.00487518310546875, -0...",재학 중인 학생이 휴학을 하려면 학기 개시일로부터 며칠 이내에 휴학을 신청하야하나...,재학 중인 학생이 휴학을 하려면 학기 개시일로부터 며칠 이내에 휴학을 신청하야하나요...,(D),,,
1,1,ewha,"[-0.024200439453125, -0.0005092620849609375, -...",'재입학은 a회에 한하여 할 수 있다. 다만 제 28조제4호에 의하여 제적된 자는...,'재입학은 a회에 한하여 할 수 있다. 다만 제 28조제4호에 의하여 제적된 자는 ...,(A),,,
2,2,ewha,"[0.01136016845703125, -0.0199127197265625, -0....",학생이 소속 학과 또는 전공 이외의 전공 교과목을 총장이 정하는 바에 따라 몇학점...,학생이 소속 학과 또는 전공 이외의 전공 교과목을 총장이 정하는 바에 따라 몇학점 ...,(C),,,
3,3,ewha,"[0.029754638671875, -0.04205322265625, -0.0153...",다음 보기의 학생들 중 제적을 당하지 않는 사람을 고르면?\n,다음 보기의 학생들 중 제적을 당하지 않는 사람을 고르면?\n(A) 팜 : 징계에 ...,(D),,,
4,4,ewha,"[-0.0307464599609375, -0.04486083984375, -0.01...",2019학년도 휴먼기계바이오공학부의 입학 정원은 몇 명인가? \n,2019학년도 휴먼기계바이오공학부의 입학 정원은 몇 명인가? \n(A) 90명 \n...,(C),,,
5,5,ewha,"[-0.0005908012390136719, -0.0205078125, -0.032...",1980학년도 이전 입학생에 대하여 적용하는 등급에 따른 성적점으로 잘못 연결된 ...,1980학년도 이전 입학생에 대하여 적용하는 등급에 따른 성적점으로 잘못 연결된 것...,(B),,,
6,6,ewha,"[-5.805492401123047e-05, -0.0219268798828125, ...",사회체육학과 소속 학생에게 수여하는 학위는 무엇인가? \n,사회체육학과 소속 학생에게 수여하는 학위는 무엇인가? \n(A) 공학사 \n(B) ...,(D),,,
7,7,ewha,"[0.007785797119140625, -0.0120086669921875, -0...",복수전공 신청 자격에 해당하지 않는 것은? \n,복수전공 신청 자격에 해당하지 않는 것은? \n(A) 1학년을 마친 학생 \n(B)...,(C),,,
8,8,ewha,"[0.0002130270004272461, -0.03668212890625, -0....",이화여자대학교의 설립 정신은 무엇인가요?,이화여자대학교의 설립 정신은 무엇인가요? (A) 공산주의 이념 (B) 불교 정신 (...,(C),,,
9,9,ewha,"[0.0007071495056152344, -0.038818359375, -0.00...",이화여자대학교의 위치는 어디인가요?,이화여자대학교의 위치는 어디인가요? (A) 강남구 (B) 서대문구 (C) 종로구 (...,(B),,,


##### Receive the response from LLM(solar).

In [30]:
# Create a DataFrame to store the LLM predictions
try : del [[bagging_pred]]
except : pass
bagging_pred = pd.DataFrame(columns=['questionNum', 'answer', 'top1_1pred', 'top1_2pred', 'top1_3pred', 'top2pred', 'top3pred', 'predict'])

In [31]:
# Perform bagging for each question
for idx, row in nowtest.iterrows() :
    if row.category == "ewha" :
        bagging_pred = bagging_ewha(row, ewha_qa_chain, bagging_pred)
    else : 
        bagging_pred = bagging_mmlu(row, mmlu_qa_chain, bagging_pred)

재학 중인 학생이 휴학을 하려면 학기 개시일로부터 며칠 이내에 휴학을 신청하야하나요?
(A) 30일
(B) 45일 
(C) 60일
(D) 90일


  answer = qa_chain.run(query=question).strip()  # Pass 'query' key


'재입학은 a회에 한하여 할 수 있다. 다만 제 28조제4호에 의하여 제적된 자는 제적된 날부터 b년이 경과한 후 재입학 할 수 있다.' a와 b가 상수일 때 a+b의 값을 구하면?
(A) 2
(B) 3
(C) 4
(D) A,B,C 중 답 없음
학생이 소속 학과 또는 전공 이외의 전공 교과목을 총장이 정하는 바에 따라 몇학점 이상 취득한 때에 부전공을 이수한것으로 인정받을 수 있는가?
(A) 15학점
(B) 18학점
(C) 21학점
(D) 25학점
다음 보기의 학생들 중 제적을 당하지 않는 사람을 고르면?
(A) 팜 : 징계에 의해 퇴학처분을 받았음 
(B) 엘모 : 성적이 평점평균 1.2 로 학사경고를 연속 3회 받았음
(C) 라마 : 수업료 기타 납입금을 소정 기일 내 납입하지 못하였음
(D) 로라 : 휴학기간 경과 후 3주가 지났으나 갑자기 큰 사고가 났다는 정당한 이유 때문에 복학하지 못하였음
2019학년도 휴먼기계바이오공학부의 입학 정원은 몇 명인가? 
(A) 90명 
(B) 100명 
(C) 110명 
(D) 120명
1980학년도 이전 입학생에 대하여 적용하는 등급에 따른 성적점으로 잘못 연결된 것은 무엇인가? 
(A) 등급: A+, 성적점: 4 
(B) 등급: A-: 성적점: 3.5 
(C) 등급: B+, 성적점: 3 
(D) 등급: C, 성적점: 2
사회체육학과 소속 학생에게 수여하는 학위는 무엇인가? 
(A) 공학사 
(B) 문학사 
(C) 사회학사 
(D) 이학사
복수전공 신청 자격에 해당하지 않는 것은? 
(A) 1학년을 마친 학생 
(B) 평균 평점이 2.50 이상인 학생 
(C) 졸업 직전 학기에 있는 학생 
(D) 재학생 신분인 경우
이화여자대학교의 설립 정신은 무엇인가요? (A) 공산주의 이념 (B) 불교 정신 (C) 기독교 정신 (D) 자유주의 사상
이화여자대학교의 위치는 어디인가요? (A) 강남구 (B) 서대문구 (C) 종로구 (D) 송파구
학점 기준에 따르면 1학점당 수업 시간은 몇 시간 이상이어야 하나요? (A) 10시간 (

In [32]:
bagging_pred

Unnamed: 0,questionNum,answer,top1_1pred,top1_2pred,top1_3pred,top2pred,top3pred,predict
0,재학 중인 학생이 휴학을 하려면 학기 개시일로부터 며칠 이내에 휴학을 신청하야하나...,(D),D,D,D,D,D,
1,'재입학은 a회에 한하여 할 수 있다. 다만 제 28조제4호에 의하여 제적된 자는...,(A),C,B,B,B,B,
2,학생이 소속 학과 또는 전공 이외의 전공 교과목을 총장이 정하는 바에 따라 몇학점...,(C),C,C,C,C,C,
3,다음 보기의 학생들 중 제적을 당하지 않는 사람을 고르면?\n,(D),D,D,D,D,D,
4,2019학년도 휴먼기계바이오공학부의 입학 정원은 몇 명인가? \n,(C),C,C,C,C,C,
5,1980학년도 이전 입학생에 대하여 적용하는 등급에 따른 성적점으로 잘못 연결된 ...,(B),B,B,B,B,B,
6,사회체육학과 소속 학생에게 수여하는 학위는 무엇인가? \n,(D),B,B,B,B,B,
7,복수전공 신청 자격에 해당하지 않는 것은? \n,(C),C,C,C,C,C,
8,이화여자대학교의 설립 정신은 무엇인가요?,(C),C,C,C,C,C,
9,이화여자대학교의 위치는 어디인가요?,(B),B,B,B,B,B,


##### Derive the Final Prediction from the bagging results and check the accuracy.

In [33]:
final_pred = []
for idx, row in bagging_pred.iterrows() :

    pred = []
    pred.append(row.top1_1pred)
    pred.append(row.top1_2pred)
    pred.append(row.top1_3pred)
    pred.append(row.top2pred)
    pred.append(row.top3pred)

    counts = Counter(pred)

    prediction = counts.most_common(1)[0][0]
    if prediction == None :
        try : prediction = counts.most_common(2)[1][0]
        except : pass
    if prediction == None : prediction = 'A'
    final_pred.append(prediction)
    bagging_pred.loc[idx, 'predict'] = prediction

print(final_pred)

['D', 'B', 'C', 'D', 'C', 'B', 'B', 'C', 'C', 'B', 'B', 'B', 'C', 'C', 'A', 'B', 'B', 'D', 'D', 'B', 'B', 'A', 'D', 'D', 'B', 'D', 'A', 'A', 'B', 'E', 'B', 'C', 'B', 'G', 'E', 'B', 'B', 'D', 'B', 'D', 'E', 'D', 'C', 'I', 'D', 'I', 'G', 'E', 'F', 'B']


  bagging_pred.loc[idx, 'predict'] = prediction


In [35]:
# Check the answers and extract the wrong ones. Store the results in a bagging_pred(dataframe)
cnt = 0
wrong = []
for idx, (answer, response) in enumerate(zip(answers, final_pred)):
    print("-"*10)
    generated_answer = extract_answer(response)
    print(response)
    # check
    if generated_answer:
        print(f"idx: {idx} | generated answer: {generated_answer}, answer: {answer}")
    else:
        print("extraction fail")
    if generated_answer == None:
        wrong.append(idx+1)
        bagging_pred.loc[idx, 'iswrong'] = '-'
        continue
    
    if generated_answer in answer:
        cnt += 1
    else : 
        wrong.append(idx+1)
        bagging_pred.loc[idx, 'iswrong'] = '-'

# Calculate Accuracy        
acc = cnt/len(answers)*100
print(f"acc: {acc}%")
print()
print("wrong:", wrong)
bagging_pred.loc[len(bagging_pred), 'predict'] = acc

----------
D
idx: 0 | generated answer: D, answer: (D)
----------
B
idx: 1 | generated answer: B, answer: (A)
----------
C
idx: 2 | generated answer: C, answer: (C)
----------
D
idx: 3 | generated answer: D, answer: (D)
----------
C
idx: 4 | generated answer: C, answer: (C)
----------
B
idx: 5 | generated answer: B, answer: (B)
----------
B
idx: 6 | generated answer: B, answer: (D)
----------
C
idx: 7 | generated answer: C, answer: (C)
----------
C
idx: 8 | generated answer: C, answer: (C)
----------
B
idx: 9 | generated answer: B, answer: (B)
----------
B
idx: 10 | generated answer: B, answer: (B)
----------
B
idx: 11 | generated answer: B, answer: (B)
----------
C
idx: 12 | generated answer: C, answer: (C)
----------
C
idx: 13 | generated answer: C, answer: (C)
----------
A
idx: 14 | generated answer: A, answer: (A)
----------
B
idx: 15 | generated answer: B, answer: (B)
----------
B
idx: 16 | generated answer: B, answer: (B)
----------
D
idx: 17 | generated answer: D, answer: (D)
--

In [36]:
bagging_pred

Unnamed: 0,questionNum,answer,top1_1pred,top1_2pred,top1_3pred,top2pred,top3pred,predict,iswrong
0,재학 중인 학생이 휴학을 하려면 학기 개시일로부터 며칠 이내에 휴학을 신청하야하나...,(D),D,D,D,D,D,D,
1,'재입학은 a회에 한하여 할 수 있다. 다만 제 28조제4호에 의하여 제적된 자는...,(A),C,B,B,B,B,B,-
2,학생이 소속 학과 또는 전공 이외의 전공 교과목을 총장이 정하는 바에 따라 몇학점...,(C),C,C,C,C,C,C,
3,다음 보기의 학생들 중 제적을 당하지 않는 사람을 고르면?\n,(D),D,D,D,D,D,D,
4,2019학년도 휴먼기계바이오공학부의 입학 정원은 몇 명인가? \n,(C),C,C,C,C,C,C,
5,1980학년도 이전 입학생에 대하여 적용하는 등급에 따른 성적점으로 잘못 연결된 ...,(B),B,B,B,B,B,B,
6,사회체육학과 소속 학생에게 수여하는 학위는 무엇인가? \n,(D),B,B,B,B,B,B,-
7,복수전공 신청 자격에 해당하지 않는 것은? \n,(C),C,C,C,C,C,C,
8,이화여자대학교의 설립 정신은 무엇인가요?,(C),C,C,C,C,C,C,
9,이화여자대학교의 위치는 어디인가요?,(B),B,B,B,B,B,B,
