<a target="_blank" href="https://colab.research.google.com/github/UpstageAI/cookbook/blob/main/cookbooks/upstage/Solar-Full-Stack LLM-101/05_3_OracleDB.ipynb">
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

# RAG + Wikipedia (for MMLU)

In [1]:
# set parameters

file = open("info/api.txt", "r")
api_key = file.read()
file.close()
file = open("info/path.txt", "r")
data_path = file.read()
file.close()
file = open("info/user.txt", "r")
user = file.read()
file.close()

import torch
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print('Device:', device)

Device: cuda:0


In [2]:
import wikipediaapi

# user는 내 아이디 같은 거
wiki_wiki = wikipediaapi.Wikipedia(f'{user}', 'en')

In [3]:
from langchain_upstage import UpstageEmbeddings

# 쿼리 전용 임베딩 모델
query_embeddings = UpstageEmbeddings(api_key=api_key, model="solar-embedding-1-large-query")
# 문장 전용 임베딩 모델
passage_embeddings = UpstageEmbeddings(api_key=api_key, model="solar-embedding-1-large-passage")

## 1. build DB (using Wikipida)

### what is the main topic & specific topic of query

In [4]:
# read mmlu_pro.csv file
import pandas as pd
import os

def read_data(data_path):
    data = pd.read_csv(data_path)
    prompts = data['prompts']
    answers = data['answers']
    # returns two lists: prompts and answers
    return prompts, answers

prompts, answers = read_data(os.path.join(data_path, 'mmlu_pro.csv'))

In [5]:
testdata = pd.read_csv(data_path+'mmlu_pro.csv')
testdata

Unnamed: 0,prompts,answers
0,QUESTION0) The symmetric group $S_n$ has $\n\f...,(A)
1,QUESTION1) Let V be the set of all real polyno...,(H)
2,QUESTION2) Let A be the set of all ordered pai...,(E)
3,QUESTION3) A tank initially contains a salt so...,(I)
4,QUESTION4) A total of 30 players will play bas...,(B)
...,...,...
12097,QUESTION12252) A hot mild steel rod is placed ...,(J)
12098,QUESTION12253) The cost of making the correct ...,(H)
12099,QUESTION12254) Consider the evaporation of liq...,(F)
12100,"QUESTION12255) Air (100°F, 1atm) is flowing at...",(I)


In [6]:
nowtest = pd.DataFrame(columns=['index', 'embed_ques', 'question', 'prompts', 'answers', 'topic_b', 'topic_s', 'b_wiki', 's_wiki'])

for index, row in testdata.iterrows():
    if index % 500 != 0 : continue # 일단 실험할 땐 500개 단위로 끊어서 가져옴
    q = row.prompts
    a = row.answers
    question = q.partition('(A)')[0]
    question = question.partition(') ')[2]
    q = q.partition(') ')[2]
    try : 
        embedded_query = query_embeddings.embed_query(question) # 질문만 받아와서 embedding 하기
        nowtest.loc[len(nowtest)] = {'index':index, 'embed_ques' : embedded_query, 'question' : question, 'prompts' : q, 'answers' : a, 'topic_b': None, 'topic_s': None, 'b_wiki': None, 's_wiki':None}

    except :
        print(f'pass: {index}')
        continue


In [7]:
nowtest

Unnamed: 0,index,embed_ques,question,prompts,answers,topic_b,topic_s,b_wiki,s_wiki
0,0,"[-0.013031005859375, -0.00452423095703125, -0....",The symmetric group $S_n$ has $\n\factorial{n}...,The symmetric group $S_n$ has $\n\factorial{n}...,(A),,,,
1,500,"[-0.006580352783203125, 0.0008025169372558594,...",________________reflect a purchaser's high lev...,________________reflect a purchaser's high lev...,(G),,,,
2,1000,"[-0.018035888671875, -0.00658416748046875, -0....",A developer is the owner of a parcel of land i...,A developer is the owner of a parcel of land i...,(B),,,,
3,1500,"[-0.01137542724609375, -0.002910614013671875, ...","A defendant, a nurse at a nursing home, is cha...","A defendant, a nurse at a nursing home, is cha...",(I),,,,
4,2000,"[-0.00925445556640625, -0.036468505859375, -0....","In time-out, a disruptive child who wants to s...","In time-out, a disruptive child who wants to s...",(G),,,,
5,2500,"[-0.0265960693359375, -0.005367279052734375, 0...",Research has shown a possible connection betwe...,Research has shown a possible connection betwe...,(E),,,,
6,3000,"[-0.0291748046875, 0.0171051025390625, 0.00342...",The nucleotide sequence of a DNA molecule is 5...,The nucleotide sequence of a DNA molecule is 5...,(F),,,,
7,3500,"[-0.01071929931640625, -0.02410888671875, -0.0...",The $J=2$ to 3 rotational transition in a cert...,The $J=2$ to 3 rotational transition in a cert...,(B),,,,
8,4000,"[-0.03466796875, -0.01006317138671875, -0.0077...","For a certain liquid which obeysTrouton'srule,...","For a certain liquid which obeysTrouton'srule,...",(D),,,,
9,4500,"[0.00843048095703125, -0.0082244873046875, -0....",A gas in two dimensions is enclosed in a recta...,A gas in two dimensions is enclosed in a recta...,(D),,,,


In [8]:
nowtest.question[1]

"________________reflect a purchaser's high level of involvement in the purchase decision. There is high perceived risk in these decisions so consumers spend a great deal of time, care, and energy searching, formulating, and making the final decision.\n"

### 질문 주고, topic 뽑아내기 (using Solar) 
zero-shot으로 노선 변경

In [9]:
from langchain_core.prompts import PromptTemplate
from langchain_upstage import ChatUpstage


llm = ChatUpstage(api_key = api_key)

prompt_template = PromptTemplate.from_template(
    '''
    
    Answer according to the conditions : 
    1. Return a broad topic for the given [Sentence].
    2. Respond with ONLY ONE English Word that correspond to [Answer].
    3. DO NOT response ANY OTHER CHARACTERS.

    [Sentence] {question}
    The broad topic is

    '''

)
chain_broad = prompt_template | llm


broad_topics = []
for idx, row in nowtest.iterrows() : ############### 일단 지금은 100개 단위로 띄엄띄엄 test 중
    response = chain_broad.invoke({"question": row.question}) # 선지 전까지 받아오기
    broad_topics.append(response.content)
    nowtest.loc[idx, 'topic_b'] = response.content

broad_topics

['Algebra',
 'Decision Making',
 'Real Estate Law',
 'Evidence',
 'Discipline',
 'Mental Disorders',
 'Biology',
 'Spectroscopy',
 'Chemistry',
 'Physics',
 'Gender',
 'Decision-making',
 'Future\n\nThe houses of the future may allow you to operate virtually everything in the house from a central terminal; such a house is already in production and is referred to as',
 'Disease.',
 'Economics',
 'Economics',
 'Health',
 'Calculus',
 'Food',
 'Nuclear Physics',
 'Atmosphere',
 'Cryptography',
 'Empathy',
 'Communication',
 'Physics']

In [10]:
from langchain_core.prompts import PromptTemplate
from langchain_upstage import ChatUpstage


llm = ChatUpstage(api_key = api_key)

prompt_template = PromptTemplate.from_template(
    '''
    
    Answer according to the conditions :
    1. Return a specific topic for the given [Sentence].
    2. Respond with ONLY ONE English Word that correspond to [Answer].
    3. DO NOT response ANY OTHER CHARACTERS.

    [Sentence] {question}
    The specific topic is

    '''

)
chain_speci = prompt_template | llm


specific_topics = []

for idx, row in nowtest.iterrows() : ############### 일단 지금은 100개 단위로 띄엄띄엄 test 중
    response = chain_broad.invoke({"question": row.question}) # 선지 전까지 받아오기
    specific_topics.append(response.content)
    nowtest.loc[idx, 'topic_s'] = response.content

specific_topics

['Mathematics',
 'Decision Making',
 'RealEstate',
 'Legal',
 'Discipline',
 'Mental Disorders',
 'Biology',
 'Spectroscopy',
 'thermodynamics',
 'Physics',
 'Gender',
 'Locational',
 '\nSmart Home',
 'Disease',
 'Economics',
 'Economy',
 'health',
 'Calculus',
 'radiation',
 'Nuclear.',
 'Physics',
 'Security',
 'Empathy',
 'Communication',
 'Physics']

In [11]:
nowtest

Unnamed: 0,index,embed_ques,question,prompts,answers,topic_b,topic_s,b_wiki,s_wiki
0,0,"[-0.013031005859375, -0.00452423095703125, -0....",The symmetric group $S_n$ has $\n\factorial{n}...,The symmetric group $S_n$ has $\n\factorial{n}...,(A),Algebra,Mathematics,,
1,500,"[-0.006580352783203125, 0.0008025169372558594,...",________________reflect a purchaser's high lev...,________________reflect a purchaser's high lev...,(G),Decision Making,Decision Making,,
2,1000,"[-0.018035888671875, -0.00658416748046875, -0....",A developer is the owner of a parcel of land i...,A developer is the owner of a parcel of land i...,(B),Real Estate Law,RealEstate,,
3,1500,"[-0.01137542724609375, -0.002910614013671875, ...","A defendant, a nurse at a nursing home, is cha...","A defendant, a nurse at a nursing home, is cha...",(I),Evidence,Legal,,
4,2000,"[-0.00925445556640625, -0.036468505859375, -0....","In time-out, a disruptive child who wants to s...","In time-out, a disruptive child who wants to s...",(G),Discipline,Discipline,,
5,2500,"[-0.0265960693359375, -0.005367279052734375, 0...",Research has shown a possible connection betwe...,Research has shown a possible connection betwe...,(E),Mental Disorders,Mental Disorders,,
6,3000,"[-0.0291748046875, 0.0171051025390625, 0.00342...",The nucleotide sequence of a DNA molecule is 5...,The nucleotide sequence of a DNA molecule is 5...,(F),Biology,Biology,,
7,3500,"[-0.01071929931640625, -0.02410888671875, -0.0...",The $J=2$ to 3 rotational transition in a cert...,The $J=2$ to 3 rotational transition in a cert...,(B),Spectroscopy,Spectroscopy,,
8,4000,"[-0.03466796875, -0.01006317138671875, -0.0077...","For a certain liquid which obeysTrouton'srule,...","For a certain liquid which obeysTrouton'srule,...",(D),Chemistry,thermodynamics,,
9,4500,"[0.00843048095703125, -0.0082244873046875, -0....",A gas in two dimensions is enclosed in a recta...,A gas in two dimensions is enclosed in a recta...,(D),Physics,Physics,,


### 뽑아둔 topic으로 wikipedia 검색해서, split 하기

In [None]:
import re
from langchain_text_splitters import (Language, RecursiveCharacterTextSplitter,)
from langchain.schema import Document
import numpy as np


# Split **** (hyper param) **********************
text_splitter = RecursiveCharacterTextSplitter.from_language(
    chunk_size=5000, chunk_overlap=1000, language=Language.HTML)



for idx, row in nowtest.iterrows() :

    broad = row['topic_b']
    broad = re.sub(r'\s{2,}|\n', '', broad) # 띄어쓰기가 2개 이상 또는 엔터 없애기
    broad = re.sub(r'[^a-zA-Z\s_]', '', broad) # 얘네들 남기기
    embed_ques = row['embed_ques']

    ########### broad ###########
    # 해당 topic에 해당하는 Wikipedia가 있는지 확인 + title, 본문 길이 출력
    wiki_broad = wiki_wiki.page(f'{broad}')
    print(f"Broad | query:{idx} {wiki_broad.exists()} | Title: {wiki_broad.title} | {len(wiki_broad.text)}")

    if wiki_broad.exists() is False : continue

    broad_docs = wiki_broad.text
    broad_docs = Document(page_content=broad_docs)

    # Split **** (hyper param)
    broad_chunk = text_splitter.split_documents([broad_docs]) ## 여기에 chunking된 애들이 저장됨
    print("Broad Splits:", len(broad_chunk)) 
    print(type(broad_chunk[0])) #????????????????????????????

    chunk = []
    for index in range(0,len(broad_chunk)) : # context 받아오기
        chunk.append(broad_chunk[index]['page_content']) #????????????????
    print() 

    chunks = passage_embeddings.embed_documents(chunk)
    #유사도 기준 내림차순 정렬
    sorted_idx = (np.array(embed_ques) @ np.array(chunks).T).argsort()[::-1]
    nowtest.at[idx, 'b_wiki'] = chunk[sorted_idx[0]].page_content # 임베딩 말고 원문 저장 // 일단 지금은 유사도 1등만 넣어둠





for idx, row in nowtest.iterrows() :
    broad, specific = row['topic_b'], row['topic_s']

    broad = re.sub(r'\s{2,}|\n', '', broad) # 띄어쓰기가 2개 이상 또는 엔터 없애기
    broad = re.sub(r'[^a-zA-Z\s_]', '', broad) # 얘네들 남기기
    specific = re.sub(r'\s{2,}|\n', '', specific) # 띄어쓰기가 2개 이상이면 없애기    
    specific = re.sub(r'[^a-zA-Z\s_]', '', specific)
    embed_ques = row['embed_ques']

    ######### specific ############# 
    if broad.lower() == specific.lower() :
        print('--same as broad--')
        continue

    specific = re.sub(r'\s{2,}|\n', '', specific) # 띄어쓰기가 2개 이상이면 없애기    
    specific = re.sub(r'[^a-zA-Z\s_]', '', specific)
    wiki_speci = wiki_wiki.page(f'{specific}')
    print(f"Speci | query:{idx} {wiki_speci.exists()} | Title: {wiki_speci.title} | {len(wiki_speci.text)}")

    if wiki_speci.exists() is False : continue

    # 받아온 Wikipedia를 split함수에 적용할 수 있도록 변환
    speci_docs = wiki_speci.text
    speci_docs = Document(page_content=speci_docs)

    # Split **** (hyper param)
    speci_chunk = text_splitter.split_documents([speci_docs]) ## 여기에도 저장됨
    print("Speci Splits:", len(speci_chunk))

    chunk = []
    for index in range(0,len(speci_chunk)) : # context 받아오기
        chunk.append(speci_chunk[index].page_content)


    chunks = passage_embeddings.embed_documents(chunk)
    #유사도 기준 내림차순 정렬
    sorted_idx = (np.array(embed_ques) @ np.array(chunks).T).argsort()[::-1]
    nowtest.at[idx, 's_wiki'] = chunk[sorted_idx[0]].page_content
    
    #########################




    

Broad | query:0 True | Title: Algebra | 54943
Broad Splits: 14
<class 'langchain_core.documents.base.Document'>


TypeError: 'Document' object is not subscriptable

In [13]:
nowtest

Unnamed: 0,index,embed_ques,question,prompts,answers,topic_b,topic_s,b_wiki,s_wiki
0,0,"[-0.0130157470703125, -0.004535675048828125, -...",The symmetric group $S_n$ has $\n\factorial{n}...,The symmetric group $S_n$ has $\n\factorial{n}...,(A),Algebra,Algebra,ther with the operation of addition. The neutr...,
1,1,"[-0.00699615478515625, -0.038055419921875, -0....",Let V be the set of all real polynomials p(x)....,Let V be the set of all real polynomials p(x)....,(H),Mathematics,Mathematics,"d analytic geometry, which uses coordinates sy...",
2,2,"[0.0017080307006835938, 0.0035076141357421875,...",Let A be the set of all ordered pairs of integ...,Let A be the set of all ordered pairs of integ...,(E),Mathematics,Mathematics,haustion to calculate the area under the arc o...,
3,3,"[-0.0105438232421875, 0.0045623779296875, -0.0...",A tank initially contains a salt solution of 3...,A tank initially contains a salt solution of 3...,(I),Chemical_Engineering,Chemical_Engineering,,be part of every degree course that it accredi...
4,4,"[-0.00896453857421875, -0.0262908935546875, -0...",A total of 30 players will play basketball at ...,A total of 30 players will play basketball at ...,(B),Math,Math,d Germany. The oldest journal addressing instr...,
...,...,...,...,...,...,...,...,...,...
12096,12097,"[-0.00893402099609375, -0.002071380615234375, ...",A hot mild steel rod is placed in a carbonaceo...,A hot mild steel rod is placed in a carbonaceo...,(J),Diffusion,Diffusion,)\n ]\n .\n \n \n {\d...,
12097,12098,"[0.01129913330078125, -0.01090240478515625, -0...",The cost of making the correct decisions for t...,The cost of making the correct decisions for t...,(H),Decision,Decision,Decision may refer to:\n\nLaw and politics\nJu...,
12098,12099,"[-0.02032470703125, -0.0207061767578125, -0.01...",Consider the evaporation of liquid ammonia int...,Consider the evaporation of liquid ammonia int...,(F),Heat,HeatTransfer.,adiabatic work in terms of the statistical dis...,
12099,12100,"[-0.01406097412109375, -0.01276397705078125, -...","Air (100°F, 1atm) is flowing at a velocity of ...","Air (100°F, 1atm) is flowing at a velocity of ...",(I),Heat,HeatTransfer,"rsibility, then there is entropy production, w...",


In [16]:
nowtest.to_csv(data_path+'full_broad_test.csv', index=False)

In [None]:
from langchain_core.prompts import PromptTemplate
from langchain_upstage import ChatUpstage


llm = ChatUpstage(api_key = api_key)

prompt_template = PromptTemplate.from_template(
    '''
    
    Please provide most correct answer from the following context.
    If the answer is not present in the context, please Answer "The information is NOT present in the context." with your own answer.
    
    The Answer format is as follow (do not explain) : 
    Answer: (D) keyword.
    ---
    
    Question: {question}
    Context: {context}
    Answer :
    ---
        
    '''

)
chain = prompt_template | llm


In [None]:
# QA
responses = []
for idx, row in nowtest.iterrows() :
    prompts, b_wiki, s_wiki = row['prompts'], row['b_wiki'], row['s_wiki']


    response = chain.invoke({"question": prompts, "context": b_wiki})
    responses.append(response.content)

In [None]:
responses

['The characteristic of the ring 2Z is 2.\n\nAnswer: (G) 2.',
 'Answer: (D) $95.46.',
 'Answer: (G) A theoretical model.',
 'The information is NOT present in the context.\n\nAnswer: (D) 6.',
 'Answer: (D) 0.6t^3 + 60t^2 + 300t + 100.',
 'Answer: (G) Durable goods.',
 'To cover the floor with linoleum, you need to calculate the total length of linoleum required. The floor dimensions are 7\'6" × 11\'8", which is 87.5 inches × 140 inches. You need to add the width of the linoleum (6 feet or 72 inches) to the length of the floor to determine the total length of linoleum needed.\n\nTotal length = 87.5 inches + 140 inches + 72 inches = 299.5 inches\n\nNow, you need to convert the total length to feet and then calculate the cost.\n\nTotal length in feet = 299.5 inches / 12 = 24.96 feet\n\nCost = Total length in feet × Price per running foot\n\nCost = 24.96 feet × $1.79 per running foot = $45.02\n\nThe best way to cover the floor is to purchase 24.96 feet of linoleum, and the cost will be app

### Groundedness check

In [None]:
import os
from langchain_upstage import UpstageGroundednessCheck
 
os.environ["UPSTAGE_API_KEY"] = api_key
 
groundedness_check = UpstageGroundednessCheck()
 
request_input = {
    "context": f"{}",
    "answer": f"{}}",
}
response = groundedness_check.invoke(request_input)
print(response)

SyntaxError: f-string: empty expression not allowed (2738544965.py, line 9)

In [None]:
answers = nowtest.answers
answers

0      (A)
1      (E)
2      (J)
3      (H)
4      (J)
      ... 
117    (D)
118    (I)
119    (D)
120    (I)
121    (I)
Name: answers, Length: 122, dtype: object

## Check Accuracy

In [None]:
# funcion to extract an answer from response

import re

def extract_answer(response):
    """
    extracts the answer from the response using a regular expression.
    expected format: "[ANSWER]: (A) convolutional networks"

    if there are any answers formatted like the format, it returns None.
    """
    pattern = r"\[ANSWER\]:\s*\((A|B|C|D|E)\)"  # Regular expression to capture the answer letter and text
    match = re.search(pattern, response)

    if match:
        return match.group(1) # Extract the letter inside parentheses (e.g., A)
    else:
        return extract_again(response)

def extract_again(response):
    pattern = r"\b[A-J]\b(?!.*\b[A-J]\b)"
    match = re.search(pattern, response)
    if match:
        return match.group(0)
    else:
        return None

In [None]:
# print accuracy

cnt = 0

for answer, response in zip(answers, responses):
    print("-"*10)
    generated_answer = extract_answer(response)
    print(response)
    # check
    if generated_answer:
        print(f"generated answer: {generated_answer}, answer: {answer}")
    else:
        print("extraction fail")


    if generated_answer == None:
        continue
    if generated_answer in answer:
        cnt += 1

print()
print(f"acc: {(cnt/len(answers))*100}%")

----------
The characteristic of the ring 2Z is 2.

Answer: (G) 2.
generated answer: G, answer: (A)
----------
Answer: (D) $95.46.
generated answer: D, answer: (E)
----------
Answer: (G) A theoretical model.
generated answer: A, answer: (J)
----------
The information is NOT present in the context.

Answer: (D) 6.
generated answer: D, answer: (H)
----------
Answer: (D) 0.6t^3 + 60t^2 + 300t + 100.
generated answer: D, answer: (J)
----------
Answer: (G) Durable goods.
generated answer: G, answer: (G)
----------
To cover the floor with linoleum, you need to calculate the total length of linoleum required. The floor dimensions are 7'6" × 11'8", which is 87.5 inches × 140 inches. You need to add the width of the linoleum (6 feet or 72 inches) to the length of the floor to determine the total length of linoleum needed.

Total length = 87.5 inches + 140 inches + 72 inches = 299.5 inches

Now, you need to convert the total length to feet and then calculate the cost.

Total length in feet = 299