In [None]:
# set parameters

api_key = "" # insert upstage api key here
data_path = "." # folder path containing ewah.pdf and samples.csv

Converting pdf to text

In [None]:
# import pdfplumber
# from langchain.docstore.document import Document  # Import Document from langchain

# # Load the PDF file
# pdf_path = "ewha.pdf"  # Replace with your file path
# with pdfplumber.open(pdf_path) as pdf:
#     pdf_text = ""
#     for page in pdf.pages:
#         pdf_text += page.extract_text()
# pdf_text = [Document(page_content=pdf_text)]
# print(pdf_text)


In [None]:
from langchain_upstage import UpstageLayoutAnalysisLoader
import os


layzer = UpstageLayoutAnalysisLoader(api_key=api_key,file_path=os.path.join(data_path, 'ewha.pdf'), output_type="text")

pdf_text = layzer.load()  # or layzer.lazy_load()

Split text into chunks

In [None]:
from langchain_text_splitters import (
    Language,
    RecursiveCharacterTextSplitter,
)

# 2. Split
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=100
)
articles = text_splitter.split_documents(pdf_text)
print("Splits:", len(articles))

Read data of testing file

In [None]:
# read samples.csv file

import pandas as pd
import os

def read_data(data_path):
    data = pd.read_csv(data_path)
    prompts = data['prompts'][:34]
    answers = data['answers'][:34]
    evidences = data['evidence'][:34]
    # returns three lists: prompts, answers and evidences
    return prompts, answers, evidences

prompts, answers, evidences = read_data(os.path.join(data_path, 'testewha.csv'))

Klue robertabase

In [None]:
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity

# Load a Korean language model
model_name = "klue/roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1)  # Mean pooling

contexts = []
count = 1

# Compute embeddings
article_embeddings = [get_embedding(article.page_content) for article in articles[:-1]]

for question in prompts:
    print(f"Processing question {count}")
    count += 1
    question_embedding = get_embedding(question)

    # Compute similarities
    similarities = [cosine_similarity(question_embedding.detach().numpy(), 
                                    article_embedding.detach().numpy())[0][0]
                    for article_embedding in article_embeddings]

    best_match = similarities.index(max(similarities))

    # Get the indexes sorted by values in descending order
    sorted_indexes = sorted(range(len(similarities)), key=lambda i: similarities[i], reverse=True)

    # Take top 3 chunks as context
    context = [articles[sorted_indexes[0]].page_content, articles[sorted_indexes[1]].page_content, articles[sorted_indexes[2]].page_content]
    contexts.append(context)


upstage embedding

In [None]:
from langchain_upstage import UpstageEmbeddings
 
embeddings = UpstageEmbeddings(
    api_key = api_key,
    model="embedding-query"
)

# Compute document embeddings
doc_result = embeddings.embed_documents(
    [article.page_content for article in articles]
)

In [None]:
import numpy as np

contexts = []

for prompt in prompts:
    query_result = embeddings.embed_query(prompt)
    similarity_list = []
    for passage_embedding in doc_result:
        similarity = np.dot(passage_embedding, query_result)
        similarity_list.append(similarity)
        
    values = similarity_list
    # Get the indexes sorted by values in descending order
    sorted_indexes = sorted(range(len(values)), key=lambda i: values[i], reverse=True)

    # Take top 3 chunks as context
    context = [articles[sorted_indexes[0]].page_content, articles[sorted_indexes[1]].page_content, articles[sorted_indexes[2]].page_content]
    contexts.append(context)

kobert

In [None]:
from kobert_transformers import get_kobert_model, get_tokenizer
import torch

# Load KoBERT tokenizer and model
tokenizer = get_tokenizer()
model = get_kobert_model()

def embed_text(text, tokenizer, model):
    """
    Embeds text using KoBERT.
    """
    # Tokenize input
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    
    # Get model outputs
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Mean-pool embeddings
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings


# Embed articles
article_embeddings = torch.vstack([embed_text(article.page_content, tokenizer, model) for article in articles])
contexts = []

for prompt in prompts:
    # Embed question
    question_embedding = embed_text(prompt, tokenizer, model)

    # Compute cosine similarity between question and articles
    cosine_sim = torch.nn.functional.cosine_similarity(article_embeddings, question_embedding)

    # Find the top 3 most relevant articles
    values = cosine_sim
    # Get the indexes sorted by values in descending order
    sorted_indexes = sorted(range(len(values)), key=lambda i: values[i], reverse=True)

    # Take top 3 chunks as context
    context = [articles[sorted_indexes[0]].page_content, articles[sorted_indexes[1]].page_content, articles[sorted_indexes[2]].page_content]
    contexts.append(context)

Extract keywords and match using Okt

In [None]:
from konlpy.tag import  Okt

# Example question
korean_question = prompts[0]

# Initialize a tagger (Okt)
okt = Okt()

# Tokenize and extract nouns (keywords)
okt_keywords = okt.nouns(korean_question)

print(korean_question)
print("Okt Keywords:", okt_keywords)


In [None]:
# Extract keywords from each article
article_keywords = [okt.nouns(article.page_content) for article in articles]
contexts = []

for prompt in prompts:

    okt_keywords = okt.nouns(prompt)
    # Convert keywords to sets for easy comparison
    question_keywords = set(okt_keywords)

    # Calculate relevance scores based on keyword overlap
    relevance_scores = [
        len(question_keywords.intersection(set(article_keywords)))
        for article_keywords in article_keywords
    ]

    # Get the indexes of the top articles
    top_indexes = sorted(
        range(len(relevance_scores)), key=lambda i: relevance_scores[i], reverse=True
    )

    # Take top 3 chunks as context
    context = [articles[top_indexes[0]].page_content, articles[top_indexes[1]].page_content, articles[top_indexes[2]].page_content]
    contexts.append(context)



Testing

In [None]:
from langchain_core.prompts import PromptTemplate
from langchain_upstage import ChatUpstage


llm = ChatUpstage(api_key = api_key, model="solar-1-mini-chat")

prompt_template = PromptTemplate.from_template(
    """
    Please provide most correct answer from the following context. Select an option from the choices provided and answer with a single letter.
    If the answer is not present in the context, please write "The information is not present in the context."
    ---
    Question: {question}
    ---
    Context: {context}
    """
)
chain = prompt_template | llm

responses = []
count = 0

for prompt in prompts: 
    print(f"Processing question {count+1}")
    response = chain.invoke({"question": prompt, "context": "\n".join(contexts[count])})
    responses.append(response.content)
    count += 1

In [None]:
count = 0
for response in responses:
    print(f"Question {count+1} : {response} \t Correct answer: {answers[count]}")
    count += 1

In [None]:
count = 0
for response in responses:
    print(f"Question {count+1} : {response} \t Correct answer: {answers[count]} \t Context: {evidences[count]}")
    print(f"Question: {prompts[count]}")
    print(f"Context 1: {contexts[count][0]}")
    print(f"Context 2: {contexts[count][1]}")
    print(f"Context 3: {contexts[count][2]}")
    count += 1

MMLU without context

In [None]:
# read samples.csv file

import pandas as pd
import os

def read_data(data_path):
    data = pd.read_csv(data_path)
    prompts = data['prompts']
    answers = data['answers']
    # returns two lists: prompts and answers
    return prompts, answers

prompts, answers = read_data(os.path.join(data_path, 'test_samples_MMLU-LAW.csv'))

In [None]:
from langchain_core.messages import HumanMessage
from langchain_upstage import ChatUpstage


llm = ChatUpstage(api_key = api_key,model="solar-1-mini-chat")

responses = []
count = 1

print(f"Processing question {count}")
count += 1
messages = [
HumanMessage(
    content = "QUESTION1) A woman was standing in the aisle of a subway car and put her purse on the seat next to her. A man approached the woman from behind and grabbed the purse off the seat. He then pushed the woman out of the way and ran out of the subway car while carrying the purse. The man was apprehended on the subway platform while in possession of the purse. In a jurisdiction that follows the common law with respect to criminal offenses, of what crime can the man properly be convicted? (A) Fraud, because he took the purse without the woman's consent. (B) Larceny, because he took the purse without the woman's permission. (C) Burglary, because he entered the subway car with the intention of committing a theft. (D) Robbery, because he used force in leaving with the purse. (E) Robbery, because he used force to take possession of the purse. (F) Robbery, because he used force to remove the woman from the purse's vicinity. (G) Larceny, because force was not used until after he took the purse. (H) Assault, because he pushed the woman out of the way. (I) Larceny, because he made no threat to use force. (J) Robbery, because he physically took the purse from the woman's presence."
    )
]

response = llm.invoke(messages)
responses.append(response.content)

In [None]:
print(responses[0])

In [None]:
# from langchain_core.messages import HumanMessage
# from langchain_upstage import ChatUpstage


# llm = ChatUpstage(api_key = api_key,model="solar-1-mini-chat")

# responses = []
# count = 1

# for prompt in prompts:
#     print(f"Processing question {count}")
#     count += 1
#     messages = [
#     HumanMessage(
#         content = "Please answer the following question by choosing the most appropriate option from the choices provided. Keep your answer within ten words" + prompt
#         )
#     ]

#     response = llm.invoke(messages)
#     responses.append(response.content)

In [None]:
from langchain_core.messages import HumanMessage
from langchain_upstage import ChatUpstage


llm = ChatUpstage(api_key = api_key,model="solar-1-mini-chat")


prompt_template = PromptTemplate.from_template(
    """
    Please provide the most correct answer from the following context. Understand the entire full sentence for completeness.
    Q: Who likes to eat chips? Context says that Ha, Lam, Bun like to eat chips. 
    (A): Ha 
    (B): Lam
    (C): Bun
    (D): All of the above
    A: (D)

    If the answer is not present in the context, please  provide a reasonable guess of the answer.

    Also, once you find the answer in the context, ignore the remainder of the context.

    The answer must look like this always! 
    Answer: (<answer>)
    ---
    Question: {question}
    ---
    Context: {context}
    """
)

chain = prompt_template | llm

responses = []
count = 1

for prompt in prompts:
    print(f"Processing question {count}")
    count += 1

    response = chain.invoke({"question": prompt, "context": prompt})
    responses.append(response.content)





In [None]:
count = 0
for response in responses:
    print(f"Question {count+1} : {response} \t Correct answer: {answers[count]}")
    count += 1

Using spacy to extract keywords and wikipedia api to retrieve keyword summaries

In [None]:
import spacy

# Load spaCy's English model
nlp = spacy.load("en_core_web_sm")

def extract_keywords(question):
    # Process the question
    doc = nlp(question)
    
    # Extract nouns, proper nouns, and compound nouns
    keywords = [chunk.text for chunk in doc.noun_chunks]
    return keywords

# Example question
question = "According to social identity theory, an individual's self-concept is primarily derived from:"
keywords = extract_keywords(question)
print("Keywords:", keywords)


In [None]:
import wikipediaapi

# Create a Wikipedia API instance
wiki = wikipediaapi.Wikipedia("NLP Project (yanrenyu00@gmail.com)", "en")

# Specify the page title
page_title = "social identity theory"
page = wiki.page(page_title)

# Check if the page exists
if page.exists():
    print(f"Title: {page.title}")
    print(f"Summary: {page.summary}")
else:
    print(f"The page '{page_title}' does not exist.")


Building context from wikipedia

In [None]:
import spacy
import wikipediaapi

# Load spaCy's English model
nlp = spacy.load("en_core_web_sm")

# Create a Wikipedia API instance
wiki = wikipediaapi.Wikipedia("NLP Project (yanrenyu00@gmail.com)", "en")

def extract_keywords(question):
    # Process the question
    doc = nlp(question)
    
    # Extract nouns, proper nouns, and compound nouns
    keywords = [chunk.text for chunk in doc.noun_chunks]
    return keywords

count = 1
contexts = []

# Example question
for prompt in prompts:

    # Remove the prefix and slice until (A)
    start_index = prompt.find(')')+1
    end_index = prompt.find('(A)')

    # Extract the substring
    if start_index != -1 and end_index != -1:
        question = prompt[start_index:end_index].strip()
        print(question)
    else:
        question = prompt
    keywords = extract_keywords(question)
    print(f"Processing question {count}, keywords: {keywords}")
    count += 1

    context = []
    for keyword in keywords:
        page_title = keyword
        page = wiki.page(page_title)

        # Check if the page exists
        if page.exists():
            context.append(page.summary)
        else:
            print(f"Page does not exist for {keyword}")
    contexts.append(context)


Testing MMLU with context

In [None]:
from langchain_core.prompts import PromptTemplate
from langchain_upstage import ChatUpstage


llm = ChatUpstage(api_key = api_key, model="solar-mini")

prompt_template = PromptTemplate.from_template(
    """
    Please provide most correct answer from the following context. Select an option from the choices provided and answer with a single letter.
    Keep your answer within ten words.
    If the answer is not present in the context, please still try you best to answer it.
    ---
    Question: {question}
    ---
    Context: {context}
    """
)
chain = prompt_template | llm

responses = []
count = 0

for prompt in prompts: 
    print(f"Processing question {count+1}")
    response = chain.invoke({"question": prompt, "context": "\n".join(contexts[count])})
    responses.append(response.content)
    count += 1

In [None]:
count = 0
for response in responses:
    print(f"Question {count+1} : {response} \t Correct answer: {answers[count]}")
    count += 1