# Evaluating LLMs‘ Performance in Answering RD Exam Prep Questions
This notebook provides code examples to get the questions from the database and ask the questions in batch to the LLMs. 
The LLMs evaluated in this project are:
- GPT 4o
- Gemini 1.5 Pro
- Claude 3.5 - Sonnet

## Setup
Before running the rest of this notebook, you'll need to run the cells below to ensure necessary libaraies are installed. 


In [None]:
#%pip install openai
#%pip install boto3
#%pip install -q -U google-generativeai
#%pip install tenacity
#%pip install pymysql
#%pip install sqlalchemy
#%pip install pandas
#%pip install sklearn

You will need to store your enviromental variables in the .env file.
Here is the sample .env file:

In [None]:
MYSQL_USER=<Your_MySQL_User>
MYSQL_PASSWORD=<Your_MySQL_Password>
MYSQL_HOST=<Your_MySQL_Host>
MYSQL_PORT=<Your_MySQL_Port>
DB_NAME=<Your_Database_Name>
OPENAI_API_KEY=<Your_OPENAI_API_Key>
GEMINI_API_KEY=<Your_GEMINI_API_Key>
GOOGLE_CLOUD_PROJECT=<Your_Google_Cloud_Project_ID>

You will also need to set AWS access. Please follow the instruction here: https://boto3.amazonaws.com/v1/documentation/api/latest/guide/quickstart.html


## Connect to the LLMs

### Connect to OPENAI API for GPT 4o to Answer RD Exam Questions Using Zero-shot and CoT Promptings

In [None]:
import openai_api
import questions_mysql
import time

api = openai_api.OpenAIAPI()
qsql = questions_mysql.QuestionsMysql()
# Connect to RD Exam Questions
question_dict = qsql.get_rd_questions()

# To run 5 rounds 
for i in range(1, 6):  
    file_name = f'claude_exp{i}.txt'
    with open(file_name, 'w') as file:
        pass
    
    response = ""

    # Starts the timer
    start = time.time()

    # Prompt the questions in a batch of 1, you can adjust the question number in each batch 
    for startIndex in range (1, len(question_dict) + 1, 1):
        # Read what previous response we got from the LLMs 
        with open(file_name, 'r') as file:
            content = file.read()
        
        # To use the zero-shot prompting instruction with the question
        prompt_str = qsql.get_no_explain_prompt_string(question_dict, startIndex, 1)
        # To use the CoT prompting instruction with the question
        # prompt_str = qsql.get_cot_prompt_string(question_dict, startIndex, 1)

        # Use the GPT 4o model and set temperature to 0. 
        response = api.ask_chatgpt(prompt_str, "gpt-4o", 0)

        # Save the response to each question to the file
        with open(file_name, 'w') as file:
            file.write(content + prompt_str + "\n" + response + "\n\n\n")

        end = time.time()
        length = end - start # Calculate the time used for a round
        print("Round", i, "took", length, "seconds.")

### Connect to Amazon BedRock API for Claude 3.5 - Sonnet to Answer RD Exam Questions Using Zero-shot and CoT Promptings

In [None]:
import anthropic_bedrock_api
import questions_mysql
import time


api = anthropic_bedrock_api.AnthropicBedRockAPI()
qsql = questions_mysql.QuestionsMysql()
# Connect to RD Exam Questions
question_dict = qsql.get_rd_questions()

# To run 5 rounds 
for i in range(1, 6):  
    file_name = f'claude_exp{i}.txt'
    with open(file_name, 'w') as file:
        pass
    
    response = ""

    # Starts the timer
    start = time.time()

    # Prompt the questions in a batch of 1, you can adjust the question number in each batch 
    for startIndex in range (1, len(question_dict) + 1, 1):
        # Read what previous response we got from the LLMs 
        with open(file_name, 'r') as file:
            content = file.read()
        
        # To use the zero-shot prompting instruction with the question
        prompt_str = qsql.get_no_explain_prompt_string(question_dict, startIndex, 1)
        # To use the CoT prompting instruction with the question
        # prompt_str = qsql.get_cot_prompt_string(question_dict, startIndex, 1)

        # Use the Claude 3.5 - Sonnet model and set temperature to 0. 
        response = api.ask_claude(prompt_str, "anthropic.claude-3-5-sonnet-20240620-v1:0", 0)

        # Save the response to each question to the file
        with open(file_name, 'w') as file:
            file.write(content + prompt_str + "\n" + response + "\n\n\n")

        end = time.time()
        length = end - start # Calculate the time used for a round
        print("Round", i, "took", length, "seconds.")

### Connect to Gemini API for Gemini 1.5 Pro to Answer RD Exam Questions Using Zero-shot and CoT Promptings

In [None]:
import gemini_api
import questions_mysql
import time

api = gemini_api.GeminiAIAPI()
qsql = questions_mysql.QuestionsMysql()
# Connect to RD Exam Questions
question_dict = qsql.get_rd_questions()

# To run 5 rounds 
for i in range(1, 6):  
    file_name = f'gemini_exp{i}.txt'
    with open(file_name, 'w') as file:
        pass
    
    response = ""

    # Starts the timer
    start = time.time()

    # Prompt the questions in a batch of 1, you can adjust the question number in each batch 
    for startIndex in range (1, len(question_dict) + 1, 1):
        # Read what previous response we got from the LLMs 
        with open(file_name, 'r') as file:
            content = file.read()
        
        # To use the zero-shot prompting instruction with the question
        prompt_str = qsql.get_no_explain_prompt_string(question_dict, startIndex, 1)
        # To use the CoT prompting instruction with the question
        # prompt_str = qsql.get_cot_prompt_string(question_dict, startIndex, 1)

        # Use the Gemini 1.5 Pro model and set temperature to 0. 
        response = api.ask_gemini(prompt_str, 'gemini-1.5-pro', 0)

        # Save the response to each question to the file
        with open(file_name, 'w') as file:
            file.write(content + prompt_str + "\n" + response + "\n\n\n")

        end = time.time()
        length = end - start # Calculate the time used for a round
        print("Round", i, "took", length, "seconds.")


## Connect to LLMs to Ask RD Exam Questions using RAG Prompting
### Step 1: Extract Chunks from pdfs
You need to add divid your pdf contents to chunks and put them into a csv file first and then run the code below.

In [None]:
import pandas as pd

from embedding import TitanEmbeddings

# Load the knowledge dataframe (chunks)
file_path = 'xxxx.csv'
df_knowledge = pd.read_csv(file_path)

# Check if 'chunk_embedding' column exists; if not, create it
if 'chunk_embedding' not in df_knowledge.columns:
    df_knowledge['chunk_embedding'] = None

# Initialize the embedding model
titan_embeddings_v2 = TitanEmbeddings(model_id="amazon.titan-embed-text-v2:0")

# Define parameters
dimensions = 1024
normalize = True

# Process each chunk
for i, row in df_knowledge.iterrows():
    # Check if the chunk embedding is already computed
    if pd.notna(row['chunk_embedding']):
        continue
    
    chunk = row['chunk']
    input_text = chunk
    
    # Obtain the embedding of the chunk
    chunk_embedding = titan_embeddings_v2(input_text, dimensions, normalize)
    
    # Save the embedding to the dataframe
    df_knowledge.at[i, 'chunk_embedding'] = chunk_embedding
    
    # Save progress after processing each chunk
    df_knowledge.to_csv(file_path, index=False)
    print(f"Processed and saved chunk {i + 1}/{len(df_knowledge)}")

print("Done!")


### Step 2: Find Similar Chunks
Now you have the chunks and their embeddings, you can use the code below to find the top 10 similar chunks.

In [None]:
import ast
import pandas as pd
import numpy as np

from questions_mysql import QuestionsMysql
from sklearn.metrics.pairwise import cosine_similarity
from embedding import TitanEmbeddings


def find_most_similar_chunks(
        query_embedding: list,
        df: pd.DataFrame,
        number_of_chunks: int = 10) -> np.ndarray:
    """
    Search for the most similar chunks
    
    Args:
        query_embedding (list): The embedding of the question.
        df (pd.DataFrame): The dataframe including the chunks and their embeddings.
        number_of_chunks (int): Number of smilar chunks to be selected.
    
    Returns:
        similar_chunks (np.ndarray): The array of N most similar chunks.
    """
    #print(type(query_embedding))
    # Convert the 'chunk_embedding' column to an array of arrays
    chunk_embeddings = df['chunk_embedding'].apply(ast.literal_eval).tolist()
    chunk_embeddings = np.array(chunk_embeddings)
    
    cosine_scores = cosine_similarity([query_embedding], chunk_embeddings)[0]
    cosine_scores_sorted_indices = np.argsort(cosine_scores)[::-1]
    print(cosine_scores[cosine_scores_sorted_indices])
    sorted_chunks_text = df['chunk'].iloc[cosine_scores_sorted_indices]
    similar_chunks = sorted_chunks_text.head(number_of_chunks)
    return similar_chunks.values


if __name__ == '__main__':
    # Read question from a file
    qsql = QuestionsMysql()
    # Connect to RD Exam Questions
    question_dict = qsql.get_RD_questions()

    # Load knowledge dataframe (chunks)
    FILE_PATH = 'xxxx.csv'
    columns_to_read = ['chunk', 'chunk_embedding']
    df_knowledge = pd.read_csv(FILE_PATH, usecols=columns_to_read)

    # Obtain the embedding of the question using the embedding model
    dimensions = 1024
    normalize = True

    titan_embeddings_v2 = TitanEmbeddings(model_id="amazon.titan-embed-text-v2:0")
   
    results_df = pd.DataFrame(columns=['question_id', 'top_10_similar_chunks'])

    data = []
    for startIndex in range (1, len(question_dict) + 1, 1):
        query = question_dict[startIndex]['question'] + question_dict[startIndex]['choices']

        input_text = query
        query_embeddings = titan_embeddings_v2(input_text, dimensions, normalize)

        # Cosine similarity
        selected_chunks = find_most_similar_chunks(query_embeddings, df_knowledge)
        # Covert the selected chunks to string to be added to the prompt
        selected_chunks_str = np.array2string(selected_chunks, separator=', ')
        data.append({'question_id': startIndex, 'top_10_similar_chunks': selected_chunks_str})
        print(f"Processed {startIndex}/{len(question_dict)}")

    # Create a DataFrame from the new data
    results_df = pd.DataFrame(data)

    # Save the DataFrame to a new CSV file
    new_file_path = 'new_xxxxx.csv'
    results_df.to_csv(new_file_path, index=False)
    print("Completed.")


### Step 3: Send the Similar Chunks together with the Questions to LLMs

#### Claude 3.5 - Sonnet with RAP

In [None]:
import pandas as pd
import anthropic_bedrock_api
import time

from questions_mysql import QuestionsMysql

    
qsql = QuestionsMysql()
# Connect to RD Exam Questions
question_dict = qsql.get_rd_questions()

api = anthropic_bedrock_api.AnthropicBedRockAPI()

# Get the extracted similar chunks
FILE_PATH = 'xxxx.csv'
df = pd.read_csv(FILE_PATH)
similar_chunks_list = []
for index, row in df.iterrows():
    similar_chunks_list.append(row['top_10_similar_chunks'])

# To run 5 rounds
for i in range(1, 6):  
    file_name = f'claude_3.5_sonnet_rag_exp{i}.txt'
    with open(file_name, 'w') as file:
        pass

    # Starts the timer
    start = time.time()
    response = "\n"
    for startIndex in range (1, len(question_dict) + 1, 1):
        selected_chunk_str = similar_chunks_list[startIndex - 1]

        with open(file_name, 'r') as file:
            content = file.read()

        prompt_str = qsql.get_rag_prompt_string(question_dict, startIndex, 1, selected_chunk_str)

        response = api.ask_claude(prompt_str, "anthropic.claude-3-5-sonnet-20240620-v1:0", 0) 
        response += "\n"

        # Saves the response of each question to a file
        with open(file_name, 'w') as file:
            file.write(content + prompt_str + "\n" + response + "\n\n\n")
        
    end = time.time()
    length = end - start
    print("Round", i, "took", length, "seconds.")


#### Gemini 1.5 Pro with RAP

In [None]:
import pandas as pd
import gemini_api
import time

from questions_mysql import QuestionsMysql

    
qsql = QuestionsMysql()
# Connect to RD Exam Questions
question_dict = qsql.get_rd_questions()

api = gemini_api.GeminiAIAPI()

# Get the extracted similar chunks
FILE_PATH = 'xxxx.csv'
df = pd.read_csv(FILE_PATH)
similar_chunks_list = []
for index, row in df.iterrows():
    similar_chunks_list.append(row['top_10_similar_chunks'])

# To run 5 rounds
for i in range(1, 6):  
    file_name = f'gemini_1.5_pro_rag_exp{i}.txt'
    with open(file_name, 'w') as file:
        pass

    # Starts the timer
    start = time.time()
    response = "\n"
    for startIndex in range (1, len(question_dict) + 1, 1):
        selected_chunk_str = similar_chunks_list[startIndex - 1]

        with open(file_name, 'r') as file:
            content = file.read()

        prompt_str = qsql.get_rag_prompt_string(question_dict, startIndex, 1, selected_chunk_str)

        response = api.ask_gemini(prompt_str, 'gemini-1.5-pro', 0) 
        response += "\n"

        # Saves the response of each question to a file
        with open(file_name, 'w') as file:
            file.write(content + prompt_str + "\n" + response + "\n\n\n")
        
    end = time.time()
    length = end - start
    print("Round", i, "took", length, "seconds.")

#### GPT 4o with RAP

In [None]:
import pandas as pd
import openai_api
import time

from questions_mysql import QuestionsMysql

    
qsql = QuestionsMysql()
# Connect to RD Exam Questions
question_dict = qsql.get_rd_questions()

api = openai_api.OpenAIAPI()

# Get the extracted similar chunks
FILE_PATH = 'xxxx.csv'
df = pd.read_csv(FILE_PATH)
similar_chunks_list = []
for index, row in df.iterrows():
    similar_chunks_list.append(row['top_10_similar_chunks'])

# To run 5 rounds
for i in range(1, 6):  
    file_name = f'gpt_4o_rag_exp{i}.txt'
    with open(file_name, 'w') as file:
        pass

    # Starts the timer
    start = time.time()
    response = "\n"
    for startIndex in range (1, len(question_dict) + 1, 1):
        selected_chunk_str = similar_chunks_list[startIndex - 1]

        with open(file_name, 'r') as file:
            content = file.read()

        prompt_str = qsql.get_rag_prompt_string(question_dict, startIndex, 1, selected_chunk_str)
        print("Prompt: \n" + prompt_str)

        response = api.ask_chatgpt(prompt_str, "gpt-4o", 0)
        response += "\n"

        # Saves the response of each question to a file
        with open(file_name, 'w') as file:
            file.write(content + prompt_str + "\n" + response + "\n\n\n")
        
    end = time.time()
    length = end - start
    print("Round", i, "took", length, "seconds.")


## Extract Results of the Questions from the Response
Some of the answers in the responses may not follow the xmL tag format. The code below will also print out the answers that do not match the xml tag format. 

In [None]:
import questions_mysql
import re

# Get the correct answers
qsql = questions_mysql.QuestionsMysql()
question_dict = qsql.get_rd_questions()

# Get the LLM's response from the txt file
with open('filename.txt', 'r') as file:
        content = file.read()

# Get the answer list and score
choices = qsql.get_answer_xml(content)
if len(choices) == 1050:
    answers = '\n'.join(choices)
    score4 = qsql.get_score_xml(content, question_dict)
    print(score4)

    # Save answer list and score to the txt file
    with open('filename.txt', 'w') as file:
        file.write(content + "Answer List: \n" + answers + "\n" + score4)
else:
    # Find the missed answer
    all_tags_pattern = r"<answer>[^<]*</answer>"
    '''
    all_tags_pattern = r"[^<]*</answer>Instruction"
    
    all_tags = re.findall(all_tags_pattern, content)
    for tag in all_tags:
        print(tag)
    '''
    # Specific pattern to match
    specific_pattern = r"<answer>([a-dA-D]|NaN)(?:\.[^<]*)?</answer>"

    # Find all answer tags
    all_tags = re.findall(all_tags_pattern, content)

    # Filter out those that match the specific pattern
    non_matching_tags = [tag for tag in all_tags if not re.match(specific_pattern, tag)]

    # Print non-matching tags
    for tag in non_matching_tags:
        if tag!= "<answer></answer>":
            print(tag)

Some of the questions's answer may not be provided by the LLM, using the code below to find the ones missing an answer. There should be two \<answer> and two \</answer> for each question. The missing situation may be the second \<answer> was missed, the second \</answer> was missed, or both were missed. Please adjust the code below as needed.

In [None]:
filename = 'xxxx'  # Replace with your file name
with open(filename, 'r') as file:
    content = file.read()

questions = content.split('Instructions:')
missing_answers = []

for i, question in enumerate(questions[1:], start=1):  # start=1 to skip the first empty split
    #if '</answer>' not in question:
        #missing_answers.append(i)
    answer_count = question.count('</answer>')
    if answer_count == 1:
        missing_answers.append(i)

if missing_answers:
    print(f'The following questions are missing answers: {missing_answers}')
else:
    print('All questions have answers.')