# Chatbot Test Notebook

This notebook is designed to test and experiment with the functionality of the chatbot implemented in the `rag_swimrules` project. It showcases various scenarios and evaluates the performance of the chatbot.

In [1]:
import textwrap
import json
import pandas as pd
import time
import random
import pickle

from langchain.prompts import PromptTemplate
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

from langchain_chroma.vectorstores import Chroma
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings

from openevals.prompts import CORRECTNESS_PROMPT
from openevals.llm import create_llm_as_judge


In [2]:
RANDOM_SAMPLE_SIZE = None
VECTORDB_SUFFIX = "_semantic"   # "_techrules"  # "_semantic"

In [3]:


swimming_rules_template = """You are an expert swimming rules assistant trained on FINA and competitive swimming regulations. 
Your answers must be precise, factual, and based only on the provided rules. Follow these guidelines:

1. Answer ONLY using the verified rules provided below
2. DO NOT use information from any other sources, including the internet or personal knowledge.
3. if an action is not explicitly prohibited in the rules, assume it is allowed.
4. if an action is not explicity mentioned, assume it is allowed.
5. if an action is explicitly stated in the rule, follow what the rule says.
6. A disqualification must be based on a specific rule violation and cannot be implied.
7. If the information isn't in the rules, say "I don't have that rule in my current documentation"
8. Keep answers concise but complete
9. Reference specific rule numbers when possible

Relevant swimming rules:
{context}

Question: {question}

Answer:"""

SWIMMING_RULES_PROMPT = PromptTemplate(
    template=swimming_rules_template,
    input_variables=["context", "question"]
)


In [4]:
# reaad situations from json file
with open("../raw_data/swimming_situations2.json") as f:
    swimming_situations = json.load(f)

print(f"# of situations: {len(swimming_situations)}")

# of situations: 56


In [5]:
# Initialize embeddings model (ensure the same model used during creation)
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

# retretieve the relevant rules from your database or knowledge base
# Load the existing Chroma vector store
persist_directory = "../db/swim_rules" + VECTORDB_SUFFIX
vector_store = Chroma(persist_directory=persist_directory, embedding_function=embeddings)

# Create a retriever from the vector store
retriever = vector_store.as_retriever(search_kwargs={"k": 10})



In [6]:
# Initialize LLM
llm = ChatOpenAI(
    model="gpt-4o",
    temperature=0,
    max_tokens=1000
)


In [7]:
print(f"Swimming situations: {len(swimming_situations)}")
for situation in swimming_situations[:5]:
    print(f"Situation: {situation}")

Swimming situations: 56
Situation: {'situation_id': 9, 'prompt': 'A breaststroke swimmer moves their hands in a sculling or flipper movement at the end of the first arm stroke, both after the start and after the turn. Should they be disqualified?', 'target': 'No, the swimmer should not be disqualified. This is legal provided, "All movements of the arms shall be simultaneous without alternating movement." A sculling motion of the hands, only as a part of the arm stroke, is not considered the beginning of a new stroke.', 'rule': '101.2.2'}
Situation: {'situation_id': 10, 'prompt': 'In a 9-10 100 yard breaststroke event, a swimmer completes 50 yards and, thinking that the race is over, pushes back from the wall to read the scoreboard. At this point, realizing that the race is only halfway over, the swimmer returns to the wall, pushes off on the breast, and completes the required distance in good form. What call, if any, should be made?', 'target': 'The swimmer left the wall on the back an

In [8]:
# setup evaluator
response_evaluator = create_llm_as_judge(
    prompt=CORRECTNESS_PROMPT,
    model="openai:gpt-4o",
    feedback_key="correctness"
)

In [9]:


if RANDOM_SAMPLE_SIZE:
    # Set random seed for reproducibility
    random.seed(42)

    # Take a random sample of swimming situations
    sampled_situations = random.sample(swimming_situations, RANDOM_SAMPLE_SIZE)
else:
    # Use the entire dataset if no sample size is specified
    sampled_situations = swimming_situations

test_results = []
for situation in sampled_situations:
    query = situation["prompt"]
    test_case = {"question": query, "situation": situation["situation_id"]}
    print()
    print(textwrap.fill(f"\n>>>SITUATION: {situation['situation_id']} QUERY: {query}", width=80))
    
    # Use the retriever to get relevant documents
    docs = retriever.get_relevant_documents(query)

    # Combine the retrieved documents into a context string
    context = "\n".join([doc.page_content for doc in docs])

    # Create the prompt template
    prompt = SWIMMING_RULES_PROMPT.format(context=context, question=query)

    start_time = time.time()
    response = llm(prompt)
    test_case["query_time"] = time.time() - start_time

    start_time = time.time()
    correctness_score = response_evaluator(inputs=query, outputs=response.content, reference_outputs=situation["target"])
    test_case["response_eval_time"] = time.time() - start_time

    test_case.update({
        "llm_response": response.content,
        "correctness_score": correctness_score["score"],
        "feedback": correctness_score["comment"],
        "target": situation["target"]
    })

    # Word wrap the response content if the line is longer than 80 characters
    wrapped_response = textwrap.fill(response.content, width=80)
    print(f">>>LLM RESPONSE: {wrapped_response}")
    print(f">>>TARGET: {textwrap.fill(situation['target'], width=80)}")

    print(f">>>CORRECTNESS SCORE: {correctness_score['key']}  {correctness_score['score']}")
    print(f"FEEDBACK: {textwrap.fill(correctness_score['comment'], width=80)}")

    test_results.append(test_case)



 >>>SITUATION: 9 QUERY: A breaststroke swimmer moves their hands in a sculling
or flipper movement at the end of the first arm stroke, both after the start and
after the turn. Should they be disqualified?


  docs = retriever.get_relevant_documents(query)
  response = llm(prompt)


>>>LLM RESPONSE: No, the swimmer should not be disqualified. The rules for breaststroke do not
explicitly prohibit a sculling or flipper movement at the end of the first arm
stroke after the start and after the turn. Therefore, according to the
guidelines, if an action is not explicitly prohibited, it is allowed.
>>>TARGET: No, the swimmer should not be disqualified. This is legal provided, "All
movements of the arms shall be simultaneous without alternating movement." A
sculling motion of the hands, only as a part of the arm stroke, is not
considered the beginning of a new stroke.
>>>CORRECTNESS SCORE: correctness  False
FEEDBACK: Upon evaluating the content of the output against the rubric, several criteria
were considered:  1. **Accuracy and Completeness**:     - The response correctly
states that the swimmer should not be disqualified based on the movement
described.     - However, the explanation for why this movement does not result
in disqualification is incomplete. While it men

In [10]:
test_results_df = pd.DataFrame(test_results)
test_results_df

Unnamed: 0,question,situation,query_time,response_eval_time,llm_response,correctness_score,feedback,target
0,A breaststroke swimmer moves their hands in a ...,9,1.083981,4.614656,"No, the swimmer should not be disqualified. Th...",False,Upon evaluating the content of the output agai...,"No, the swimmer should not be disqualified. Th..."
1,"In a 9-10 100 yard breaststroke event, a swimm...",10,0.666053,4.198321,I don't have that rule in my current documenta...,False,The evaluated model output returns an incomple...,The swimmer left the wall on the back and then...
2,"In a 9-10 100 yard breaststroke event, a swimm...",11,0.608344,2.967194,I don't have that rule in my current documenta...,False,The model output fails to provide accurate and...,"No call should be made. The rule states, “It i..."
3,"In a senior 100 yard breaststroke event, the t...",12,1.527979,3.483755,"Yes, the swimmer should be disqualified. Accor...",False,The output provided in the evaluation does not...,The swimmer should not be disqualified. The ru...
4,When a swimmer left the wall at the first turn...,13,1.679277,2.049041,"Yes, the swimmer should have been disqualified...",True,The output accurately addresses the question o...,"Yes. The rule states, “After the start and aft..."
5,"After a turn in the 100 yard breaststroke, a s...",14,1.222312,2.971064,"Yes, the swimmer should be disqualified. Accor...",False,The output provided incorrectly interprets the...,No. The rule states “All movements of the arms...
6,A swimmer swims the breaststroke in a way that...,15,1.120795,2.965431,"Yes, the coach is correct. According to rule 1...",False,"The output references rule 101.2.2, asserting ...","No, the coach is not. The rule states, “The ha..."
7,To get further distance out of their breaststr...,16,0.81111,3.277771,"No, this is not legal. According to rule 101.2...",True,The output provides a clear and complete respo...,"It is not legal. The rule states, “The hands s..."
8,A swimmer in the 100 yard butterfly moves too ...,37,1.281072,2.969362,"No, this is not a valid disqualification. Acco...",False,The model output indicates that the disqualifi...,"This is a correct call. The rule states, “Both..."
9,"Near the finish of a butterfly race, a swimmer...",38,1.55395,2.125873,"Yes, this is legal. According to rule 101.3.5,...",False,The submitted output claims that the swimmer's...,"No, this is not legal. The swimmer executed an..."


In [11]:
# save the dataframe to a pickle file
with open(f"../results/test_results_df{VECTORDB_SUFFIX}.pkl", "wb") as f:
    pickle.dump(test_results_df, f)