# Chatbot Test Notebook

This notebook is designed to test and experiment with the functionality of the chatbot implemented in the `rag_swimrules` project. It showcases various scenarios and evaluates the performance of the chatbot.

In [12]:
import textwrap
import json
import pandas as pd
import time
import matplotlib.pyplot as plt
import random
import pickle

from langchain.prompts import PromptTemplate
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

from langchain_chroma.vectorstores import Chroma
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings

from openevals.prompts import CORRECTNESS_PROMPT
from openevals.llm import create_llm_as_judge


In [13]:
RANDOM_SAMPLE_SIZE = 20
VECTORDB_SUFFIX = "_techrules"   # "_techrules"  # "_semantic"

In [14]:


swimming_rules_template = """You are an expert swimming rules assistant trained on FINA and competitive swimming regulations. 
Your answers must be precise, factual, and based only on the provided rules. Follow these guidelines:

1. Answer ONLY using the verified rules provided below
2. DO NOT use information from any other sources, including the internet or personal knowledge.
3. if an action is not explicitly prohibited in the rules, assume it is allowed.
4. if an action is not explicity mentioned, assume it is allowed.
5. if an action is explicitly stated in the rule, follow what the rule says.
6. A disqualification must be based on a specific rule violation and cannot be implied.
7. If the information isn't in the rules, say "I don't have that rule in my current documentation"
8. Keep answers concise but complete
9. Reference specific rule numbers when possible

Relevant swimming rules:
{context}

Question: {question}

Answer:"""

SWIMMING_RULES_PROMPT = PromptTemplate(
    template=swimming_rules_template,
    input_variables=["context", "question"]
)


In [15]:
# reaad situations from json file
with open("../raw_data/swimming_situations2.json") as f:
    swimming_situations = json.load(f)

print(f"# of situations: {len(swimming_situations)}")

# of situations: 56


In [16]:
# Initialize embeddings model (ensure the same model used during creation)
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

# retretieve the relevant rules from your database or knowledge base
# Load the existing Chroma vector store
persist_directory = "../db/swim_rules" + VECTORDB_SUFFIX
vector_store = Chroma(persist_directory=persist_directory, embedding_function=embeddings)

# Create a retriever from the vector store
retriever = vector_store.as_retriever(search_kwargs={"k": 10})



In [17]:
# Initialize LLM
llm = ChatOpenAI(
    model="gpt-4o",
    temperature=0,
    max_tokens=1000
)


In [18]:
print(f"Swimming situations: {len(swimming_situations)}")
for situation in swimming_situations[:5]:
    print(f"Situation: {situation}")

Swimming situations: 56
Situation: {'situation_id': 9, 'prompt': 'A breaststroke swimmer moves their hands in a sculling or flipper movement at the end of the first arm stroke, both after the start and after the turn. Should they be disqualified?', 'target': 'No, the swimmer should not be disqualified. This is legal provided, "All movements of the arms shall be simultaneous without alternating movement." A sculling motion of the hands, only as a part of the arm stroke, is not considered the beginning of a new stroke.', 'rule': '101.2.2'}
Situation: {'situation_id': 10, 'prompt': 'In a 9-10 100 yard breaststroke event, a swimmer completes 50 yards and, thinking that the race is over, pushes back from the wall to read the scoreboard. At this point, realizing that the race is only halfway over, the swimmer returns to the wall, pushes off on the breast, and completes the required distance in good form. What call, if any, should be made?', 'target': 'The swimmer left the wall on the back an

In [19]:
# setup evaluator
response_evaluator = create_llm_as_judge(
    prompt=CORRECTNESS_PROMPT,
    model="openai:gpt-4o",
    feedback_key="correctness"
)

In [20]:


if RANDOM_SAMPLE_SIZE:
    # Set random seed for reproducibility
    random.seed(42)

    # Take a random sample of swimming situations
    sampled_situations = random.sample(swimming_situations, RANDOM_SAMPLE_SIZE)
else:
    # Use the entire dataset if no sample size is specified
    sampled_situations = swimming_situations

test_results = []
for situation in sampled_situations:
    query = situation["prompt"]
    test_case = {"question": query, "situation": situation["situation_id"]}
    print()
    print(textwrap.fill(f"\n>>>SITUATION: {situation['situation_id']} QUERY: {query}", width=80))
    
    # Use the retriever to get relevant documents
    docs = retriever.get_relevant_documents(query)

    # Combine the retrieved documents into a context string
    context = "\n".join([doc.page_content for doc in docs])

    # Create the prompt template
    prompt = SWIMMING_RULES_PROMPT.format(context=context, question=query)

    start_time = time.time()
    response = llm(prompt)
    test_case["query_time"] = time.time() - start_time

    start_time = time.time()
    correctness_score = response_evaluator(inputs=query, outputs=response.content, reference_outputs=situation["target"])
    test_case["response_eval_time"] = time.time() - start_time

    test_case.update({
        "llm_response": response.content,
        "correctness_score": correctness_score["score"],
        "feedback": correctness_score["comment"],
        "target": situation["target"]
    })

    # Word wrap the response content if the line is longer than 80 characters
    wrapped_response = textwrap.fill(response.content, width=80)
    print(f">>>LLM RESPONSE: {wrapped_response}")
    print(f">>>TARGET: {textwrap.fill(situation['target'], width=80)}")

    print(f">>>CORRECTNESS SCORE: {correctness_score['key']}  {correctness_score['score']}")
    print(f"FEEDBACK: {textwrap.fill(correctness_score['comment'], width=80)}")

    test_results.append(test_case)



 >>>SITUATION: 109 QUERY: A swimmer begins the freestyle leg of the individual
medley by taking several butterfly kicks. The swimmer is disqualified for
swimming more than one quarter of the race in the style of butterfly. Is this a
correct disqualification?
>>>LLM RESPONSE: No, this is not a correct disqualification. According to rule 101.6.2, in the
freestyle leg of an individual medley, the swimmer may swim any style other than
butterfly, breaststroke, or backstroke. The rules do not specify that taking
several butterfly kicks constitutes swimming in the style of butterfly for more
than one quarter of the race. Therefore, the disqualification is not supported
by the provided rules.
>>>TARGET: No, this is not a correct disqualification. The butterfly kick is only one
element of butterfly; it is not the defining characteristic. A swimmer would
need to use both a butterfly kick, as well as a simultaneous double arm pull, in
which the arms recover over the surface of the water, in orde

In [21]:
test_results_df = pd.DataFrame(test_results)
test_results_df

Unnamed: 0,question,situation,query_time,response_eval_time,llm_response,correctness_score,feedback,target
0,A swimmer begins the freestyle leg of the indi...,109,1.313711,3.084885,"No, this is not a correct disqualification. Ac...",True,The output claims that the disqualification of...,"No, this is not a correct disqualification. Th..."
1,To get further distance out of their breaststr...,16,1.016227,1.641489,"No, this is not legal. According to rule 101.2...",True,The output correctly addresses the legality of...,"It is not legal. The rule states, “The hands s..."
2,"In a 9-10 100 yard breaststroke event, a swimm...",10,1.876201,3.751092,"Based on the provided rules, there is no speci...",False,Upon reviewing the output:\n\n1. **Factual Acc...,The swimmer left the wall on the back and then...
3,A medley relay team would like to swim the rel...,126,1.09788,2.35796,"No, this is not legal. According to rule 101.7...",True,"Upon evaluation of the input and output, here ...","No, this is not legal. The rules require that ..."
4,A swimmer dives in to the pool to start the 10...,88,1.323515,1.844905,"Yes, the swimmer should be disqualified. Accor...",True,The output provided is in line with the rules ...,Yes. The rule requires the head to break the s...
5,A swimmer wants to swim in the style of backst...,86,1.43017,1.8448,"Yes, it is legal. According to rule 101.5.2, i...",False,The model output states that it is legal for a...,"No. The rule states, “The forward start shall ..."
6,"In the 200 yard butterfly, a swimmer approachi...",43,1.755013,2.66362,"Yes, the deck referee should accept the call. ...",True,The output correctly identifies the rule regar...,As long as the official can clearly see that t...
7,A swimmer in the 100 yard butterfly moves too ...,37,1.122136,2.973142,"Yes, this is a valid disqualification. Accordi...",True,The output aligns well with the referenced out...,"This is a correct call. The rule states, “Both..."
8,During the freestyle leg of the 200 yard medle...,131,1.022486,2.173128,"Yes, the relay team should be disqualified. Ac...",True,The output provided states that the relay team...,"Yes, the relay team should be disqualified. Th..."
9,A swimmer swims the breaststroke in a way that...,15,0.918803,3.791853,"Yes, the coach is correct. According to rule 1...",False,"In evaluating the given output, it is essentia...","No, the coach is not. The rule states, “The ha..."


In [22]:
# save the dataframe to a pickle file
with open(f"../results/test_results_df{VECTORDB_SUFFIX}.pkl", "wb") as f:
    pickle.dump(test_results_df, f)