In [1]:

from langchain_chroma.vectorstores import Chroma
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings

from openevals.llm import create_llm_as_judge
from openevals.prompts import RETRIEVAL_HELPFULNESS_PROMPT

import pandas as pd
import time




In [2]:
# Initialize embeddings model (ensure the same model used during creation)
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

# Load the existing Chroma vector store
persist_directory = "../db/swim_rules_semantic"
vector_store = Chroma(persist_directory=persist_directory, embedding_function=embeddings)




## Helper Functions


In [3]:

# Function to perform semantic search
def semantic_search(query, k=10):
    """
    Perform a semantic search on the Chroma vector store.

    Args:
        query (str): The query string to search for.
        k (int): The number of top results to retrieve.

    Returns:
        list: A list of documents matching the query.
    """
    results = vector_store.similarity_search_with_relevance_scores(query, k=k, score_threshold=0.4)
    chunks = [{"title": f"chunk_{chunk+1}", "content": result[0].page_content} for chunk, result in enumerate(results)]

    relevance = chunk_evaluator(inputs=query, outputs=chunks)

    print(f"num results: {len(results)}, {[result[1] for result in results]}")

    return relevance




chunk_evaluator = create_llm_as_judge(
    prompt=RETRIEVAL_HELPFULNESS_PROMPT,
    feedback_key="retrieval_helpfulness",
    model="openai:gpt-4o",
)




## Test chunk retrieval


In [4]:
%%time
# Example usage
queries = [
    "Prior to the start signal the swimmer is seen rocking back and forth.  Is this a disqualification?",
    "upon the command 'take your mark', the swimmer takes their starting position and just prior to the start signal, the swimmer is seen moving forward.",
    "What happens if a swimmer touches the wall with one hand in freestyle?",
    "what happens if a swimmer touches the wall with one hand in breaststroke?",
    "A swimmer is wearing a wetsuit in a swimming competition?",
    "During a freestyle event, a swimmer does a flip turn and touches the wall with only one foot.",
    "A breaststroke swimmer moves their hands in a sculling or flipper movement at the end of the first arm stroke, both after the start and after the turn Should they be disqualified?",
    "In a 9-10 100 yard breaststroke event, a swimmer completes 50 yards with a simulatenous two-hand touch and, thinking that the race is over, pushes back from the wall to read the scoreboard. At this point, realizing that the race is only halfway over, the swimmer returns to the wall, pushes off on the breast, and completes the required distance in good form. What call, if any, should be made?",
    "In a 9-10 100 yard breaststroke event, a swimmer completes 50 yards with a simulatenous two-hand touch and, thinking that the race is over, pushes back from the wall on their back to read the scoreboard. At this point, realizing that the race is only halfway over, the swimmer returns to the wall, pushes off on the breast, and completes the required distance in good form. What call, if any, should be made?",
    "A swimmer swims the breaststroke in a way that their hands are completely underwater when they are pushed forward together from their breast.",
    "A swimmer swims the breaststroke in a way that their elbows are above the water line during the forward movement of the arms.",
    "A breaststroker’s head breaks the surface of the water during each cycle, however, the swimmer does not take a breath even if the head breaks the surface.",
    "During a freestyle event, a swimmer starts in the water such that the swimmwer is facing the starting end of the pool with one hand on the wall.",
    "During a freestyle event, a swimmer starts in the water such that the swimmer is facing the other end of the pool with one hand on the wall.",
    "During a butterfly event, a swimmer starts in the water such that the swimmer is facing the other end of the pool with one hand on the wall.",
    "During a butterfly event, a swimmer starts in the water such that the swimmer has one hand on the wall looking away from the starting block.",
    "During a butterfly event, a swimmer starts in the water such that the swimmer is looking at the opposite end of the pool.",
    "In a 200m individual medley event, a swimmer swims the first 50m in butterfly, the second 50m in backstroke, the third 50m starts swimming freestyle but then switches to breaststroke, and the last 50m in freestyle.",
    "In a 100y freestyle event, the swimmer swims the first 50y in butterfly and the last 50y in backstroke.",
]

relevance_list = []
for i, query in enumerate(queries):
    print(f"\nQuery: {query}")
    start_time = time.time()
    relevance = semantic_search(query)
    end_time = time.time()

    relevance["query"] = query
    relevance["evaluation_time"] = end_time - start_time
    relevance_list.append(relevance)

    print(f"completed {i+1} out of {len(queries)} queries")





Query: Prior to the start signal the swimmer is seen rocking back and forth.  Is this a disqualification?
num results: 2, [0.43222538948755584, 0.42851414874192906]
completed 1 out of 19 queries

Query: upon the command 'take your mark', the swimmer takes their starting position and just prior to the start signal, the swimmer is seen moving forward.
num results: 2, [0.4263217539760894, 0.41741996635412393]
completed 2 out of 19 queries

Query: What happens if a swimmer touches the wall with one hand in freestyle?
num results: 3, [0.5160830061917044, 0.4516143727954164, 0.41716792209485676]
completed 3 out of 19 queries

Query: what happens if a swimmer touches the wall with one hand in breaststroke?


No relevant docs were retrieved using the relevance score threshold 0.4


num results: 3, [0.513873848657147, 0.46893478877836237, 0.42539928590273146]
completed 4 out of 19 queries

Query: A swimmer is wearing a wetsuit in a swimming competition?
num results: 0, []
completed 5 out of 19 queries

Query: During a freestyle event, a swimmer does a flip turn and touches the wall with only one foot.
num results: 1, [0.4583698804089851]
completed 6 out of 19 queries

Query: A breaststroke swimmer moves their hands in a sculling or flipper movement at the end of the first arm stroke, both after the start and after the turn Should they be disqualified?
num results: 2, [0.5278120395398984, 0.4294021828400536]
completed 7 out of 19 queries

Query: In a 9-10 100 yard breaststroke event, a swimmer completes 50 yards with a simulatenous two-hand touch and, thinking that the race is over, pushes back from the wall to read the scoreboard. At this point, realizing that the race is only halfway over, the swimmer returns to the wall, pushes off on the breast, and completes t

No relevant docs were retrieved using the relevance score threshold 0.4


num results: 0, []
completed 11 out of 19 queries

Query: A breaststroker’s head breaks the surface of the water during each cycle, however, the swimmer does not take a breath even if the head breaks the surface.


No relevant docs were retrieved using the relevance score threshold 0.4


num results: 0, []
completed 12 out of 19 queries

Query: During a freestyle event, a swimmer starts in the water such that the swimmwer is facing the starting end of the pool with one hand on the wall.
num results: 5, [0.653508654706495, 0.5076304425989419, 0.47857688255750197, 0.4035777129393675, 0.4009182467983301]
completed 13 out of 19 queries

Query: During a freestyle event, a swimmer starts in the water such that the swimmer is facing the other end of the pool with one hand on the wall.
num results: 3, [0.613200174372203, 0.49437513230172014, 0.44327625356076283]
completed 14 out of 19 queries

Query: During a butterfly event, a swimmer starts in the water such that the swimmer is facing the other end of the pool with one hand on the wall.
num results: 3, [0.6043454764719436, 0.4549280844658885, 0.4409715831109483]
completed 15 out of 19 queries

Query: During a butterfly event, a swimmer starts in the water such that the swimmer has one hand on the wall looking away from the s

In [5]:
relevance_df = pd.DataFrame(relevance_list)
relevance_df = relevance_df[["key", "score", "evaluation_time", "query", "comment"]]
relevance_df

Unnamed: 0,key,score,evaluation_time,query,comment
0,retrieval_helpfulness,False,8.645577,Prior to the start signal the swimmer is seen ...,"Upon reading and analyzing the input, the esse..."
1,retrieval_helpfulness,True,6.949613,"upon the command 'take your mark', the swimmer...",To address the input query regarding the swimm...
2,retrieval_helpfulness,True,7.267491,What happens if a swimmer touches the wall wit...,The input question is focused on the rules reg...
3,retrieval_helpfulness,True,4.609283,what happens if a swimmer touches the wall wit...,The input asks about the consequence if a swim...
4,retrieval_helpfulness,False,7.681885,A swimmer is wearing a wetsuit in a swimming c...,The input seeks information about a swimmer we...
5,retrieval_helpfulness,True,6.691034,"During a freestyle event, a swimmer does a fli...",The input presents a scenario from a swimming ...
6,retrieval_helpfulness,False,4.024399,A breaststroke swimmer moves their hands in a ...,The input question is about whether a breastst...
7,retrieval_helpfulness,True,8.296239,"In a 9-10 100 yard breaststroke event, a swimm...","In the input scenario, there is a 9-10 100 yar..."
8,retrieval_helpfulness,True,17.292914,"In a 9-10 100 yard breaststroke event, a swimm...","In evaluating the outputs against the rubric, ..."
9,retrieval_helpfulness,False,6.675392,A swimmer swims the breaststroke in a way that...,The retrieved output provides details on compe...


In [6]:
relevance_df.evaluation_time.sum()

139.90287446975708

In [7]:
relevance_df.describe()

Unnamed: 0,evaluation_time
count,19.0
mean,7.363309
std,2.772613
min,4.024399
25%,6.356661
50%,7.056856
75%,7.982124
max,17.292914


In [8]:
relevance_df.score.value_counts()

score
True     11
False     8
Name: count, dtype: int64

In [9]:
import textwrap
for relevance in relevance_list:
    print(f"\nQuery: {textwrap.fill(relevance['query'], width=90,replace_whitespace=False)}")
    print(f">>>Evaluation Time: {relevance['evaluation_time']:.2f} seconds")
    print(f">>>Relevance: {relevance['key']} {relevance['score']},\n{textwrap.fill(relevance['comment'], width=90,replace_whitespace=False)}")



Query: Prior to the start signal the swimmer is seen rocking back and forth.  Is this a
disqualification?
>>>Evaluation Time: 8.65 seconds
>>>Relevance: retrieval_helpfulness False,
Upon reading and analyzing the input, the essential question posed is whether rocking back
and forth before the start signal, in a swimming event, leads to disqualification. For
relevance evaluation, one should look into the rules regarding swimmer actions before the
start and potential misconduct or false start scenarios. 

1. **Relevant Information:**
- From 'chunk_1': It is mentioned that swimmers need to assume their starting position
promptly, and a false start may lead to disqualification. However, it does not explicitly
address whether pre-signal movement is seen as a legal infraction or grounds for
disqualification. Also, the text suggests that an illegal starting position itself is not
grounds for disqualification if the race proceeds. 
    - From 'chunk_2': Discusses
potential penalties for delib