In [4]:
import os
from openai import OpenAI
from langsmith.wrappers import wrap_openai
from langsmith import traceable, evaluate
from langsmith.schemas import Example, Run
from dotenv import load_dotenv
import weaviate
from weaviate.classes.init import Auth
from weaviate.classes.query import MetadataQuery

load_dotenv()

wcd_url = os.environ["WCD_URL"]
wcd_api_key = os.environ["WCD_API_KEY"]
client = wrap_openai(OpenAI())
client = weaviate.connect_to_weaviate_cloud(
    cluster_url=wcd_url,                                    
    auth_credentials=Auth.api_key(wcd_api_key),
)
openai_client = wrap_openai(OpenAI(api_key=os.getenv('OPENAI_API_KEY')))

COLLECTION_NAME = "James"
GPT_MODEL_NAME = "gpt-4o"
EMBEDDING_MODEL_NAME = "text-embedding-3-large"
DATASET_NAME = "rag_evaluation_dataset"
EXPERIMENT_PREFIX = "rag_evaluation_experiment"

@traceable
def answer_with_rag(inputs: dict) -> dict:
    query = inputs["messages"][-1]["content"]

    # Get embedding for the query
    response = openai_client.embeddings.create(
        model=EMBEDDING_MODEL_NAME,
        input=query
    )
    query_embedding = response.data[0].embedding

    # Search for similar texts in Weaviate
    collection = client.collections.get(COLLECTION_NAME)
    similar_texts = collection.query.near_vector(
        near_vector=query_embedding,
        limit=3,
        return_properties=["text"],
        return_metadata=MetadataQuery(distance=True)
    )

    # Prepare context for GPT
    context_str = "\n\n---\n\n".join([doc.properties["text"] for doc in similar_texts.objects])

    prompt = f"""Answer the question using ONLY the information provided in the context below. 
    Do not add any general knowledge or information not contained in the context."

    Context:
    {context_str}

    Question: {query}

    Answer:"""

    # Generate answer using GPT-4
    response = openai_client.chat.completions.create(
        model=GPT_MODEL_NAME,
        messages=[
            {"role": "system", "content": "You are a helpful assistant that answers questions based on the provided context."},
            {"role": "user", "content": prompt}
        ],
        temperature=0
    )

    return {
        "message": {"role": "assistant", "content": response.choices[0].message.content}
    }


def correctness_evaluator(run: Run, example: Example) -> dict:
    """
    Evaluates the correctness of generated response.

    Args:
        run: Contains the run information including inputs and outputs
        example: Contains the reference example if available

    Returns:
        Dictionary with score (0-1) and explanation
    """
    # Extract the original vocabulary list from inputs
    query = run.inputs["inputs"]["messages"][-1]["content"]

    # Extract the model's generated dialogue
    answer = run.outputs["message"]["content"]

    # Rest of the evaluation logic remains the same
    evaluation_prompt = f"""
    Given a query by the user and two responses, evaluate whether the two responses are basically equivalent and whether the second response satisfactorily answers the query.

    Use the following scoring rubric:
    2 = The two responses are equivalent and the second response satisfactorily answers the query.
    1 = The two responses are not equivalent but the second response does answer the query.
    0 = The two responses are not equivalent and the second response does not answer the query.
    
    Return only the number (0-2).

    The query is: {query}

    The first response is: {example.outputs["message"]["content"]}

    The second response is: {answer}
    """

    response = openai_client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "system",
                "content": "You are a dialogue evaluation assistant. Respond only with a number 0-2.",
            },
            {"role": "user", "content": evaluation_prompt},
        ],
        temperature=0,
    )

    try:
        score = int(response.choices[0].message.content.strip())
        return {
            "key": "correctness score",
            "score": score / 2,  # Normalize to 0-1
            "explanation": f"Correctness score: {score}/2",
        }
    except ValueError:
        return {
            "key": "correctness score",
            "score": 0,
            "explanation": "Failed to parse score",
        }


# List of evaluators to score the outputs of target task
evaluators = [correctness_evaluator]

# Evaluate the target task
results = evaluate(
    answer_with_rag,
    data=DATASET_NAME,
    evaluators=evaluators,
    experiment_prefix=EXPERIMENT_PREFIX,
)


View the evaluation results for experiment: 'rag_evaluation_experiment-65e032b8' at:
https://smith.langchain.com/o/f34f0648-f907-4864-b09a-dd5f87740bd9/datasets/96a47dfc-d5cc-422a-8ea3-42cf694a2418/compare?selectedSessions=06913239-6638-44b7-91c3-dc1f1f386bca




0it [00:00, ?it/s]