In [1]:
import dspy
from dspy.retrieve.weaviate_rm import WeaviateRM
import weaviate
from dotenv import load_dotenv
import openai
import os

load_dotenv()
  
WCS_API_KEY = os.getenv("WCS_API_KEY")
WEAVIATE_CLUSTER_URL= os.getenv("WEAVIATE_CLUSTER_URL")
  
# Connect to a WCS instance
weaviate_client = weaviate.connect_to_wcs(
    cluster_url=WEAVIATE_CLUSTER_URL,
    auth_credentials=weaviate.auth.AuthApiKey(WCS_API_KEY),
    headers = {
        'X-Openai-Api-Key': os.getenv("OPENAI_API_KEY")
    }
    )

llm = dspy.OpenAI(model = "gpt-4o-mini")
retriever_model = WeaviateRM("WeaviateBlogChunk", weaviate_client=weaviate_client)

dspy.settings.configure(lm = llm, rm=retriever_model)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print(dspy.settings.lm("Write a 3 line poem about neural networks."))
context_example = dspy.OpenAI(model="gpt-4o")

with dspy.context(llm=context_example):
    print(context_example("Write a 3 line poem about neural networks."))

['In layers deep, connections weave,  \nPatterns dance, as data grieves,  \nA mind of code, where thoughts conceive.']
["In tangled webs of code they weave,\nMimicking the mind's intricate sieve,\nNeural dreams in silicon conceive."]


In [3]:
import re

f = open("faq.md")
markdown_content = f.read()

def parse_question(markdown_content):
    question_pattern = r'#### Q: (.+?)\n'
    questions = re.findall(question_pattern, markdown_content, re.DOTALL)
    return questions

questions = parse_question(markdown_content)
questions[:5]

['Why would I use Weaviate as my vector database?',
 'What is the difference between Weaviate and for example Elasticsearch?',
 'Do you offer Weaviate as a managed service?',
 'How should I configure the size of my instance?',
 'Do I need to know about Docker (Compose) to use Weaviate?']

In [4]:
len(questions)

44

In [5]:
trainset =  questions[:20]
devset = questions[20:30]
testset = questions[30:]

trainset = [dspy.Example(question=question).with_inputs("question") for question in trainset]
devset = [dspy.Example(question=question).with_inputs("question") for question in devset]
testset = [dspy.Example(question=question).with_inputs("question") for question in testset]

In [6]:
devset[0]

Example({'question': 'Is there support to multiple versions of the query/document embedding models to co-exist at a given time? (helps with live experiments of new model versions)'}) (input_keys={'question'})

In [7]:
metricLM = dspy.OpenAI(model = 'gpt-4o', max_tokens = 1000, model_type='chat')

class Assess(dspy.Signature):
    """Assess the quality of an answer to a question."""

    context = dspy.InputField(desc = "The context for answering the question")
    assessment_criterion = dspy.InputField(desc = "The evaluation criterion")
    assessed_answer = dspy.InputField(desc = "The answer to the question")
    assessment_answer = dspy.OutputField(desc = "A rating between 1 and 5. Only output the rating and nothing else.")

def llm_metric(gold, pred, trace = None):
    predicted_answer = pred.answer
    question = gold.question

    print(f"Test question: {question}")
    print(f"Predicted answer: {predicted_answer}")

    detail = "Is the assessed answer detailed?"
    faithful = "Is the assessed text grounded in the context? Say no if it includes significant information not in the context."
    overall = f"Please rate how well this answer addresses the question, `{question}` based on the context.\n `{predicted_answer}`"

    with dspy.context(lm = metricLM):
        context = dspy.Retrieve(k = 5)(question).passages
        detail = dspy.ChainOfThought(Assess)(context = 'N/A', assessment_criterion = detail, assessed_answer = predicted_answer)
        faithful = dspy.ChainOfThought(Assess)(context = context, assessment_criterion = faithful, assessed_answer = predicted_answer)
        overall = dspy.ChainOfThought(Assess)(context = context, assessment_criterion = overall, assessed_answer = predicted_answer)

    print(f"Faithful: {faithful.assessment_answer}")
    print(f"Detail: {detail.assessment_answer}")
    print(f"Overall: {overall.assessment_answer}")

    total = float(detail.assessment_answer) + float(faithful.assessment_answer) * 2 + float(overall.assessment_answer)

    return total / 5.0


In [8]:
test_example = dspy.Example(question = "What do cross encoders do?")
test_pred = dspy.Example(answer = "They re-rank documents.")

llm_metric(test_example, test_pred)

Test question: What do cross encoders do?
Predicted answer: They re-rank documents.
Faithful: 1
Detail: 1
Overall: 2


1.0

In [9]:
test_example = dspy.Example(question="What do cross encoders do?")
test_pred = dspy.Example(answer="They index data.")

type(llm_metric(test_example, test_pred))

Test question: What do cross encoders do?
Predicted answer: They index data.
Faithful: 1
Detail: 1
Overall: 1


float

In [11]:
metricLM.inspect_history(n=3)





Assess the quality of an answer to a question.

---

Follow the following format.

Context: The context for answering the question

Assessment Criterion: The evaluation criterion

Assessed Answer: The answer to the question

Reasoning: Let's think step by step in order to ${produce the assessment_answer}. We ...

Assessment Answer: A rating between 1 and 5. Only output the rating and nothing else.

---

Context:
[1] «[Cross Encoders](#cross-encoders) (collapsing the use of Large Language Models for ranking into this category as well)
1. [Metadata Rankers](#metadata-rankers)
1. [Score Rankers](#score-rankers)

## Cross Encoders
Cross Encoders are one of the most well known ranking models for content-based re-ranking. There is quite a collection of pre-trained cross encoders available on [sentence transformers](https://www.sbert.net/docs/pretrained_cross-encoders.html). We are currently envisioning interfacing cross encoders with Weaviate using the following syntax.»
[2] «Bi-Encoders 

'\n\n\nAssess the quality of an answer to a question.\n\n---\n\nFollow the following format.\n\nContext: The context for answering the question\n\nAssessment Criterion: The evaluation criterion\n\nAssessed Answer: The answer to the question\n\nReasoning: Let\'s think step by step in order to ${produce the assessment_answer}. We ...\n\nAssessment Answer: A rating between 1 and 5. Only output the rating and nothing else.\n\n---\n\nContext:\n[1] «[Cross Encoders](#cross-encoders) (collapsing the use of Large Language Models for ranking into this category as well)\n1. [Metadata Rankers](#metadata-rankers)\n1. [Score Rankers](#score-rankers)\n\n## Cross Encoders\nCross Encoders are one of the most well known ranking models for content-based re-ranking. There is quite a collection of pre-trained cross encoders available on [sentence transformers](https://www.sbert.net/docs/pretrained_cross-encoders.html). We are currently envisioning interfacing cross encoders with Weaviate using the followi