In [17]:
#!pip install dspy-ai==2.1.9 weaviate-client==3.26.2 > /dev/null

In [18]:
import dspy
from dspy.evaluate import Evaluate
from dspy.teleprompt import BootstrapFewShot, BootstrapFewShotWithRandomSearch, BootstrapFinetune
from dspy.retrieve.weaviate_rm import WeaviateRM
import weaviate
import os
from dotenv import load_dotenv
import pathlib
import re
import pandas as pd
from sklearn.metrics import f1_score, precision_score, recall_score
import dsp

In [19]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

In [20]:
path_env = pathlib.Path(os.getcwd()).parent.parent / '.env'
load_dotenv(path_env)
api_key = os.getenv("OPENAI_API_KEY")

os.environ["OPENAI_API_KEY"] = api_key

In [21]:
os.environ["DSP_NOTEBOOK_CACHEDIR"] = os.path.join('.', 'cache')

In [22]:
llm = dspy.OpenAI(
    model="gpt-4-0125-preview",  #gpt-4o-2024-05-13, #"gpt-4-1106-preview", # TODO: try turbo-instruct
)

# Assumes the Weaviate collection has a text key `content`
dspy.settings.configure(lm=llm)

### Defining the training data

In [35]:
import pandas as pd
df = pd.read_excel("contrad_ex.xlsx")
df = df.astype(str)
df

Unnamed: 0,ID,Context,Text,Rationale,LabelInt,LabelStr,Faithfulness
0,0,"Testosterone is a hormone that your gonads (sex organs) mainly produce. More specifically, the testicles in people assigned male at birth (AMAB) and the ovaries in people assigned female at birth (AFAB) produce testosterone.","In the brain, the hypothalamus releases chemicals (hormones) that cause the pituitary gland to release hormones called gonadotropins. Gonadotropins stimulate the growth of the sex glands (also called gonads), which in boys are the testicles, and in girls are the ovaries. In boys, the testicles release testosterone; in girls, the ovaries release estrogen. Puberty usually starts between the ages of 8 and 13 in girls and 9 and 14 in boys.","The context highlights that ovaries produce testosterone without detailing its relatively small amount compared to estrogen, while the new text emphasizes estrogen as the primary hormone from ovaries, possibly implying that testosterone is not produced by ovaries at all.",3,Minor Discrepancy,False
1,1,"With one important exception. The exception was the shot. Compared to women using a hormonal IUD, women using the shot were three times more likely to have a blood clot.\t","Careful, not all methods are safe for new moms to use: As if they didn’t have their hands full already, women who have just given birth have to be especially careful when choosing a contraceptive. The CDC has just updated its guidelines on which methods are safe for new moms to use, and when. While hormonal methods that don’t contain estrogen—the shot, the Mirena IUD, the implant and the mini-pill—are safe for women to use immediately after giving birth, combination pills, which contain estrogen, shouldn’t be used until three weeks after giving birth, since estrogen can increase a woman’s risk of blood clots.\t","The context notes an increased risk of blood clots with the contraceptive shot compared to hormonal IUDs, the new text emphasizes its safety postpartum due to the absence of estrogen, despite the known clotting risk.",3,Minor Discrepancy,False
2,2,"Cal’s skin was described to be jaundiced again through visual assessment when he was 23 hours old, but a bilirubin test was not done. Neither was his blood typed or a Coombes test performed.\t","Diagnosis: At a minimum, babies should be checked for jaundice every 8 to 12 hours in the first 48 hours of life. It is important for your baby to be seen by a nurse or doctor when the baby is between 3 and 5 days old, because this is usually when a baby’s bilirubin level is highest. This is why, if your baby is discharged before age 72 hours, your baby should be seen within 2 days of discharge. The timing of this visit may vary depending on your baby’s age when released from the hospital and other factors.\t","The context describes a case where a baby showing signs of jaundice was not given a bilirubin test or other relevant tests before being discharged at 36 hours old, which contradicts the guidelines in the new text that recommend checking for jaundice every 8 to 12 hours in the first 48 hours and ensuring follow-up visits if discharged before 72 hours.\t",2,Major Discrepancy,False
3,3,The word prenatal means before birth. Trimester means 3 months. A normal pregnancy is around 10 months and has 3 trimesters.\t,"The average length of pregnancy is 280 days, or 40 weeks. But there is no way to know exactly when you will go into labor. Most women give birth between 38 and 41 weeks of pregnancy. The more you know about what to expect during labor, the better prepared you will be once it begins.\t","The context states that a normal pregnancy is around 10 months (which would imply 43-44 weeks), while the new text states that the average length of pregnancy is 40 weeks, with most women giving birth between 38 and 41 weeks.\t",1,Direct Contradiction,False
4,4,Key points about pregnancy loss: Pregnancy loss is the death of an unborn baby (fetus) at any time during pregnancy. Pregnancy loss occurs in up to 1 in every 4 pregnancies. Most happen during the first trimester. About half of early pregnancy losses are from defects in genes or chromosomes. Vaginal bleeding is the most common symptom of pregnancy loss.\t,"What is my risk of miscarriage by week: Your risk of pregnancy loss declines each week you’re pregnant. Around 15% of pregnancies end in miscarriage. Miscarriage risk in the second trimester (13 to 19 weeks) is between 1% and 5%. Many factors affect your risk of miscarriage such as your age and health. However, everyone’s risk of miscarriage declines each week of pregnancy if the pregnant person has no other health conditions.\t","The context states that pregnancy loss occurs in up to 25% of pregnancies, whereas the new text states that around 15% of pregnancies end in miscarriage.\t",2,Major Discrepancy,False
5,5,"IBS can be painful, but it doesn’t lead to other health problems or damage the digestive tract.\t","Luckily, diarrhea is usually short-lived, lasting no more than a few days. But when diarrhea lasts beyond a few days into weeks, it usually indicates that there's another problem — such as irritable bowel syndrome (IBS) or a more serious disorder, including persistent infection, celiac disease or inflammatory bowel disease (IBD).\t","The context states that IBS doesn't lead to other health problems or damage the digestive tract, while the new text suggests that prolonged diarrhea can indicate IBS or more serious conditions, which may imply IBS is a serious issue.\t",2,Major Discrepancy,False
6,6,All mammals have hair or fur at some stage of their development.,"Dolphins are mammals that are born with a few whiskers around their snout, which typically fall out soon after birth.","The context broadly states that all mammals have hair or fur, while the new text specifies that dolphins, as mammals, lose their initial hair quickly, suggesting minimal hair presence.",3,Minor Discrepancy,False
7,7,Regular physical activity is essential for maintaining a healthy weight.,"Weight management is primarily influenced by diet, not physical activity, according to recent studies.","The context emphasizes physical activity as essential for weight management, whereas the new text claims diet is more crucial, de-emphasizing the role of exercise.",2,Major Discrepancy,False
8,8,Decaffeinated coffee is 100% caffeine-free.,"Decaffeinated coffee still contains small amounts of caffeine, though significantly less than regular coffee.","The context suggests decaffeinated coffee has no caffeine, while the new text corrects this by noting the presence of small amounts, directly contradicting the initial claim.",1,Direct Contradiction,False
9,9,Eating carrots improves your night vision.,"Carrots contain beta-carotene, which the body converts into vitamin A, important for overall eye health, but they do not enhance night vision to significant levels.","The context claims a direct improvement in night vision from eating carrots, while the new text clarifies that while beneficial for eye health, carrots do not significantly enhance night vision.",3,Minor Discrepancy,False


In [24]:
data = []
for i, row in df.iterrows():
    data.append({
        "id": row["ID"],
        "context": row["Context"],
        "text": row["Text"],
        "rationale": row["Rationale"],
        "answer": row["LabelInt"],
        "LabelStr": row["LabelStr"],
        "Faithfulness": row["Faithfulness"]
    })
    

# Create training examples
trainset = [
    dspy.Example({**row}).with_inputs('context', 'text') for row in data[:4]
]

devset = [
    dspy.Example({**row}).with_inputs('context', 'text') for row in data[4:]
]

In [25]:
# Define the CheckContradictions signature
class CheckContradictions(dspy.Signature):
    """Identify if there are contradictions with the provided context."""
    context = dspy.InputField(desc="facts here are assumed to be true")
    text = dspy.InputField()
    Faithfulness = dspy.OutputField(desc="True/False indicating if text contradicts the context")
    rationale = dspy.OutputField(desc="Why the new text contradicts the context or not")
    
class CheckContradictionType(dspy.Signature):
    """Identify the type of contradiction detected"""
    context = dspy.InputField(desc="facts here are assumed to be true")
    text = dspy.InputField()
    rationale = dspy.OutputField(desc="Why the new text contradicts the context or not")
    LabelInt = dspy.OutputField(desc="A rating between 0 and 4: 0 = No Contradiction, 1 = Direct Contradiction, 2 = Major Discrepancy, 3 = Minor Discrepancy")
    
# Define the Assess signature for automatic assessments
class Assess(dspy.Signature):
    """Assess the quality of the contradiction detected."""
    context = dspy.InputField(desc='The context for detecting the contradiction')
    assessed_text = dspy.InputField(desc='The text on which the contradiction is being searched')
    assessed_answer = dspy.OutputField(desc="The contradiction detected, if any.")
    assessment_answer = dspy.OutputField(desc="A rating between 1 and 5. Only output the rating and nothing else.")

In [26]:
class RuleFewShot(dspy.Module):
    def __init__(self):
        super().__init__()
        self.check_contradictions = dspy.ChainOfThought(CheckContradictions)

    def forward(self, context, text):
        prediction = self.check_contradictions(context=context, text=text)
        return dspy.Prediction(context=context, text=text, Faithfulness=prediction.Faithfulness, rationale=prediction.rationale)
    
class RuleFewShotType(dspy.Module):
    def __init__(self):
        super().__init__()
        self.check_contradictions = dspy.ChainOfThought(CheckContradictionType)

    def forward(self, context, text):
        prediction = self.check_contradictions(context=context, text=text)
        return dspy.Prediction(context=context, text=text, rationale=prediction.rationale, answer=prediction.LabelInt)
    
# RAG class
class RAG(dspy.Module):
    def __init__(self, num_passages=3):
        super().__init__()
        self.retrieve = dspy.Retrieve(k=num_passages)
        self.check_contradictions = dspy.ChainOfThought(CheckContradictions)
    
    def forward(self, context, text):
        retrieved_passages = self.retrieve.forward(text).passages
        prediction = self.check_contradictions(context=retrieved_passages, text=text)
        return dspy.Prediction(context=retrieved_passages, answer=prediction.faithfulness)

In [27]:
contradiction_checker = RuleFewShotType()
output = contradiction_checker(context=devset[3].context, text=devset[3].text)
llm.inspect_history(1)





Identify the type of contradiction detected

---

Follow the following format.

Context: facts here are assumed to be true

Text: ${text}

Reasoning: Let's think step by step in order to ${produce the LabelInt}. We ...

Label Int: A rating between 0 and 4: 0 = No Contradiction, 1 = Direct Contradiction, 2 = Major Discrepancy, 3 = Minor Discrepancy

---

Context: Regular physical activity is essential for maintaining a healthy weight.

Text: Weight management is primarily influenced by diet, not physical activity, according to recent studies.

Reasoning: Let's think step by step in order to[32m produce the LabelInt. We start by acknowledging the context that emphasizes the importance of regular physical activity for maintaining a healthy weight. The text, however, presents a viewpoint that diet is the primary factor influencing weight management, suggesting that physical activity plays a lesser role. This presents a discrepancy in the emphasis on what is considered more essential f

In [39]:
def validate_answer_contained(example, pred, trace=None):
    if "3" in dsp.normalize_text(pred.answer):
        clean_pred = "3"
    elif "2" in dsp.normalize_text(pred.answer):
        clean_pred = "2"
    elif "1" in dsp.normalize_text(pred.answer):
        clean_pred = "1"
    elif "0" in dsp.normalize_text(pred.answer):
        clean_pred = "0"
    else:
        clean_pred = "-1"
    return clean_pred == example.answer


In [None]:
teleprompter = BootstrapFewShotWithRandomSearch(
    metric=validate_answer_contained,
    max_bootstrapped_demos=4,
    max_labeled_demos=16,
    num_candidate_programs=16,
    max_rounds=1,
)

compiled_classifier = teleprompter.compile(RuleFewShotType(), trainset=trainset, valset=devset)

In [41]:
llm.history[-1]

{'prompt': "Identify the type of contradiction detected\n\n---\n\nFollow the following format.\n\nContext: facts here are assumed to be true\n\nText: ${text}\n\nReasoning: Let's think step by step in order to ${produce the LabelInt}. We ...\n\nLabel Int: A rating between 0 and 4: 0 = No Contradiction, 1 = Direct Contradiction, 2 = Major Discrepancy, 3 = Minor Discrepancy\n\n---\n\nContext: Information on the logistical aspects of scheduling medical procedures and tests in healthcare facilities.\n\nText: The test will be scheduled in the surgery center, main operating room, or procedure center.\n\nReasoning: Let's think step by step in order to",
 'response': {'id': 'chatcmpl-9dqlWAvKBtVFPeTYDaIQ8c13IQ1eK',
  'choices': [{'finish_reason': 'stop',
    'index': 0,
    'logprobs': None,
    'message': {'content': 'produce the LabelInt. We know that scheduling medical procedures and tests in healthcare facilities involves selecting an appropriate location based on the type of procedure or t

In [43]:
contradiction_checker(context=devset[0].context, text=devset[0].text)

Prediction(
    context='Key points about pregnancy loss: Pregnancy loss is the death of an unborn baby (fetus) at any time during pregnancy. Pregnancy loss occurs in up to 1 in every 4 pregnancies. Most happen during the first trimester. About half of early pregnancy losses are from defects in genes or chromosomes. Vaginal bleeding is the most common symptom of pregnancy loss.\t',
    text='What is my risk of miscarriage by week: Your risk of pregnancy loss declines each week you’re pregnant. Around 15% of pregnancies end in miscarriage. Miscarriage risk in the second trimester (13 to 19 weeks) is between 1% and 5%. Many factors affect your risk of miscarriage such as your age and health. However, everyone’s risk of miscarriage declines each week of pregnancy if the pregnant person has no other health conditions.\t',
    rationale="produce the LabelInt. We start by acknowledging that the context defines pregnancy loss and mentions that it occurs in up to 1 in every 4 pregnancies, whic