In [None]:
from openai import OpenAI
import json
import requests
import os

os.environ["OPENAI_API_KEY"] = 'openai-key'

client = OpenAI()

FIELDS = "entities"
HOST = "nl.diffbot.com"
TOKEN = "diffbot-token" 

import requests

def get_request(payload):
    res = requests.post("https://{}/v1/?fields={}&token={}".format(HOST, FIELDS, TOKEN), json=payload)
    ret = None
    try:
        ret = res.json()
    except:
        print("Bad response: " + res.text)
        print(res.status_code)
        print(res.headers)
    return ret

def get_entity_types_diffbot(res):
    entity_types = []
    if res['entities'] == []:
        entity_types = "No valid entity type found from the answer."
    else:
        answer_name = res['entities'][0]['name']
        answer_types = res['entities'][0]['allTypes']
        for i in range(len(answer_types)): 
            entity_type = answer_types[i]['name']
            entity_types.append(entity_type)
        
    return entity_types

def type_evaluator_by_llm(question):
    response = client.chat.completions.create(
        model="gpt-4-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant who determines the correct entity type that the answer of the question should belongs to. Possible entity types are organization, person, product, country, city, article. And if the question is a yes-or-no type of question, answer: yes/no type. Yes/No type questions start with: Is, Was, Do, Did, etc."},
            {"role": "user", "content": question},
        ]
    )
    answer = response.choices[0].message.content
    question_type = answer
    return question_type

def entity_linker(answer_vector, question):
    if answer_vector.lower() == "none":
        return "The entity type of the answer should be: None", "Entity type(s) of the answer: None", "entity type matched!"
    
    # fact-check validity of vector-based answer via Diffbot Natural Language API
    type_to_be_checked = answer_vector
    content = "Please verify the type of: " + type_to_be_checked
    
    res = get_request({
        "content": content,
        "lang": "en",
        "format": "plain text with title"
    })
    
    entity_list_diffbot = get_entity_types_diffbot(res)
    question_type = type_evaluator_by_llm(question)
    correct_question_type = "The entity type of the answer should be: " + type_evaluator_by_llm(question)
    
    if entity_list_diffbot != [] and "yes" not in question_type.lower():
        type_matches = any(question_type.lower() in entity.lower() for entity in entity_list_diffbot)
        original_answer_type = "Entity type(s) of the answer: "
        if isinstance(entity_list_diffbot, list):
            original_answer_type += ', '.join(entity_list_diffbot)
        else:
            original_answer_type += entity_list_diffbot
            
        if type_matches:
            type_status = "entity type matched!"
        else:
            type_status = "entity type not matched!"

    elif "yes" or "no" in question_type.lower():
        original_answer_type = "Entity type(s) of the answer: " + "yes/no type"
        type_status = "entity type not required for the answer!"
        
    else:
        original_answer_type = "Entity type(s) of the answer: invalid."
        type_status = "entity type not matched!"
        
    return correct_question_type, original_answer_type, type_status


In [241]:
# Test 1
entity_linker(answer_vector = "Sam Altman, Greg Brockman, Ilya Sutskever, Andrej Karpathy, Wojciech Zaremba", question = "Who are the other founders Elon Musk co-found OpenAI with?")

('The entity type of the answer should be: person',
 'Entity type(s) of the answer: person',
 'entity type matched!')

In [243]:
# Test 2
entity_linker(answer_vector = "None", question = "Who are the other founders Elon Musk co-found SpaceX with?")

('The entity type of the answer should be: None',
 'Entity type(s) of the answer: None',
 'entity type matched!')

In [248]:
# Test 3
entity_linker(answer_vector = "No", question = "Did Elon co-found The Boring Company with other founders?")

('The entity type of the answer should be: yes/no type',
 'Entity type(s) of the answer: yes/no type',
 'entity type not required for the answer!')

In [249]:
# Test 4
entity_linker(answer_vector = "No", question = "Is The Boring Company one of the companies Elon co-founded with other founders?")

('The entity type of the answer should be: yes/no type',
 'Entity type(s) of the answer: yes/no type',
 'entity type not required for the answer!')

In [274]:
# Test 5
entity_linker(answer_vector = "sdalfkfjgfldkger", question = "List out the companies that Elon Musk co-founded with other founders.")

('The entity type of the answer should be: organization',
 'Entity type(s) of the answer: No valid entity type found from the answer.',
 'entity type not matched!')

In [271]:
# Test 6
entity_linker(answer_vector = "The Boring Company", question = "List out one of the companies that Elon Musk founded himself.")

('The entity type of the answer should be: organization',
 'Entity type(s) of the answer: organization',
 'entity type matched!')

In [276]:
# Test 7
entity_linker(answer_vector = "PetrolCar", question = "List out one of the companies that Elon Musk founded himself.")

('The entity type of the answer should be: organization',
 'Entity type(s) of the answer: No valid entity type found from the answer.',
 'entity type not matched!')

In [277]:
# Test 8
entity_linker(answer_vector = "SpaceX", question = "List out one of the companies that Elon Musk founded himself.")

('The entity type of the answer should be: organization',
 'Entity type(s) of the answer: organization',
 'entity type matched!')

In [279]:
# Test 9
entity_linker(answer_vector = "EarthY", question = "List out one of the companies that Elon Musk founded himself.")

('The entity type of the answer should be: organization',
 'Entity type(s) of the answer: No valid entity type found from the answer.',
 'entity type not matched!')

# Update Vanilla DSPy RAG with Entity Linker

In [124]:
class GenerateAnswer(dspy.Signature):
    """Answer questions with short factoid answers."""
    context = dspy.InputField(desc="may contain relevant facts")
    question = dspy.InputField()
    answer = dspy.OutputField(desc="often between 1 and 5 words")

class AnswerTypeValidityCheck(dspy.Signature):
    """Check if the generated answer has the valid type(s) and valid information."""
    entity_type = dspy.InputField(desc="check valid entity type for the answer")
    question = dspy.InputField()
    answer = dspy.InputField()
    type_status = dspy.OutputField(desc="entity type matched or not")

class vanilla_dspy_rag(dspy.Module):
    
    # we set num_passages=1 to avoid the same passage being repeatedly retrieved multiple times
    def __init__(self, num_passages=3):
        super().__init__()
        self.retrieve = dspy.Retrieve(k=num_passages)
        self.generate_answer = dspy.ChainOfThought(GenerateAnswer)
        self.check_answer_type = dspy.ChainOfThought(AnswerTypeValidityCheck)
    
    def forward(self, question):
        # Step 1: Retrieve context based on the question
        context = self.retrieve(question).passages
        
        # Step 2: Generate an answer based on the context and question
        prediction = self.generate_answer(context=context, question=question)
        answer = prediction.answer
        
        # Step 3: Validate the answer type using the entity_linker function
        correct_question_type, original_answer_type, type_status = entity_linker(answer, question)
        
        # Optional: You can use the AnswerTypeValidityCheck signature for validation if needed
        validation = self.check_answer_type(entity_type=correct_question_type, question=question, answer=answer).type_status
        
        return dspy.Prediction(context=context, answer=answer, type_status=type_status)


In [30]:
# nomic embedding + llama3
question_org_elon_cofounded_1b = "What are the other founders Elon Musk co-found SpaceX with?"
vanilla_dspy_rag_q1b = vanilla_dspy_rag_nomic_llama3.forward(question=question_org_elon_cofounded_1b)
vanilla_dspy_rag_q1b

Prediction(
    context=["== Notable members ==\nMaye Musk (née Haldeman; born 1948), model and dietitian. She has appeared on the cover of several magazines, including a Time magazine health edition, Women's Day, and Vogue; Elon's mother.\nElon Musk (born 1971), entrepreneur and business magnate. Variously CEO, CTO, and/or Chairman of SpaceX, Tesla, X Corp., and Neuralink. He was Time Magazine's 2021 Person of the Year.\nKimbal Musk (born 1972), entrepreneur, philanthropist, and restaurateur. He founded Zip2 in 1998 with his brother Elon and later sold it to Compaq for $307 million. He is the co-founder and chairman of Big Green.\nTosca Musk (born 1974), filmmaker and sister of Elon; she is the co-founder of Passionflix, an OTT entertainment streaming platform and production company.\nLyndon Rive (born 1977), businessman who co-founded SolarCity and served as its CEO until 2017. He is Elon's cousin through his mother Kaye Rive, Maye Musk's twin.\n\n\n== References ==", "== Notable mem

In [119]:
import dspy
dspy.settings.configure(lm=lm_llama3, rm=nomic_rm)
vanilla_dspy_rag_nomic_llama3 = vanilla_dspy_rag()
question_org_elon_cofounded_1b = "What are the other founders Elon Musk co-founded SpaceX with?"
vanilla_dspy_rag_q1b = vanilla_dspy_rag_nomic_llama3.forward(question=question_org_elon_cofounded_1b)

print("Generated Answer:", vanilla_dspy_rag_q1b.answer)
print("Entity Type Status:", vanilla_dspy_rag_q1b.type_status)


Generated Answer: Elon Musk co-founded SpaceX with Jim Cantrell.
Entity Type Status: entity type matched!


With entity linker, we finally don't get the answer 'Paypal' (type: organization) anymore, but we're still seeing hallucination as Elon Musk is the sole founder of SpaceX, so we need to further bring in ground truth from the knowledge graph and see how well the optimizer can make LLM stick to the ground truth from the knowledge graph