In [None]:
!pip install dspy-ai
!pip install -r dspy-requirements-2.txt
!pip install -U dspy-ai

# Environment Setup

In [1]:
import dspy
from dspy.retrieve.chromadb_rm import ChromadbRM
import json
from langchain.chains import GraphCypherQAChain
from langchain_community.graphs import Neo4jGraph
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
import os

os.environ["OPENAI_API_KEY"] = 'open-api-key'

# Elon Musk's Wikipedia Page as Our Knowledge Source

In [None]:
from langchain_community.document_loaders import WikipediaLoader

query = "Elon Musk"
raw_documents = WikipediaLoader(query=query).load()

Embed our text data in chromadb vector database

In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

# Split
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=512, chunk_overlap=100
)
all_splits = text_splitter.split_documents(raw_documents)

import chromadb.utils.embedding_functions as embedding_functions
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")

CHROMA_COLLECTION_NAME = "dspy-rag-chroma"
CHROMADB_DIR = "dspy_rag_chroma/"

# Index
vectorstore = Chroma.from_documents(
    documents=all_splits,
    collection_name=CHROMA_COLLECTION_NAME,
    embedding=embeddings,
    persist_directory=CHROMADB_DIR
)
retriever = vectorstore.as_retriever()

In [31]:
import chromadb
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction
from dspy.retrieve.chromadb_rm import ChromadbRM
import os

embedding_function = OpenAIEmbeddingFunction(
    api_key=os.environ.get('OPENAI_API_KEY'),
    model_name="text-embedding-ada-002"
)

rm = ChromadbRM(CHROMA_COLLECTION_NAME, CHROMADB_DIR, embedding_function, k=3)

# Build Knowlegde Graph of Elon Musk with Diffbot's Natural Language API

In [43]:
# DiffbotGraphTransformer calls Diffbot Natural Language API to extract entities and relationships in the article
from langchain_experimental.graph_transformers.diffbot import DiffbotGraphTransformer
import os

diffbot_api_key = "Diffbot-token" 

diffbot_nlp = DiffbotGraphTransformer(diffbot_api_key=diffbot_api_key)

# Diffbot's Natural Language API converts unstructured text data into knowlegde graphs
graph_documents = diffbot_nlp.convert_to_graph_documents(raw_documents)

# add knowledge graph data to the neo4j database
graph.add_graph_documents(graph_documents)

In [10]:
# connect to our neo4j database
from langchain_community.graphs import Neo4jGraph

url = "url"
username = "neo4j"
password = "pwd"

graph = Neo4jGraph(url=url, username=username, password=password)

# Vanilla DSPy RAG Pipeline

In [84]:
class GenerateAnswer(dspy.Signature):
    """Answer questions with short factoid answers."""

    context = dspy.InputField(desc="may contain relevant facts")
    question = dspy.InputField()
    answer = dspy.OutputField(desc="often between 1 and 5 words")

vanilla_dspy_rag_lm = dspy.OpenAI(model='gpt-3.5-turbo-instruct')
dspy.settings.configure(lm=vanilla_dspy_rag_lm, rm=rm)

class vanilla_dspy_rag(dspy.Module):
    
    # we set num_passages=1 to avoid the same passage being repeatedly retrieved for multiple times
    def __init__(self, num_passages=1):
        super().__init__()

        self.retrieve = dspy.Retrieve(k=num_passages)
        self.generate_answer = dspy.ChainOfThought(GenerateAnswer)
    
    def forward(self, question):
        context = self.retrieve(question).passages
        prediction = self.generate_answer(context=context, question=question)
        return dspy.Prediction(context=context, answer=prediction.answer)

# initial_context_from_knowledge_graph : fetch structured and grounded facts in knowlegde graph

In [12]:
# function to query neo4j graph database

initial_context_from_knowledge_graph = GraphCypherQAChain.from_llm(
    ChatOpenAI(model="gpt-3.5-turbo-instruct", temperature=0), graph=graph, verbose=True
)

# Complete Pipeline of DSPy RAG with Knowledge Graph

In [95]:
# Set up the LM
lm = dspy.OpenAI(model='gpt-3.5-turbo-instruct')
dspy.settings.configure(lm=lm, rm=rm)

class EnrichQueryWithKG(dspy.Signature):
    """Refines and enriches questions using knowledge graph context."""
    question = dspy.InputField()
    kg_context = dspy.InputField(type=str, desc="Knowledge graph metadata") #changed from dict to str
    enriched_query = dspy.OutputField(desc="Query enriched with KG context")

class EnhanceContextWithKG(dspy.Signature):
    """Consolidates and summarizes information from enriched queries and passages."""
    query = dspy.InputField()
    passages = dspy.InputField(desc="Passages retrieved based on the enriched query")
    enhanced_context = dspy.OutputField(desc="Context enhanced with KG data")

class KGEnrichedQueryModule(dspy.Module):
    def __init__(self, temperature: int = 0):
        super().__init__()
        self.temperature = temperature
        self.enrich_query_module = dspy.ChainOfThought(EnrichQueryWithKG)
        self.enhance_context_module = dspy.ChainOfThought(EnhanceContextWithKG)
        self.retrieve = rm

    def forward(self, question: str, kg_context: str) -> str: #change from dict to str
        enriched_query = self.enrich_query_module(question=question, kg_context=kg_context).enriched_query
        passages = self.retrieve(enriched_query)
        final_answer = self.enhance_context_module(query=enriched_query, passages=passages).enhanced_context
        return final_answer


# Question 1 : What industry or industries is Neuralink in?
# A. Vanilla DSPy RAG 

In [73]:
question_neuralink_industry = 'What industry or industries is Neuralink in?'
vanilla_dspy_rag = vanilla_dspy_rag()
vanilla_dspy_rag_q1 = vanilla_dspy_rag.forward(question_neuralink_industry)
vanilla_dspy_rag_q1

Prediction(
    context=['=== History ===\nNeuralink was founded in 2016 by Elon Musk and a founding team of seven scientists and engineers. The group of initial hires consisted of experts in areas such as neuroscience, biochemistry and robotics. The trademark "Neuralink" was purchased from its previous owners in January 2017.In April 2017, Neuralink announced that it was aiming to make devices to treat serious brain diseases in the short-term, with the eventual goal of human enhancement, sometimes called transhumanism. Musk had said his interest in the idea partly stemmed from the science fiction concept of "neural lace" in the fictional universe in The Culture, a series of 10 novels by Iain M. Banks.Musk defined the neural lace as a "digital layer above the cortex" that would not necessarily imply extensive surgical insertion but ideally an implant through a vein or artery. He said the long-term goal is to achieve "symbiosis with artificial intelligence", which he perceives as an exi

In [76]:
# We can check and see what's happening under the hood of the Vanilla DSPy RAG pipeline
vanilla_dspy_rag_lm.inspect_history()





Answer questions with short factoid answers.

---

Follow the following format.

Context: may contain relevant facts

Question: ${question}

Reasoning: Let's think step by step in order to ${produce the answer}. We ...

Answer: often between 1 and 5 words

---

Context:
«=== History ===
Neuralink was founded in 2016 by Elon Musk and a founding team of seven scientists and engineers. The group of initial hires consisted of experts in areas such as neuroscience, biochemistry and robotics. The trademark "Neuralink" was purchased from its previous owners in January 2017.In April 2017, Neuralink announced that it was aiming to make devices to treat serious brain diseases in the short-term, with the eventual goal of human enhancement, sometimes called transhumanism. Musk had said his interest in the idea partly stemmed from the science fiction concept of "neural lace" in the fictional universe in The Culture, a series of 10 novels by Iain M. Banks.Musk defined the neural lace as a "digit

# Fact-check with Our Knowledge Graph

In [17]:
# Reminder that our question is: question_neuralink_industry = 'What industry or industries is Neuralink in?'
kg_context = initial_context_from_knowledge_graph.run(question_neuralink_industry)



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (o:Organization {name: "Neuralink"})-[:INDUSTRY]->(s:Skill)
RETURN s.name
[0m
Full Context:
[32;1m[1;3m[{'s.name': 'computer interfaces'}, {'s.name': 'neurotechnology'}][0m

[1m> Finished chain.[0m


In [18]:
kg_context

'Neuralink is in the computer interfaces and neurotechnology industries.'

# B. DSPy RAG with Knowledge Graphs

In [110]:
dspy_w_kg = KGEnrichedQueryModule(temperature=0)

In [73]:
kg_context

'Neuralink is in the computer interfaces and neurotechnology industries.'

In [75]:
answer = dspy_w_kg.forward(question=question_neuralink_industry, kg_context = kg_context)
print(answer)

can confidently say that Neuralink is in the technology industry, specifically in the field of neurotechnology and brain-computer interfaces.


In [79]:
# We can check and see what's happening under the hood of the DSPy RAG w/ Knowledge Graph pipeline
lm.inspect_history()





Consolidates and summarizes information from enriched queries and passages.

---

Follow the following format.

Query: ${query}

Passages: Passages retrieved based on the enriched query

Reasoning: Let's think step by step in order to ${produce the enhanced_context}. We ...

Enhanced Context: Context enhanced with KG data

---

Query: What industry or industries is Neuralink in that are related to computer interfaces and neurotechnology?

Passages: «{'long_text': 'Neuralink Corp. is an American neurotechnology company that is developing implantable brain–computer interfaces (BCIs), based in Fremont, California, as of 2024. Founded by Elon Musk and a team of seven scientists and engineers, Neuralink was launched in 2016 and was first publicly reported in March 2017.Since its founding, the company has hired several high-profile neuroscientists from various universities. By July 2019, it had received $158 million in funding (of which $100 million was from Musk) and was employing a sta

# Question 2 : List out the companies that Elon Musk co-founded with other founders.
# A. Vanilla DSPy RAG 

In [86]:
question_org_elon_cofounded = "List out the companies that Elon Musk co-founded with other founders."
vanilla_dspy_rag = vanilla_dspy_rag()
vanilla_dspy_rag_q2 = vanilla_dspy_rag.forward(question=question_org_elon_cofounded)
vanilla_dspy_rag_q2

Prediction(
    context=['In October 2002, eBay acquired PayPal for $1.5 billion, and that same year, with $100 million of the money he made, Musk founded SpaceX, a spaceflight services company. In 2004, he became an early investor in electric vehicle manufacturer Tesla Motors, Inc. (now Tesla, Inc.). He became its chairman and product architect, assuming the position of CEO in 2008. In 2006, Musk helped create SolarCity, a solar-energy company that was acquired by Tesla in 2016 and became Tesla Energy. In 2013, he proposed a hyperloop high-speed vactrain transportation system. In 2015, he co-founded OpenAI, a nonprofit artificial intelligence research company. The following year, Musk co-founded Neuralink—a neurotechnology company developing brain–computer interfaces—and the Boring Company, a tunnel construction company. In 2022, he acquired Twitter for $44 billion. He subsequently merged the company into newly created X Corp. and rebranded the service as X the following year. In Marc

In [88]:
vanilla_dspy_rag_lm.inspect_history(15)





Answer questions with short factoid answers.

---

Follow the following format.

Context: may contain relevant facts

Question: ${question}

Reasoning: Let's think step by step in order to ${produce the answer}. We ...

Answer: often between 1 and 5 words

---

Context:
«In October 2002, eBay acquired PayPal for $1.5 billion, and that same year, with $100 million of the money he made, Musk founded SpaceX, a spaceflight services company. In 2004, he became an early investor in electric vehicle manufacturer Tesla Motors, Inc. (now Tesla, Inc.). He became its chairman and product architect, assuming the position of CEO in 2008. In 2006, Musk helped create SolarCity, a solar-energy company that was acquired by Tesla in 2016 and became Tesla Energy. In 2013, he proposed a hyperloop high-speed vactrain transportation system. In 2015, he co-founded OpenAI, a nonprofit artificial intelligence research company. The following year, Musk co-founded Neuralink—a neurotechnology company developi

# Did we just spot hallucination in RAG?

This is wrong: "Elon co-founded SpaceX, Tesla, the Boring Company, xAI" 

# B. DSPy RAG with Knowledge Graph

In [42]:
# Reminder that our question is: question_org_elon_cofounded = "List out the companies that Elon Musk co-founded with other founders."
kg_context = initial_context_from_knowledge_graph(question_org_elon_cofounded)



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (p:Person {name: "Elon Musk"})-[:FOUNDED_BY]-(o:Organization)-[:FOUNDED_BY]-(coFounder:Person)
WHERE p <> coFounder
RETURN o.name AS CompanyName, collect(coFounder.name) AS CoFounders
[0m
Full Context:
[32;1m[1;3m[{'CompanyName': 'SolarCity', 'CoFounders': ['Lyndon Rive']}, {'CompanyName': 'Zip2', 'CoFounders': ['Kimbal Musk']}, {'CompanyName': 'OpenAI', 'CoFounders': ['Wojciech Zaremba', 'Sam Altman', 'Andrej Karpathy', 'Trevor Blackwell', 'Ilya Sutskever']}, {'CompanyName': 'Excession LLC', 'CoFounders': ['Jared Birchall']}][0m

[1m> Finished chain.[0m


In [43]:
kg_context

{'query': 'List out the companies that Elon Musk co-founded with other founders.',
 'result': "I don't know the answer.",
 'intermediate_steps': [{'query': 'cypher\nMATCH (p:Person {name: "Elon Musk"})-[:FOUNDED_BY]-(o:Organization)-[:FOUNDED_BY]-(coFounder:Person)\nWHERE p <> coFounder\nRETURN o.name AS CompanyName, collect(coFounder.name) AS CoFounders\n'},
  {'context': [{'CompanyName': 'SolarCity', 'CoFounders': ['Lyndon Rive']},
    {'CompanyName': 'Zip2', 'CoFounders': ['Kimbal Musk']},
    {'CompanyName': 'OpenAI',
     'CoFounders': ['Wojciech Zaremba',
      'Sam Altman',
      'Andrej Karpathy',
      'Trevor Blackwell',
      'Ilya Sutskever']},
    {'CompanyName': 'Excession LLC', 'CoFounders': ['Jared Birchall']}]}]}

In [96]:
# turn our dictionary-type kg data into string-type to fit in the DSPy pipeline
kg_context = "These are the companies that Elon has co-founded with other people:" + str(kg_data['intermediate_steps'][1]['context'])
kg_context

"These are the companies that Elon has co-founded with other people:[{'CompanyName': 'SolarCity', 'CoFounders': ['Lyndon Rive']}, {'CompanyName': 'Zip2', 'CoFounders': ['Kimbal Musk']}, {'CompanyName': 'OpenAI', 'CoFounders': ['Wojciech Zaremba', 'Sam Altman', 'Andrej Karpathy', 'Trevor Blackwell', 'Ilya Sutskever']}, {'CompanyName': 'Excession LLC', 'CoFounders': ['Jared Birchall']}]"

In [97]:
dspy_w_kg = KGEnrichedQueryModule(temperature=0)
answer = dspy_w_kg.forward(question=question_org_elon_cofounded, kg_context = kg_context)
print(answer)

According to KG data, Elon Musk is listed as a co-founder of several companies, including SpaceX, Tesla, X Corp., the Boring Company, xAI, Neuralink, OpenAI, and the


In [103]:
lm.inspect_history(7)





Refines and enriches questions using knowledge graph context.

---

Follow the following format.

Question: ${question}

Kg Context: Knowledge graph metadata

Reasoning: Let's think step by step in order to ${produce the enriched_query}. We ...

Enriched Query: Query enriched with KG context

---

Question: List out the companies that Elon Musk co-founded with other founders.

Kg Context: These are the companies that Elon has co-founded with other people:[{'CompanyName': 'SolarCity', 'CoFounders': ['Lyndon Rive']}, {'CompanyName': 'Zip2', 'CoFounders': ['Kimbal Musk']}, {'CompanyName': 'OpenAI', 'CoFounders': ['Wojciech Zaremba', 'Sam Altman', 'Andrej Karpathy', 'Trevor Blackwell', 'Ilya Sutskever']}, {'CompanyName': 'Excession LLC', 'CoFounders': ['Jared Birchall']}]

Reasoning: Let's think step by step in order to[32m find out the companies that Elon Musk co-founded with other founders. First, we need to identify the companies that Elon has co-founded. Then, we need to look at t