In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
import os

In [3]:
required_env_vars = [
    "NEO4J_URI",
    "NEO4J_USERNAME",
    "NEO4J_PASSWORD",
    "OPENAI_API_KEY",
    "ANTHROPIC_API_KEY"
]

for var in required_env_vars:
    assert os.getenv(var) , f"Environment variable {var} is not set."

In [4]:
from langchain_openai import ChatOpenAI
from langchain_anthropic import ChatAnthropic

In [5]:
llm_anthropic = ChatAnthropic(
    model="claude-3-5-sonnet-20240620",
    temperature=0,
    max_tokens=4096,
    max_retries=2
)

In [6]:
from langchain_core.prompts import ChatPromptTemplate

In [7]:
system_prompt = (
    "# Knowledge Graph Instructions\n"
    "## 1. Overview\n"
    "You are a top-tier algorithm designed for extracting information in structured "
    "formats to build a knowledge graph.\n"
    "Try to capture as much information from the text as possible without "
    "sacrificing accuracy. Do not add any information that is not explicitly "
    "mentioned in the text.\n"
    "- **Nodes** represent entities and concepts.\n"
    "- The aim is to achieve simplicity and clarity in the knowledge graph, making it\n"
    "accessible for a vast audience.\n"
    "## 2. Labeling Nodes\n"
    "- **Consistency**: Ensure you use available types for node labels.\n"
    "Ensure you use basic or elementary types for node labels.\n"
    "- For example, when you identify an entity representing a person, "
    "always label it as **'person'**. Avoid using more specific terms "
    "like 'mathematician' or 'scientist'."
    "- **Node IDs**: Never utilize integers as node IDs. Node IDs should be "
    "names or human-readable identifiers found in the text.\n"
    "- **Node Names**: Create a **name** property for each node it should be names, or human-readable identifiers found in the text.\n"
    "- **Relationships** represent connections between entities or concepts.\n"
    "Ensure consistency and generality in relationship types when constructing "
    "knowledge graphs. Instead of using specific and momentary types "
    "such as 'BECAME_PROFESSOR', use more general and timeless relationship types "
    "like 'PROFESSOR'. Make sure to use general and timeless relationship types!\n"
    "## 3. Coreference Resolution\n"
    "- **Maintain Entity Consistency**: When extracting entities, it's vital to "
    "ensure consistency.\n"
    'If an entity, such as "John Doe", is mentioned multiple times in the text '
    'but is referred to by different names or pronouns (e.g., "Joe", "he"),'
    "always use the most complete identifier for that entity throughout the "
    'knowledge graph. In this example, use "John Doe" as the entity ID.\n'
    "Remember, the knowledge graph should be coherent and easily understandable, "
    "so maintaining consistency in entity references is crucial.\n"
    "## 4. Strict Compliance\n"
    "Adhere to the rules strictly. Non-compliance will result in termination."
)
custom_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            system_prompt,
        ),
        (
            "human",
            (
                "Tip: Make sure to answer in the correct format and do "
                "not include any explanations. "
                "Make sure to **strictly** add name property for each node."
                "Remember that some information may be saved as properties of nodes and relationships, think it through add few items as properties as you see fit."
                "Use the given format to extract information from the "
                "following input: {input}"
            ),
        ),
    ]
)

In [8]:
from langchain_experimental.graph_transformers import LLMGraphTransformer

In [9]:
graph_maker = LLMGraphTransformer(llm=llm_anthropic, node_properties=True, relationship_properties=True, strict_mode=True, prompt=custom_prompt)

In [10]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("../sample/mango.pdf")
docs = loader.load()

In [11]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=4000, chunk_overlap=100)
text_chunks = text_splitter.split_documents(docs)

In [12]:
len(text_chunks)

7

In [18]:
graph_documents = graph_maker.convert_to_graph_documents(text_chunks)

In [26]:
graph_documents[0].nodes
graph_documents[0].relationships

[Relationship(source=Node(id='Mango', type='Fruit'), target=Node(id='Mangifera Indica', type='Plant'), type='PRODUCED_BY'),
 Relationship(source=Node(id='Mangifera Indica', type='Plant'), target=Node(id='Mango_Fruit', type='Fruit'), type='PRODUCES'),
 Relationship(source=Node(id='Mangifera Indica', type='Plant'), target=Node(id='Mango_Leaf', type='Plant_part'), type='HAS_PART'),
 Relationship(source=Node(id='Mangifera Indica', type='Plant'), target=Node(id='Mango_Flower', type='Plant_part'), type='HAS_PART'),
 Relationship(source=Node(id='Bangladesh', type='Country'), target=Node(id='Mangifera Indica', type='Plant'), type='NATIONAL_TREE')]

In [13]:
from langchain_community.graphs import Neo4jGraph

In [14]:
graph = Neo4jGraph(driver_config={"max_connection_lifetime": 3600})

In [19]:
graph.add_graph_documents(graph_documents)

In [15]:
graph.refresh_schema()
graph.structured_schema

{'node_props': {'Fruit': [{'property': 'id', 'type': 'STRING'},
   {'property': 'name', 'type': 'STRING'},
   {'property': 'national_fruit_of', 'type': 'STRING'},
   {'property': 'scientific_name', 'type': 'STRING'},
   {'property': 'origin', 'type': 'STRING'},
   {'property': 'length_range', 'type': 'STRING'},
   {'property': 'weight_range', 'type': 'STRING'},
   {'property': 'ripening_time', 'type': 'STRING'}],
  'Plant': [{'property': 'id', 'type': 'STRING'},
   {'property': 'name', 'type': 'STRING'},
   {'property': 'crown_radius', 'type': 'STRING'},
   {'property': 'common_name', 'type': 'STRING'},
   {'property': 'lifespan', 'type': 'STRING'},
   {'property': 'height', 'type': 'STRING'},
   {'property': 'scientific_name', 'type': 'STRING'}],
  'Plant_part': [{'property': 'id', 'type': 'STRING'},
   {'property': 'name', 'type': 'STRING'},
   {'property': 'arrangement', 'type': 'STRING'},
   {'property': 'shape', 'type': 'STRING'},
   {'property': 'length', 'type': 'STRING'},
   {'

In [16]:
import neo4jupyter
neo4jupyter.init_notebook_mode()

<IPython.core.display.Javascript object>

In [17]:
def show_graph():
    load_dotenv()
    NEO4J_URI = os.getenv("NEO4J_URI")
    NEW_NEO4J_URI = NEO4J_URI.replace("neo4j+s://", "bolt+s://")
    os.environ.pop('NEO4J_URI')

    from py2neo import Graph
    vis_graph = Graph(NEW_NEO4J_URI, auth=(os.getenv("NEO4J_USERNAME"), os.getenv("NEO4J_PASSWORD")))

    query = "MATCH (n) RETURN COUNT(n) AS total_nodes"
    result = vis_graph.run(query).data()

    total_nodes = result[0]['total_nodes']

    os.environ["NEO4J_URI"] = NEO4J_URI
    return neo4jupyter.draw(vis_graph,{}, limit=total_nodes)
    

In [18]:
show_graph()

In [19]:

system_prompt = """
Act as a entity disambiugation tool and tell me which values reference the same entity. 
For example if I give you

Birds
Bird
Ant

You return to me

Birds, 1
Bird, 1
Ant, 2

As the Bird and Birds values have the same integer assigned to them, it means that they reference the same entity.
"""

disambiguate_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            system_prompt,
        ),
        (
            "human",
            (                
                "Perform disambiguation on the following values: \n{input}"
            ),
        ),
    ]
)