This notebook includes the modified version of Langchain's Diffbot Graph Transformer.
Please refer to the original full document: 
https://api.python.langchain.com/en/latest/graph_transformers/langchain_experimental.graph_transformers.diffbot.DiffbotGraphTransformer.html

https://api.python.langchain.com/en/latest/_modules/langchain_experimental/graph_transformers/diffbot.html#

# Customize "GraphDocument" for DiffbotTransformer library

Original source:
https://api.python.langchain.com/en/latest/_modules/langchain_community/graphs/graph_document.html#

In [None]:
from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
from langchain_core.load.serializable import Serializable
from langchain_core.pydantic_v1 import Field

class Node(Serializable):
    """Represents a node in a graph with associated properties.

    Attributes:
        id (Union[str, int]): A unique identifier for the node.
        type (str): The type or label of the node, default is "Node".
        properties (dict): Additional properties and metadata associated with the node.
    """

    id: Union[str, int]
    type: str = "Node"
    properties: dict = Field(default_factory=dict)

class Relationship(Serializable):
    """Represents a directed relationship between two nodes in a graph.

    Attributes:
        source (Node): The source node of the relationship.
        target (Node): The target node of the relationship.
        type (str): The type of the relationship.
        properties (dict): Additional properties associated with the relationship.
    """

    source: Node
    target: Node
    type: str
    properties: dict = Field(default_factory=dict)

class GraphDocument(Serializable):
    """Represents a graph document consisting of nodes and relationships.

    Attributes:
        nodes (List[Node]): A list of nodes in the graph.
        relationships (List[Relationship]): A list of relationships in the graph.
    """

    nodes: List[Node]
    relationships: List[Relationship]

import requests
from langchain.schema import Document
from langchain.utils import get_from_env
from langchain_community.graphs.graph_document import Node, Relationship

# Customize "process_response" under class DiffbotGraphTransformer

Original source: https://api.python.langchain.com/en/latest/graph_transformers/langchain_experimental.graph_transformers.diffbot.DiffbotGraphTransformer.html

In [None]:
def format_property_key(s: str) -> str:
    words = s.split()
    if not words:
        return s
    first_word = words[0].lower()
    capitalized_words = [word.capitalize() for word in words[1:]]
    return "".join([first_word] + capitalized_words)

class NodesList:
    """
    Manages a list of nodes with associated properties.

    Attributes:
        nodes (Dict[Tuple, Any]): Stores nodes as keys and their properties as values.
            Each key is a tuple where the first element is the
            node ID and the second is the node type.
    """
    def __init__(self) -> None:
        self.nodes: Dict[Tuple[Union[str, int], str], Any] = dict()


    def add_node_property(
        self, node: Tuple[Union[str, int], str], properties: Dict[str, Any]
    ) -> None:
        """
        Adds or updates node properties.

        If the node does not exist in the list, it's added along with its properties.
        If the node already exists, its properties are updated with the new values.

        Args:
            node (Tuple): A tuple containing the node ID and node type.
            properties (Dict): A dictionary of properties to add or update for the node.
        """
        if node not in self.nodes:
            self.nodes[node] = properties
        else:
            self.nodes[node].update(properties)


    def return_node_list(self) -> List[Node]:
        """
        Returns the nodes as a list of Node objects.

        Each Node object will have its ID, type, and properties populated.

        Returns:
            List[Node]: A list of Node objects.
        """
        nodes = [
            Node(id=key[0], type=key[1], properties=self.nodes[key])
            for key in self.nodes
        ]
        return nodes



# Properties that should be treated as node properties instead of relationships
FACT_TO_PROPERTY_TYPE = [
    "Date",
    "Number",
    "Job title",
    "Cause of death",
    "Organization type",
    "Academic title",
]


schema_mapping = [
    ("HEADQUARTERS", "ORGANIZATION_LOCATIONS"),
    ("RESIDENCE", "PERSON_LOCATION"),
    ("ALL_PERSON_LOCATIONS", "PERSON_LOCATION"),
    ("CHILD", "HAS_CHILD"),
    ("PARENT", "HAS_PARENT"),
    ("CUSTOMERS", "HAS_CUSTOMER"),
    ("SKILLED_AT", "INTERESTED_IN"),
]


class SimplifiedSchema:
    """
    Provides functionality for working with a simplified schema mapping.

    Attributes:
        schema (Dict): A dictionary containing the mapping to simplified schema types.
    """

    def __init__(self) -> None:
        """Initializes the schema dictionary based on the predefined list."""
        self.schema = dict()
        for row in schema_mapping:
            self.schema[row[0]] = row[1]


    def get_type(self, type: str) -> str:
        """
        Retrieves the simplified schema type for a given original type.

        Args:
            type (str): The original schema type to find the simplified type for.

        Returns:
            str: The simplified schema type if it exists;
                 otherwise, returns the original type.
        """
        try:
            return self.schema[type]
        except KeyError:
            return type


class DiffbotGraphTransformer:
    """Transforms documents into graph documents using Diffbot's NLP API.

    A graph document transformation system takes a sequence of Documents and returns a
    sequence of Graph Documents.
    """

    def __init__(
        self,
        diffbot_api_key: Optional[str] = None,
        fact_confidence_threshold: float = 0.7,
        include_qualifiers: bool = True,
        include_evidence: bool = True,
        simplified_schema: bool = True,
    ) -> None:
        """
        Initialize the graph transformer with various options.

        Args:
            diffbot_api_key (str):
               The API key for Diffbot's NLP services.

            fact_confidence_threshold (float):
                Minimum confidence level for facts to be included.
            include_qualifiers (bool):
                Whether to include qualifiers in the relationships.
            include_evidence (bool):
                Whether to include evidence for the relationships.
            simplified_schema (bool):
                Whether to use a simplified schema for relationships.
        """
        self.diffbot_api_key = diffbot_api_key or get_from_env(
            "diffbot_api_key", "DIFFBOT_API_KEY"
        )
        self.fact_threshold_confidence = fact_confidence_threshold
        self.include_qualifiers = include_qualifiers
        self.include_evidence = include_evidence
        self.simplified_schema = None
        if simplified_schema:
            self.simplified_schema = SimplifiedSchema()


    def nlp_request(self, text: str) -> Dict[str, Any]:
        """
        Make an API request to the Diffbot NLP endpoint.

        Args:
            text (str): The text to be processed.

        Returns:
            Dict[str, Any]: The JSON response from the API.
        """

        # Relationship extraction only works for English
        payload = {
            "content": text,
            "lang": "en",
        }

        FIELDS = "facts"
        HOST = "nl.diffbot.com"
        url = (
            f"https://{HOST}/v1/?fields={FIELDS}&"
            f"token={self.diffbot_api_key}&language=en"
        )
        result = requests.post(url, data=payload)
        return result.json()

    def process_response(
        self, payload: Dict[str, Any]):
            """
            Args:
                payload (Dict[str, Any]): The JSON response from Diffbot's NLP API.
    
            Returns:
                GraphDocument: The transformed document as a graph.
            """
    
            # Return empty result if there are no facts
            if "facts" not in payload or not payload["facts"]:
                return GraphDocument(nodes=[], relationships=[])
    
            # Nodes are a custom class because we need to deduplicate
            nodes_list = NodesList()
            # Relationships are a list because we don't deduplicate nor anything else
            relationships = list()
            for record in payload["facts"]:
                # Skip if the fact is below the threshold confidence
                if record["confidence"] < self.fact_threshold_confidence:
                    continue
    
                # TODO: It should probably be treated as a node property
                if not record["value"]["allTypes"]:
                    continue
    
                # Define source node
                source_id = (
                    record["entity"]["allUris"][0]
                    if record["entity"]["allUris"]
                    else record["entity"]["name"]
                )
                source_label = record["entity"]["allTypes"][0]["name"].capitalize()
                source_name = record["entity"]["name"]
                source_node = Node(id=source_id, type=source_label)
                nodes_list.add_node_property(
                    (source_id, source_label), {"name": source_name}
                )
    
                # Define target node
                target_id = (
                    record["value"]["allUris"][0]
                    if record["value"]["allUris"]
                    else record["value"]["name"]
                )
                target_label = record["value"]["allTypes"][0]["name"].capitalize()
                target_name = record["value"]["name"]
                # Some facts are better suited as node properties
                if target_label in FACT_TO_PROPERTY_TYPE:
                    nodes_list.add_node_property(
                        (source_id, source_label),
                        {format_property_key(record["property"]["name"]): target_name},
                    )
                else:  # Define relationship
                    # Define target node object
                    target_node = Node(id=target_id, type=target_label)
                    nodes_list.add_node_property(
                        (target_id, target_label), {"name": target_name}
                    )
                    # Define relationship type
                    rel_type = record["property"]["name"].replace(" ", "_").upper()
                    if self.simplified_schema:
                        rel_type = self.simplified_schema.get_type(rel_type)
    
                    # Relationship qualifiers/properties
                    rel_properties = dict()
                    relationship_evidence = [el["passage"] for el in record["evidence"]][0]
                    if self.include_evidence:
                        rel_properties.update({"evidence": relationship_evidence})
                    if self.include_qualifiers and record.get("qualifiers"):
                        for property in record["qualifiers"]:
                            prop_key = format_property_key(property["property"]["name"])
                            rel_properties[prop_key] = property["value"]["name"]
    
                    relationship = Relationship(
                        source=source_node,
                        target=target_node,
                        type=rel_type,
                        properties=rel_properties,
                    )
                    relationships.append(relationship)
    
            return GraphDocument(
                nodes=nodes_list.return_node_list(),
                relationships=relationships,
            )
    # Not being used in this project 
    def convert_to_graph_documents(
        self, documents: Sequence[Document]
    ) -> List[GraphDocument]:
        """Convert a sequence of documents into graph documents.

        Args:
            documents (Sequence[Document]): The original documents.
            **kwargs: Additional keyword arguments.

        Returns:
            Sequence[GraphDocument]: The transformed documents as graphs.
        """
        results = []
        for document in documents:
            raw_results = self.nlp_request(document.page_content)
            graph_document = self.process_response(raw_results, document)
            results.append(graph_document)
        return results

# Modify "add_graph_documet" -> "add_graph_from_text" under neo4j_graph

Original source: https://api.python.langchain.com/en/latest/_modules/langchain_community/graphs/neo4j_graph.html#

In [None]:
from typing import Any, Dict, List, Optional

from langchain_core.utils import get_from_env

from langchain_community.graphs.graph_store import GraphStore

node_properties_query = """
CALL apoc.meta.data()
YIELD label, other, elementType, type, property
WHERE NOT type = "RELATIONSHIP" AND elementType = "node"
WITH label AS nodeLabels, collect({property:property, type:type}) AS properties
RETURN {labels: nodeLabels, properties: properties} AS output

"""

rel_properties_query = """
CALL apoc.meta.data()
YIELD label, other, elementType, type, property
WHERE NOT type = "RELATIONSHIP" AND elementType = "relationship"
WITH label AS nodeLabels, collect({property:property, type:type}) AS properties
RETURN {type: nodeLabels, properties: properties} AS output
"""

rel_query = """
CALL apoc.meta.data()
YIELD label, other, elementType, type, property
WHERE type = "RELATIONSHIP" AND elementType = "node"
UNWIND other AS other_node
RETURN {start: label, type: property, end: toString(other_node)} AS output
"""


def value_sanitize(d: Dict[str, Any]) -> Dict[str, Any]:
    """
    Sanitizes the input dictionary by removing embedding-like values,
    lists with more than 128 elements, that are mostly irrelevant for
    generating answers in a LLM context. These properties, if left in
    results, can occupy significant context space and detract from
    the LLM's performance by introducing unnecessary noise and cost.
    """
    LIST_LIMIT = 128
    # Create a new dictionary to avoid changing size during iteration
    new_dict = {}
    for key, value in d.items():
        if isinstance(value, dict):
            # Recurse to handle nested dictionaries
            new_dict[key] = value_sanitize(value)
        elif isinstance(value, list):
            # check if it has less than LIST_LIMIT values
            if len(value) < LIST_LIMIT:
                # if value is a list, check if it contains dictionaries to clean
                cleaned_list = []
                for item in value:
                    if isinstance(item, dict):
                        cleaned_list.append(value_sanitize(item))
                    else:
                        cleaned_list.append(item)
                new_dict[key] = cleaned_list
        else:
            new_dict[key] = value
    return new_dict

class Neo4jGraph(GraphStore):
    """Provides a connection to a Neo4j database for various graph operations.
    Parameters:
    url (Optional[str]): The URL of the Neo4j database server.
    username (Optional[str]): The username for database authentication.
    password (Optional[str]): The password for database authentication.
    database (str): The name of the database to connect to. Default is 'neo4j'.
    timeout (Optional[float]): The timeout for transactions in seconds.
            Useful for terminating long-running queries.
            By default, there is no timeout set.
    sanitize (bool): A flag to indicate whether to remove lists with
            more than 128 elements from results. Useful for removing
            embedding-like properties from database responses. Default is False.

    *Security note*: Make sure that the database connection uses credentials
        that are narrowly-scoped to only include necessary permissions.
        Failure to do so may result in data corruption or loss, since the calling
        code may attempt commands that would result in deletion, mutation
        of data if appropriately prompted or reading sensitive data if such
        data is present in the database.
        The best way to guard against such negative outcomes is to (as appropriate)
        limit the permissions granted to the credentials used with this tool.

        See https://python.langchain.com/docs/security for more information.
    """

    def __init__(
        self,
        url: Optional[str] = None,
        username: Optional[str] = None,
        password: Optional[str] = None,
        database: str = "neo4j",
        timeout: Optional[float] = None,
        sanitize: bool = False,
    ) -> None:
        """Create a new Neo4j graph wrapper instance."""
        try:
            import neo4j
        except ImportError:
            raise ValueError(
                "Could not import neo4j python package. "
                "Please install it with `pip install neo4j`."
            )

        url = get_from_env("url", "NEO4J_URI", url)
        username = get_from_env("username", "NEO4J_USERNAME", username)
        password = get_from_env("password", "NEO4J_PASSWORD", password)
        database = get_from_env("database", "NEO4J_DATABASE", database)

        self._driver = neo4j.GraphDatabase.driver(url, auth=(username, password))
        self._database = database
        self.timeout = timeout
        self.sanitize = sanitize
        self.schema: str = ""
        self.structured_schema: Dict[str, Any] = {}
        # Verify connection
        try:
            self._driver.verify_connectivity()
        except neo4j.exceptions.ServiceUnavailable:
            raise ValueError(
                "Could not connect to Neo4j database. "
                "Please ensure that the url is correct"
            )
        except neo4j.exceptions.AuthError:
            raise ValueError(
                "Could not connect to Neo4j database. "
                "Please ensure that the username and password are correct"
            )
        # Set schema
        try:
            self.refresh_schema()
        except neo4j.exceptions.ClientError:
            raise ValueError(
                "Could not use APOC procedures. "
                "Please ensure the APOC plugin is installed in Neo4j and that "
                "'apoc.meta.data()' is allowed in Neo4j configuration "
            )


    @property
    def get_schema(self) -> str:
        """Returns the schema of the Graph"""
        return self.schema

    @property
    def get_structured_schema(self) -> Dict[str, Any]:
        """Returns the structured schema of the Graph"""
        return self.structured_schema

    def query(self, query: str, params: dict = {}) -> List[Dict[str, Any]]:
        """Query Neo4j database."""
        from neo4j import Query
        from neo4j.exceptions import CypherSyntaxError

        with self._driver.session(database=self._database) as session:
            try:
                data = session.run(Query(text=query, timeout=self.timeout), params)
                json_data = [r.data() for r in data]
                if self.sanitize:
                    json_data = [value_sanitize(el) for el in json_data]
                return json_data
            except CypherSyntaxError as e:
                raise ValueError(f"Generated Cypher Statement is not valid\n{e}")


    def refresh_schema(self) -> None:
        """
        Refreshes the Neo4j graph schema information.
        """
        node_properties = [el["output"] for el in self.query(node_properties_query)]
        rel_properties = [el["output"] for el in self.query(rel_properties_query)]
        relationships = [el["output"] for el in self.query(rel_query)]

        self.structured_schema = {
            "node_props": {el["labels"]: el["properties"] for el in node_properties},
            "rel_props": {el["type"]: el["properties"] for el in rel_properties},
            "relationships": relationships,
        }

        # Format node properties
        formatted_node_props = []
        for el in node_properties:
            props_str = ", ".join(
                [f"{prop['property']}: {prop['type']}" for prop in el["properties"]]
            )
            formatted_node_props.append(f"{el['labels']} {{{props_str}}}")

        # Format relationship properties
        formatted_rel_props = []
        for el in rel_properties:
            props_str = ", ".join(
                [f"{prop['property']}: {prop['type']}" for prop in el["properties"]]
            )
            formatted_rel_props.append(f"{el['type']} {{{props_str}}}")

        # Format relationships
        formatted_rels = [
            f"(:{el['start']})-[:{el['type']}]->(:{el['end']})" for el in relationships
        ]

        self.schema = "\n".join(
            [
                "Node properties are the following:",
                ",".join(formatted_node_props),
                "Relationship properties are the following:",
                ",".join(formatted_rel_props),
                "The relationships are the following:",
                ",".join(formatted_rels),
            ]
        )

    def add_graph_from_txt(
        self, graph_document: GraphDocument
    ) -> None:
        """
        Take GraphDocument as input as uses it to construct a graph.
        """
        # Import nodes
        self.query(
            (
                "UNWIND $data AS row "
                "CALL apoc.merge.node([row.type], {id: row.id}, "
                "row.properties, {}) YIELD node "
                "RETURN distinct 'done' AS result"
            ),
            {
                "data": [el.__dict__ for el in graph_document.nodes],
            },
        )
        # Import relationships
        self.query(
            "UNWIND $data AS row "
            "CALL apoc.merge.node([row.source_label], {id: row.source},"
            "{}, {}) YIELD node as source "
            "CALL apoc.merge.node([row.target_label], {id: row.target},"
            "{}, {}) YIELD node as target "
            "CALL apoc.merge.relationship(source, row.type, "
            "{}, row.properties, target) YIELD rel "
            "RETURN distinct 'done'",
            {
                "data": [
                    {
                        "source": el.source.id,
                        "source_label": el.source.type,
                        "target": el.target.id,
                        "target_label": el.target.type,
                        "type": el.type.replace(" ", "_").upper(),
                        "properties": el.properties,
                    }
                    for el in graph_document.relationships
                ]
            },
        )