<a href="https://colab.research.google.com/github/kushjaggi/AWS-EC2-Chatbot-KG/blob/main/AWS_EC2Chatbot_KG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installing all the dependencies

*   langchain
*   neo4j
*   openai
*   unstructured[all-docs]
*   tiktoken





In [None]:
!pip install langchain neo4j openai unstructured[all-docs] tiktoken



# Connecting Neo4j Graph Database

In [None]:
from langchain.graphs import Neo4jGraph

url = "neo4j+s://50ebd397.databases.neo4j.io"
username ="neo4j"
password = "OvfrBWwn4rYYngJJDTsVkwByoqjnhoi9lgfAwk1rydo"
graph = Neo4jGraph(
    url=url,
    username=username,
    password=password
)

# Defining Property, Node and Relationship for the Knowledge Graph

In [None]:
from langchain.graphs.graph_document import (
    Node as BaseNode,
    Relationship as BaseRelationship,
    GraphDocument,
)
from langchain.schema import Document
from typing import List, Dict, Any, Optional
from langchain.pydantic_v1 import Field, BaseModel

class Property(BaseModel):
  """A single property consisting of key and value"""
  key: str = Field(..., description="key")
  value: str = Field(..., description="value")

class Node(BaseNode):
    properties: Optional[List[Property]] = Field(
        None, description="List of node properties")

class Relationship(BaseRelationship):
    properties: Optional[List[Property]] = Field(
        None, description="List of relationship properties"
    )

class KnowledgeGraph(BaseModel):
    """Generate a knowledge graph with entities and relationships."""
    nodes: List[Node] = Field(
        ..., description="List of nodes in the knowledge graph")
    rels: List[Relationship] = Field(
        ..., description="List of relationships in the knowledge graph"
    )

# KG to Base format conversion utilities

In [None]:
def format_property_key(s: str) -> str:
    words = s.split()
    if not words:
        return s
    first_word = words[0].lower()
    capitalized_words = [word.capitalize() for word in words[1:]]
    return "".join([first_word] + capitalized_words)

def props_to_dict(props) -> dict:
    """Convert properties to a dictionary."""
    properties = {}
    if not props:
      return properties
    for p in props:
        properties[format_property_key(p.key)] = p.value
    return properties

def map_to_base_node(node: Node) -> BaseNode:
    """Map the KnowledgeGraph Node to the base Node."""
    properties = props_to_dict(node.properties) if node.properties else {}
    # Add name property for better Cypher statement generation
    properties["name"] = node.id.title()
    return BaseNode(
        id=node.id.title(), type=node.type.capitalize(), properties=properties
    )


def map_to_base_relationship(rel: Relationship) -> BaseRelationship:
    """Map the KnowledgeGraph Relationship to the base Relationship."""
    source = map_to_base_node(rel.source)
    target = map_to_base_node(rel.target)
    properties = props_to_dict(rel.properties) if rel.properties else {}
    return BaseRelationship(
        source=source, target=target, type=rel.type, properties=properties
    )

# Training the LLM for the specific need

In [None]:
import os
from langchain.chains.openai_functions import (
    create_openai_fn_chain,
    create_structured_output_chain,
)
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
os.environ["OPENAI_API_KEY"] = "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"
llm = ChatOpenAI(model="gpt-3.5-turbo-16k", temperature=0)

def get_extraction_chain(
    allowed_nodes: Optional[List[str]] = None,
    allowed_rels: Optional[List[str]] = None
    ):
    prompt = ChatPromptTemplate.from_messages(
        [(
          "system",
          f"""# Knowledge Graph Instructions for GPT-3.5
## 1. Overview
You are a top-tier algorithm designed for extracting information in structured formats to build a knowledge graph.
- **Nodes** represent entities and concepts. They’re akin to AWS EC2 documentation nodes.
- The aim is to achieve simplicity and clarity in the knowledge graph, making it accessible for a vast audience.
## 2. Labeling Nodes
- **Consistency**: Ensure you use basic or elementary types for node labels.
  - For example, when you identify an entity representing a service, always label it as **"service"**. Avoid using more specific terms like "EC2 instance" or "VPC".
- **Node IDs**: Never utilize integers as node IDs. Node IDs should be names or human-readable identifiers found in the text.
{'- **Allowed Node Labels:**' + ", ".join(allowed_nodes) if allowed_nodes else ""}
{'- **Allowed Relationship Types**:' + ", ".join(allowed_rels) if allowed_rels else ""}
## 3. Handling Numerical Data and Dates
- Numerical data, like age or other related information, should be incorporated as attributes or properties of the respective nodes.
- **No Separate Nodes for Dates/Numbers**: Do not create separate nodes for dates or numerical values. Always attach them as attributes or properties of nodes.
- **Property Format**: Properties must be in a key-value format.
- **Quotation Marks**: Never use escaped single or double quotes within property values.
- **Naming Convention**: Use camelCase for property keys, e.g., `birthDate`.
## 4. Coreference Resolution
- **Maintain Entity Consistency**:When extracting entities, it’s vital to ensure consistency. If an entity, such as “Amazon EC2”, is mentioned multiple times in the text but is referred to by different names or abbreviations (e.g., “EC2”, “Elastic Compute Cloud”), always use the most complete identifier for that entity throughout the knowledge graph. In this example, use “Amazon EC2” as the entity ID. Remember, the knowledge graph should be coherent and easily understandable, so maintaining consistency in entity references is crucial.
## 5. Strict Compliance
Adhere to the rules strictly. Non-compliance will result in termination.
          """),
            ("human", "Use the given format to extract information from the following input: {input}"),
            ("human", "Tip: Make sure to answer in the correct format"),
        ])
    return create_structured_output_chain(KnowledgeGraph, llm, prompt, verbose=False)

# Graph Skeleton

In [None]:
def extract_and_store_graph(
    document: Document,
    nodes:Optional[List[str]] = None,
    rels:Optional[List[str]]=None) -> None:
    # Extract graph data using OpenAI functions
    extract_chain = get_extraction_chain(nodes, rels)
    data = extract_chain.run(document.page_content)
    # Construct a graph document
    graph_document = GraphDocument(
      nodes = [map_to_base_node(node) for node in data.nodes],
      relationships = [map_to_base_relationship(rel) for rel in data.rels],
      source = document
    )
    # Store information into a graph
    graph.add_graph_documents([graph_document])

## Evaluation
We will extract information from the AWS EC2 documentation and construct a knowledge graph to test the pipeline. Here, we will utilize the AWS EC2 pdf and text chunking modules provided by LangChain.

In [None]:
from langchain.text_splitter import TokenTextSplitter
from langchain.document_loaders import UnstructuredPDFLoader

# Read the article
raw_documents = UnstructuredPDFLoader("/content/ec2-ug-11-30.pdf").load()

# Define chunking strategy
text_splitter = TokenTextSplitter(chunk_size=2048, chunk_overlap=24)

# Only take the first the raw_documents
documents = text_splitter.split_documents(raw_documents[:3])

In [None]:
from tqdm import tqdm

for i, d in tqdm(enumerate(documents), total=len(documents)):
    extract_and_store_graph(d)

100%|██████████| 5/5 [07:45<00:00, 93.19s/it]


# Reset Graph - Delete previous Nodes and Relationships

In [None]:
# Delete the graph
graph.query("MATCH (n) DETACH DELETE n")

[]

# Specify which node labels should be extracted by the LLM

In [None]:
allowed_nodes = ["How to Get Started with Amazon EC2","Related Services","Accessing Amazon EC2","Features of Amazon EC2","Instances and AMIs","Storage for Your Instance","Instances","Stopping, Starting, and Terminating Instances","AMIs","Amazon Machine Image (AMI)", "Security Best Practices","Regions","Availability Zones","Describing Your Regions and Availability Zones","Specifying the Region for a Resource"]

for i, d in tqdm(enumerate(documents), total=len(documents)):
    extract_and_store_graph(d, allowed_nodes)

100%|██████████| 5/5 [03:33<00:00, 42.79s/it]


# Query the knowledge graph in a RAG application

In [None]:
# Query the knowledge graph in a RAG application
from langchain.chains import GraphCypherQAChain

graph.refresh_schema()

cypher_chain = GraphCypherQAChain.from_llm(
    graph=graph,
    cypher_llm=ChatOpenAI(temperature=0, model="gpt-3.5-turbo-16k"),
    qa_llm=ChatOpenAI(temperature=0, model="gpt-3.5-turbo"),
    validate_cypher=True, # Validate relationship directions
    verbose=True
)

In [None]:
cypher_chain.run("How to specify the region for a resource using the console?")



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (s:Service {name: "Resource"})-[:HAS]->(c:Concept {name: "Region"})
RETURN c.name[0m
Full Context:
[32;1m[1;3m[][0m

[1m> Finished chain.[0m


'To specify the region for a resource using the console, you can follow these steps:\n\n1. Open the console for the cloud service provider you are using.\n2. Navigate to the resource you want to specify the region for.\n3. Look for the region settings or configuration options for the resource.\n4. Select the desired region from the available options.\n5. Save or apply the changes to ensure that the resource is deployed or located in the specified region.\n\nBy following these steps, you can easily specify the region for a resource using the console.'