In [1]:
import os
import json
import pickle
from pathlib import Path
from typing import List, Optional

from dotenv import load_dotenv
from langchain_community.graphs import Neo4jGraph
from langchain_core.pydantic_v1 import BaseModel, Field
from nest_asyncio import apply as nest_asyncio_apply

_ = nest_asyncio_apply(), load_dotenv()
DATA = Path("data")

In [2]:
# Define the structure of the expected output
class AWSServicesRelationship(BaseModel):
    """Relantionship between two AWS services. The relationship represents a interaction between two AWS services in an AWS architecture."""
    serviceA: Optional[str] = Field(description="Name of the first AWS service.")
    serviceB: Optional[str] = Field(description="Name of the second AWS service.")
    relationship: Optional[str] = Field(description="Description of the relationship between the two services. This must be a single word.")
    evidence: Optional[str] = Field(description="Verbatim sentence of the text where the relationship is mentioned.")

class DataExtracted(BaseModel):
    """Relantionship between two AWS services. The relationship represents a connection between two AWS services in an AWS architecture."""
    relationships: List[AWSServicesRelationship] = Field(description="List of relationships between AWS services.")

In [3]:
with open(DATA / "relationships.pkl", "rb") as f:
    extracted_data = pickle.load(f)

with open(DATA / "services_data_sc.json") as f:
    services_raw = json.load(f)
    services = [service["service_name"] for service in services_raw]

In [4]:
good_relationships = []

for relationships in extracted_data:
    as_dict = relationships.dict()["relationships"]

    for relationship in as_dict:
        if not relationship:
            continue
        elif relationship["serviceA"] == relationship["serviceB"]:
            continue
        elif not all([relationship["serviceA"] in services, relationship["serviceB"] in services]):
            continue
        else:
            good_relationships.append(relationship)

len(good_relationships)

4471

#### Extra filter of good relationships (Bedrock)

In [5]:
#TODO

#### Creating knowledge graph

In [6]:
kg = Neo4jGraph(
    url=os.getenv("NEO4J_URL"),
    username=os.getenv("NEO4J_USERNAME"),
    password=os.getenv("NEO4J_PASSWORD"),
    database=os.getenv("NEO4J_DATABASE")
)
cypher = """
    MATCH (n)
    DETACH DELETE n
"""
kg.query(cypher)

[]

In [7]:
cypher = """
    MATCH (s) 
    RETURN count(s) AS number_of_services
"""
kg.query(cypher)

[{'number_of_services': 0}]

In [8]:
# Iterate over each service in the JSON data
for service in services_raw:
    service_name = service["service_name"]
    description = service["description"]
    service_categories = service["service_categories"]

    # Create a Cypher query to add the service node with its properties
    cypher = f"""
    CREATE (s:Service {{
        name: '{service_name}',
        description: '{description}',
        categories: {service_categories}
    }})
    """

    # Execute the Cypher query to add the service node
    kg.query(cypher)

In [9]:
print(f"Expected number of nodes: {len(services_raw)}")
cypher = """
    MATCH (n) 
    RETURN count(n) AS number_of_services
"""
kg.query(cypher)

Expected number of nodes: 305


[{'number_of_services': 305}]

#### Testing queries

In [10]:
cypher = """
    MATCH (s:Service {categories: ["Machine Learning"]})
    RETURN s
"""
a = kg.query(cypher)
a

[{'s': {'name': 'Amazon A2I',
   'description': 'Easily implement human review of ML predictions',
   'categories': ['Machine Learning']}},
 {'s': {'name': 'Amazon Bedrock',
   'description': 'Access best-in-class foundation models to build generative AI applications',
   'categories': ['Machine Learning']}},
 {'s': {'name': 'Amazon CodeGuru',
   'description': 'Find your most expensive lines of code',
   'categories': ['Machine Learning']}},
 {'s': {'name': 'Amazon Comprehend',
   'description': 'Discover insights and relationships in text',
   'categories': ['Machine Learning']}},
 {'s': {'name': 'Amazon Comprehend Medical',
   'description': 'Detect and return useful information in unstructured clinical text',
   'categories': ['Machine Learning']}},
 {'s': {'name': 'Amazon DevOps\xa0Guru',
   'description': 'Improve application availability with ML-powered cloud operations',
   'categories': ['Machine Learning']}},
 {'s': {'name': 'Amazon Elastic Inference',
   'description': 'Deep

In [11]:
cypher = """
    MATCH (s:Service)
    WHERE s.categories = ["Machine Learning"]
    RETURN s
"""
b = kg.query(cypher)
print(a == b)

True


In [12]:
cypher = """
MATCH (s:Service)
WHERE "Machine Learning" IN s.categories
RETURN s.name AS service, s.description AS description, s.categories AS categories
LIMIT 5
"""
kg.query(cypher)

[{'service': 'Amazon A2I',
  'description': 'Easily implement human review of ML predictions',
  'categories': ['Machine Learning']},
 {'service': 'Amazon Bedrock',
  'description': 'Access best-in-class foundation models to build generative AI applications',
  'categories': ['Machine Learning']},
 {'service': 'Amazon CodeGuru',
  'description': 'Find your most expensive lines of code',
  'categories': ['Machine Learning']},
 {'service': 'Amazon CodeWhisperer',
  'description': 'Build applications faster with the ML-powered coding companion',
  'categories': ['Developer Tools', 'Machine Learning']},
 {'service': 'Amazon Comprehend',
  'description': 'Discover insights and relationships in text',
  'categories': ['Machine Learning']}]

#### Adding relationships to the graph

In [13]:
for relationship in good_relationships:
    service_a = relationship["serviceA"]
    service_b = relationship["serviceB"]

    # Create the Cypher query to create the relationship
    cypher = f"""
    MATCH 
        (a:Service {{name: '{service_a}'}}),
        (b:Service {{name: '{service_b}'}})
    MERGE (a)-[:CAN_BE_CONNECTED]-(b)
    """

    kg.query(cypher)

In [16]:
cypher = """
    MATCH (a:Service {name: 'Amazon Athena'})-[r:CAN_BE_CONNECTED]->(b:Service)
    WITH a, b, COLLECT(r) AS relationships
    RETURN a.name AS source, b.name AS target
"""
kg.query(cypher)

[{'source': 'Amazon Athena', 'target': 'Amazon EMR'},
 {'source': 'Amazon Athena', 'target': 'Amazon Redshift'},
 {'source': 'Amazon Athena', 'target': 'Amazon S3'},
 {'source': 'Amazon Athena', 'target': 'AWS CloudTrail'},
 {'source': 'Amazon Athena', 'target': 'AWS Database Migration Service'},
 {'source': 'Amazon Athena', 'target': 'AWS Glue'},
 {'source': 'Amazon Athena', 'target': 'AWS KMS'},
 {'source': 'Amazon Athena', 'target': 'AWS Lambda'},
 {'source': 'Amazon Athena', 'target': 'AWS VPN'},
 {'source': 'Amazon Athena', 'target': 'Elastic Load Balancing'}]