In [1]:
import json
import os
from pathlib import Path
from typing import List, Optional

from dotenv import load_dotenv
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_openai import ChatOpenAI
from llama_parse import LlamaParse
from nest_asyncio import apply as nest_asyncio_apply

_ = nest_asyncio_apply(), load_dotenv()
DATA = Path("data/whitepapers/")

In [3]:
parser = LlamaParse(
    api_key=os.getenv("LLAMA_PARSE_API_KEY"),
    result_type="markdown",
    verbose=True,
    language="en",
    num_workers=5
)

In [4]:
file_path = [str(DATA / "2024" / file) for file in os.listdir(DATA / "2024")]
documents = await parser.aload_data(file_path=file_path)

Started parsing the file under job_id 31814d4b-9509-400e-9ab7-ecb69b82880d
Started parsing the file under job_id 2e4574e1-bbbd-4b42-8331-bbe75ededbca
Started parsing the file under job_id 2a700e38-470c-4872-ad72-ce18ef39d127
Started parsing the file under job_id 7e85df9a-4981-4737-bc23-8550f41118c1
Started parsing the file under job_id a641004d-53ba-4634-9d27-0b6c06c5695a
Started parsing the file under job_id 74019ba2-3511-4b05-a5cd-e56a23d1e9a3
Started parsing the file under job_id 1c70181f-1b18-4150-ba24-f709b163265c
...

#### Extractor Example

In [15]:
print(len(documents[0].text))

248997


In [38]:
with open("data/services_data_sc.json", "r") as f:
    data = json.load(f)
services = ", ".join([service["service_name"] for service in data])

In [56]:
class KGRelationship(BaseModel):
    f"""Relationship between two of the following AWS services: {services}"""
    service_1: Optional[str] = Field(..., description="Service 1, that has a connection with Service 2")
    service_2: Optional[str] = Field(..., description="Service 2, that receives the connection from Service 1")
    evidence: Optional[str] = Field(..., description="Verbatim sentence of the text where the relationship was found")

class Data(BaseModel):
    """Extracted relationships between AWS Services"""
    relationships: List[KGRelationship] = Field(..., description="List of relationships between AWS Services")

In [57]:
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are an expert at identifying relationships between AWS Services. "
            f"Only extract directed connections between services. What relationships mean is that the two services are used often in an architecture. Extract nothing if no important information can be found in the text. You must only extract relationships between the following services: {services}",
        ),
        ("human", "{text}"),
    ]
)

In [58]:
llm = ChatOpenAI(
    model="gpt-4-0125-preview",
    temperature=0,
)

In [59]:
extractor = prompt | llm.with_structured_output(
    schema=Data,
    method="function_calling",
    include_raw=False,
)

In [60]:
from langchain_text_splitters import TokenTextSplitter

text_splitter = TokenTextSplitter(
    chunk_size=2000,
    chunk_overlap=20,
)

texts = text_splitter.split_text(documents[0].text)

In [63]:
first_few = texts[3:6]

extractions = await extractor.abatch(
    [{"text": text} for text in first_few],
    {"max_concurrency": 5},
)

In [62]:
relationships = []

for extraction in extractions:
    relationships.extend(extraction.relationships)

relationships[:20]

[KGRelationship(service_1='AWS Control Tower', service_2='AWS Organizations', evidence='Define your AWS accounts strategy and leverage AWS Control Tower and AWS Organizations to build landing zones to provide ongoing account management, governance, and implementation of AWS best practices.'),
 KGRelationship(service_1='AWS Well-Architected Framework', service_2='AWS Migration Hub', evidence='Your migration planning is the key to a successful migration to AWS, and needs to cover many aspects. These include ensuring you have the right skills at the points when they are needed and the capacity required to meet your timeline, scope, and budget.')]

In [64]:
relationships = []

for extraction in extractions:
    relationships.extend(extraction.relationships)

relationships[:20]

[KGRelationship(service_1='AWS Well-Architected Framework', service_2='Amazon EC2', evidence='Throughout the migration, a wide range of decisions need to be made, such as the target Amazon Elastic Compute Cloud (Amazon EC2) instance type or what type of Amazon Elastic Block Store (Amazon EBS) to use.'),
 KGRelationship(service_1='AWS Well-Architected Framework', service_2='Amazon EBS', evidence='Throughout the migration, a wide range of decisions need to be made, such as the target Amazon Elastic Compute Cloud (Amazon EC2) instance type or what type of Amazon Elastic Block Store (Amazon EBS) to use.'),
 KGRelationship(service_1='AWS Professional Services', service_2='AWS Managed Services', evidence='Alternatively, you may decide to leverage AWS Managed Services to extend your team with operational capabilities, including monitoring, incident management, AWS Incident Detection and Response, security, patch, backup, and cost optimization for migrated workloads.'),
 KGRelationship(service