#### Libraries

In [1]:
import asyncio
import json
import pickle
import random
from pathlib import Path
from typing import List, Optional

from dotenv import load_dotenv
from langchain.cache import SQLiteCache
from langchain.globals import set_llm_cache
from langchain.prompts import ChatPromptTemplate
from langchain.schema import Document
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_openai.chat_models import ChatOpenAI
from langchain_text_splitters import TokenTextSplitter
from nest_asyncio import apply as nest_asyncio_apply

_ = nest_asyncio_apply(), load_dotenv()
set_llm_cache(SQLiteCache(database_path=".langchain.db"))
DATA = Path("data")
random.seed(1399)

#### Define Extractor LLM

In [2]:
# GPT-4 Turbo
llm = ChatOpenAI(
    model="gpt-4-turbo-2024-04-09",
    max_tokens=512,
    temperature=0.0,
    streaming=True,
) 

In [3]:
# Inference test
async for tokens in llm.astream("Hey king, how is it going?"):
    print(tokens.content, end="", flush=True)

Hello! I'm here to help you. What's on your mind today?

#### Load parsed documents

In [4]:
# Load parsed pdfs (since these were loaded as LlamaIndex Documents, the library is needed to convert them to LC Documents)
with open(DATA / "parsed_docs.pkl", "rb") as f:
    documents = pickle.load(f)

# Convert parsed pdfs to LC documents
documents = [Document(page_content=doc.text) for doc in documents]
print(len(documents))

284


#### Split documents -> brute force approach

In [5]:
text_splitter = TokenTextSplitter(
    encoding_name="cl100k_base",
    chunk_size=2000,
    chunk_overlap=20,
)
splitted_documents = text_splitter.split_documents(documents=documents)
len(splitted_documents)

2437

#### Load list of AWS services

In [6]:
# Load parsed pdfs
with open(DATA / "services_data_sc.json", "r") as f:
    services = json.load(f)
    services = [service["service_name"] for service in services]
    services = ", ".join(services)
services[:100]

'Amazon A2I, Amazon API Gateway, Amazon AppFlow, Amazon AppStream 2.0, Amazon Athena, Amazon Aurora, '

#### Define extractor prompt

In [7]:
# Define the structure of the expected output
class AWSServicesRelationship(BaseModel):
    """Relantionship between two AWS services. The relationship represents a interaction between two AWS services in an AWS architecture."""
    serviceA: Optional[str] = Field(description="Name of the first AWS service.")
    serviceB: Optional[str] = Field(description="Name of the second AWS service.")
    relationship: Optional[str] = Field(description="Description of the relationship between the two services. This must be a single word.")
    evidence: Optional[str] = Field(description="Verbatim sentence of the text where the relationship is mentioned.")

class DataExtracted(BaseModel):
    """Relantionship between two AWS services. The relationship represents a connection between two AWS services in an AWS architecture."""
    relationships: List[AWSServicesRelationship] = Field(description="List of relationships between AWS services.")

In [8]:
# System prompt
system_prompt = f"""
As an expert in analyzing cloud architectures, your task is to identify and extract relationships between AWS services from the provided text. A 'relationship' is defined as any instance where one AWS service directly interacts with, supports, or is configured to use another AWS service.

Please follow these steps:
1. Read the text carefully to identify any sentences that describe a relationship between the listed AWS services.
2. Extract the names of the two AWS services involved in the relationship.
3. In one word only, capture the type of relationship between the two services.
4. Capture the sentence or fragment that best describes the relationship as evidence.
5. Focus solely on extracting relationships between the following AWS services:
<services> {services} </services>
"""

In [9]:
# Define prompt and extractor runnable
messages = [
    ("system", system_prompt),
    ("user", "{input}")
]
prompt = ChatPromptTemplate.from_messages(messages=messages)
extractor = prompt | llm.with_structured_output(
    schema=DataExtracted,
    method="function_calling",
    include_raw=False
)

  warn_beta(


#### Testing a sample & measuring cost

In [10]:
result = extractor.invoke({"input": splitted_documents[1000].page_content})
print(result.json(indent=4))

{
    "relationships": [
        {
            "serviceA": "AWS Organizations",
            "serviceB": "Amazon EC2",
            "relationship": "governance",
            "evidence": "customers should use a dedicated AWS Account that is governed by AWS Organizations for running the SWIFT secure zone in a production environment."
        },
        {
            "serviceA": "AWS Config",
            "serviceB": "AWS CloudTrail",
            "relationship": "monitoring",
            "evidence": "you can monitor AWS CloudTrail for any unintended changes."
        },
        {
            "serviceA": "AWS CloudFormation",
            "serviceB": "AWS Config",
            "relationship": "integration",
            "evidence": "you can use AWS Config to help you detect drifts and alterations that have happened in the SWIFT secure zone, and you can monitor AWS CloudTrail for any unintended changes."
        },
        {
            "serviceA": "AWS Control Tower",
            "serviceB": "Am

In [11]:
# GPT 4 Turbo new prices
input_ppt = 0.000001
output_ppt = 0.000003
input_tokens = 4000
output_tokens = 500
cost = (input_ppt * input_tokens) + (output_ppt * output_tokens)
print(f"Total expected cost (high approximation): {cost * len(splitted_documents):.2f} USD")

Total expected cost (high approximation): 13.40 USD


In [12]:
# Token limits
total_tokens_per_request = input_tokens + output_tokens
print(f"Expected number of total tokens: {total_tokens_per_request * len(splitted_documents):,}")

Expected number of total tokens: 10,966,500


#### Batch inference

In [16]:
# Aux function to get batches
def gen_batches(iterable, n):
    l = len(iterable)
    for ndx in range(0, l, n):
        yield iterable[ndx:min(ndx + n, l)]

In [18]:
to_batch = [{"input": chunk.page_content} for chunk in splitted_documents]
len(to_batch) == len(splitted_documents)

True

In [15]:
# batch_size = 100
# total = len(to_batch) // batch_size + 1
# relationships = []

# for idx, batch in enumerate(gen_batches(iterable=to_batch, n=batch_size), start=1):

#     print(f"Batch {idx}/{total} processing...")
#     _tmp = await extractor.abatch(inputs=batch, return_exceptions=True)
#     relationships.append(_tmp)
#     print(f"Batch {idx}/{total} done! Sleeping...")

#     await asyncio.sleep(62)
#     print("-" * 50)

#### Manual approach

In [19]:
batch_size = 100
total = len(to_batch) // batch_size + 1
batches = list(gen_batches(iterable=to_batch, n=batch_size))
len(batches)

25

In [29]:
relationships = []

In [31]:
relationships.extend(await extractor.abatch(inputs=batches[0], return_exceptions=False))

In [34]:
relationships.extend(await extractor.abatch(inputs=batches[1], return_exceptions=False))

In [36]:
relationships.extend(await extractor.abatch(inputs=batches[2], return_exceptions=False))

In [37]:
relationships.extend(await extractor.abatch(inputs=batches[3], return_exceptions=False))

In [46]:
relationships.extend(await extractor.abatch(inputs=batches[4], return_exceptions=False))

In [47]:
relationships.extend(await extractor.abatch(inputs=batches[5], return_exceptions=False))

In [53]:
relationships.extend(await extractor.abatch(inputs=batches[6], return_exceptions=False))

In [54]:
relationships.extend(await extractor.abatch(inputs=batches[7], return_exceptions=False))

In [56]:
relationships.extend(await extractor.abatch(inputs=batches[8], return_exceptions=False))

In [60]:
relationships.extend(await extractor.abatch(inputs=batches[9], return_exceptions=False))

In [62]:
relationships.extend(await extractor.abatch(inputs=batches[10], return_exceptions=False))

In [64]:
relationships.extend(await extractor.abatch(inputs=batches[11], return_exceptions=False))

In [67]:
relationships.extend(await extractor.abatch(inputs=batches[12], return_exceptions=False))

In [68]:
relationships.extend(await extractor.abatch(inputs=batches[13], return_exceptions=False))

In [69]:
relationships.extend(await extractor.abatch(inputs=batches[14], return_exceptions=False))

In [71]:
relationships.extend(await extractor.abatch(inputs=batches[15], return_exceptions=False))

In [72]:
relationships.extend(await extractor.abatch(inputs=batches[16], return_exceptions=False))

In [75]:
relationships.extend(await extractor.abatch(inputs=batches[17], return_exceptions=False))

In [76]:
relationships.extend(await extractor.abatch(inputs=batches[18], return_exceptions=False))

In [83]:
relationships.extend(await extractor.abatch(inputs=batches[19], return_exceptions=False))

In [89]:
relationships.extend(await extractor.abatch(inputs=batches[20], return_exceptions=False))

In [91]:
with open(DATA / "relationships.pkl", "wb") as f:
    pickle.dump(relationships, f)

In [None]:
# relationships.extend(await extractor.abatch(inputs=batches[21], return_exceptions=False))

In [None]:
# relationships.extend(await extractor.abatch(inputs=batches[22], return_exceptions=False))

In [None]:
# relationships.extend(await extractor.abatch(inputs=batches[23], return_exceptions=False))

In [None]:
# relationships.extend(await extractor.abatch(inputs=batches[24], return_exceptions=False))