In [13]:
import os
import shutil
import tempfile
import pandas as pd
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph
from reportlab.lib.styles import getSampleStyleSheet
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_postgres import PGVector
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_community.chat_models import ChatOpenAI

In [15]:
def create_mock_pdf_files(directory):
    """Creates 5 mock PDF files based on aircraft claims data using pandas and reportlab."""
    if not os.path.exists(directory):
        os.makedirs(directory)
    print(f"Created directory: {directory}")

    # 1. Create DataFrame using pandas
    data = {
        'Claim_ID': ['C2024-101', 'C2024-102', 'C2024-103', 'C2024-104', 'C2024-105'],
        'Aircraft_Model': ['Boeing 737', 'Airbus A320', 'Cessna 172', 'Embraer 175', 'DHC-8 Q400'],
        'Damage_Cause': ['Hard Landing', 'FOD Ingestion', 'Hail Damage', 'Bird Strike', 'Tail Strike'],
        'Claim_Value': ['$150,000', '$4,500,000', '$25,000', '$350,000', '$900,000'],
        'Description': [
            'Minor flap damage during a hard landing in strong crosswinds. Total claim value: $150,000.',
            'Engine compressor stall due to foreign object debris (FOD) ingestion on the runway. Claim amount: $4,500,000.',
            'Hail damage to the wing surface during an unexpected storm. Simple repair. Claim value: $25,000.',
            'Damage to the nose cone and radome after hitting a flock of birds on final approach. Claim amount: $350,000.',
            'Minor structural damage to the fuselage near the rear pressure bulkhead during landing. Claim amount: $900,000.'
        ]
    }
    df = pd.DataFrame(data)

    styles = getSampleStyleSheet()

    # 2. Iterate and create a PDF for each claim
    for index, row in df.iterrows():
        filename = f"{directory}/{row['Claim_ID']}.pdf"
        doc = SimpleDocTemplate(filename, pagesize=letter)

        story = []
        # Title and basic info
        story.append(Paragraph(f"<b>AIRCRAFT CLAIMS REPORT: {row['Claim_ID']}</b>", styles['Title']))
        story.append(Paragraph("<br/>", styles['Normal']))
        story.append(Paragraph(f"<b>Aircraft Model:</b> {row['Aircraft_Model']}", styles['Normal']))
        story.append(Paragraph(f"<b>Primary Cause:</b> {row['Damage_Cause']}", styles['Normal']))
        story.append(Paragraph(f"<b>Claim Value:</b> {row['Claim_Value']}", styles['Normal']))
        story.append(Paragraph("<br/>", styles['Normal']))
        # Detailed Description
        story.append(Paragraph("<b>Damage Description:</b>", styles['Normal']))
        story.append(Paragraph(row['Description'], styles['Normal']))

        doc.build(story)

    print(f"✅ Created {len(df)} structured PDF claim files in {directory}.")
    # Return a list of created files
    return [f"{directory}/{cid}.pdf" for cid in df['Claim_ID']]

In [16]:
PDF_DIR = "./aircraft_claims_pdfs"
create_mock_pdf_files(PDF_DIR)

Created directory: ./aircraft_claims_pdfs
✅ Created 5 structured PDF claim files in ./aircraft_claims_pdfs.


['./aircraft_claims_pdfs/C2024-101.pdf',
 './aircraft_claims_pdfs/C2024-102.pdf',
 './aircraft_claims_pdfs/C2024-103.pdf',
 './aircraft_claims_pdfs/C2024-104.pdf',
 './aircraft_claims_pdfs/C2024-105.pdf']

In [17]:
# set the working directory for the project
%cd /home/vcap/app/cf-jupyterlab-workshop

/home/vcap/app/cf-jupyterlab-workshop


In [18]:
## Ingestion pipeline to load data
import os
import json
import pandas as pd
import requests
import httpx
from sqlalchemy import create_engine, text
from langchain.docstore.document import Document
from langchain_postgres.vectorstores import PGVector
from cfenv import AppEnv
import sys, os
import warnings
from tanzu_utils import CFGenAIService
warnings.filterwarnings('ignore')
# -----------------------------
# Load services from env
# -----------------------------
env = AppEnv()

# -----------------------------
# Embedding service details
# -----------------------------
embedding_service = CFGenAIService("tanzu-nomic-embed-text")

# List available models
embedding_models = embedding_service.list_models()
for m in embedding_models:
    print(f"- {m['name']} (capabilities: {', '.join(m['capabilities'])})")


api_base = embedding_service.api_base + "/openai/v1"
api_key = embedding_service.api_key
model_name = embedding_models[0]["name"]

print("Embedding model:", model_name)

- nomic-embed-text-v1025 (capabilities: EMBEDDING)
Embedding model: nomic-embed-text-v1025


In [19]:
# -----------------------------
# Database connection
# -----------------------------
db_service = env.get_service(name="vector-db")
db_credentials = db_service.credentials
db_uri = db_credentials["uri"]

print("DB URI:", db_uri)

engine = create_engine(db_uri)

# Test DB connection
with engine.connect() as conn:
    version = conn.execute(text("SELECT version();")).fetchone()
    print("Connected to:", version[0])

DB URI: postgresql://pgadmin:629PVy514m0w8rc3jq7Y@q-s0.postgres-instance.kdc01-dvs-lab-mgt-net-82.service-instance-465d60d4-e494-49a5-aace-022e92fbdc1c.bosh:5432/postgres
Connected to: PostgreSQL 16.6 (VMware Postgres 16.6.0) on x86_64-pc-linux-gnu, compiled by gcc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0, 64-bit


In [21]:
PDF_DIR = "/home/vcap/app/cf-jupyterlab-workshop/RAGPDF/aircraft_claims_pdfs"
loader = DirectoryLoader(
            path=PDF_DIR,
            glob="**/*.pdf", 
            loader_cls=PyPDFLoader,
            show_progress=True
        )
documents = loader.load()
print(f"Loaded {len(documents)} document pages/chunks.")

 83%|████████▎ | 5/6 [00:00<00:00,  8.63it/s]

Loaded 5 document pages/chunks.





In [22]:
# 3.2 Split Documents
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
    is_separator_regex=False,
)
splits = text_splitter.split_documents(documents)
print(f"Split documents into {len(splits)} text chunks.")

Split documents into 5 text chunks.


In [24]:
# -----------------------------
# Embedding function (REST call)
# -----------------------------
url = api_base + "/embeddings"
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}
COLLECTION_NAME = "aircraft_claims_collection"
def embed_text(text: str):
    payload = {"model": model_name, "input": text}
    resp = requests.post(url, headers=headers, json=payload, verify=False)
    resp.raise_for_status()
    return resp.json()["data"][0]["embedding"]

class CustomEmbeddings:
    def embed_documents(self, texts): return [embed_text(t) for t in texts]
    def embed_query(self, text): return embed_text(text)

embedding = CustomEmbeddings()

# -----------------------------
# PGVector setup
# -----------------------------
vectorstore = PGVector(
    embeddings=embedding,
    connection=db_uri,
    collection_name=COLLECTION_NAME,
    use_jsonb=True,
    create_extension=True,       # will create pgvector extension if not exists
    pre_delete_collection=True,  # clears old data on restart
)


vectorstore.add_documents(splits)
query = text("SELECT * FROM langchain_pg_collection LIMIT 10;")
print(pd.read_sql(query, engine))

query2 = text("SELECT * FROM langchain_pg_embedding LIMIT 10;")
print(pd.read_sql(query2, engine))
print("Ingestion complete. Data stored in PostgreSQL.")


Collection not found


                         name cmetadata                                  uuid
0     my_documents_collection      None  c202026b-4755-4e63-b4c0-f9856fdcfd01
1               aircraft_docs      None  e1e375c2-4585-4b26-9e52-39fbde99407c
2       finsmart-transactions      None  4336d726-23f9-41e0-b828-e0cb64ca01c5
3    maintenance_and_taxonomy      None  5088ed6d-678b-4f75-b2e6-84dbc31df561
4  aircraft_claims_collection      None  b6e63f4a-65f0-43ea-99b2-b97d1399bfc6
                                     id                         collection_id  \
0  04d1a97d-17d2-4c9e-98b6-af9fb772d7ef  4336d726-23f9-41e0-b828-e0cb64ca01c5   
1  69b105d6-d6f5-4044-be00-4e079a7acf8d  4336d726-23f9-41e0-b828-e0cb64ca01c5   
2  de890765-3f45-469e-aafa-b708891d1cd8  4336d726-23f9-41e0-b828-e0cb64ca01c5   
3  f218d4ff-4d02-44f7-a164-0a9673f8fa06  4336d726-23f9-41e0-b828-e0cb64ca01c5   
4  3e2084a9-2576-4a3c-9a68-7da42545eaf6  4336d726-23f9-41e0-b828-e0cb64ca01c5   
5  9ea8589d-28a9-4d1a-b165-a4a7287a6720  4336d

In [26]:
import os
import requests
import json
import httpx
from openai import OpenAI
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from datetime import date
import warnings
import ssl
from langchain_community.embeddings import OllamaEmbeddings
from openai import OpenAI
from langchain_classic.chains import RetrievalQA

# Optional: configure custom http client
httpx_client = httpx.Client(http2=True, verify=False, timeout=30.0)
# -----------------------------
# cat service details
# -----------------------------
chat_service = CFGenAIService("tanzu-gpt-oss-120b")

# List available models
chat_models = chat_service.list_models()
for m in chat_models:
    print(f"- {m['name']} (capabilities: {', '.join(m['capabilities'])})")


chat_api_base = chat_service.api_base + "/openai/v1"
chat_api_key = chat_service.api_key
chat_model_name = chat_models[0]["name"]

print("chat model:", model_name + api_base)

# Initialize LLM with credentials from cfenv
llm = ChatOpenAI(
    temperature=0.9,
    model=chat_model_name,
    base_url=chat_api_base,
    api_key=chat_api_key,
    http_client=httpx_client
)

- openai/gpt-oss-120b (capabilities: CHAT, TOOLS)
chat model: nomic-embed-text-v1025https://genai-proxy.sys.tas-ndc.kuhn-labs.com/tanzu-nomic-embed-text-v1025-4201d1d/openai/v1


In [29]:
# Create a retriever from your vectorstore
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k":3})

# Build a RetrievalQA chain
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever
)

# Ask a question
query = "What was the primary cause of damage and the value for the claim involving the Airbus A320?"
result = qa.run(query)
print(result)

  result = qa.run(query)


**Primary Cause of Damage:** FOD (Foreign Object Debris) ingestion  
**Claim Value:** $4,500,000


In [30]:
# Ask a question
query = "What is the total sum of all claim values in the database?"
result = qa.run(query)
print(result)

The total sum of all claim values is **$1,400,000**.


In [31]:
# Ask a question
query = "Summarize all hail damage incidents."
result = qa.run(query)
print(result)

**Hail Damage Incident Summary**

| Claim ID | Aircraft Model | Primary Cause | Claim Value | Damage Description |
|----------|----------------|---------------|------------|--------------------|
| C2024-103 | Cessna 172 | Hail Damage | $25,000 | Hail struck the wing surface during an unexpected storm, requiring a simple repair. |

*No other hail‑related claims are present in the provided data.*
