In [42]:
%%capture captured_output
!pip install PyPDF2 openai lancedb ipywidgets

import os
import math
import json
import numpy as np
import PyPDF2
import lancedb
from ipywidgets import Text, Button, VBox, Output
from openai import OpenAI
import pyarrow as pa
import pandas as pd
import ipywidgets as widgets
from langchain.text_splitter import TokenTextSplitter



OPENAI_API_KEY = "ENTER_YOUR_API_KEY"
client = OpenAI(api_key=OPENAI_API_KEY)

Extracts text from a PDF file by reading each page sequentially and combines it into a single string, then splits this text into overlapping chunks of specified size for easier processing or analysis.

Insert your own OpenAI key above.

In [43]:
# %% [code]
def extract_text_from_pdf(file_path):
    """Extract text from a PDF file."""
    text = ""
    with open(file_path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            text += page.extract_text() + "\n"
    return text

def chunk_text(text, chunk_size=4000, overlap=256):
    """
    Split text into overlapping chunks based on token count.
    
    This uses the tokenizer associated with text-embedding-ada-002 
    (encoding "cl100k_base") and sets the chunk size to 8000 tokens 
    (nearly the full context window) with an overlap of 512 tokens.
    
    Args:
        text (str): The input text to be split.
        chunk_size (int): Maximum number of tokens per chunk. Default is 8000.
        overlap (int): Number of tokens to overlap between chunks. Default is 512.
    
    Returns:
        List[str]: A list of text chunks.
    """
    text_splitter = TokenTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=overlap,
        encoding_name="cl100k_base"  # the encoding used by ada-002
    )
    return text_splitter.split_text(text)

This code processes multiple PDF files by extracting text from each, chunking the content, and organizing it into a structured list of dictionaries where each dictionary contains a unique identifier, source file, text chunk, and company code derived from the filename.

If you don't have the documents, you can go to sec.gov > view as html > print > save as pdf (https://www.sec.gov/ix?doc=/Archives/edgar/data/0000080661/000008066124000007/pgr-20231231.htm). Important to keep names as described in the files list.

In [44]:
# List of expected PDF filenames and their corresponding company codes will be derived from the filename.
pdf_files = ["all.pdf", "chubb.pdf", "pgr.pdf", "trv.pdf"]

documents = []  # Each document is a dict with id, source, text, and company.

def extract_company_from_filename(filename):
    """
    Extract company code from filename.
    Example: 'all.pdf' -> 'ALL'
            'chubb.pdf' -> 'CHUBB'
            'pgr.pdf' -> 'PGR'
            'trv.pdf' -> 'TRV'
    """
    # Remove the .pdf extension and convert to uppercase
    company = filename.replace('.pdf', '').upper()
    
    # Map company codes (if you need specific mappings)
    company_mapping = {
        'ALL': 'ALL', 
        'CHUBB': 'CHUBB',
        'PGR': 'PGR',
        'TRV': 'TRV'
    }
    
    return company_mapping.get(company, company)

for pdf_file in pdf_files:
    if os.path.exists(pdf_file):
        company = extract_company_from_filename(pdf_file)  # e.g., "ALL", "CHUBB", "PGR", "TRV"
        text = extract_text_from_pdf(pdf_file)
        chunks = chunk_text(text)
        for idx, chunk in enumerate(chunks):
            documents.append({
                "id": f"{company}_{idx}",
                "source": pdf_file,
                "text": chunk,
                "company": company,
            })

print(f"Total number of chunks from all PDFs: {len(documents)}")

Total number of chunks from all PDFs: 161


Next, we generate embeddings for each chunk and add them to our document records.

In [45]:
def get_embedding(texts):
    """
    Get embeddings for text using OpenAI's API.
    Uses text-embedding-ada-002 model which returns 1536-dimensional embeddings.
    """
    # If input is a single string, convert to list
    if isinstance(texts, str):
        texts = [texts]
        single_input = True
    else:
        single_input = False
    
    # Process in batches of 100
    batch_size = 100
    embeddings = []
    
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        response = client.embeddings.create(
            input=batch,
            model="text-embedding-ada-002"
        )
        batch_embeddings = [data.embedding for data in response.data]
        embeddings.extend(batch_embeddings)
        print(f"Processed batch {i//batch_size + 1} of {math.ceil(len(texts)/batch_size)}")
    
    # If input was single string, return single embedding
    if single_input:
        return embeddings[0]
    return embeddings

# Then modify the embedding generation loop:
texts = [doc["text"] for doc in documents]
embeddings = get_embedding(texts)

for doc, embedding in zip(documents, embeddings):
    doc["vector"] = embedding

print("Embeddings generated for all chunks.")

Processed batch 1 of 2
Processed batch 2 of 2
Embeddings generated for all chunks.


This code sets up a LanceDB database with a specific schema for storing document data, including text and vector embeddings, by first removing any existing table and then creating a new one populated with the previously processed PDF document chunks.

In [46]:
# Connect to your LanceDB database.
db = lancedb.connect("lancedb_insurance_competition")

# Define the PyArrow schema for our documents table.
schema = pa.schema([
    ("id", pa.string()),
    ("source", pa.string()),
    ("text", pa.string()),
    ("vector", pa.list_(pa.float32(), 1536)),  # fixed size dimension 1536 for the embedding
    ("company", pa.string()),
])

table_name = "documents"

# Overwrite any existing table by dropping it.
try:
    db.drop_table(table_name)
except Exception:
    # If the table doesn't exist, ignore the error.
    pass

if documents:
    table = db.create_table(table_name, data=documents, schema=schema)
    print(f"Created table '{table_name}' with {len(documents)} rows.")
else:
    print("No documents available to create the table.")
    


Created table 'documents' with 161 rows.


extract_companies_from_query(): Identifies which insurance companies are mentioned in the user's query by looking for specific name variations (like finding both "TRAVELERS" and "TRV" to refer to the same company), ensuring accurate document retrieval for exactly the companies the user is asking about.

search_lancedb(): Performs semantic search in the database by company name, with special handling for multi-company queries, returning the most relevant document chunks based on vector similarity.

generate_answer(): Creates responses using retrieved document chunks and OpenAI's API, specifically formatting the prompt to address each mentioned company separately in the answer.

answer_query(): The main orchestration function that ties everything together - it processes the query, finds relevant documents, and generates the final answer using the retrieved context.

In [40]:

def extract_companies_from_query(query):
    """
    Extract a list of canonical company identifiers from the query using simple substring matching.
    
    The mapping is as follows:
      - "ALL": ["ALL", "ALLSTATE"]
      - "CHUBB": ["CHUBB"]
      - "PGR": ["PGR", "PROGRESSIVE"]
      - "TRV": ["TRV", "TVR", "TRAVELERS", "TRAVELER"]
    """
    query_upper = query.upper()
    company_variants = {
        "ALL": ["ALL", "ALLSTATE"],
        "CHUBB": ["CHUBB"],
        "PGR": ["PGR", "PROGRESSIVE"],
        "TRV": ["TRV", "TVR", "TRAVELERS", "TRAVELER"],
    }
    found = set()
    for canonical, variants in company_variants.items():
        for variant in variants:
            if variant in query_upper:
                found.add(canonical)
                break
    return list(found)

def search_lancedb(query_embedding, query, k=10):
    # Check for group retrieval indicators.
    all_indicators = [
        "all companies",
        "all carriers",
        "all insurers",
        "all firms",
        "all organizations",
        "all providers"
    ]
    
    query_lower = query.lower()
    if any(indicator in query_lower for indicator in all_indicators):
        companies = ["ALL", "CHUBB", "PGR", "TRV"]
        print("Group retrieval detected. Using companies:", companies)
    else:
        companies = extract_companies_from_query(query)
        print("Extracted companies:", companies)
    
    if companies and len(companies) > 1:
        results_list = []
        for comp in companies:
            print(f"Searching for {comp}...")
            res = table.search(query_embedding)\
                       .where(f"company = '{comp}'", prefilter=True)\
                       .limit(k)\
                       .to_pandas()
            print(f"Found {len(res)} chunks for {comp}")
            results_list.append(res)
        results = pd.concat(results_list, ignore_index=True) if results_list else pd.DataFrame()
    elif companies:
        print(f"Searching for {companies[0]}...")
        results = table.search(query_embedding)\
                       .where(f"company = '{companies[0]}'", prefilter=True)\
                       .limit(k)\
                       .to_pandas()
        print(f"Found {len(results)} chunks for {companies[0]}")
    else:
        print("No company filter applied.")
        results = table.search(query_embedding).limit(k).to_pandas()
    return results

def generate_answer(query, retrieved_chunks, model="o3-mini"):
    """
    Generate an answer using the OpenAI chat completions API,
    providing the retrieved context as part of the prompt.
    
    The prompt instructs the assistant to address each company mentioned in the question separately.
    """
    context_text = "\n\n".join(retrieved_chunks)
    prompt = f"""You are an insurance competitive intelligence assistant.

Below is context extracted from financial statements of multiple companies.
Please answer the following question by addressing each company mentioned in the question separately.
If context for a company is missing, state that the relevant information is not available.

Context:
{context_text}

Question:
{query}
"""
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
    )
    answer = response.choices[0].message.content.strip()
    return answer

def answer_query(query, top_k=10):
    """
    Process the query by computing its embedding, searching for relevant document chunks
    (with filtering based on detected companies), and generating an answer using the retrieved chunks.
    
    Returns a tuple (answer, retrieved_chunks).
    """
    query_embedding = get_embedding(query)
    results_df = search_lancedb(query_embedding, query, k=top_k)
    if results_df.empty:
        return "No relevant context found for your query.", []
    retrieved_chunks = results_df["text"].tolist() if "text" in results_df.columns else []
    answer = generate_answer(query, retrieved_chunks)
    return answer, retrieved_chunks


this cell creates a basic UI where you can try it on your local jupyter notebook instance to verify that it works

In [41]:
query_input = widgets.Text(
    value='',
    placeholder='Enter your query here, e.g., "What is PGR revenue?"',
    description='Query:',
    layout={'width': '80%'}
)
submit_button = widgets.Button(description="Submit Query")
output_area = widgets.Output()

def on_submit(b):
    with output_area:
        output_area.clear_output()  # Clear previous output.
        query = query_input.value
        print("Your query:", query)
        try:
            answer, context = answer_query(query)
            print("Response:", answer)
            if context:
                print("\nRetrieved Context Chunks:")
                for idx, chunk in enumerate(context, 1):
                    print(f"Chunk {idx}:", chunk)
        except Exception as e:
            print("Error processing query:", e)

submit_button.on_click(on_submit)
display(widgets.VBox([query_input, submit_button, output_area]))

VBox(children=(Text(value='', description='Query:', layout=Layout(width='80%'), placeholder='Enter your query …