In [66]:
#!pip install langchain faiss-cpu openai tiktoken
#!pip install langchain faiss-cpu openai tiktoken
#!pip install pdfplumber
#!pip install pdfminer.six
#!pip install langchain openai faiss-cpu tiktoken
#!pip install langchain-community

# Exctract Text from PDF

In [67]:
from pdfminer.high_level import extract_text

# Path to the PDF
pdf_path = "document.pdf"
# Extract text using pdfminer (better at preserving formatting)
pdf_text = extract_text(pdf_path)
# Save to a file
with open("clean_document_text.txt", "w", encoding="utf-8") as f:
    f.write(pdf_text)


In [68]:
#print(pdf_text[:1000])  # Show first 1000 characters

In [69]:
len(pdf_text)

28975

# Chunk Text

In [70]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

with open("clean_document_text.txt", "r", encoding="utf-8") as f:
    pdf_text = f.read()

# Split into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
chunks = text_splitter.split_text(pdf_text)

# Save chunks for reference
with open("document_chunks.txt", "w", encoding="utf-8") as f:
    f.write("\n\n---\n\n".join(chunks))

In [71]:
# Print some chunks to verify
print(f"Total chunks created: {len(chunks)}")
print("\nSample Chunk:\n", chunks[2])

Total chunks created: 77

Sample Chunk:
 customer in the year 2030. His digital personal assistant orders him an

autonomous vehicle for a meeting across town. Upon hopping into the arriving car,

Scott decides he wants to drive today and moves the car into “active” mode. Scott’s

personal assistant maps out a potential route and shares it with his mobility insurer,

which immediately responds with an alternate route that has a much lower likelihood

of accidents and auto damage as well as the calculated adjustment to his monthly


# Embeddings

In [None]:
#YOUR_API_KEY


In [None]:
import os
os.environ["OPENAI_API_KEY"] = "YOUR_API_KEY"

In [74]:
from openai import OpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS

embeddings = OpenAIEmbeddings()
# Store embeddings in FAISS
vector_store = FAISS.from_texts(chunks, embedding=embeddings)

# Save FAISS index for later use
faiss_index_path = "faiss_index"
vector_store.save_local(faiss_index_path)

print("FAISS index saved successfully!")

FAISS index saved successfully!


# Query FAISS and use GPT04 to answer questions

In [75]:
from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings

# Load FAISS index
faiss_index_path = "faiss_index"
embeddings = OpenAIEmbeddings()
vector_store = FAISS.load_local(faiss_index_path, embeddings, allow_dangerous_deserialization=True)

# Function to retrieve relevant chunks from FAISS
def retrieve_context(query, k=30):
    """
    Retrieve the most relevant document chunks for the given query.
    Ensures proper formatting to avoid vertical text issues.
    """
    docs = vector_store.similarity_search(query, k=k)
    
    # ✅ Fix text formatting by stripping extra spaces & ensuring line breaks are correct
    formatted_context = "\n\n---\n\n".join(
        [doc.page_content.replace("\n", " ").strip() for doc in docs]
    )

    return formatted_context



In [76]:
from openai import OpenAI
import os

# Initialize OpenAI client
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

def ask_document(query):
    """
    Uses FAISS retrieval + GPT-4 to answer user questions.
    Ensures GPT-4 returns properly formatted responses.
    """
    context = retrieve_context(query)

    prompt = f"""
    You are an AI assistant that answers questions based only on the given document.
    If the answer is not found in the document, say "Sorry, no relevant information found."

    Ensure your response is formatted correctly, using full sentences. 
    DO NOT return text one letter per line.

    Context:
    {context}

    Question: {query}

    Answer:"""

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}]
    )

    return response.choices[0].message.content.strip()  # Remove unnecessary spaces

# Test Cases

In [77]:
# Test retrieval
query = "Who is the author of the document?"
retrieved_chunks = ask_document(query)
print((retrieved_chunks))

The authors of the document are Ramnath Balasubramanian, Ari Libarikian, and Doug McElhaney.


In [78]:
# Test retrieval
query = "How many authors wrote the document?"
retrieved_chunks = ask_document(query)
print((retrieved_chunks))

The document was written by three authors: Ramnath Balasubramanian, Ari Libarikian, and Doug McElhaney.


In [79]:
# Test retrieval
query = "What is the document talking about?"
retrieved_chunks = ask_document(query)
print((retrieved_chunks))

The document discusses the impact of AI on the future of the insurance industry as outlined by McKinsey & Company. It covers a range of topics including how AI will reshape claims processing, distribution, underwriting, and pricing. It also highlights the importance of preparing for these changes by adopting a strategic plan that integrates AI-related technologies, focuses on skill building, and includes regular milestones and checkpoints. The document further explores advancements in AI technologies, such as cognitive models, and their applications, as well as the evolving business landscape with the advent of IoT, blockchain, and other innovations. Additionally, it addresses the shift from "detect and repair" to "predict and prevent" in insurance processes, the need for effective data management strategies, and the development of new products and services tailored to future needs.


In [80]:
query = "What is the name of the document?"
retrieved_chunks = ask_document(query)
print((retrieved_chunks))

The name of the document is "Insurance 2030--The impact of AI on the future of insurance | McKinsey & Company".


In [81]:
query = "What are the 3 most important ideas of the document?"
retrieved_chunks = ask_document(query)
print((retrieved_chunks))

The document highlights the following three important ideas:

1. **Integration of AI in the Insurance Industry**: The document emphasizes that as AI becomes more integrated into the insurance industry, carriers must adapt to changes in claims processing, distribution, and underwriting. AI will transform the industry from a "detect and repair" model to a "predict and prevent" one, fundamentally altering the way insurers operate.

2. **Development of a Comprehensive Strategic Plan**: Insurers are encouraged to develop a strategic plan that encompasses all dimensions of analytics-based initiatives, ranging from data handling to cultural changes. This plan should include investments in skill-building, change management, and milestone tracking to adapt to evolving AI technologies and market shifts.

3. **Future Technological Trends and Their Impact**: The document outlines the impact of upcoming technologies such as additive manufacturing, autonomous vehicles, and IoT on the insurance indus