In [2]:
import os
from dotenv import load_dotenv
import openai
from openai import OpenAI
import json
import fitz
import pandas as pd

In [3]:
load_dotenv()

True

In [4]:
client = OpenAI(api_key=os.getenv('OPEN_API_KEY'))

In [5]:
response = client.chat.completions.create(
 model = 'gpt-4o',
 messages = [{"role": "user", "content": "is it too late to get life guard certification?"}]
)

response.choices[0].message.content

"It's never too late to get lifeguard certification, as there are no specific upper age limits for becoming a lifeguard, provided you meet the necessary physical and skill requirements. Most certification programs, like those offered by the American Red Cross or YMCA, have minimum age requirements (usually around 15 or 16 years old), but they welcome students of all ages beyond that.\n\nIf you're interested in becoming a certified lifeguard, check for local classes and certification courses in your area. These courses are typically offered throughout the year, although the schedule can vary based on location and demand. As long as you can pass the physical tests and complete the training, you can become certified."

#### Load the policy

In [6]:

# Open the PDF file
pdf_path = "Home_Insurance_Policy.pdf"
doc = fitz.open(pdf_path)

# Extract text from all pages
documents = []
for page_num in range(doc.page_count):
    page = doc.load_page(page_num)  # Load each page
    text = page.get_text()  # Extract text from the page
    documents.append(text)

In [7]:
text = ' '.join(documents)

insuranc_policy = {
    "text": text
}


In [8]:
prompt_template = """
You are emulating a homeowner who is purchasing property insurance.
Formulate 5 questions that this homeowner might ask the insurance agent about the property insurance.
The questions should be specific to the provided document and relevant to the content.
Ensure that each question is complete, specific, and not too short.
Use the text provided below to generate both the questions and their corresponding answers.

Document content:

{text}

Please provide a well-formed and valid JSON output, without any additional formatting such as code blocks or extra characters. The output should contain the questions as keys and the corresponding answers as values, as shown in the format below:

{{
  "questions": [
    {{"question": "question1", "answer": "answer1"}},
    {{"question": "question2", "answer": "answer2"}},
    {{"question": "question3", "answer": "answer3"}},
    {{"question": "question4", "answer": "answer4"}},
    {{"question": "question5", "answer": "answer5"}}
  ]
}}
""".strip()

In [9]:
prompt = prompt_template.format(**insuranc_policy)

In [11]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content


In [12]:
questions = llm(prompt)

In [33]:
data = (json.loads(questions))['questions']

In [35]:
prompt_template2 = """

You are emulating a homeowner who is purchasing property insurance.
Formulate 5 questions that this homeowner might ask the insurance agent about the property insurance.
The questions should be specific to the provided document and relevant to the content.
Ensure that each question is complete, specific, and not too short.
Use the text provided below to generate the questions.


{text}

Please provide a well-formed and valid JSON output, without any additional formatting such as code blocks or extra characters. The output should contain the questions as keys and the corresponding answers as values, as shown in the format below:

{{"question": ["question1", "question2", ..., "question5"]}}

""".strip()

In [36]:
# Initialize an empty DataFrame
results = pd.DataFrame()

for i in range(len(data)):
    text_input = f"Question: {data[i]['question']}\nAnswer: {data[i]['answer']}"
    # Format the prompt template using the correct 'text' key
    prompt = prompt_template2.format(text=text_input)
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    json_response = response.choices[0].message.content
    questions_list = json.loads(json_response)["question"]
    
    # Convert the list of questions into a DataFrame with an 'id' column
    questions_df = pd.DataFrame({
        'id': [i] * len(questions_list),
        'question': questions_list
    })
    
    # Append the questions to the results DataFrame
    results = pd.concat([results, questions_df], ignore_index=True)

# Now 'results' contains all the questions with their corresponding 'id'
print(results)

    id                                           question
0    0  What is the maximum coverage amount for Dwelli...
1    0  Can you clarify what Other Structures Protecti...
2    0  How does Personal Property Protection work, an...
3    0  Are there additional coverages or endorsements...
4    0  What process do I follow if I need to file a c...
5    1  What specific protective devices can qualify m...
6    1  Can you explain how my claim history impacts m...
7    1  What documentation do I need to provide to con...
8    1  Are there any seasonal or geographical factors...
9    1  How frequently should I review my policy with ...
10   2  What specific actions do I need to take immedi...
11   2  How do I properly document the damaged or stol...
12   2  Is there a specific time frame within which I ...
13   2  What types of documentation would be considere...
14   2  What steps does Allstate take to assess the da...
15   3  What specific coverage options are included in...
16   3  Can yo

In [39]:
results.to_csv('ground-truth-data.csv', index=False)