In [1]:
import os
import numpy as np
import json
import fitz
from openai import OpenAI
import re

In [14]:
import os
from dotenv import load_dotenv
from openai import OpenAI

# Load environment variables from .env
load_dotenv()

# Initialize the OpenAI client with the base URL and API key
client = OpenAI(
    #base_url="https://api.studio.nebius.com/v1/",
    api_key=os.getenv("OPENAI_API_KEY")  # Retrieve the API key from environment variables
)

In [15]:
def load_pdfs_from_folder(folder_path):
    documents = []
    for filename in os.listdir(folder_path):
        if filename.lower().endswith(".pdf"):
            pdf_path = os.path.join(folder_path, filename)
            with fitz.open(pdf_path) as doc:
                text = ""
                for page in doc:
                    text += page.get_text()  # Extracts all visible text per page
                documents.append(text)
    return documents

In [16]:
import os
import fitz  # Ensure PyMuPDF is installed: pip install pymupdf

folder_path = "data"  # Replace with your folder name
documents = load_pdfs_from_folder(folder_path)

print(f"{len(documents)} PDFs loaded and extracted.")
print("Preview of first document:")
print(documents[0][:500])  # Print the first 500 characters of the first PDF


6 PDFs loaded and extracted.
Preview of first document:
 
 
 
 
 
 
  
 
 
 
Improving Public Messaging 
for Evacuation and Shelter‐in‐Place 
Findings and Recommendations for Emergency Managers
from Peer-Reviewed Research 
April 2021 
 
 
 
 
Improving Public Messaging for Evacuation and Shelter-in-Place 
This page intentionally left blank 
 
 
 
 
 
 
 
  
 
 
 
 
 
 
 
 
Improving Public Messaging for Evacuation and Shelter-in-Place 
Contributors 
• 
Carol Freeman, Argonne National Laboratory, National Preparedness Analytics Center1 
• 
Nicole Nunn


In [None]:
def split_into_chunks(documents, chunk_size=300, overlap=50):
    chunks = []
    for doc in documents:
        words = doc.split()
        start = 0
        while start < len(words):
            end = start + chunk_size
            chunk = " ".join(words[start:end])
            chunks.append(chunk)
            start += chunk_size - overlap  # overlap improves context in retrieval
    return chunks

chunks = split_into_chunks(documents, chunk_size=300, overlap=50)
print(f"{len(chunks)} chunks created.")
print("Preview of first chunk:\n", chunks[0][:350])


360 chunks created.
Preview of first chunk:
 Improving Public Messaging for Evacuation and Shelter‐in‐Place Findings and Recommendations for Emergency Managers from Peer-Reviewed Research April 2021 Improving Public Messaging for Evacuation and Shelter-in-Place This page intentionally left blank Improving Public Messaging for Evacuation and Shelter-in-Place Contributors • Carol Freeman, Argon


In [29]:
def chunk_text(text, chunk_size=1000, chunk_overlap=200):
    # Chunks text (by characters) with overlap
    chunks = []
    start = 0
    text_length = len(text)
    while start < text_length:
        end = start + chunk_size
        chunk = text[start:end]
        if chunk.strip():  # Don't save empty chunks
            chunks.append(chunk)
        if end >= text_length:
            break
        start += chunk_size - chunk_overlap
    return chunks

In [30]:
def preprocess_text(text):
    text = text.lower()
    text = ' '.join(text.split())
    return text

preprocessed_chunks = [preprocess_text(chunk) for chunk in chunks]

In [31]:
def create_embeddings(text, model="text-embedding-3-small"):
    """
    Creates embeddings for the given text.

    Args:
    text (str or List[str]): The input text(s) for which embeddings are to be created.
    model (str): The model to be used for creating embeddings.

    Returns:
    List[float] or List[List[float]]: The embedding vector(s).
    """
    # Handle both string and list inputs by converting string input to a list
    input_text = text if isinstance(text, list) else [text]
    
    # Create embeddings for the input text using the specified model
    response = client.embeddings.create(
        model=model,
        input=input_text
    )
    
    # If the input was a single string, return just the first embedding
    if isinstance(text, str):
        return response.data[0].embedding
    
    # Otherwise, return all embeddings for the list of texts
    return [item.embedding for item in response.data]

In [32]:
class SimpleVectorStore:
    def __init__(self):
        self.vectors = []
        self.texts = []
        self.metadata = []
    def add_item(self, text, embedding, metadata=None):
        self.vectors.append(np.array(embedding))
        self.texts.append(text)
        self.metadata.append(metadata or {})
    def similarity_search(self, query_embedding, k=5):
        if not self.vectors:
            return []
        query_vec = np.array(query_embedding)
        similarities = []
        for i, vec in enumerate(self.vectors):
            sim = np.dot(query_vec, vec) / (np.linalg.norm(query_vec) * np.linalg.norm(vec))
            similarities.append((i, sim))
        similarities.sort(key=lambda x: x[1], reverse=True)
        results = []
        for idx, score in similarities[:k]:
            results.append({
                "text": self.texts[idx],
                "metadata": self.metadata[idx],
                "similarity": score
            })
        return results

In [33]:
def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text()
    return text

In [34]:
def process_folder(folder_path, chunk_size=1000, chunk_overlap=200):
    store = SimpleVectorStore()
    for filename in os.listdir(folder_path):
        if filename.lower().endswith(".pdf"):
            pdf_path = os.path.join(folder_path, filename)
            print(f"Processing: {filename}")
            text = extract_text_from_pdf(pdf_path)
            chunks = chunk_text(text, chunk_size, chunk_overlap)
            preprocessed = [preprocess_text(chunk) for chunk in chunks]
            if not preprocessed:
                continue
            embeddings = create_embeddings(preprocessed)
            for i, (chunk, embedding) in enumerate(zip(preprocessed, embeddings)):
                store.add_item(
                    text=chunk,
                    embedding=embedding,
                    metadata={"source": filename, "chunk_index": i}
                )
    print(f"Total chunks in vector store: {len(store.texts)}")
    return store

In [35]:
def answer_query(store, question, k=3):
    query_embedding = create_embeddings(question)
    hits = store.similarity_search(query_embedding, k=k)
    print(f"\nTop {k} results for: \"{question}\"")
    for i, item in enumerate(hits):
        print(f"\nResult {i+1} [source: {item['metadata']['source']}, similarity: {item['similarity']:.3f}]")
        print(item["text"][:500], "...")  # Preview chunk
    return hits

In [36]:
folder_path = "data"
store = process_folder(folder_path)
question = "What are the recommended steps for evacuation according to FEMA?"
results = answer_query(store, question, k=3)

Processing: fema_improving-public-messaging-for-evacuation-and-shelter-in-place_literature-review-report.pdf
Processing: fema_national-resilience-guidance_august2024.pdf
Processing: fema_npd_developing-and-maintaining-emergency_052125.pdf
Processing: fema_npd_local-elected-officials-quick-reference-guide_2025.pdf
Processing: fema_pdhi-final-report-letter-amended.pdf
Processing: fema_shelter-in-place_guidance.pdf
Total chunks in vector store: 826

Top 3 results for: "What are the recommended steps for evacuation according to FEMA?"

Result 1 [source: fema_improving-public-messaging-for-evacuation-and-shelter-in-place_literature-review-report.pdf, similarity: 0.649]
who take the appropriate protective action to evacuate or to sip. many of these research findings are likely to be familiar to emergency managers with experience in evacuation operations. key recommendations include: 1. understand the potential impediments to action and take steps to address these barriers in advance. 2. make

In [37]:
import openai

context = "\n\n".join([item['text'] for item in results])
llm_prompt = (
    f"Answer the following question using ONLY the provided context. Cite specific steps or guidance mentioned.\n"
    f"Context:\n{context}\n\nQuestion: {question}\nAnswer:"
)

response = openai.OpenAI().chat.completions.create(
    model="gpt-4o-mini",  # or "gpt-3.5-turbo", etc.
    messages=[{"role": "user", "content": llm_prompt}],
    max_tokens=512,
    temperature=0.2,
)
print("LLM Answer:\n", response.choices[0].message.content)

LLM Answer:
 The recommended steps for evacuation according to FEMA include:

1. **Understand potential impediments**: Identify and address barriers to action in advance.
2. **Simplify evacuation decisions**: Issue only mandatory evacuation orders to make decisions easier.
3. **Provide information**: Ensure residents and tourists have multiple ways to know if they are in an evacuation zone.
4. **Use multiple messaging channels**: Utilize authoritative channels that include visual information about the hazard and encourage sharing.
5. **Frequent updates**: Provide regular updates to reduce stress related to the unknown aspects of evacuation.
6. **Encourage early evacuation for vulnerable populations**: Specifically advise those in manufactured or mobile homes to evacuate early and plan for assistance.
7. **Assist individuals with disabilities**: Ensure messaging includes information on support for transportation and evacuation planning for those with disabilities or access needs.
