In [2]:
for name in dir("/Users/aindukur/Documents/Projects/Personal/AortaGPT/data/raw"):
    def read_pdf(self, filename: str) -> str:
            """Read and return text content from a PDF file."""
            if filename in self.document_cache:
                return self.document_cache[filename]

            file_path = os.path.join(self.data_folder, filename)
            if not os.path.exists(file_path):
                logger.warning(f"File not found: {file_path}")
                return ""

            try:
                reader = PdfReader(file_path)
                text = ""
                for page in reader.pages:
                    page_text = page.extract_text() or ""
                    text += page_text + "\n\n"
                self.document_cache[filename] = text
                txt_filename = f"{name}.txt"
                with open(txt_filename, "w", encoding="utf-8") as f:
                    f.write(text)
                return text
            except Exception as e:
                logger.error(f"Error reading PDF {filename}: {e}")
                return ""
    

In [None]:
import os
import openai
import pandas as pd
import numpy as np
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

def generate_embedding(text):
    '''
    Generate embedding for a given text

    Args:
        text (str): Text to generate embedding for
    Returns:
        np.array: Embedding vector
    '''
    response = openai.embeddings.create(input=text, model="text-embedding-3-large")
    return np.array(response.data[0].embedding)

In [None]:
def cosine_similarity(vec1, vec2):
    '''
    Calculate the cosine similarity between two vectors.
    
    Args:
        vec1 : numpy.array
            First vector
        vec2 : numpy.array
            Second vector
    
    Returns:
        float
            The cosine similarity between the two vectors
    '''
    return np.dot(vec1, vec2) / (norm(vec1) * norm(vec2))

In [None]:
def search_policy_docs(query: str, top_k: int=2) -> str:
    """
    Search for company policy documents that match a query and return information from the top_k results.
    
    USE THIS TOOL ONLY FOR:
    - Responding to Noncompliance with Laws and Regulations When Providing Client Services (U.S., IND, MEX & GER)
    - Mandatory Compliance with Periodic Reporting Requirements for PPMDs, Senior Managers, and Managers (U.S., IND, CR, MEX & GER)
    - Third Party Risk Management (U.S., IND, MEX & GER)
    - Relocation Assistance for Employees (Other than Managing Directors) (U.S.)
    - Artificial Intelligence (AI) Policy (U.S., IND, CR, MEX & GER)
    - Expectations Concerning Independence Policies and Consequences of Noncompliance (U.S., IND, CR, MEX & GER)
    - Restrictions on Procuring, Downloading, Installing, Accessing, or Copying Technology (U.S. & IND)
    - Time Reporting
    - Lobbying and Related Activities (U.S.)
    - Continuing Professional Education (CPE) Compliance (U.S.)
    - Gifts and Entertainment for U.S. Public Officials (U.S.)
    - Information Security (U.S., IND, CR, MEX & GER)
    - Dealing with Suppliers, Service Providers, and Contractors (U.S., IND, CR, MEX & GER)
    - Insurance (U.S. & MEX)
    - Laptop Security (U.S., IND, CR, MEX & GER)
    - Mobile Devices (U.S.)
    - Long-Term Travel and Subsistence Expenses (U.S.)
    - Gifts and Prizes To Personnel (U.S.)
    - Physical Security and Safety (U.S., IND, MEX & GER)
    - Expense Reimbursement (U.S.)
    - Entertainment (U.S.)
    - Firm-Sponsored Activities (U.S. & IND)
    - Donations, Political Contributions, and Sponsorships (U.S., IND & MEX)
    - Certified Public Accountant Licensing (U.S.)
    - Wireless Communications (U.S., IND, CR, MEX & GER)
    - Copyright - Infringement Issues (U.S., IND, CR, MEX & GER)
    - Mandatory Broker Data Import Program for Employees Required to Maintain a Tracking & Trading Portfolio (U.S., IND & MEX)
    - Gifts To and From Clients (U.S.)
    - Personal Automobile Liability Insurance Employees (U.S.)

    DO NOT use this tool for technical troubleshooting or hardware/software questions - use ServiceNow_Researcher for those.
    
    Args:
        query: The policy-related query to search for.
        top_k: The number of top similar policy documents to retrieve. Default value is 2.
    
    Returns:
        str: A string containing results of the search
    """
    df = pd.read_pickle("data/policy_pdf_embeddings.pkl")  # Load stored embeddings
    
    query_embedding = generate_embedding(query)
    

    # Compute similarity for all documents
    df["similarity"] = df["vector"].apply(lambda x: cosine_similarity(query_embedding, np.array(x)))
    
    # Sort by similarity and get the top_k results
    top_results = df.sort_values(by="similarity", ascending=False).head(top_k)
    
    # Check if no results were found
    if top_results.empty:
        return "No similar PDFs found."
    
    # Format the response with titles in a similar way to call_snow_search()
    result_str = "here are the relevant documents, each document is separated by '<<<<<<<>>>>>>>>'\n\n" + "\n<<<<<<<>>>>>>>>\n".join(
        [f"title: {row['title']}\n\ntext: {row['text']}" for _, row in top_results.iterrows()]
    ) + "\n<<<<<<<>>>>>>>>"
    
    return result_str
