# Libraries


In [None]:
!python -m pip install poppler-utils tesseract-ocr  unstructured unstructured-client python-dotenv langchain langchain-openai langchain-community faiss-cpu ipython rank-bm25 pydantic httpx unstructured libmagic-dev pydantic

# Data processing

In [None]:
#PARSE DATA
import os
import glob
import json
from dotenv import load_dotenv
from unstructured_client import UnstructuredClient
from unstructured_client.models import shared
from unstructured_client.models.errors import SDKError
from unstructured.staging.base import dict_to_elements
from unstructured.chunking.title import chunk_by_title
from  langchain.schema import Document
from IPython.display import JSON, display, Markdown
from langchain_community.vectorstores import FAISS
# from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
# from  langchain.schema import Document
import json
from typing import Iterable
import logging


def process_pdfs_and_cache(input_folder, output_folder, strategy):
    # Load environment variables from a .env file
    load_dotenv()

    # Get unstructured API key
    unstructured_api_key = os.getenv("UNSTRUCTURED_API_KEY")
    if not unstructured_api_key:
        raise ValueError("UNSTRUCTURED_API_KEY environment variable not found")

    # Initialize the UnstructuredClient
    s = UnstructuredClient(api_key_auth=unstructured_api_key, server_url='https://redhorse-d652ahtg.api.unstructuredapp.io')

    # Create cache folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    # Generate cache file path
    folder_name = os.path.basename(os.path.normpath(input_folder))
    cache_file_path = os.path.join(output_folder, f'{folder_name}_combined_content.json')

    # Check if the combined content file already exists
    if os.path.exists(cache_file_path):
        print(f"Loading combined content from {cache_file_path}...")
        with open(cache_file_path, 'r', encoding='utf-8') as f:
            combined_content = json.load(f)
    else:
        # Initialize a list to hold the combined content
        combined_content = []

        # Iterate through all PDF files in the directory
        for filename in glob.glob(os.path.join(input_folder, "*.pdf")):
            print(f"Processing {filename}...")
            with open(filename, "rb") as file:
                req = shared.PartitionParameters(
                    files=shared.Files(
                        content=file.read(),
                        file_name=filename,
                    ),
                    strategy=strategy,
                )

                try:
                    res = s.general.partition(req)
                    # Append the parsed elements to the combined content list
                    combined_content.extend(res.elements)
                except SDKError as e:
                    print(f"Error processing {filename}: {e}")

        # Display length of combined content
        print(f"Combined content length: {len(combined_content)}")

        # Save combined content to the cache file
        with open(cache_file_path, 'w', encoding='utf-8') as f:
            json.dump(combined_content, f)

        print(f"Combined content saved to {cache_file_path}")

    return combined_content
# Function to process and chunk the data
def process_data(combined_content):
    pdf_elements = dict_to_elements(combined_content)
    elements = chunk_by_title(pdf_elements, combine_text_under_n_chars=2000 ,  max_characters=5000, overlap=700)
    # elements = chunk_elements(pdf_elements, max_characters=5000, overlap=1000)
    documents = []
    for element in elements:
        metadata = element.metadata.to_dict()
        del metadata["languages"]
        metadata["source"] = metadata["filename"]
        documents.append(Document(page_content=element.text, metadata=metadata))
    
    return documents

# Example usage:
strategy = "auto" # "auto"
combined_content = process_pdfs_and_cache("./data/far_seperatedParts", "./cache", strategy)

#PROCESS DATA
print(f"Chunking data...")
documents = process_data(combined_content)
print("Finished chunking data")

# LOAD INTO VECTOR STORE
# Initialize the OpenAIEmbeddings class
embeddings = OpenAIEmbeddings()

# Initialize the Chroma vector store
#clear the vector store
print(f"Loading data into vector store")
vectorstore = FAISS.from_documents(documents, embeddings)
print("Finish loading data into vector store")

# Create a retriever
retriever = vectorstore.as_retriever(
    # search_type="similarity",
    search_kwargs={"k": 6})

query = "1.102-2 Performance standards"
print(f"Getting answer for: {query}")
answer = retriever.invoke(query)

# Calculate the total length of all page_content
total_length = sum(len(doc.page_content) for doc in answer)
print(f"Total length of all page_content combined: {total_length}")

answer

#display each retrieve document for debugging
for doc in answer:
    display(Markdown(doc.page_content))
    print("---------------------------------------------------")
    print("\n\n")

# APP


In [52]:
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from IPython.display import display, Markdown
import os
import warnings
# from pydantic import PydanticDeprecatedSince20
# warnings.filterwarnings("ignore", category=PydanticDeprecatedSince20)
import logging
from rank_bm25 import BM25Okapi
from typing import List, Tuple
import re

# Set logging level for httpx to WARNING or higher
logging.getLogger("httpx").setLevel(logging.WARNING)


class EnhancedRetriever:
    def __init__(self, vectorstore, documents):
        self.vectorstore = vectorstore
        self.documents = documents
        self.bm25 = self._create_bm25_index()

    def _create_bm25_index(self):
        tokenized_docs = [self._tokenize(doc.page_content) for doc in self.documents]
        return BM25Okapi(tokenized_docs)

    def _tokenize(self, text: str) -> List[str]:
        # Capture FAR section references and words
        tokens = re.findall(r'\b\d+(?:\.\d+)*(?:\s+[A-Za-z]+(?:\s+[A-Za-z]+)*)?\b|\w+', text.lower())
        return tokens

    def exact_keyword_search(self, query: str) -> List[Tuple[float, str]]:
        query_keywords = query.lower().split()
        results = []
        for doc in self.documents:
            content = doc.page_content.lower()
            score = sum(1 for keyword in query_keywords if keyword in content)
            if score > 0:
                results.append((score, doc))
        return sorted(results, key=lambda x: x[0], reverse=True)
    
    def keyword_search(self, query: str, k: int = 4) -> List[Tuple[float, str]]:
        tokenized_query = self._tokenize(query)
        bm25_scores = self.bm25.get_scores(tokenized_query)
        scored_docs = [(score, self.documents[i]) for i, score in enumerate(bm25_scores)]
        return sorted(scored_docs, key=lambda x: x[0], reverse=True)[:k]
        
    def hybrid_search(self, query: str, k: int = 4) -> List[Tuple[float, str]]:
        vector_results = self.vectorstore.similarity_search_with_score(query, k=k)
        keyword_results = self.keyword_search(query, k=k)
        
        combined_results = {}
        
        # Convert query to lowercase and split into keywords
        query_keywords = set(query.lower().split())
        
        for doc, score in vector_results:
            combined_results[doc.page_content] = {'doc': doc, 'vector_score': score, 'keyword_score': 0, 'exact_match': False}
            
            # Check for exact keyword match
            doc_words = set(doc.page_content.lower().split())
            if query_keywords.issubset(doc_words):
                combined_results[doc.page_content]['exact_match'] = True
                print(f"Exact keyword match found by similarity search in document: {doc.metadata.get('source', 'Unknown source')}, "
                    f"Page: {doc.metadata.get('page_number', 'Unknown page')}")
        
        for score, doc in keyword_results:
            if doc.page_content in combined_results:
                combined_results[doc.page_content]['keyword_score'] = score
                if query_keywords.issubset(set(doc.page_content.lower().split())) and not combined_results[doc.page_content]['exact_match']:
                    combined_results[doc.page_content]['exact_match'] = True
                    print(f"Exact keyword match found by keyword search in document: {doc.metadata.get('source', 'Unknown source')}, "
                        f"Page: {doc.metadata.get('page_number', 'Unknown page')}")
            else:
                combined_results[doc.page_content] = {'doc': doc, 'vector_score': 0, 'keyword_score': score, 'exact_match': False}
                
                # Check for exact keyword match for keyword results
                doc_words = set(doc.page_content.lower().split())
                if query_keywords.issubset(doc_words):
                    combined_results[doc.page_content]['exact_match'] = True
                    print(f"Exact keyword match found by keyword search in document: {doc.metadata.get('source', 'Unknown source')}, "
                        f"Page: {doc.metadata.get('page_number', 'Unknown page')}")
        
        final_results = []
        for content, scores in combined_results.items():
            normalized_vector_score = 1 / (1 + scores['vector_score'])
            normalized_keyword_score = scores['keyword_score']
            exact_match_bonus = 2 if scores['exact_match'] else 0  # Increased the bonus for exact matches
            combined_score = (normalized_vector_score + normalized_keyword_score + exact_match_bonus) / 3
            final_results.append((combined_score, scores['doc']))
        
        # print(f"Number of final results: {len(final_results)}")
        
        return sorted(final_results, key=lambda x: x[0], reverse=True)[:k]  # Return only top k results

def organize_documents(docs):
    organized_text = ""
    for i, doc in enumerate(docs, start=1):
        source = doc.metadata.get('source', 'unknown source')
        page_number = doc.metadata.get('page_number', 'unknown page number')
        page_content = doc.page_content
        organized_text += f"Document {i}:\n"
        organized_text += f"Source: {source}\n"
        organized_text += f"Page number: {page_number}\n"
        organized_text += f"Content: {page_content}\n"
        organized_text += "\n\n"
    return organized_text

def create_llm(model_name: str, streaming: bool = False):
    if streaming:
        return ChatOpenAI(
            model_name=model_name,
            temperature=0,
            streaming=True,
            callbacks=[StreamingStdOutCallbackHandler()]
        )
    else:
        return ChatOpenAI(
            model_name=model_name,
            temperature=0
        )


def generate_answer_stream(query: str, relevant_data: str, llm: ChatOpenAI):
    prompt = PromptTemplate(
        template="""
        Based on the following relevant data, please answer the user's query.
        Provide a comprehensive and accurate answer, using the information given.
        If the information is not sufficient to answer the query fully, state so clearly.
        The answer always should have 2 parts. The first part should be the easy to understand answer/explaination and the second part should be the exact wording,make sure to get the full section, do not cut off(for citing purpose).


        
        Example:
        Answer: Based on the provided data, here is a comprehensive explanation of the term "1.103" in the context of the Federal Acquisition Regulation (FAR):

        FAR 1.103 - Authority
        
        Explanation:
        FAR 1.103 outlines the authority under which the Federal Acquisition Regulation (FAR) system operates. It specifies that the authority to issue and maintain the FAR is shared among three key officials:

        The Administrator of General Services
        The Secretary of Defense
        The Administrator of the National Aeronautics and Space Administration (NASA)
        These officials operate under the broad policy guidance of the Office of Federal Procurement Policy (OFPP). The FAR is jointly prepared, issued, and maintained by these officials under their respective statutory authorities.

        Exact Wording:
        "The authority of the FAR system is divided among the following: (a) The Administrator of General Services, the Secretary of Defense, and the Administrator of the National Aeronautics and Space Administration, under the broad policy guidance of the Office of Federal Procurement Policy, are authorized to issue and maintain the FAR. (b) The FAR is prepared, issued, and maintained, and the FAR System is prescribed jointly by the Secretary of Defense, the Administrator of General Services, and the Administrator of the National Aeronautics and Space Administration, under their several statutory authorities."
        
        
        
        
        Relevant data:
        {relevant_data}
        
        User query: {query}
        
        Answer:
        """,
        input_variables=["query", "relevant_data"],
    )
    
    # Create and invoke the chain with streaming
    chain = prompt | llm | StrOutputParser()
    return chain.stream({"query": query, "relevant_data": relevant_data})


def display_keyword_results(retriever: EnhancedRetriever, query: str, k: int = 4):
    results = retriever.keyword_search(query, k)
    print(f"Keyword search results for query: '{query}'\n")
    for i, (score, doc) in enumerate(results, 1):
        print(f"Result {i}:")
        print(f"Score: {score}")
        print(f"Source: {doc.metadata.get('source', 'unknown source')}")
        print(f"Page number: {doc.metadata.get('page_number', 'unknown page number')}")
        print(f"Content snippet: {doc.page_content}...")  # Display first 200 characters
        print("\n")

def extract_search_keywords(query: str, llm: ChatOpenAI) -> str:
    prompt = PromptTemplate(
        template="""
        Context: You are part of a system that processes user queries for searching through legal documents, specifically the Federal Acquisition Regulation (FAR). The system uses a hybrid search method that combines vector search and keyword search.

        Task: Extract the most relevant keywords or phrases from the user's query that would be effective for searching. Focus on specific terms, section numbers, or phrases that are likely to yield the most relevant results.

        Example:
        User query: "Tell me about the section 3.901 Definitions in exact wording, in a nice markdown format."
        Extracted keywords: "3.901 Definitions"

        Explanation: In this example, "3.901 Definitions" is extracted because it's a specific section reference that will yield precise results in the search system.

        User query: {query}

        Please extract the most relevant search keywords or phrases from this query. 
        Return your answer in JSON format with a single key "keywords".
        Return only the json object with the key "keywords", no explanation, no json tag ```json,  is needed.

        """,
        input_variables=["query"],
    )
    
    chain = prompt | llm | StrOutputParser()
    result = chain.invoke({"query": query})
    # print(f"Extract result: {result}")
    try:
        extracted = json.loads(result)
        return extracted["keywords"]
    except json.JSONDecodeError:
        print("Error parsing LLM output. Using original query.")
        return query

def rag_query_enhanced(user_query: str, enhanced_retriever: EnhancedRetriever, model_name: str = "gpt-4o", use_streaming: bool = False, k: int = 4):
    # Create LLM instance for keyword extraction
    keyword_llm = create_llm(model_name, streaming=False)
    
    # Extract search keywords
    search_keywords = extract_search_keywords(user_query, keyword_llm)
    print(f"Extracted search keywords: {search_keywords}")
    
    # Retrieve relevant documents using the enhanced retriever with extracted keywords
    retrieved_docs = enhanced_retriever.hybrid_search(search_keywords, k=k)
    
    # Organize retrieved documents
    organized_text = organize_documents([doc for _, doc in retrieved_docs])
    print(f"\nOrganized text length: {len(organized_text)}")
    
    # Create LLM instance for generating the answer
    answer_llm = create_llm(model_name, use_streaming)
    
    if use_streaming:
        # Generate answer using LLM with streaming
        for chunk in generate_answer_stream(user_query, organized_text, answer_llm):
            print(chunk, end="", flush=True)
    else:
        # Generate answer using LLM without streaming
        answer = generate_answer(user_query, organized_text, answer_llm)
        display(Markdown(answer))

def generate_answer(query: str, relevant_data: str, llm: ChatOpenAI):
    prompt = PromptTemplate(
        template="""
        #Relevant data:
        {relevant_data}
        
        \n\n
        #Instruction:
        - Based on the following relevant data, please answer the user's query.
        - Provide a comprehensive and accurate answer, using the information given.
        - If the information is not sufficient to answer the query fully, state so clearly.
        - Sometime, the full section got cut off or break down to separated parts. Carefully look at the relevant data and connect them if they belong to each other. Pay attention to the sections number to figure it out.
        -The answer always should have 3 parts:
          - The first part should be the easy to understand answer/explanation. With a concise, short example. 
          - the second part should be the exact wording and format to the original doc, make sure to get the full section, do not cut off(for citing purpose).
          - In the third part, if the content refers to any other section or clause(s), please state it out to the user. 
        
        #User query: {query}
        
        #Answer:
        """,
        input_variables=["query", "relevant_data"],
    )
    chain = prompt | llm | StrOutputParser()
    return chain.invoke({"query": query, "relevant_data": relevant_data})

# Usage example:
enhanced_retriever = EnhancedRetriever(vectorstore, documents)

# #Test search function:
# print(f"Query: {query}")
# answer = enhanced_retriever.hybrid_search(query, k=4)
# #display each retrieve document for debugging
# for doc in answer:
#     display(Markdown(doc[1].page_content))
    
# cant find without keyword search:
# 1.102-2 Performance standards.
# 1.102-1 Discussion.
# 1.102 Statement of guiding principles for the Federal Acquisition System.
# 3.901 Definitions.
# 3.902 Classified information.

query = """tell me about the 1.102-2 Performance standards,
output it nicely in markdown format.
"""

rag_query_enhanced(query, enhanced_retriever, model_name="gpt-4o", use_streaming=False, k=4)

Extracted search keywords: 1.102-2 Performance standards
Exact keyword match found by keyword search in document: Part 1 - Federal Acquisition Regulations System.pdf, Page: 5

Organized text length: 10609


### Explanation of 1.102-2 Performance Standards

The performance standards outlined in section 1.102-2 of the Federal Acquisition Regulations (FAR) emphasize the importance of satisfying the customer, minimizing administrative costs, conducting business with integrity, and fulfilling public policy objectives. These standards guide the acquisition process to ensure efficiency, fairness, and responsiveness to customer needs. For example, the government is encouraged to use contractors with a proven track record and to engage in early communication with industry to understand market capabilities.

### Exact Wording from the Original Document

#### 1.102-2 Performance standards.

(a) Satisfy the customer in terms of cost, quality, and timeliness of the delivered product or service.

1. The principal customers for the product or service provided by the System are the users and line managers, acting on behalf of the American taxpayer.

2. The System must be responsive and adaptive to customer needs, concerns, and feedback. Implementation of acquisition policies and procedures, as well as consideration of timeliness, quality, and cost throughout the process, must take into account the perspective of the user of the product or service.

3. When selecting contractors to provide products or perform services, the Government will use contractors who have a track record of successful past performance or who demonstrate a current superior ability to perform.

4. The Government must not hesitate to communicate with industry as early as possible in the acquisition cycle to help the Government determine the capabilities available in the marketplace. Government acquisition personnel are permitted and encouraged to engage in responsible and constructive exchanges with industry (e.g., see 10.002 and 15.201), so long as those exchanges are consistent with existing laws and regulations, and do not promote an unfair competitive advantage to particular firms.

5. The Government will maximize its use of commercial products and commercial services in meeting Government requirements.

6. It is the policy of the System to promote competition in the acquisition process.

7. The System must perform in a timely, high quality, and cost-effective manner.

8. All members of the Team are required to employ planning as an integral part of the overall process of acquiring products or services. Although advance planning is required, each member of the Team must be flexible in order to accommodate changing or unforeseen mission needs. Planning is a tool for the accomplishment of tasks, and application of its discipline should be commensurate with the size and nature of a given task.

(b) Minimize administrative operating costs.

1. In order to ensure that maximum efficiency is obtained, rules, regulations, and policies should be promulgated only when their benefits clearly exceed the costs of their development, implementation, administration, and enforcement. This applies to internal administrative processes, including reviews, and to rules and procedures applied to the contractor community.

2. The System must provide uniformity where it contributes to efficiency or where fairness or predictability is essential. The System should also, however, encourage innovation, and local adaptation where uniformity is not essential.

(c) Conduct business with integrity, fairness, and openness.

1. An essential consideration in every aspect of the System is maintaining the public’s trust. Not only must the System have integrity, but the actions of each member of the Team must reflect integrity, fairness, and openness. The foundation of integrity within the System is a competent, experienced, and well-trained, professional workforce. Accordingly, each member of the Team is responsible and accountable for the wise use of public resources as well as acting in a manner which maintains the public’s trust. Fairness and openness require open communication among team members, internal and external customers, and the public.

2. To achieve efficient operations, the System must shift its focus from "risk avoidance" to one of "risk management." The cost to the taxpayer of attempting to eliminate all risk is prohibitive. The Executive Branch will accept and manage the risk associated with empowering local procurement officials to take independent action based on their professional judgment.

3. The Government shall exercise discretion, use sound business judgment, and comply with applicable laws and regulations in dealing with contractors and prospective contractors. All contractors and prospective contractors shall be treated fairly and impartially but need not be treated the same.

(d) Fulfill public policy objectives. The System must support the attainment of public policy goals adopted by the Congress and the President. In attaining these goals, and in its overall operations, the process shall ensure the efficient use of public resources.

### References to Other Sections or Clauses

- **10.002**: Market research.
- **15.201**: Exchanges with industry before receipt of proposals.
- **42.302**: Contract administration functions.