In [6]:
# We will be using these PDF loaders but you can check out other loaded documents
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.document_loaders import UnstructuredPDFLoader

# This is the name of the report which should be in the directory
# You can download the precise PDF I am using from here https://www.pc.gov.pk/uploads/archives/PSDP_2023-24.pdf
name = 'Att 5. Air Quality Assessment 4567 Old Northern Road Maroota.pdf'

# This loader uses PyMuPDF
loader_py = PyMuPDFLoader(name)

#This loader uses Unstructured
loader_un = UnstructuredPDFLoader(name)

# Storing the loaded documents as langChain Document object
pages_py = loader_py.load()

# pages_un = loader_un.load()

In [7]:
# text splitter
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(
    # shows how to seperate
    separator="\n",
    # Shows the document token length
    chunk_size=1000,
    # How much overlap should exist between documents
    chunk_overlap=150,
    # How to measure length
    length_function=len
)

# Applying the splitter
docs = text_splitter.split_documents(pages_py)

docs

[Document(metadata={'producer': 'Microsoft® Word 2013', 'creator': 'Microsoft® Word 2013', 'creationdate': '2017-09-29T07:53:36+10:00', 'source': 'Att 5. Air Quality Assessment 4567 Old Northern Road Maroota.pdf', 'file_path': 'Att 5. Air Quality Assessment 4567 Old Northern Road Maroota.pdf', 'total_pages': 63, 'format': 'PDF 1.5', 'title': 'Air Quality Assessment – Lot 1 DP 590937 AT 4567 Old Northern Road, Maroota Proposed Sand Quarry', 'author': 'Deanna Tuxford', 'subject': 'PF Formation', 'keywords': '', 'moddate': '2018-05-28T14:58:20+10:00', 'trapped': '', 'modDate': "D:20180528145820+10'00'", 'creationDate': "D:20170929075336+10'00'", 'page': 0}, page_content='Report \n \nAir Quality Assessment – Lot 1 DP 590937 AT 4567 Old Northern \nRoad, Maroota Proposed Sand Quarry\nPF Formation \nJob ID. 20477 \n28 September 2017 \nATTACHMENT 5'),
 Document(metadata={'producer': 'Microsoft® Word 2013', 'creator': 'Microsoft® Word 2013', 'creationdate': '2017-09-29T07:53:36+10:00', 'source'

In [1]:
# a simple function that removes \n newline from the content
def remove_ws(d):
    text = d.page_content.replace('\n','')
    d.page_content = text
    return d

# applied on the docs
docs = [remove_ws(d) for d in docs]

NameError: name 'docs' is not defined

In [9]:
from genaicore import azure_openai_text_embeddings_llm
from langchain_community.vectorstores import FAISS
embeddings=azure_openai_text_embeddings_llm
# Creates the document retriever using docs and embeddings
db = FAISS.from_documents(docs, embeddings)



# Asking the retriever to do similarity search based on Query
query = """Extract the following information from the attached PDF report, starting from page 31 (onwards) under the section 'Annual Average Concentrations':

All environment parameters such as TSP, SO2, NO2, O3, Pb, PM2.5, PM10, deposited dust, CO, and HF.

The values for each parameter.

The dates when these values were measured.

Any relevant notes related to these measurements.

Please also check for cumulative values where applicable.

Additionally, from the section starting on page 93, extract the values for each residential location (R1 to R20) and check if any of the values mentioned under R1 to R20 exceed the following limits:

50 (24 hours for PM10) and 25 (Annual for PM10)

25 (24 hours for PM2.5) and 8 (Annual for PM2.5)

90 (Annual for TSP)

2 (Annual Increment for Dust Deposition) and 4 (Annual Cumulative for Dust Deposition)

The units of measurement (UOM) for these parameters are as follows:

For Dust Deposited: g/m²/day

For others: µg/m³

Please provide a comprehensive summary of your findings."""
answer = db.similarity_search(query)

# Building the retriever
retriever = db.as_retriever(search_kwargs={'k': 3})

In [15]:
def load_text_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return text

# Example usage
file_path = 'Environment_standards.txt'
standards = load_text_from_file(file_path)


In [None]:
#Imports needed for the code to work.
#Using a simple output parser and chat prompt template
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings


# This is the prompt used
template = """

You are a information retrieval AI. Format the retrieved information as a table or text


Use only the context for your answers, do not make up information

query: {query}

{context} 
"""

# Converts the prompt into a prompt template
prompt = ChatPromptTemplate.from_template(template)
#Using OpenAI model, by default gpt 3.5 Turbo
model = ChatOpenAI(api_key =api_key)

# Construction of the chain
chain = (
# The initial dictionary uses the retriever and user supplied query
    {"context":retriever,
     "query":RunnablePassthrough()}
# Feeds that context and query into the prompt then model & lastly 
# uses the ouput parser, do query for the data.
    |  prompt  | model | StrOutputParser()
 
)

In [29]:

# Example usage
result = chain.invoke({"query": "hi"})
print(result)

TypeError: argument 'text': 'dict' object cannot be converted to 'PyString'