In [1]:
from langchain.agents import Tool
from langchain.chains import RetrievalQA
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import FAISS
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_text_splitters import CharacterTextSplitter
from pydantic import BaseModel, Field
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings

In [4]:
class DocumentInput(BaseModel):
    question: str = Field()

from genaicore import azure_gpt4_openai_text_chat_llm
llm = azure_gpt4_openai_text_chat_llm

tools = []
files = [
    # https://abc.xyz/investor/static/pdf/2023Q1_alphabet_earnings_release.pdf
    {
        "name": "Environment-Protection-Authority-in-New-South-Wales",
        "path": "22p3963-approved-methods-for-modelling-and-assessment-of-air-pollutants.pdf",
    },
    # https://digitalassets.tesla.com/tesla-contents/image/upload/IR/TSLA-Q1-2023-Update
    {
        "name": "PF_Formation",
        "path": "Att 5. Air Quality Assessment 4567 Old Northern Road Maroota.pdf",
    },
]

for file in files:
    loader = PyPDFLoader(file["path"])
    pages = loader.load_and_split()
    from genaicore import azure_openai_text_embeddings_llm
    embeddings = azure_openai_text_embeddings_llm
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    # text_splitter = SemanticChunker(embeddings)
    docs = text_splitter.split_documents(pages)
    
    retriever = FAISS.from_documents(docs, embeddings).as_retriever()

    # Wrap retrievers in a Tool
    tools.append(
        Tool(
            args_schema=DocumentInput,
            name=file["name"],
            description=f"useful when you want to answer questions about {file['name']}",
            func=RetrievalQA.from_chain_type(llm=llm, retriever=retriever),
        )
    )

In [5]:
from langchain.agents import AgentType, initialize_agent,create_tool_calling_agent,AgentExecutor

In [6]:
llm = azure_gpt4_openai_text_chat_llm

agent = initialize_agent(
    agent=AgentType.OPENAI_FUNCTIONS,
    tools=tools,
    llm=llm,
    verbose=True,
)
# from langchain.prompts import ChatPromptTemplate

# from langchain.prompts import ChatPromptTemplate

# prompt = ChatPromptTemplate.from_messages([
#     ("system", "You are an expert in environmental regulations and air pollution assessment. Your task is to analyze whether the given company follows the approved government methods for air pollution assessment in New South Wales."),
    
#     ("human", """The company name is: {company_name}

# The government-approved method for assessment is: {approved_method}

# Retrieve the following data using the available tools:
# 1. **Reported pollution values** from the company.
# 2. **Government-approved pollution standards** for each parameter.

# After retrieving the data, perform the following tasks:
# 1. Compare the company's reported pollution values with the government-approved standards.
# 2. Generate a structured table with:
#    - **Pollution Parameter**
#    - **Reported Value (from tool)**
#    - **Government Limit (from tool)**
#    - **Difference**
#    - **Pass/Fail Status**
# 3. Provide a detailed assessment report explaining whether the company complies with the environmental regulations.

# Ensure that all retrieved data is correctly interpreted before generating the comparison.
# """),
    
#     ("placeholder", "{chat_history}"),
    
#     ("placeholder", "{agent_scratchpad}")
# ])



# # Construct the Tools agent
# agent = create_tool_calling_agent(llm, tools, prompt)


  agent = initialize_agent(


In [7]:
input="Compare the pf formation pollution report, which contains information regarding pollution parameters, with Environment Protection Authority in New South Wales. Identify which parameters passed and which failed with the values comparision show the comparision values differences. Additionally, provide me the detailed report."

In [131]:
# # Example inputs
# company_name = "Dixon Sand (Penrith) Pty Ltd"
# approved_method = "Environment Protection Authority"

# # Format the prompt with inputs
# formatted_prompt = prompt.format(company_name=company_name, approved_method=approved_method)

In [8]:
result=agent(input)

  result=agent(input)




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `Environment-Protection-Authority-in-New-South-Wales` with `{'question': 'Can you provide detailed parameters and their thresholds for pollution monitoring?'}`


[0m[36;1m[1;3m{'query': 'Can you provide detailed parameters and their thresholds for pollution monitoring?', 'result': "The provided context does not contain specific threshold values for pollution monitoring parameters. For detailed parameters and their thresholds, you can refer to resources such as the EPA's guidelines or specific environmental protection regulations. \n\nHowever, based on the given text, some pollutants and their relevant information for impact assessments include:\n\n1. **Sulfur Dioxide (SO2)**\n2. **Nitrogen Dioxide (NO2)**\n3. **Ozone (O3)**\n4. **Particulate Matter (PM2.5, PM10)**\n5. **Total Suspended Particulates (TSP)**\n6. **Deposited Dust**\n7. **Lead (Pb)**\n8. **Carbon Monoxide (CO)**\n9. **Hydrogen Fluoride (HF)**\n\nFor

In [9]:
print(result['output'])

Based on the provided information from both the Environment Protection Authority (EPA) in New South Wales and the PF Formation pollution report, below is a comparison of the pollution parameters, indicating which parameters passed and which failed, along with their respective values.

### Comparison of Pollution Parameters

| **Parameter**              | **Standard** | **EPA Threshold**      | **PF Formation Report Value** | **Status** |
|----------------------------|--------------|------------------------|-----------------------------|------------|
| **Total Suspended Particulates (TSP)** | Annual       | 90 µg/m³                | *Provided in report query* | TBD        |
| **Particulate Matter (PM10)**            | 24-Hour     | 50 µg/m³                | *Provided in report query* | TBD        |
|                                        | Annual       | 25 µg/m³                | *Provided in report query* | TBD        |
| **Particulate Matter (PM2.5)**           | 24-Hour     | 25 µg/

In [153]:
# import fitz  # PyMuPDF for reading PDF
# from langchain.llms import OpenAI
# from langchain.agents import initialize_agent, AgentType
# from langchain.prompts import PromptTemplate
# from langchain.chains import LLMChain
# from langchain.document_loaders import TextLoader
# from azure.core.credentials import AzureKeyCredential
# from azure.ai.textanalytics import TextAnalyticsClient

# # Function to extract text from PDF using PyMuPDF
# def extract_text_from_pdf(pdf_path):
#     document = fitz.open(pdf_path)
#     text = ""
#     for page in document:
#         text += page.get_text("text")
#     return text

# from genaicore import azure_gpt4_openai_text_chat_llm
# llm=azure_gpt4_openai_text_chat_llm

# # Initialize the Langchain agent with the LLM
# def run_azure_openai(pdf_text):
#     # Process the PDF text
#     # prompt_template = """The following text is an environmental standard document. Extract all the parameters and their permissible limits in a detailed way dont leave any thing  extarct the all the table information in that input just extract from the the given input dont give outside information. Format your answer with the parameter name, followed by its limit and any other relevant information.

#     # Document: 
#     # {text}
#     # """
#     prompt_template = """The following text is an environmental standard document. Extract all the parameters and their permissible limits in a detailed way. Don't leave anything out, and also extract all the tables information in the input. Just extract from the given input, don't give outside information. Format your answer with the parameter name, followed by its limit and any other relevant information.

# Document: 
# {text}
# """

#     prompt = PromptTemplate(input_variables=["text"], template=prompt_template)
#     chain = LLMChain(llm=llm, prompt=prompt)
    
#     response = chain.run(pdf_text)
#     return response

# # Function to send extracted text to Azure LLM for analysis and retrieval
# def retrieve_information_from_pdf(pdf_path):
#     # Step 1: Extract text from PDF
#     pdf_text = extract_text_from_pdf(pdf_path)
    
#     # Step 2: Send extracted text to Azure OpenAI via Langchain for detailed analysis
#     analysis_response = run_azure_openai(pdf_text)
    
#     return analysis_response


In [156]:
import fitz  # PyMuPDF for reading PDF
from langchain.llms import OpenAI
from langchain.agents import initialize_agent, AgentType
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.document_loaders import TextLoader
from azure.core.credentials import AzureKeyCredential
from azure.ai.textanalytics import TextAnalyticsClient

# Function to extract text from PDF in chunks
def extract_text_from_pdf(pdf_path, chunk_size=20):
    document = fitz.open(pdf_path)
    num_pages = len(document)
    
    # Divide the PDF into chunks based on the chunk_size
    chunks = []
    for i in range(0, num_pages, chunk_size):
        chunk_text = ""
        for page in document[i:i+chunk_size]:
            chunk_text += page.get_text("text")
        chunks.append(chunk_text)
    
    return chunks

from genaicore import azure_gpt4_openai_text_chat_llm
llm=azure_gpt4_openai_text_chat_llm

# Initialize the Langchain agent with the LLM
def run_azure_openai(pdf_text):
    prompt_template = """The following text is an environmental standard document. Extract all the parameters and their permissible limits in a detailed way. Don't leave anything out, and also extract all the tables information in table format in the input. Just extract from the given input, don't give outside information. Format your answer with the parameter name, followed by its limit and any other relevant information.

Document: 
{text}
"""
    prompt = PromptTemplate(input_variables=["text"], template=prompt_template)
    chain = LLMChain(llm=llm, prompt=prompt)
    
    response = chain.run(pdf_text)
    return response

# Function to send extracted text to Azure LLM for analysis and retrieval
def retrieve_information_from_pdf(pdf_path):
    # Step 1: Extract text from PDF in chunks
    pdf_chunks = extract_text_from_pdf(pdf_path)
    
    # Step 2: Send each chunk to Azure OpenAI via Langchain for detailed analysis
    analysis_responses = []
    for chunk in pdf_chunks:
        analysis_response = run_azure_openai(chunk)
        analysis_responses.append(analysis_response)
    
    # Step 3: Concatenate all responses
    final_response = "\n".join(analysis_responses)
    
    return final_response


In [158]:

# Example usage
pdf_path = "22p3963-approved-methods-for-modelling-and-assessment-of-air-pollutants.pdf"
result = retrieve_information_from_pdf(pdf_path)

print("Extracted Information from PDF and Processed by LLM:")
print(result)


Extracted Information from PDF and Processed by LLM:
### Extracted Parameters and Their Permissible Limits

#### Tables Information

**Table 1: Stack Source Release Parameters**

| Source      | Release Type | Stack Height (m) | Exit Temp. (°C) | Exit Diameter (m) | Exit Velocity (m/s) | Oxygen Conc. (%) | Moisture Content (%) | Flow Rate (Am³/s) | Flow Rate (Nm³/s) |
|-------------|--------------|------------------|-----------------|-------------------|---------------------|------------------|----------------------|-------------------|-------------------|
| Boiler No.1 | Wake-affected | 20               | 150             | 4                 | 15                  | 10               | 15                   | 188.5             | 103.4             |

**Table 2: Stack Emission Concentrations and Regulation Limits**

| Pollutant     | Emission Rate (g/s) | Emission Concentration (mg/Am³) | Corrected Emission Concentration (mg/Nm³ at stack reference conditions) | Regulation Emission Concentra

In [150]:
type(result)

str

In [159]:
# # Your string
# text = """This is your string that you want to save into the file.
# You can add multiple lines like this."""

# Specify the file path
file_path = "Environment_standards.txt"

# Open the file in write mode and save the string
with open(file_path, "w", encoding="utf-8") as file:
    file.write(result)

print(f"String saved to {file_path}")


String saved to Environment_standards.txt
