In [1]:
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from genaicore import azure_openai_text_embeddings_llm
from langchain_experimental.text_splitter import SemanticChunker
# Generate embeddings
embeddings = azure_openai_text_embeddings_llm
# Load the report (Ensure the file is in the working directory)
pdf_name = "Att 5. Air Quality Assessment 4567 Old Northern Road Maroota.pdf"
loader = PyMuPDFLoader(pdf_name)
pages = loader.load()

# Split into manageable chunks
# text_splitter = CharacterTextSplitter(
#     separator="\n", chunk_size=1000, chunk_overlap=150, length_function=len
# )
text_splitter = SemanticChunker(embeddings)
docs = text_splitter.split_documents(pages)

# Remove unwanted whitespaces
for doc in docs:
    doc.page_content = doc.page_content.replace("\n", "")


In [11]:
from langchain_community.vectorstores import FAISS



# Create a FAISS vector database
db = FAISS.from_documents(docs, embeddings)

# Build a retriever
# retriever = db.as_retriever(search_kwargs={"k": 5})  # Retrieve top 5 relevant chunks
retriever = db.as_retriever(search_kwargs={"k": 20, "semantic_search": True})


In [12]:
standards="""
| Parameter | 24-hour Standard | Annual Standard |
|-----------|------------------|-----------------|
| PM10      | 50 µg/m³         | 25 µg/m³        |
| PM2.5     | 25 µg/m³         | 8 µg/m³         |
| TSP       |     -            | 90 µg/m³        |"""

In [13]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from genaicore import azure_gpt4_openai_text_chat_llm
# Define LLM
llm = azure_gpt4_openai_text_chat_llm

In [72]:
# Query for impact assessment criteria at sensitive receptors
# query_sensitive_receptors = """
# Retrieve information on whether the operational phase impact assessment considers the appropriate 
# impact assessment criteria at each sensitive receptor. List all receptors assessed.
# """
query_sensitive_receptors ='''Retrieve receptor locations along with 
Extract the following predicted concentrations and deposition levels:
- Predicted Annual Average TSP Concentrations
- Predicted Annual Average PM10 Concentrations
- Predicted Annual Average PM2.5 Concentrations
- Predicted Annual Average Dust Deposition Levels
- Maximum predicted 24-Hour Average PM10 concentrations
- Maximum predicted 24-Hour Average PM2.5 concentrations'''

# Retrieve relevant documents
retrieved_docs_receptors = retriever.invoke(query_sensitive_receptors)
retrieved_text_receptors = "\n\n".join([doc.page_content for doc in retrieved_docs_receptors])

# Define structured prompt
# template_sensitive_receptors ="""You are an AI environmental analyst. Your task is to analyze the retrieved environmental data and present it in a structured format. The response should be written in **Markdown** format.
# ### **Context: Retrieved Environmental Data**
# {context}

# ### **Environmental Standards**
# {standards}



# now for below for the annual take only cummalative and and for the 24 hr take only worst to fill that values for respective PM10,P2.5,TSP there have there own value fill those only from the above context and compare the value with the standards and pass if there meet the standa fail if not and mention in the observatiob what parameter are not meet tge standards and at each receptor
# if the parameters values are less than the standards then there "yes" if it greater or equal to the standard value then its "NO" consider only the values
# ### ** out put format look like this Report Heading**
# [Mention extrated company name ] - Environmental Impact Assessment

# ### **Criteria Applied Metrics**
# | Parameter | 24-hour Standard | Annual Standard |
# |-----------|------------------|-----------------|
# | PM10      | 50 µg/m³         | 25 µg/m³        |
# | PM2.5     | 25 µg/m³         | 8 µg/m³         |
# | TSP       | 90 µg/m³         | -               |

# ### **Impact Assessment Criteria Applied**

# | Receptors |       PM10       |                  PM2.5                     |    TSP  | Meets Standards? | Observation                 |
# |           |------------------|-----------------------------------------   |         |                  |                            |
# |           | 24-hour | Annual | 24-hour         | Annual                   | Annual  |                  |                             |
# |-----------|---------|--------|-----------------|----------------          |---------|------------------|-----------------------------|
# | R1        | Value   | Value  | Value           | Value                    | Value   | ✅ Yes           | High cumulative PM2.5       |
# | R2        | Value   | Value  | Value           | Value                    | Value   | ❌ No            | Exceeds PM10 criteria cumulatively |
# .
# .
# ### **Conclusion**
# Summarize the findings and highlight any exceedances or concerns.
# """
# template_sensitive_receptors = """
# You are an AI environmental analyst. Your task is to analyze the retrieved environmental data and present it in a structured format. The response should be written in **Markdown** format.

# ---

# ### **Context: Retrieved Environmental Data**  
# {context}

# ### **Environmental Standards**  
# {standards}

# ---

# ### **Instructions for Data Analysis**  
# - For **annual values**, consider only the **cumulative** values.  
# - For **24-hour values**, take only the **worst-case values**.  
# - Extract the values for **PM10, PM2.5, and TSP** from the provided context.  
# - Compare the extracted values against the given environmental standards.  
# - If the all the  parameters value is **less than** the standard, mark it as **"Yes"** (✅).  
# - If the any single parameter value is **equal  ** the standard, mark it as **"No"** (❌).  
# - If the any single parameter value is **greater than ** the standard, mark it as **"No"** (❌).  
# - In the **Observations** column, write short observation on the parameter values and mention the parameters that exceed the standards at each receptor and short observation of the which parameter are close(diference is 1-5) to standards and going to be problematic

# ---
# output format exatcly like this and should be written in **Markdown** format. - 

# ## **Report Heading**  
# **[Extracted Company Name] - Environmental Impact Assessment**  

# ### **Criteria Applied Metrics**  

# | Parameter | 24-hour Standard | Annual Standard |
# |-----------|------------------|-----------------|
# | PM10      | 50 µg/m³         | 25 µg/m³        |
# | PM2.5     | 25 µg/m³         | 8 µg/m³         |
# | TSP       | -                | 90 µg/m³        |

# ---

# ### **Impact Assessment Criteria Applied**  
# |           |       PM10       |        PM2.5     |   TSP  |                  |                         |
# | Receptors |------------------|------------------|--------| Meets Standards? |        Observation      |
# |           | 24-hour | Annual | 24-hour | Annual | Annual |                  |                         |
# |-----------|---------|--------|---------|--------|--------|------------------|-------------------------|
# |    R1     |  Value  |  Value |  Value  |  Value |  Value |  ✅ Yes / ❌ No | Mention exceedances if any |
# |    R2     |  Value  |  Value |  Value  |  Value |  Value |  ✅ Yes / ❌ No | Mention exceedances if any |
# |    ...    |   ...   |  ...   |   ...   |  ...   |  ...   |       ...        |           ...           |

# ### **Conclusion**  
# Summarize the findings, highlighting any exceedances or concerns.
# """
template_sensitive_receptors="""You are an AI environmental analyst. Your task is to analyze the retrieved environmental data and present it in a structured format. The response should be written in **Markdown** format.

---

### **Context: Retrieved Environmental Data**  
{context}

### **Environmental Standards**  
{standards}

---

### **Instructions for Data Analysis**  
1. For **annual values**, consider only the **cumulative** values.  
2. For **24-hour values**, take only the **worst-case values**.  
3. Extract the values for **PM10, PM2.5, and TSP** from the provided context.  
4. Compare the extracted values against the given environmental standards.  
5. Use the following rules for marking compliance for Meets:  
   - If **all parameter values** are **less than** the standard, mark it as **"Yes"** (✅).  
   - If **any single parameter value** is **equal to** or **greater than** the standard, mark it as **"No"** (❌).  
6. In the **Observations** column:  
   - Observation on the paramters and standards in short
   - Mention the parameters that **exceed** the standards.  
   - Highlight parameters that are **close to the standards or equal** (difference of 1-2 units) and could become problematic.  

---

### **Output Format**  
The output must follow the exact structure below and be written in **Markdown** format:

# **[Extracted Company Name] - Environmental Impact Assessment**  

### **Criteria Applied Metrics**  

| Parameter | 24-hour Standard | Annual Standard |
|-----------|------------------|-----------------|
| PM10      | 50 µg/m³         | 25 µg/m³        |
| PM2.5     | 25 µg/m³         | 8 µg/m³         |
| TSP       | -                | 90 µg/m³        |

---

### **Impact Assessment Criteria Applied**  

| Receptors           |              PM10          |              PM2.5           |    TSP     |       |                                     |   
|                     |     24-hour  |  Annual     |  24-hour      |  Annual      |  Annual    | Meets |        Observation                  |
|---------------------|--------------|-------------|---------------|--------------|------------|-------|-------------------------------------|
| R1                  | Value        | Value       | Value         | Value        | Value      |       | High cumulative PM2.5               |
| R2                  | Value        | Value       | Value         | Value        | Value      |       | Exceeds PM10 criteria cumulatively  |
.
.
---
"""

# Create prompt
prompt_sensitive_receptors = ChatPromptTemplate.from_template(template_sensitive_receptors)

# Construct the chain manually
formatted_prompt_sensitive_receptors = prompt_sensitive_receptors.invoke(
    {"context": retrieved_text_receptors, "standards": standards}
)
response_sensitive_receptors = llm.invoke(formatted_prompt_sensitive_receptors)

# Parse the response
parsed_response_sensitive_receptors = StrOutputParser().invoke(response_sensitive_receptors)

# Print result
print(parsed_response_sensitive_receptors)


# **PF Formation - Environmental Impact Assessment**  

### **Criteria Applied Metrics**  

| Parameter | 24-hour Standard | Annual Standard |
|-----------|------------------|-----------------|
| PM10      | 50 µg/m³         | 25 µg/m³        |
| PM2.5     | 25 µg/m³         | 8 µg/m³         |
| TSP       | -                | 90 µg/m³        |

---

### **Impact Assessment Criteria Applied**  

| Receptors           |     PM10 (µg/m³)       |     PM2.5 (µg/m³)       |     TSP (µg/m³)    | Meets |                 Observation                 |
|---------------------|------------------------|-------------------------|--------------------|-------|----------------------------------------------|
|                     | 24-hour | Annual       | 24-hour | Annual        | Annual             |       |                                              |
| R1                  |  5      |  14          |  3      |  5            |  34                | ✅    | No exceedance observed                      |


In [75]:
prompt="""
check the report provided in the Impact Assessment Criteria Applied table check the Meets dont change any values only you want to chnage meets
 Use the following rules for marking compliance for Meets:  
   - If **all parameter values** are **less than** the standard, mark it as **"Yes"** (✅).  
   - If **any single parameter value** is **equal to** or **greater than** the standard, mark it as **"No"** (❌).  
 In the **Observations** column:  
   - Observation on the paramters and standards in short
   - Mention the parameters that **exceed** the standards.  
   - Highlight parameters that are **close to the standards or equal** (difference of 1-2 units) and could become problematic.  
{report}


after all this after editiong the mistakes give me the extact markdow format after updated dont provide extra text or explanation other than the text in the report"""

In [76]:
# Create prompt
receptors = ChatPromptTemplate.from_template(prompt)

# Construct the chain manually
prompt_sensitive_receptors = receptors.invoke(
    {"report": parsed_response_sensitive_receptors}
)
response_receptors = llm.invoke(prompt_sensitive_receptors)

# Parse the response
parsed_sensitive_receptors = StrOutputParser().invoke(response_receptors)

# Print result
print(parsed_sensitive_receptors)


# **PF Formation - Environmental Impact Assessment**  

### **Criteria Applied Metrics**  

| Parameter | 24-hour Standard | Annual Standard |
|-----------|------------------|-----------------|
| PM10      | 50 µg/m³         | 25 µg/m³        |
| PM2.5     | 25 µg/m³         | 8 µg/m³         |
| TSP       | -                | 90 µg/m³        |

---

### **Impact Assessment Criteria Applied**  

| Receptors           |     PM10 (µg/m³)       |     PM2.5 (µg/m³)       |     TSP (µg/m³)    | Meets |                 Observation                 |
|---------------------|------------------------|-------------------------|--------------------|-------|----------------------------------------------|
|                     | 24-hour | Annual       | 24-hour | Annual        | Annual             |       |                                              |
| R1                  |  5      |  14          |  3      |  5            |  34                | ✅    | No exceedance observed                      |


In [77]:
import os
import subprocess
# Save markdown content to file
def save_markdown_to_file(documentation: str, file_path: str):
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(documentation)

    print(f"Markdown content has been saved to {file_path}")

# Convert Markdown file to .docx using pandoc
def convert_md_to_docx(md_file: str, docx_file: str, template_file: str = None):
    try:
        command = ["pandoc", md_file, "-o", docx_file]
        if template_file:
            command.extend(["--reference-doc", template_file])
        
        subprocess.run(command, check=True)
        print(f"Successfully converted {md_file} to {docx_file}")
    except subprocess.CalledProcessError as e:
        print(f"Error during conversion: {e}")

In [78]:
markdown_file_path =  'Ensuring Appropriate Criteria at Sensitive Receptors.md'
save_markdown_to_file(parsed_sensitive_receptors, markdown_file_path)
docx_file_path = "Ensuring Appropriate Criteria at Sensitive Receptors.docx"
# pdf_file_path = "video_documentation.pdf"

# Convert .md to .docx
convert_md_to_docx(markdown_file_path, docx_file_path)

Markdown content has been saved to Ensuring Appropriate Criteria at Sensitive Receptors.md
Successfully converted Ensuring Appropriate Criteria at Sensitive Receptors.md to Ensuring Appropriate Criteria at Sensitive Receptors.docx


In [79]:
from docx2pdf import convert



# Specify input .docx file and desired output .pdf file

input_file = "Ensuring Appropriate Criteria at Sensitive Receptors.docx"

output_file = "Ensuring Appropriate Criteria at Sensitive Receptors.pdf"



# Convert the document

convert(input_file, output_file) 



print("Conversion complete!") 


100%|██████████| 1/1 [00:17<00:00, 17.50s/it]

Conversion complete!





In [1]:
import base64

def generate_air_quality_report(pdf_filepath: str, output_name: str = "Air Quality Report"):
    """
    Generate an air quality assessment report from a PDF file and return it as base64 string.
    
    Args:
        pdf_filepath (str): Path to the input PDF file containing air quality data
        output_name (str): Base name for output files (without extension)
    
    Returns:
        str: Base64 encoded PDF content
    """
    try:
        # Load and process the PDF
        loader = PyMuPDFLoader(pdf_filepath)
        pages = loader.load()
        
        # Split into chunks using semantic chunker
        docs = text_splitter.split_documents(pages)
        
        # Clean the documents
        for doc in docs:
            doc.page_content = doc.page_content.replace("\n", "")
            
        # Create vector database
        db = FAISS.from_documents(docs, embeddings)
        retriever = db.as_retriever(search_kwargs={"k": 20, "semantic_search": True})
        
        # Get relevant information
        retrieved_docs_receptors = retriever.invoke(query_sensitive_receptors)
        retrieved_text_receptors = "\n\n".join([doc.page_content for doc in retrieved_docs_receptors])
        
        # Generate initial report
        prompt_sensitive_receptors = ChatPromptTemplate.from_template(template_sensitive_receptors)
        formatted_prompt = prompt_sensitive_receptors.invoke(
            {"context": retrieved_text_receptors, "standards": standards}
        )
        response = llm.invoke(formatted_prompt)
        parsed_response = StrOutputParser().invoke(response)
        
        # Verify and update the report
        receptors = ChatPromptTemplate.from_template(prompt)
        prompt_verify = receptors.invoke({"report": parsed_response})
        response_verify = llm.invoke(prompt_verify)
        final_report = StrOutputParser().invoke(response_verify)
        
        # Save and convert files
        md_path = f"{output_name}.md"
        docx_path = f"{output_name}.docx"
        pdf_path = f"{output_name}.pdf"
        
        # Save markdown
        save_markdown_to_file(final_report, md_path)
        
        # Convert to docx
        convert_md_to_docx(md_path, docx_path)
        
        # Convert to PDF
        convert(docx_path, pdf_path)
        
        # Read and encode PDF as base64
        with open(pdf_path, "rb") as pdf_file:
            encoded_pdf = base64.b64encode(pdf_file.read()).decode('utf-8')
            
        # Clean up temporary files
        os.remove(md_path)
        os.remove(docx_path)
        os.remove(pdf_path)
        
        return encoded_pdf
        
    except Exception as e:
        print(f"Error generating report: {str(e)}")
        return None

In [None]:
import os
import subprocess
import base64
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from genaicore import azure_openai_text_embeddings_llm, azure_gpt4_openai_text_chat_llm
from docx2pdf import convert

# Global definitions
embeddings = azure_openai_text_embeddings_llm
text_splitter = SemanticChunker(embeddings)

standards = """
| Parameter | 24-hour Standard | Annual Standard |
|-----------|------------------|-----------------|
| PM10      | 50 µg/m³         | 25 µg/m³        |
| PM2.5     | 25 µg/m³         | 8 µg/m³         |
| TSP       |     -            | 90 µg/m³        |"""

query_sensitive_receptors = '''Retrieve receptor locations along with 
Extract the following predicted concentrations and deposition levels:
- Predicted Annual Average TSP Concentrations
- Predicted Annual Average PM10 Concentrations
- Predicted Annual Average PM2.5 Concentrations
- Predicted Annual Average Dust Deposition Levels
- Maximum predicted 24-Hour Average PM10 concentrations
- Maximum predicted 24-Hour Average PM2.5 concentrations'''

template_sensitive_receptors = """You are an AI environmental analyst. Your task is to analyze the retrieved environmental data and present it in a structured format. The response should be written in **Markdown** format.

---

### **Context: Retrieved Environmental Data**  
{context}

### **Environmental Standards**  
{standards}

---

### **Instructions for Data Analysis**  
1. For **annual values**, consider only the **cumulative** values.  
2. For **24-hour values**, take only the **worst-case values**.  
3. Extract the values for **PM10, PM2.5, and TSP** from the provided context.  
4. Compare the extracted values against the given environmental standards.  
5. Use the following rules for marking compliance for Meets:  
    - If **all parameter values** are **less than** the standard, mark it as **"Yes"** (✅).  
    - If **any single parameter value** is **equal to** or **greater than** the standard, mark it as **"No"** (❌).  
6. In the **Observations** column:  
    - Observation on the paramters and standards in short  
    - Mention the parameters that **exceed** the standards.  
    - Highlight parameters that are **close to the standards or equal** (difference of 1-2 units) and could become problematic.  

---

### **Output Format**  
The output must follow the exact structure below and be written in **Markdown** format:

# **[Extracted Company Name] - Environmental Impact Assessment**  

### **Criteria Applied Metrics**  

| Parameter | 24-hour Standard | Annual Standard |
|-----------|------------------|-----------------|
| PM10      | 50 µg/m³         | 25 µg/m³        |
| PM2.5     | 25 µg/m³         | 8 µg/m³         |
| TSP       | -                | 90 µg/m³        |

---

### **Impact Assessment Criteria Applied**  

| Receptors           |              PM10          |              PM2.5           |    TSP     |       |                                     |   
|                     |     24-hour  |  Annual     |  24-hour      |  Annual      |  Annual    | Meets |        Observation                  |
|---------------------|--------------|-------------|---------------|--------------|------------|-------|-------------------------------------|
| R1                  | Value        | Value       | Value         | Value        | Value      |       | High cumulative PM2.5               |
| R2                  | Value        | Value       | Value         | Value        | Value      |       | Exceeds PM10 criteria cumulatively  |
...
---
"""

prompt = """
check the report provided in the Impact Assessment Criteria Applied table check the Meets dont change any values only you want to chnage meets
Use the following rules for marking compliance for Meets:  
    - If **all parameter values** are **less than** the standard, mark it as **"Yes"** (✅).  
    - If **any single parameter value** is **equal to** or **greater than** the standard, mark it as **"No"** (❌).  
In the **Observations** column:  
    - Observation on the paramters and standards in short
    - Mention the parameters that **exceed** the standards.  
    - Highlight parameters that are **close to the standards or equal** (difference of 1-2 units) and could become problematic.  
{report}


after all this after editiong the mistakes give me the extact markdow format after updated dont provide extra text or explanation other than the text in the report
"""

def save_markdown_to_file(documentation: str, file_path: str):
     with open(file_path, 'w', encoding='utf-8') as file:
          file.write(documentation)
     print(f"Markdown content has been saved to {file_path}")

def convert_md_to_docx(md_file: str, docx_file: str, template_file: str = None):
     try:
          command = ["pandoc", md_file, "-o", docx_file]
          if template_file:
                command.extend(["--reference-doc", template_file])
          subprocess.run(command, check=True)
          print(f"Successfully converted {md_file} to {docx_file}")
     except subprocess.CalledProcessError as e:
          print(f"Error during conversion: {e}")

def generate_air_quality_report(pdf_filepath: str, output_name: str = "Air Quality Report") -> str:
     """
     Generate an air quality assessment report from a PDF file and return it as a base64 string.
     Args:
          pdf_filepath (str): Path to the input PDF file containing air quality data.
          output_name (str): Base name for output files (without extension).
     Returns:
          str: Base64 encoded PDF content.
     """
     try:
          # Load and process the PDF
          loader = PyMuPDFLoader(pdf_filepath)
          pages = loader.load()
          
          # Split into chunks using the semantic chunker
          docs = text_splitter.split_documents(pages)
          
          # Clean the documents
          for doc in docs:
                doc.page_content = doc.page_content.replace("\n", "")
          
          # Create vector database and retriever
          db = FAISS.from_documents(docs, embeddings)
          retriever = db.as_retriever(search_kwargs={"k": 20, "semantic_search": True})
          
          # Get relevant information
          retrieved_docs = retriever.invoke(query_sensitive_receptors)
          retrieved_text = "\n\n".join([doc.page_content for doc in retrieved_docs])
          
          # Generate initial report using the structured prompt
          prompt_template = ChatPromptTemplate.from_template(template_sensitive_receptors)
          formatted_prompt = prompt_template.invoke({
                "context": retrieved_text, 
                "standards": standards
          })
          response = azure_gpt4_openai_text_chat_llm.invoke(formatted_prompt)
          parsed_response = StrOutputParser().invoke(response)
          
          # Verify and update the report using the review prompt
          review_template = ChatPromptTemplate.from_template(prompt)
          review_prompt = review_template.invoke({"report": parsed_response})
          response_review = azure_gpt4_openai_text_chat_llm.invoke(review_prompt)
          final_report = StrOutputParser().invoke(response_review)
          
          # Save and convert files
          md_path = f"{output_name}.md"
          docx_path = f"{output_name}.docx"
          pdf_path = f"{output_name}.pdf"
          
          save_markdown_to_file(final_report, md_path)
          convert_md_to_docx(md_path, docx_path)
          convert(docx_path, pdf_path)
          
          # Read and encode PDF as base64
          with open(pdf_path, "rb") as pdf_file:
                encoded_pdf = base64.b64encode(pdf_file.read()).decode('utf-8')
          
          # Clean up temporary files
          os.remove(md_path)
          os.remove(docx_path)
          os.remove(pdf_path)
          
          return encoded_pdf
          
     except Exception as e:
          print(f"Error generating report: {str(e)}")
          return None

# Example usage:
# if __name__ == "__main__":
#     pdf_file_path = "Att 5. Air Quality Assessment 4567 Old Northern Road Maroota.pdf"
#     base64_pdf = generate_air_quality_report(pdf_file_path)
#     if base64_pdf:
#         print("Report generated successfully and encoded in base64.")