In [1]:
pip install torch transformers faiss-cpu sentence-transformers pymupdf


Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-3.2.0-py3-none-any.whl.metadata (10 kB)
Downloading faiss_cpu-1.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sentence_transformers-3.2.0-py3-none-any.whl (255 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.2/255.2 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu, sentence-transformers
Successfully installed faiss-cpu-1.9.0 sentence-transformers-3.2.0


In [2]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, RagTokenizer, RagRetriever, RagSequenceForGeneration
from sentence_transformers import SentenceTransformer
import faiss
import fitz  # PyMuPDF
from sentence_transformers import SentenceTransformer
import numpy as np
from transformers import GPT2LMHeadModel, GPT2Tokenizer


In [9]:

import fitz  # PyMuPDF

def extract_pdf_chunks(pdf_path, words_per_chunk=100):
    """
    Extract and chunk text from a PDF file using PyMuPDF, creating chunks of approximately a specified number of words.

    Parameters:
        pdf_path (str): The file path of the PDF to be processed.
        words_per_chunk (int): The approximate number of words per chunk.

    Returns:
        list: A list of text chunks extracted from the PDF.
    """
    try:
        # Open the PDF document
        doc = fitz.open(pdf_path)
        chunks = []
        current_chunk = []

        # Loop through each page in the document
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            text = page.get_text("text")

            # Split text into words to control chunk size
            words = text.split()
            for word in words:
                current_chunk.append(word)
                # When the chunk reaches the desired size, join the words into a string and add it to chunks
                if len(current_chunk) >= words_per_chunk:
                    chunks.append(' '.join(current_chunk))
                    current_chunk = []  # Reset the current chunk

        # Add any remaining words as the last chunk
        if current_chunk:
            chunks.append(' '.join(current_chunk))

        return chunks

    except Exception as e:
        print(f"An error occurred while processing the PDF: {e}")
        return []

# Example usage
pdf_path = '/content/IGBC_Green_New_Buildings_Rating_System_(Version_3.0_with_Fifth_Addendum) (1).pdf'
chunks = extract_pdf_chunks(pdf_path)
#print(f"Extracted {len(chunks)} chunks from the PDF.")

In [11]:

# Define a taxonomy for green building credits
green_building_credits = {
    'energy_efficiency': ['energy saving', 'efficiency', 'consumption', 'lighting', 'HVAC'],
    'water_conservation': ['water saving', 'conservation', 'reuse', 'rainwater', 'irrigation'],
    'sustainable_materials': ['recycled materials', 'eco-friendly', 'sustainable', 'green materials'],
    'indoor_air_quality': ['ventilation', 'air quality', 'filters', 'pollutants'],
}

# Function to classify chunks based on keywords in the taxonomy
def classify_chunk(chunk, taxonomy):
    categories = []
    for category, keywords in taxonomy.items():
        if any(keyword.lower() in chunk.lower() for keyword in keywords):
            categories.append(category)
    return categories

# Classify each chunk with its relevant categories
classified_chunks = [(chunk, classify_chunk(chunk, green_building_credits)) for chunk in chunks]

# Load a pre-trained Sentence Transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for each chunk
chunk_embeddings = model.encode([chunk[0] for chunk in classified_chunks])

# Create a FAISS index for fast retrieval
dimension = chunk_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(chunk_embeddings))

def retrieve_relevant_chunks(query, top_n=5):
    query_embedding = model.encode([query])

    # Check for dimension compatibility
    if query_embedding.shape[1] != chunk_embeddings.shape[1]:
        raise ValueError(f"Query embedding dimension mismatch: expected {chunk_embeddings.shape[1]}, got {query_embedding.shape[1]}")

    # Perform the search on the FAISS index
    distances, indices = index.search(np.array(query_embedding), top_n)

    # Handle case where no matches are found or indices are out of range
    relevant_chunks = []
    for i, dist in zip(indices[0], distances[0]):
        if i < len(classified_chunks):
            relevant_chunks.append((classified_chunks[i][0], dist))
        else:
            print(f"Warning: Index {i} is out of range, skipping this result.")

    if not relevant_chunks:
        print("No relevant chunks found for the given query.")
    return relevant_chunks



def generate_response(query, relevant_chunks, model_name='gpt2', max_input_length=1024, max_new_tokens=300):
    # Load the pre-trained GPT-2 model and tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    model = GPT2LMHeadModel.from_pretrained(model_name)

    # Set the padding token to be the same as the end-of-sequence token (if not already set)
    tokenizer.pad_token = tokenizer.eos_token

    # Combine the query and retrieved chunks to create the input for the language model
    context = query + '\n\n' + '\n'.join([chunk for chunk, _ in relevant_chunks])

    # Encode the context into input_ids
    input_ids = tokenizer.encode(context, return_tensors='pt')

    # Check if any token index is out of the model's vocabulary range
    max_token_id = len(tokenizer) - 1  # Maximum valid token ID for the model's vocabulary
    if torch.max(input_ids) > max_token_id:
        print("Warning: Some tokens are out of the model's vocabulary range. Adjusting tokens to fit the model.")
        input_ids = torch.clamp(input_ids, max=max_token_id)  # Ensure all token IDs are within range

    # Truncate the input_ids if it exceeds the maximum length allowed by the model
    if input_ids.shape[1] > max_input_length:
        print(f"Input length exceeds the maximum of {max_input_length} tokens. Truncating the input...")
        input_ids = input_ids[:, :max_input_length]

    # Create the attention mask for the input_ids
    attention_mask = input_ids.ne(tokenizer.pad_token_id).long()

    # Generate a response using the language model with the attention mask
    output = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_new_tokens=max_new_tokens,
        num_return_sequences=1
    )

    # Decode the output to generate the final response text
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_text

# Example usage
query = "energy efficiency in buildings"
relevant_chunks = retrieve_relevant_chunks(query)
#relevant_chunks = [("Energy-efficient HVAC systems can reduce power consumption significantly.", 0.85),
#                   ("Lighting upgrades in commercial spaces lead to lower energy costs.", 0.75)]

# Generate the response using the function
response = generate_response(query, relevant_chunks)
print(f"Generated response: {response}")







# Example query for retrieving relevant chunks
#query = "energy efficiency in buildings"





Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated response: energy efficiency in buildings

Green Building Council Minimum Energy Efficiency EE Mandatory Requirement 2 Intent: Optimise energy consumption, to reduce negative environmental impacts from excessive energy use. Compliance Options:  Case A - Air-conditioned Buildings: Design the building to comply with Energy Conservation Building Code (Revised Version May, 2008) (or) ASHRAE Standard 90.1-2010 (without amendments) through one of the following approaches: ¾ Option 1 - Performance based approach (Whole building simulation) ¾ Option 2 - Prescriptive approach The total annual energy consumption of the building should not exceed the total base case energy consumption computed, as per ECBC (or) ASHRAE Standard 90.1-2010. Note: •
capacity of all HVAC or refrigeration (kW) (or) Total gross AHRI rated cooling capacity of all HVAC or refrigeration • Small HVAC units (containing less than 0.25 kg of refrigerant) need not be considered in calculation. Exemplary Performance: T

Top relevant chunks: [('68\nIndian Green Building Council\nENERGY  EFFICIENCY\n', 0.571772), ('78\nIndian Green Building Council\nEnhanced Energy Efficiency\nEE Credit 2\nPoints: 1-15\nIntent:\nOptimise energy consumption, to reduce negative environmental impacts from excessive energy use.\nCompliance Options:\n\x99 Case A - Air-conditioned Buildings:\nDesign the building to comply with ASHRAE Standard 90.1-2010, Appendix - G (without amendments)\nthrough Performance based approach (Whole building simulation). Simulation is to be carried out\nat comfort temperatures of 24 + 2 deg C.\nPoints are awarded based on energy cost percentage savings as detailed below:\nPercentage of Energy Cost Savings\nover ASHRAE Standard 90.1-2010 Base case\nPoints\nOwner-occupied\nTenant-occupied\nMajor Renovation\nBuildings\nBuildings\nBuildings\n6%\n4%\n4%\n1\n8%\n6%\n6%\n2\n10%\n8%\n8%\n3\n12%\n10%\n10%\n4\n14%\n12%\n12%\n5\n16%\n14%\n14%\n6\n18%\n16%\n16%\n7\n20%\n18%\n18%\n8\n22%\n20%\n20%\n9\n24%\n22

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated response: energy efficiency in buildings

Energy-efficient HVAC systems can reduce power consumption significantly.
Lighting upgrades in commercial spaces lead to lower energy costs.

Energy efficiency in buildings

Energy-efficient HVAC systems can reduce power consumption significantly.

Lighting upgrades in commercial spaces lead to lower energy costs.

Energy efficiency in buildings

Energy-efficient HVAC systems can reduce power consumption significantly.

Lighting upgrades in commercial spaces lead to lower energy costs.

Energy efficiency in buildings

Energy-efficient HVAC systems can reduce power consumption significantly.

Lighting upgrades in commercial spaces lead to lower energy costs.

Energy efficiency in buildings

Energy-efficient HVAC systems can reduce power consumption significantly.

Lighting upgrades in commercial spaces lead to lower energy costs.

Energy efficiency in buildings

Energy-efficient HVAC systems can reduce power consumption significantly.


In [16]:
response


'energy efficiency in buildings\n\nEnergy-efficient HVAC systems can reduce power consumption significantly.\nLighting upgrades in commercial spaces lead to lower energy costs.\n\nEnergy efficiency in buildings\n\nEnergy-efficient HVAC systems can reduce power consumption significantly.\n\nLighting upgrades in commercial spaces lead to lower energy costs.\n\nEnergy efficiency in buildings\n\nEnergy-efficient HVAC systems can reduce power consumption significantly.\n\nLighting upgrades in commercial spaces lead to lower energy costs.\n\nEnergy efficiency in buildings\n\nEnergy-efficient HVAC systems can reduce power consumption significantly.\n\nLighting upgrades in commercial spaces lead to lower energy costs.\n\nEnergy efficiency in buildings\n\nEnergy-efficient HVAC systems can reduce power consumption significantly.\n\nLighting upgrades in commercial spaces lead to lower energy costs.\n\nEnergy efficiency in buildings\n\nEnergy-efficient HVAC systems can reduce power consumption sig