# RAG Implementation

Read from mongoDB database, perform semantic search on user query using cosine similarity 

In [2]:
import os
import ast
import json
import openai
import ollama
import evaluate
import pandas as pd
import numpy as np

from openai import OpenAI
from transformers import pipeline
from pymongo import MongoClient
from FlagEmbedding import FlagReranker
from sentence_transformers import SentenceTransformer
from langchain.evaluation import load_evaluator

from IPython.display import display, HTML

from tqdm import tqdm

In [3]:
mcqInput = "./datasets/mcqs_letter_answer.json"
mcqOutput = "./data/mcqs_predicted.csv"

longInput = "./datasets/longQA.json"
longOutput = "./data/longQA_predicted.csv"

embedModelName = "BAAI/bge-large-en-v1.5"
embedModel = SentenceTransformer(embedModelName)

reRankModelName = "BAAI/bge-reranker-large"
reRankModel = FlagReranker(reRankModelName, use_fp16=True)

llmModelName = "llama3.2"

rewriter = pipeline("text2text-generation", model="castorini/t5-base-canard")

client = MongoClient("mongodb://localhost:27017/")
db = client["rag"]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


## Retrieval Module

In [3]:
def getTopN(queries, n=10, coll="doc"):
    
    dfs = []
    
    for query in queries:
        # Get embeddings of the query
        queryEmbed = embedModel.encode(query, normalize_embeddings=True)

        # Load the table holding the chunks embeddings
        collection = db[coll]
        data = list(collection.find())

        for doc in data:
            doc.pop('_id', None)

        df = pd.DataFrame(data)

        if n == -1:
            n = len(df)

        # Compute cosine similarity and return sorted DataFrame
        df['cosine_similarity'] = df['embeddings'].apply(lambda x: np.dot(x, queryEmbed))
        df_sorted = df.sort_values(by='cosine_similarity', ascending=False)
        dfTop = df_sorted.head(n)
        dfs.append(dfTop)
        
    # Combine 3 dfs of each query's cosine similarity search, remove duplicates and return
    combined_df = pd.concat(dfs, ignore_index=True).drop_duplicates(subset=["Chunks", "PageNo"])
    
    return combined_df

## Re Rank Module

In [4]:
def reRank(query, df):
    # Compute a score for each chunk based on the contents of the chunk and query
    rankMatches = [[query, df["Chunks"].iloc[idx]] for idx in range(len(df))]
    scores = reRankModel.compute_score(rankMatches)
    df["rerank_score"] = scores
    
    # Remove scores less than 0 and return sorted DataFrame
    df = df[df["rerank_score"] > 0]
    df = df.sort_values(by="rerank_score", ascending=False)

    return df

## Generation

In [5]:
def getLLMAns(query, df, llmModelName, llmInstructions, verbose=False):
  
    # Putting all the chunks in one string to pass to prompt
    allChunkText = ""
    for idx in range(len(df)):
        score = df["rerank_score"].iloc[idx]
        chunk = df["Chunks"].iloc[idx]
        allChunkText += f"Information {idx}: {chunk}\n\n"
        
    prompt = f"""\
    {llmInstructions}
    
    Using this information:
    {allChunkText}
    
    Answer this Question:
    {query}
    """
    if verbose:
        print(prompt)


    messages = [
        {
            "role": "system",
            "content": llmInstructions
        },
        {
            "role": "user",
            "content": prompt
        }
    ]
    
    # Use openai library for GPT LLM if openai key exists in enviornment
    if "gpt-" in llmModelName:
        if os.getenv("OPENAI_API_KEY") is None:
            return ""
        
        response = openai.chat.completions.create(
            model=llmModelName,
            messages=messages,
            temperature=0
        )
        ans = response.choices[0].message.content.strip()
    else:
        llmOut = ollama.chat(
            model=llmModelName,
            messages=messages,
            options={
              "temperature": 0
            }
        )
        ans = llmOut["message"]["content"]
    
    # Remove think tag for reasoning model (deepseek-r1)
    tag = "</think>"
    idx = ans.find(tag)
    if idx != -1:
        return ans[idx + len(tag):].strip()
    
    return ans

## Query Rewriting and Query Expansion

In [6]:
def reWriteExpander(query, mcq=False, llmModelName="llama3.2"):
    # Default to llama3.2 just for expansion if GPT requested 
    if "gpt-" in llmModelName:
        llmModelName="llama3.2"
    
    # Separate options from question if MCQ
    if mcq:
        lines = query.strip().split('\n')
        query = lines[0].strip()
        opts = "".join(lines[1:])   
        
        
    # ReWriter Model from HuggingFace
    rewritten_input = f"{query}"
    rewritten_query = rewriter(rewritten_input, max_length=64, clean_up_tokenization_spaces=True)[0]['generated_text']
    
    # Generate 2 more queries surrounding the context
    prompt = f"""
    Given the query: "{rewritten_query}"
    Generate 2 contextually related queries:
    A broader version of the query
    A more specific version of the query
    
    Format the output as a Python dictionary with keys: "broader", "narrower".\  
    
    """
    
    messages = [
        {
            "role": "system",
            "content": "You are a helpful assistant that generates contextual queries."
        },
        {
            "role": "user",
            "content": prompt
        }
    ]
    
    llmOut = ollama.chat(
        model=llmModelName,
        messages=messages,
        options={
          "temperature": 0
        }
    )
    ans = llmOut["message"]["content"]
    
    # If the LLM returned the correct format return all 3 queries or if not just return rewritten query
    try:
        query_dict = ast.literal_eval(ans.strip())
        broader = query_dict.get("broader", "").strip()
        narrower = query_dict.get("narrower", "").strip()
        
        retList = [rewritten_query, broader, narrower]
    except (SyntaxError, ValueError):
        retList = [rewritten_query]
    
    # Add removed options back
    if mcq:
        retList = [i+opts for i in retList]

    return retList

## Full Pipeline

In [7]:
def answer(query, mcq=False, n=10, coll="doc", llmModelName="llama3.2", htmlOut=[]):
    # Different system instructions and prompt if MCQs
    if mcq:
        llmInstructions = """
        Using ONLY the provided information, answer the following multiple-choice question.\
        If the answer is not present in the information, reply with:\
        "Answer not in given information"
        Return ONLY the letter corresponding to the correct option (A, B, C, D) — DO NOT include explanation or any other text.\
        """
    else:
        llmInstructions = """
        Using ONLY the provided information, answer the question.\
        If the answer is not present in the information, reply with:\
        "Answer not in given information"
        """
    
    # ReWrite and expand and select rewritten query as main query
    queries = reWriteExpander(query, mcq=mcq, llmModelName=llmModelName)
    query = queries[0]
  
    # Pipeline to get topN, rerank and get LLM answer
    qdf = getTopN(queries, n, coll)
    qdf = reRank(query, qdf)
    ans = getLLMAns(query=query, df=qdf, llmModelName=llmModelName, llmInstructions=llmInstructions)
    
    # Add a reference to the page number if not a MCQ
    if not mcq:
        # Chunk contains an image where image path is not NaN
        texts = qdf[qdf["imagePath"].isna()]
        imgs = qdf[qdf["imagePath"].notna()]
        
        if len(texts) > 0:
            # Page Numbers of all text references in a string
            refs = [str(i) for i in texts["PageNo"].tolist()]
            refsCont = ", ".join(refs)
            ans += f"\n\nThis answer was generated using the text references on page(s): {refsCont}"
            
        if len(imgs) > 0:
            # Page Numbers of all image references in a string
            refs = [str(i) for i in imgs["PageNo"].tolist()]
            refsCont = ", ".join(refs)
            ans += f"\n\nThis answer was generated using the image references on page(s): {refsCont}"
            
            # Optional output if passed by reference: URL String for links to all images
            paths = imgs["imagePath"].tolist()
            urls = [i.replace("\\", "/") for i in paths]  # Replace backslashes with forward slashes for URLs
            html_links = [f'<a href="{url}" target="_blank">img-page{refs[i]}</a>' for i, url in enumerate(urls)]
            html_output = " ".join(html_links)
            htmlOut.append(HTML(f"\nClick to view the images used: {html_output}"))
    
    return ans

## Model Evaluation

Extract letter from LLM Output

In [8]:
def extract_letter(value):
    # Even though prompt is strict for MCQs sometimes LLM gives surrounding text
    # Function to extract LLM's predicted option from generated answer
    
    if pd.isna(value) or len(value) == 0:
        return "X"
    elif len(value) == 1:
        return value
    else:
        match = pd.Series(value).str.extract(r'([A-Za-z])\)')[0]
        return match[0] if pd.notna(match[0]) else "X"

### 50 Deepseek generated pHD Level MCQs for accuracy

In [9]:
# Load already evaluated results if they exist
if os.path.exists(mcqOutput):
    qs = pd.read_csv(mcqOutput)
else:  
    qs = pd.read_json(mcqInput)

qs.head(5)

Unnamed: 0,query,expected,predicted-gpt-3.5-turbo,predicted-gpt-3.5-turbo-letter,predicted-llama3.2,predicted-llama3.2-letter,predicted-deepseek-llm,predicted-deepseek-llm-letter
0,What is the primary objective of the Capital W...,B,B,B,B,B,"B) To ensure cost certainty, better value for ...",B
1,"Under the new Public Works Contracts, who is r...",B,B,B,A,A,A) The Contractor,A
2,Which of the following is NOT a key change int...,C,B,B,D,D,D) Fixed-price lump sum contracts,D
3,What is the purpose of Programme Contingency?\...,B,B,B,B,B,D,D
4,"In the new contracts, what replaces the role o...",B,B,B,B,B,B) Specialists,B


In [10]:
# Different LLMs to test
testModels = ["gpt-3.5-turbo", "deepseek-llm", "llama3.2"]

In [11]:
for model in testModels:  
    colName = f"predicted-{model}"
    colLetterName = f"predicted-{model}-letter"
  
    # Only run the model if not already run before
    if colName in qs:
        continue
  
    answers = []
    # Run with tqdm to show progress bar and time per query
    for i in tqdm(range(len(qs)), desc="Processing queries"):
        query = qs["query"].iloc[i]
        ans = answer(query, mcq=True, llmModelName=model)
        answers.append(ans)
      
    # Save results to dataframe
    qs[colName] = answers
    qs[colLetterName] = qs[colName].apply(extract_letter)

In [12]:
qs.head(5)

Unnamed: 0,query,expected,predicted-gpt-3.5-turbo,predicted-gpt-3.5-turbo-letter,predicted-llama3.2,predicted-llama3.2-letter,predicted-deepseek-llm,predicted-deepseek-llm-letter
0,What is the primary objective of the Capital W...,B,B,B,B,B,"B) To ensure cost certainty, better value for ...",B
1,"Under the new Public Works Contracts, who is r...",B,B,B,A,A,A) The Contractor,A
2,Which of the following is NOT a key change int...,C,B,B,D,D,D) Fixed-price lump sum contracts,D
3,What is the purpose of Programme Contingency?\...,B,B,B,B,B,D,D
4,"In the new contracts, what replaces the role o...",B,B,B,B,B,B) Specialists,B


In [13]:
# Save to output so dont have to run models again
qs.to_csv(mcqOutput, index=False)

#### Accuracy of each Model

In [14]:
for model in testModels:  
    colName = f"predicted-{model}"
    colLetterName = f"predicted-{model}-letter"

    print(f"{model}:", (qs["expected"] == qs[colLetterName]).sum()/len(qs))

gpt-3.5-turbo: 0.84
deepseek-llm: 0.5
llama3.2: 0.76


In [15]:
#assert False

### GPT generated questions (not MCQs)

In [16]:
# Load output if already exists
if os.path.exists(longOutput):
    df = pd.read_csv(longOutput)
else:  
    df = pd.read_json(longInput)
    
df.head(5)

Unnamed: 0,Query,Expected,Predicted,LevenshtienSimilarity
0,What are the two main categories of public cap...,The two main categories of public capital work...,The two main categories of public capital work...,0.155861
1,What are Delay Events and Compensation Events ...,Delay Events are events that are not at the co...,Delay events refer to instances when work on a...,0.33982
2,What are the key structural components of the ...,The structure includes: 1. Letter of Acceptanc...,The key structural components of the new publi...,0.43294
3,What is the purpose of Programme Contingency i...,Programme Contingency is a specified delay per...,The provided information does not mention anyt...,0.323719
4,Under what circumstances can a Contracting Aut...,A Contracting Authority can allocate this risk...,A Contracting Authority can allocate the risk ...,0.324441


#### Levenshtien Similarity

In [17]:
# Run queries if not ran before
if "Predicted" not in df:
    df["Predicted"] = df["Query"].apply(lambda q: answer(q, mcq=False))

df.head(5)

Unnamed: 0,Query,Expected,Predicted,LevenshtienSimilarity
0,What are the two main categories of public cap...,The two main categories of public capital work...,The two main categories of public capital work...,0.155861
1,What are Delay Events and Compensation Events ...,Delay Events are events that are not at the co...,Delay events refer to instances when work on a...,0.33982
2,What are the key structural components of the ...,The structure includes: 1. Letter of Acceptanc...,The key structural components of the new publi...,0.43294
3,What is the purpose of Programme Contingency i...,Programme Contingency is a specified delay per...,The provided information does not mention anyt...,0.323719
4,Under what circumstances can a Contracting Aut...,A Contracting Authority can allocate this risk...,A Contracting Authority can allocate the risk ...,0.324441


In [18]:
# Calculate Langchain's Levenshtien Similarity score if not already done 
if "LevenshtienSimilarity" not in df:
    evaluator = load_evaluator("string_distance")

    examples = df[["Query", "Expected"]].rename(columns={"Query": "input", "Expected": "ideal"})
    predictions = df["Predicted"].tolist()

    results = []
    for example, prediction in zip(examples.to_dict(orient="records"), predictions):
        result = evaluator.evaluate_strings(
            prediction=prediction,
            reference=example["ideal"]
        )
        results.append(result)

    df["LevenshtienSimilarity"] = [r["score"] for r in results]

In [19]:
# Save results
df.to_csv(longOutput, index=False)

In [20]:
df.head(5)

Unnamed: 0,Query,Expected,Predicted,LevenshtienSimilarity
0,What are the two main categories of public cap...,The two main categories of public capital work...,The two main categories of public capital work...,0.155861
1,What are Delay Events and Compensation Events ...,Delay Events are events that are not at the co...,Delay events refer to instances when work on a...,0.33982
2,What are the key structural components of the ...,The structure includes: 1. Letter of Acceptanc...,The key structural components of the new publi...,0.43294
3,What is the purpose of Programme Contingency i...,Programme Contingency is a specified delay per...,The provided information does not mention anyt...,0.323719
4,Under what circumstances can a Contracting Aut...,A Contracting Authority can allocate this risk...,A Contracting Authority can allocate the risk ...,0.324441


#### Human Testing

In [21]:
row = 0

In [22]:
df["Query"].iloc[row]

'What are the two main categories of public capital works projects mentioned in the manual?'

In [23]:
df["Predicted"].iloc[row]

'The two main categories of public capital works projects mentioned in the manual are Civil Engineering Works and Building Works.\n\nThis answer was generated using the text references on page(s): 16'

In [24]:
df["Expected"].iloc[row]

'The two main categories of public capital works projects are Civil Engineering Works and Building Works. Civil engineering works include roads, tunnels, and bridges, typically designed by civil engineers. Building works include office buildings, schools, and hospitals, generally designed by architects.'

#### ROUGE/BLEU Scores

In [25]:
rouge = evaluate.load('rouge')
bleu = evaluate.load('bleu')

Using the latest cached version of the module from C:\Users\karti\.cache\huggingface\modules\evaluate_modules\metrics\evaluate-metric--rouge\b01e0accf3bd6dd24839b769a5fda24e14995071570870922c71970b3a6ed886 (last modified on Mon May 26 14:06:09 2025) since it couldn't be found locally at evaluate-metric--rouge, or remotely on the Hugging Face Hub.
Using the latest cached version of the module from C:\Users\karti\.cache\huggingface\modules\evaluate_modules\metrics\evaluate-metric--bleu\9e0985c1200e367cce45605ce0ecb5ede079894e0f24f54613fca08eeb8aff76 (last modified on Mon May 26 14:11:54 2025) since it couldn't be found locally at evaluate-metric--bleu, or remotely on the Hugging Face Hub.


In [26]:
results = rouge.compute(predictions=[df["Predicted"].iloc[row]], references=[df["Expected"].iloc[row]])
print(results)

{'rouge1': 0.4383561643835617, 'rouge2': 0.3943661971830986, 'rougeL': 0.4383561643835617, 'rougeLsum': 0.4383561643835617}


In [27]:
results = bleu.compute(predictions=[df["Predicted"].iloc[row]], references=[df["Expected"].iloc[row]])
print(results)

{'bleu': 0.27163511112029864, 'precisions': [0.4722222222222222, 0.42857142857142855, 0.38235294117647056, 0.3333333333333333], 'brevity_penalty': 0.6778095780054504, 'length_ratio': 0.72, 'translation_length': 36, 'reference_length': 50}


In [28]:
results = rouge.compute(predictions=df["Predicted"].tolist(), references=df["Expected"].tolist())
print(results)

{'rouge1': 0.3040668485121949, 'rouge2': 0.13563399823208128, 'rougeL': 0.22770016042126812, 'rougeLsum': 0.23266656621491083}


In [29]:
results = bleu.compute(predictions=df["Predicted"].tolist(), references=df["Expected"].tolist())
print(results)

{'bleu': 0.07188847655348057, 'precisions': [0.22792607802874743, 0.07053941908713693, 0.04612159329140461, 0.036016949152542374], 'brevity_penalty': 1.0, 'length_ratio': 1.770909090909091, 'translation_length': 487, 'reference_length': 275}


### Query to check for tables

#### GPT Generated question based on table on page 38

In [30]:
q = "Identify and explain one specific event that qualifies as a Delay Event but not a Compensation Event under all three forms of contract: Traditional, Design & Build, and Minor Works. \
Support your answer using the information.?"

q

'Identify and explain one specific event that qualifies as a Delay Event but not a Compensation Event under all three forms of contract: Traditional, Design & Build, and Minor Works. Support your answer using the information.?'

In [31]:
qrs = reWriteExpander(q, mcq=False)
qrs

['Identify and explain one specific event that qualifies as a Delay Event but not a Compensation Event under all three forms of contract: Traditional, Design & Build, and Minor Works. Support your answer using the information.']

In [32]:
df = getTopN(qrs, 10)
df.head(5)

Unnamed: 0,Chunks,PageNo,imagePath,embeddings,cosine_similarity
0,The image is a table from a research paper com...,41,images\page-41_img8.png,"[0.001292031491175294, 0.006042515393346548, -...",0.838398
1,= . 9. The Employer does not give the Contract...,39,,"[0.010430303402245045, -0.003476436948403716, ...",0.836329
2,"9.2, MINOR WORKS.Delay Event = Yes. 4. The Con...",38,,"[-0.004050837829709053, 0.0043960995972156525,...",0.827263
3,The following table is compiled from Sect...,37,,"[-0.006695794872939587, 0.022547774016857147, ...",0.819687
4,1. The Employer's Representative gives the Con...,38,,"[0.002150048268958926, 0.0009375461377203465, ...",0.817519


In [33]:
qdf = reRank(q, df)
qdf.head(5)

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Unnamed: 0,Chunks,PageNo,imagePath,embeddings,cosine_similarity,rerank_score
0,The image is a table from a research paper com...,41,images\page-41_img8.png,"[0.001292031491175294, 0.006042515393346548, -...",0.838398,3.916457
2,"9.2, MINOR WORKS.Delay Event = Yes. 4. The Con...",38,,"[-0.004050837829709053, 0.0043960995972156525,...",0.827263,2.058186
1,= . 9. The Employer does not give the Contract...,39,,"[0.010430303402245045, -0.003476436948403716, ...",0.836329,1.450883
4,1. The Employer's Representative gives the Con...,38,,"[0.002150048268958926, 0.0009375461377203465, ...",0.817519,1.426681
5,12. Loss of or damage to the Works that is at ...,40,,"[-0.012642641551792622, 0.017360514029860497, ...",0.816902,1.128211


In [34]:
# Sometimes the table was also represented by docling as an Image hence image being in Row 0
# Row 1 is how the table is represented as text by docling as chunks
qdf["Chunks"].iloc[1]

'9.2, MINOR WORKS.Delay Event = Yes. 4. The Contractor suspends work in accordance with sub-clause 12.3, TRADITIONAL.Delay Event = Yes. 4. The Contractor suspends work in accordance with sub-clause 12.3, TRADITIONAL.Compensation Event = Yes. 4. The Contractor suspends work in accordance with sub-clause 12.3, DESIGN & BUILD. = . 4. The Contractor suspends work in accordance with sub-clause 12.3, DESIGN & BUILD.Delay Event Compensation Event = Yes Yes. 4. The Contractor suspends work in accordance with sub-clause 12.3, MINOR WORKS.Delay Event = Yes. 5. There is a factual error in information about the Site or setting out information in the Works Requirements. [This does not include an error of interpretation]., TRADITIONAL.Delay Event = Yes. 5. There is a factual error in information about the Site or setting out information in the Works Requirements. [This does not include an error of interpretation]., TRADITIONAL.Compensation Event = Yes. 5. There is a factual error in information abou

In [35]:
# Reference is added later in the pipeline function so no reference shown here

llmInstructions = """
Using ONLY the provided information, answer the question.\
If the answer is not present in the information, reply with:\
"Answer not in given information"
"""

ans = getLLMAns(q, df, llmModelName="llama3.2", llmInstructions=llmInstructions)
ans

'After reviewing the table, I have identified an event that qualifies as a Delay Event but not a Compensation Event under all three forms of contract:\n\n**Event:** Loss of or damage to the Works that is at the Contractor\'s risk in accordance with clause 3.2.\n\n**Reasoning:**\n\n* Under **Traditional**, this event is listed as a Delay Event (Delay Event = Yes) and not a Compensation Event.\n* Under **Design & Build**, this event is also listed as a Delay Event (Delay Event = Yes) and not a Compensation Event.\n* Under **Minor Works**, this event is listed as a Delay Event (Delay Event = Yes) but with a note that it is "Not used" for Compensation Event.\n\n**Explanation:**\n\nThis event occurs when the Contractor is responsible for loss or damage to the Works, which is outside of their scope and risk. As such, it is not considered a Compensation Event, as it is not an unforeseen event caused by the Employer\'s actions or omissions. Instead, it is a result of the Contractor\'s own acti

In [36]:
#assert False

### Query to check for images

In [37]:
q = "What are the key control points in the Capital Works Management Framework where project reviews are mandated during the Planning and Implementation stages?"

q

'What are the key control points in the Capital Works Management Framework where project reviews are mandated during the Planning and Implementation stages?'

In [38]:
qrs = reWriteExpander(q, mcq=False)
qrs

['What are the key control points in the Capital Works Management Framework where project reviews are mandated during the Planning and Implementation stages?']

In [39]:
df = getTopN(qrs, 10)
df.head(5)

Unnamed: 0,Chunks,PageNo,imagePath,embeddings,cosine_similarity
0,The image displays a structured diagram titled...,16,images\page-16_img4.png,"[0.020727336406707764, -0.014973741956055164, ...",0.780836
1,The Capital Works Management Framework is a...,16,,"[-0.039464130997657776, 0.001380482572130859, ...",0.76765
2,"Building Works, CAPITAL WORKS MANAGEMENT FRAME...",18,,"[-0.015871873125433922, -0.02579466812312603, ...",0.671131
3,The new public works contracts are very much a...,22,,"[-0.01807848922908306, 0.015640415251255035, -...",0.667816
4,The aim of this course is to support d...,21,,"[-0.004354395437985659, 0.011255735531449318, ...",0.652762


In [40]:
# Shows LLM generated image summary
df["Chunks"].iloc[0]

'The image displays a structured diagram titled "Capital Works Management Framework," organized into three main sections: Stage, Project Management, and Procedure.\n\n1. **Stage:**\n   - Contains two levels: Appraisal and Approval in Principle.\n\n2. **Project Management:**\n   - Divided into four main phases:\n     1. **Planning (Preliminary):**\n        - **1.1 Project Definition:** Focuses on major project milestone reviews.\n        - **1.2 Procurement Strategy & Contract Type**\n        - **1.3 Consultants Appointment Process & Conditions of Engagement**\n     2. **Planning (Detailed):**\n        - **2.1 Design Development and Procurement Process:** Includes major project assessment reviews.\n        - **2.2 Tender Process**\n     3. **Implementation:**\n        - **3.1 Construction Process**\n     4. **Review:**\n        - **4.1 Post Project Review**\n\n3. **Procedure:**\n   - Each stage in Project Management is linked with a corresponding procedure:\n     - **1.1.1 Budget Develo

In [41]:
qdf = reRank(q, df)
qdf.head(5)

Unnamed: 0,Chunks,PageNo,imagePath,embeddings,cosine_similarity,rerank_score
0,The image displays a structured diagram titled...,16,images\page-16_img4.png,"[0.020727336406707764, -0.014973741956055164, ...",0.780836,3.479454
1,The Capital Works Management Framework is a...,16,,"[-0.039464130997657776, 0.001380482572130859, ...",0.76765,0.637613


In [42]:
llmInstructions = """
Using ONLY the provided information, answer the question.\
If the answer is not present in the information, reply with:\
"Answer not in given information"
"""

ans = getLLMAns(q, df, llmModelName="llama3.2", llmInstructions=llmInstructions)
ans

'Answer not in given information'

In [43]:
# Pass in optional list by reference to get htmlOutput for images
htmlOut = []
ans = answer(q, mcq=False, htmlOut=htmlOut)

In [44]:
# Print for the LLM Generated response
print(ans)
# Display for the HTML Image Link
display(htmlOut[-1])

According to Information 0:

- In the **Planning (Detailed) stage**, there is a review point: **2.2.1 Control project review at end of process**
- In the **Implementation stage**, there is a review point: **3.1.1 Construction Cost Control** and also **4.1.1 Analysis of Outturn Costs**

These are the key control points in the Capital Works Management Framework where project reviews are mandated during the Planning and Implementation stages.

This answer was generated using the text references on page(s): 16

This answer was generated using the image references on page(s): 16


In [45]:
ans = answer(q, mcq=False)

ans

'According to Information 0:\n\n- In the **Planning (Detailed) stage**, there is a review point: **2.2.1 Control project review at end of process**\n- In the **Implementation stage**, there is a review point: **3.1.1 Construction Cost Control** and also **4.1.1 Analysis of Outturn Costs**\n\nThese are the key control points in the Capital Works Management Framework where project reviews are mandated during the Planning and Implementation stages.\n\nThis answer was generated using the text references on page(s): 16\n\nThis answer was generated using the image references on page(s): 16'

# DEMO Output Step by Step

In [46]:
q = "What are the two main categories of public capital works projects mentioned in the manual?"

In [47]:
qrs = reWriteExpander(q, mcq=False)
qrs

['What are the two main categories of public capital works projects mentioned in the manual?']

In [48]:
qdf = getTopN(qrs, n=10)

qdf.head(5)

Unnamed: 0,Chunks,PageNo,imagePath,embeddings,cosine_similarity
0,The Capital Works Management Framework is a...,16,,"[-0.039464130997657776, 0.001380482572130859, ...",0.723892
1,The image displays a structured diagram titled...,16,images\page-16_img4.png,"[0.020727336406707764, -0.014973741956055164, ...",0.721038
2,When approaching the question of the appropria...,46,,"[-0.007367652840912342, -0.03172290325164795, ...",0.720151
3,"Building Works, CAPITAL WORKS MANAGEMENT FRAME...",18,,"[-0.015871873125433922, -0.02579466812312603, ...",0.716294
4,The Works Proposals are the following,222,,"[0.0034117044415324926, 0.0003231484442949295,...",0.711521


In [49]:
qdf = reRank(q, qdf)
texts = qdf[qdf["imagePath"].isna()]
imgs = qdf[qdf["imagePath"].notna()]

qdf.head(5)

Unnamed: 0,Chunks,PageNo,imagePath,embeddings,cosine_similarity,rerank_score
0,The Capital Works Management Framework is a...,16,,"[-0.039464130997657776, 0.001380482572130859, ...",0.723892,0.756045


In [50]:
llmInstructions = """
Using ONLY the provided information, answer the question.\
If the answer is not present in the information, reply with:\
"Answer not in given information"
"""

ans = getLLMAns(query=q, df=qdf, llmModelName="llama3.2", llmInstructions=llmInstructions)

# Reference building
if len(texts) > 0:
    # Page Numbers of all text references in a string
    refs = [str(i) for i in texts["PageNo"].tolist()]
    refsCont = ", ".join(refs)
    ans += f"\n\nThis answer was generated using the text references on page(s): {refsCont}"

if len(imgs) > 0:
    # Page Numbers of all image references in a string
    refs = [str(i) for i in imgs["PageNo"].tolist()]
    refsCont = ", ".join(refs)
    ans += f"\n\nThis answer was generated using the image references on page(s): {refsCont}"

    # Optional output if passed by reference: URL String for links to all images
    paths = imgs["imagePath"].tolist()
    urls = [i.replace("\\", "/") for i in paths]  # Replace backslashes with forward slashes for URLs
    html_links = [f'<a href="{url}" target="_blank">img-page{refs[i]}</a>' for i, url in enumerate(urls)]
    html_output = " ".join(html_links)

print(ans)
if len(imgs) > 0:
    display(HTML(f"\nClick to view the images used: {html_output}"))

The two main categories of public capital works projects mentioned in the manual are:

1. Civil Engineering Works
2. Building Works

This answer was generated using the text references on page(s): 16


## DEMO Output Pipeline

In [51]:
query = ""

In [52]:
# Pass in optional list by reference to get htmlOutput for images
htmlOut = []
ans = answer(query, mcq=False, htmlOut=htmlOut)

In [53]:
# Print for the LLM Generated response
print(ans)
# Display for the HTML Image Link if exists
if len(htmlOut) > 0:
    display(htmlOut[-1])

Answer not in given information.
