In [3]:
import warnings
from typing import List, Optional, Dict
from langchain_core.pydantic_v1 import BaseModel, Field

from langchain_groq import ChatGroq
from elasticsearch import Elasticsearch
from sentence_transformers import SentenceTransformer

from langgraph.graph import END, StateGraph
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

warnings.filterwarnings("ignore", category=FutureWarning)


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  exec(code_obj, self.user_global_ns, self.user_ns)
  from tqdm.autonotebook import tqdm, trange


`1. Initializing Elastic search client, LLM model connection and Embedding model`

In [4]:
es_client = Elasticsearch(
    hosts="https://512baa73334c4127ade77bc2dfa2ef02.eastus2.azure.elastic-cloud.com:443",
    basic_auth=("elastic", "XRzYOABuG17Yv32UKAFySw41")
)

print(f"Elastic Search Connection successful: {es_client.ping()}")

llm = ChatGroq(
    temperature=0,
    model_name="llama-3.1-8b-instant",
    # api_key="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",
    api_key="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"
)

print("LLM Model loaded")

model_name = "nomic-ai/nomic-embed-text-v1"
model = SentenceTransformer(model_name, trust_remote_code=True)

print("Embedding Model loaded")

def embedding_creation(input_texts, model):
    embedding = model.encode(input_texts)
    return embedding.tolist()

Elastic Search Connection successful: True
LLM Model loaded


<All keys matched successfully>


Embedding Model loaded


In [20]:
print(embedding_creation("Hello", model))

[0.05320166051387787, 0.009270425885915756, -0.02038547769188881, -0.03376682475209236, 0.013245669193565845, 0.03141860291361809, 0.03381296619772911, 0.015163923613727093, -0.042463403195142746, -0.018505126237869263, -0.01898069493472576, 0.0423465259373188, 0.05877299606800079, 0.03942979499697685, 0.039538897573947906, -0.04362544044852257, 0.08177028596401215, -0.005457289516925812, -0.03233932703733444, -0.0020926976576447487, 0.024892674759030342, -0.05255647003650665, 5.828490247949958e-05, -0.018829913809895515, 0.1901833564043045, -0.020415714010596275, 0.02401735633611679, 0.054968249052762985, -0.0211741104722023, -0.022106170654296875, 0.012289210222661495, -0.0021186254452914, 0.023014849051833153, -0.02732783369719982, 0.012837995775043964, -0.016583681106567383, 0.016315419226884842, 0.021133868023753166, 0.01682966761291027, 0.040230657905340195, 0.02859254740178585, -0.009654542431235313, 0.03104333020746708, -0.030321625992655754, 0.019781263545155525, 0.03613482788

`2. RAG`

`2.1 Multi Query Generation`

In [6]:
query = "List of Board of Trustees of Mohamed bin Zayed University?"

In [7]:
class MultiQuerySchema(BaseModel):
    Questions: List[str] = Field(description="List of questions")


MULTI_QUERY_PROMPT = PromptTemplate(
    template="""
    <|begin_of_text|>
    <|start_header_id|>system<|end_header_id|>  
    You are an AI, specialized in generating effective multi-queries for Retrieval Augmented Generation (RAG) systems.

    Guidelines:
    1. Analyze the user's original query and generate a set of five distinct yet relevant search queries that cover:
        - Synonyms and similar terms
        - Related subtopics or specific aspects
        - Broader and narrower variations of the original query
    2. Aim to create queries that will increase the chances of retrieving diverse, relevant results by considering different interpretations and possible user intents.
    3. Provide JSON response, with list of questions like {{ "Questions": ["Question 1", "Question 2", ..] }}
    4. Provide only the list of queries without any additional explanations or commentary.

    User Query: "{user_query}"
    <|eot_id|>
    <|start_header_id|>assistant<|end_header_id|>
    """,
    input_variables=["user_query"],
)

structured_llm = llm.with_structured_output(MultiQuerySchema, method="json_mode")

multi_query_generation = MULTI_QUERY_PROMPT | structured_llm

In [8]:
multi_query_response = multi_query_generation.invoke({"user_query": query})
multi_query_response.Questions

['Mohamed bin Zayed University Board of Directors',
 'Members of the Board of Trustees of Mohamed bin Zayed University',
 'List of Trustees of Mohamed bin Zayed University Abu Dhabi',
 'Mohamed bin Zayed University governing body members',
 'Who are the members of the Board of Trustees of Mohamed bin Zayed University?']

`2.2 Agent: Enriching Queries with Synonyms`

In [16]:
# 1. Query Enriching Agent

QUERY_ENRICHING_PROMPT = PromptTemplate(
    template="""
    <|begin_of_text|>
    <|start_header_id|>system<|end_header_id|>  
    You are an AI assistant specialized in generating Elasticsearch query strings. 
    Your task is to create the most effective query string for the given user question. 
    This query string will be used to search for relevant documents in an Elasticsearch index.

    Guidelines:
    1. Analyze the user's question carefully.
    2. Generate ONLY a query string suitable for Elasticsearch's match query.
    3. Focus on key terms and concepts from the question.
    4. Include synonyms or related terms that might be in relevant documents.
    5. Use simple Elasticsearch query string syntax if helpful (e.g., OR, AND).
    6. Do not use advanced Elasticsearch features or syntax.
    7. Do not include any explanations, comments, or additional text.
    8. Provide only the query string, nothing else.

    For the question "What is Clickthrough Data?", we would expect a response like:
    clickthrough data OR click-through data OR click through rate OR CTR OR user clicks OR ad clicks OR search engine results OR web analytics

    AND operator is not allowed. Use only OR.

    User Question: {user_query}
    <|eot_id|>
    <|start_header_id|>assistant<|end_header_id|>
    """,
    input_variables=["user_query"],
)

query_enricher = QUERY_ENRICHING_PROMPT | llm | StrOutputParser()

In [17]:
query_text = query_enricher.invoke({"user_query": multi_query_response.Questions})
enriched_query = [term.strip() for term in query_text.split(' OR ')]
enriched_query

['Mohamed bin Zayed University',
 'Mohamed bin Zayed University Board of Directors',
 'Board of Trustees',
 'Board of Trustees of Mohamed bin Zayed University',
 'Trustees of Mohamed bin Zayed University Abu Dhabi',
 'governing body',
 'governing body members',
 'members of the Board of Trustees',
 'members of the Board of Directors']

`2.3 Tool: Hybrid Search`

In [21]:
def hybrid_vector_search(
        conn,
        index_name: str, 
        query_keys: str, 
        query_vector: List[float], 
        text_fields: List[str], 
        vector_field: str, 
        num_candidates: int = 100, 
        num_results: int = 10,
    ) -> Dict:


    search_body = {
            "knn": {
                "field": vector_field,
                "query_vector": query_vector,
                "k": num_candidates,
                "num_candidates": num_candidates
            },
            "query": {
                "bool": {
                    "must": [
                        {
                            "multi_match": {
                                "query": " ".join(query_keys),  
                                "fields": text_fields,
                                "type": "best_fields",
                                "operator": "or" 
                            }
                        }
                    ],
                    "should": [
                        {
                            "script_score": {
                                "query": {"match_all": {}},
                                "script": {
                                    "source": """
                                    double vector_score = cosineSimilarity(params.query_vector, params.vector_field) + 1.0;
                                    double text_score = _score;
                                    return 0.7 * vector_score + 0.3 * text_score;
                                    """,
                                    "params": {
                                        "query_vector": query_vector,  
                                        "vector_field": vector_field
                                    }
                                }
                            }
                        }
                    ]
                }
            }
        }

    response = conn.search(index=index_name, body=search_body, size=num_results)

    return response, search_body


In [22]:
query_vector = embedding_creation(query, model)

response, search_body = hybrid_vector_search(
        es_client,
        index_name="abudhabi-policies-legislations-v4", 
        query_keys= list(enriched_query),
        query_vector=query_vector, 
        text_fields=["content", "heading", "questions", "reference"], 
        vector_field="embedded_content", 
        num_candidates = 25, 
        num_results = 3
    )

result = []
all_content = ""
for j in response['hits']['hits']:
    content = j["_source"]
    _result = {}
    _result["score"] = j["_score"]
    _result["heading"] = content['heading']
    _result["content"] = content['content']
    all_content = all_content + '\n' + content['content']
    _result["reference"] = content['reference']

    result.append(_result)

result

  response = conn.search(index=index_name, body=search_body, size=num_results)


[{'score': 112.34246,
  'heading': 'executive council resolution no. (11) of 2023 concerning the reformation of the board of trustees of mohamed bin zayed university for humanities',
  'content': 'executive council resolution no. (11) of 2023 concerning the reformation of the board of trustees of mohamed bin zayed university for humanities having reviewed chairman of the executive council resolution no. (27) of 2020 concerning the formation of the board of trustees of mohamed bin zayed university for humanities. the executive council has decided the following: 1. the board of trustees of mohamed bin zayed university for humanities shall be reformed under the chairmanship of h.e. dr. mohammed rashid ahmed al hamli, and the membership of: • mubarak hamad mubarak al muhairi - vice chairman • mohammed hamza hassan al qassim. • maryam eid khamis al muhairi • abdullah aqueeda ali al muhairi • dr. omar habtoor theeb al-derei. • ghanem sultan ahmed al suwaidi. 2. the membership term of the boa

In [23]:
type(response)

elastic_transport.ObjectApiResponse

`2.4 Agent: Extracting Reference Document details`

In [201]:
def reference_search(conn, index_name, reference_index):
    search_body = {
        "size": 1,
        "query": {
            "bool": {
                "must": [
                    {
                    "match": {"heading": reference_index}
                    }
                ]
            }
        }
    }
    response = conn.search(index=index_name, body=search_body)

    return response
    

ref_doc = list()
for doc in result:
    for reference_index in doc["reference"]:
        response = reference_search(es_client, "abudhabi-policies-legislations-v4", reference_index)
        result = []
        for j in response['hits']['hits']:
            content = j["_source"]
            _result = {}
            _result["score"] = j["_score"]
            _result["heading"] = content['heading']
            _result["content"] = content['content']
            _result["reference"] = content['reference']

            ref_doc.append(_result)

In [202]:
ref_doc

[{'score': 24.451162,
  'heading': 'executive council resolution no. (11) of 2023 concerning the reformation of the board of trustees of mohamed bin zayed university for humanities',
  'content': 'executive council resolution no. (11) of 2023 concerning the reformation of the board of trustees of mohamed bin zayed university for humanities having reviewed chairman of the executive council resolution no. (27) of 2020 concerning the formation of the board of trustees of mohamed bin zayed university for humanities. the executive council has decided the following: 1. the board of trustees of mohamed bin zayed university for humanities shall be reformed under the chairmanship of h.e. dr. mohammed rashid ahmed al hamli, and the membership of: • mubarak hamad mubarak al muhairi - vice chairman • mohammed hamza hassan al qassim. • maryam eid khamis al muhairi • abdullah aqueeda ali al muhairi • dr. omar habtoor theeb al-derei. • ghanem sultan ahmed al suwaidi. 2. the membership term of the boa

`2.5 Agent: Critic Agent`

In [157]:
CRITIC_PROMPT = PromptTemplate(
    template="""
    <|begin_of_text|>
    <|start_header_id|>system<|end_header_id|>  
    You are a critical evaluator tasked with assessing the relevance of retrieved documents to a user's query. For each document, determine if it contains any keywords related to the user's question.

    Guidelines:
    1. Analyze the user's question and identify key terms and concepts.
    2. Consider synonyms and related terms that may appear in relevant documents.
    3. Provide JSON response with following format: 
    {{
      "Document Title 1": true/false,
      "Document Title 2": true/false,
      ...
    }}
    4. Do not include explanations or additional text.

    Documents: {documents}
    User Question: {question}

    <|eot_id|>
    <|start_header_id|>assistant<|end_header_id|>

    <|eot_id|>
    <|start_header_id|>assistant<|end_header_id|>
    """,
    input_variables=["question", "documents"],  
)

relevency_checking = CRITIC_PROMPT | llm

In [158]:
reference_doc = list()

for doc in ref_doc:
    reference_doc.append(doc['heading'])

reference_doc

['executive council resolution no. (11) of 2023 concerning the reformation of the board of trustees of mohamed bin zayed university for humanities',
 'executive council resolution no. (11) of 2023 concerning the reformation of the board of trustees of mohamed bin zayed university for humanities',
 'law no. (1) of 2022 concerning the amendment of some provisions of law no. (1) of 2017 concerning the financial system of abu dhabi government',
 'law no. (1) of 2022 concerning the amendment of some provisions of law no. (1) of 2017 concerning the financial system of abu dhabi government',
 'chairman of the executive council resolution no. (4) of 2021 concerning the reformation of the board of directors of abu dhabi housing authority',
 'executive council resolution no. (131) of 2021 concerning the reformation of the board of trustees of the emirates college for advanced education',
 'executive council resolution no. (135) of 2021 concerning the membership of the board of trustees of khalif

In [159]:
response = relevency_checking.invoke(
    {
        "question": query, 
        "documents": reference_doc
    }
)
print(response.content)

{
  "executive council resolution no. (11) of 2023 concerning the reformation of the board of trustees of mohamed bin zayed university for humanities": true,
  "executive council resolution no. (11) of 2023 concerning the reformation of the board of trustees of mohamed bin zayed university for humanities": true,
  "law no. (1) of 2022 concerning the amendment of some provisions of law no. (1) of 2017 concerning the financial system of abu dhabi government": false,
  "law no. (1) of 2022 concerning the amendment of some provisions of law no. (1) of 2017 concerning the financial system of abu dhabi government": false,
  "chairman of the executive council resolution no. (4) of 2021 concerning the reformation of the board of directors of abu dhabi housing authority": false,
  "executive council resolution no. (131) of 2021 concerning the reformation of the board of trustees of the emirates college for advanced education": false,
  "executive council resolution no. (135) of 2021 concerning 

In [160]:
import json
conditional_response = json.loads(response.content)
conditional_response

{'executive council resolution no. (11) of 2023 concerning the reformation of the board of trustees of mohamed bin zayed university for humanities': True,
 'law no. (1) of 2022 concerning the amendment of some provisions of law no. (1) of 2017 concerning the financial system of abu dhabi government': False,
 'chairman of the executive council resolution no. (4) of 2021 concerning the reformation of the board of directors of abu dhabi housing authority': False,
 'executive council resolution no. (131) of 2021 concerning the reformation of the board of trustees of the emirates college for advanced education': False,
 'executive council resolution no. (135) of 2021 concerning the membership of the board of trustees of khalifa university of science and technology': False,
 'executive council resolution no. (136) of 2021 concerning the reformation of the board of trustees of sorbonne university abu dhabi': False,
 'executive council resolution no. (137) of 2021 concerning the appointment of

In [161]:
for topic, value in conditional_response.items():
    if value:
        all_content += f'\n{topic}: {value}'

`2.6 Agent: Response creation`

In [162]:
# Define the prompt template
prompt = PromptTemplate(
    template="""<|begin_of_text|>
    <|start_header_id|>system<|end_header_id|>  
    You are a legal assistant, your task is to answer the user question based on given documents. 

    Legal Documents: {documents}

    Response Instructions:
    1. Formulate a concise, informative response based on the extracted information for the users question.
    2. Reference specific sections or clauses of the documents to support your answer.
    3. Do not answer the question based general knowledge, if the given documents do not have sufficient information to answer the question then send response as "I don't have knowledge to answer your question"
    4. Do not start your response like Based on the provided documents,  here are the details about ...
    5. Make sure that you are including any quantitative mesures

    User Question: {question}

    <|eot_id|>
    <|start_header_id|>assistant<|end_header_id|>
    """,
    input_variables=["question", "documents"],  # Fixed typo here
)

# Create the question router
question_router = prompt | llm | StrOutputParser() 

response = question_router.invoke(
    {
        "question": query, 
        "documents": all_content  # Use the corrected variable name
    }
)

In [163]:
print(response)

The members of the Board of Trustees of Mohamed bin Zayed University for Humanities are:

1. H.E. Dr. Mohammed Rashid Ahmed Al Hamli - Chairman
2. Mubarak Hamad Mubarak Al Muhairi - Vice Chairman
3. Mohammed Hamza Hassan Al Qassim
4. Maryam Eid Khamis Al Muhairi
5. Abdullah Aqeela Ali Al Muhairi
6. Dr. Omar Habtoor Theeb Al-Derei
7. Ghanem Sultan Ahmed Al Suwaidi

The membership term of the Board shall be three renewable years. (Executive Council Resolution No. (11) of 2023, Clause 2)


In [167]:
response = question_router.invoke(
    {
        "question": query, 
        "documents": all_content  # Use the corrected variable name
    }
)

for s in question_router.stream(    {
        "question": query, 
        "documents": all_content  # Use the corrected variable name
    }
):
    print(s, end="", flush=True)

The members of the Board of Trustees of Mohamed bin Zayed University for Humanities are:

1. H.E. Dr. Mohammed Rashid Ahmed Al Hamli - Chairman
2. Mubarak Hamad Mubarak Al Muhairi - Vice Chairman
3. Mohammed Hamza Hassan Al Qassim
4. Maryam Eid Khamis Al Muhairi
5. Abdullah Aqeela Ali Al Muhairi
6. Dr. Omar Habtoor Theeb Al-Derei
7. Ghanem Sultan Ahmed Al Suwaidi

The membership term of the Board shall be three renewable years. (Executive Council Resolution No. (11) of 2023, Clause 2)

In [166]:
s

''