In [17]:
import warnings
from typing import List, Optional, Dict
from langchain_core.pydantic_v1 import BaseModel, Field

from langchain_groq import ChatGroq
from elasticsearch import Elasticsearch
from sentence_transformers import SentenceTransformer

from langgraph.graph import END, StateGraph
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

warnings.filterwarnings("ignore", category=FutureWarning)

In [24]:
es_client = Elasticsearch(
    hosts="https://512baa73334c4127ade77bc2dfa2ef02.eastus2.azure.elastic-cloud.com:443",
    basic_auth=("elastic", "XRzYOABuG17Yv32UKAFySw41")
)
llm = ChatGroq(
    temperature=0,
    model_name="llama-3.1-8b-instant",
    # api_key="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",
    api_key="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"
)
model_name = "nomic-ai/nomic-embed-text-v1"
model = SentenceTransformer(model_name, trust_remote_code=True)

<All keys matched successfully>


In [65]:
class MultiQuerySchema(BaseModel):
    Questions: List[str] = Field(description="List of questions")


MULTI_QUERY_PROMPT = PromptTemplate(
    template="""
    <|begin_of_text|>
    <|start_header_id|>system<|end_header_id|>  
    You are an AI, specialized in generating effective multi-queries for Retrieval Augmented Generation (RAG) systems.

    Guidelines:
    1. Analyze the user's original query and generate a set of five distinct yet relevant search queries that cover:
        - Synonyms and similar terms
        - Related subtopics or specific aspects
        - Broader and narrower variations of the original query
    2. Aim to create queries that will increase the chances of retrieving diverse, relevant results by considering different interpretations and possible user intents.
    3. Provide JSON response, with list of questions like {{ "Questions": ["Question 1", "Question 2", ..] }}
    4. Provide only the list of queries without any additional explanations or commentary.

    User Query: "{user_query}"
    <|eot_id|>
    <|start_header_id|>assistant<|end_header_id|>
    """,
    input_variables=["user_query"],
)

structured_llm = llm.with_structured_output(MultiQuerySchema, method="json_mode")

multi_query_generation_chain = MULTI_QUERY_PROMPT | structured_llm

In [66]:
QUERY_ENRICHING_PROMPT = PromptTemplate(
    template="""
    <|begin_of_text|>
    <|start_header_id|>system<|end_header_id|>  
    You are an AI assistant specialized in generating Elasticsearch query strings. 
    Your task is to create the most effective query string for the given user question. 
    This query string will be used to search for relevant documents in an Elasticsearch index.

    Guidelines:
    1. Analyze the user's question carefully.
    2. Generate ONLY a query string suitable for Elasticsearch's match query.
    3. Focus on key terms and concepts from the question.
    4. Include synonyms or related terms that might be in relevant documents.
    5. Use simple Elasticsearch query string syntax if helpful (e.g., OR, AND).
    6. Do not use advanced Elasticsearch features or syntax.
    7. Do not include any explanations, comments, or additional text.
    8. Provide only the query string, nothing else.

    For the question "What is Clickthrough Data?", we would expect a response like:
    clickthrough data OR click-through data OR click through rate OR CTR OR user clicks OR ad clicks OR search engine results OR web analytics

    AND operator is not allowed. Use only OR.

    User Question: {user_query}
    <|eot_id|>
    <|start_header_id|>assistant<|end_header_id|>
    """,
    input_variables=["user_query"],
)

query_enricher_chain = QUERY_ENRICHING_PROMPT | llm | StrOutputParser()

In [92]:
RESPONSE_GENERATOR_PROMPT = PromptTemplate(
    template="""<|begin_of_text|>
    <|start_header_id|>system<|end_header_id|>  
    You are a legal assistant, your task is to answer the user question based on given documents. 

    Legal Documents: {documents}

    Response Instructions:
    1. Formulate a concise, informative response based on the extracted information for the users question.
    2. Reference specific sections or clauses of the documents to support your answer.
    3. Do not answer the question based general knowledge, if the given documents do not have sufficient information to answer the question then send response as "I don't have knowledge to answer your question"
    4. Do not start your response like Based on the provided documents,  here are the details about ...
    5. Make sure that you are including any quantitative mesures

    User Question: {question}

    <|eot_id|>
    <|start_header_id|>assistant<|end_header_id|>
    """,
    input_variables=["question", "documents"],  # Fixed typo here
)

# Create the question router
response_generation_chain = RESPONSE_GENERATOR_PROMPT | llm | StrOutputParser() 

In [83]:
from typing import List, Dict
from typing_extensions import TypedDict


class GraphState(TypedDict):
    user_query: str
    multi_query: List[str]
    embedding_vector: List[float]
    elasticsearch_keys: List[str]
    relevent_docs: str
    num_steps: int

In [96]:
def multi_query_generation(state):
    """Generate the query for user given question"""
    print("===================================================")
    print("=============> Generate Multi Query <==============")
    print("===================================================")
    user_question = state['user_query']
    num_steps = int(state['num_steps'])
    num_steps += 1

    response = multi_query_generation_chain.invoke({"user_query": user_question})
    print(response.Questions)

    return {"multi_query": response.Questions, "num_steps": num_steps}

def query_enricher(state):
    """Query Enricher"""
    print("===================================================")
    print("================> Query Enricher <=================")
    print("===================================================")
    multi_query = state['multi_query']
    num_steps = int(state['num_steps'])
    num_steps += 1

    unique_enriched_keys = set()

    for query in multi_query:
        enriched_query = query_enricher_chain.invoke({"user_query": query})
        parsed_or_query = [term.strip() for term in enriched_query.split(' OR ')]
        unique_enriched_keys.update(parsed_or_query)

    unique_enriched_keys = list(unique_enriched_keys)
    print(unique_enriched_keys)

    return {"elasticsearch_keys": unique_enriched_keys, "num_steps": num_steps}  

def embedding_creation(state):
    """Embedding creation"""
    print("===================================================")
    print("==============> Embedding Creation <===============")
    print("===================================================")
    user_question = state['user_query']
    num_steps = int(state['num_steps'])
    num_steps += 1

    model_name = "nomic-ai/nomic-embed-text-v1"
    model = SentenceTransformer(model_name, trust_remote_code=True)

    embedding = model.encode(user_question).tolist()

    return {"embedding_vector": embedding, "num_steps": num_steps} 

def elasticsearch(state):
    """Elasticsearch"""
    print("===================================================")
    print("================> Elastic Search <=================")
    print("===================================================")
    user_question = state['user_query']
    embedding = state['embedding_vector']
    elasticsearch_keys = state['elasticsearch_keys']
    num_steps = int(state['num_steps'])
    num_steps += 1

    search_body = {
            "knn": {
                "field": "embedded_content",
                "query_vector": embedding,
                "k": 5,
                "num_candidates": 50
            },
            "query": {
                "bool": {
                    "must": [
                        {
                            "multi_match": {
                                "query": " ".join(elasticsearch_keys),  
                                "fields": ["content", "heading", "questions", "reference"],
                                "type": "best_fields",
                                "operator": "or" 
                            }
                        }
                    ],
                    "should": [
                        {
                            "script_score": {
                                "query": {"match_all": {}},
                                "script": {
                                    "source": """
                                    double vector_score = cosineSimilarity(params.query_vector, params.vector_field) + 1.0;
                                    double text_score = _score;
                                    return 0.7 * vector_score + 0.3 * text_score;
                                    """,
                                    "params": {
                                        "query_vector": embedding,  
                                        "vector_field": "embedded_content"
                                    }
                                }
                            }
                        }
                    ]
                }
            }
        }
    conn = Elasticsearch(
        hosts="https://512baa73334c4127ade77bc2dfa2ef02.eastus2.azure.elastic-cloud.com:443",
        basic_auth=("elastic", "XRzYOABuG17Yv32UKAFySw41")
    )
    response = conn.search(index="abudhabi-policies-legislations-v4", body=search_body, size=5)

    all_content = ""
    for j in response['hits']['hits']:
        content = j["_source"]
        _result = {}
        _result["score"] = j["_score"]
        _result["heading"] = content['heading']
        _result["content"] = content['content']
        all_content = all_content + '\n' + content['content']
        _result["reference"] = content['reference']

        print(_result)

    return {"relevent_docs": all_content, "num_steps": num_steps} 

def response_generate(state):
    """Generate the response for the users query"""
    print("===================================================")
    print("=============> Response Generation <===============")
    print("===================================================")
    user_question = state['user_query']
    relevent_docs = state['relevent_docs']
    num_steps = int(state['num_steps'])
    num_steps += 1

    response = response_generation_chain.invoke({"question": user_question, "documents": relevent_docs})
    print(response)

    return {"relevent_docs": response, "num_steps": num_steps}

In [97]:
workflow = StateGraph(GraphState)

workflow.add_node("multi_query_generation", multi_query_generation)
workflow.add_node("query_enricher", query_enricher)
workflow.add_node("embedding_creation", embedding_creation)
workflow.add_node("elasticsearch", elasticsearch)
workflow.add_node("response_generate", response_generate)

workflow.set_entry_point("multi_query_generation")

workflow.add_edge("multi_query_generation", "query_enricher")
workflow.add_edge("query_enricher", "embedding_creation")
workflow.add_edge("embedding_creation", "elasticsearch")
workflow.add_edge("elasticsearch", "response_generate")
workflow.add_edge("query_enricher", END)

app_workflow = workflow.compile()

In [98]:
question = "List of Board of Trustees of Mohamed bin Zayed University?"

result = app_workflow.invoke({"user_query": question, "num_steps": 0}, {"recursion_limit": 10})

['Mohamed bin Zayed University Board of Directors', 'Members of the Board of Trustees of Mohamed bin Zayed University', 'List of Trustees of Mohamed bin Zayed University Abu Dhabi', 'Mohamed bin Zayed University governing body members', 'Who are the members of the Board of Trustees of Mohamed bin Zayed University?']
['Abu Dhabi University', 'Mohammed bin Zayed University', 'Board of Trustees', 'university trustee', 'administration', 'Board', 'academy', 'academic staff', 'University', 'university faculty', 'trustee board', 'of', 'Directors', 'college', 'bin', 'Mohamed bin Zayed', 'university board', 'school board', 'school', 'MBZ', 'Mohamed bin Zayed University Abu Dhabi', 'Trustees', 'Zayed', 'University Trustees', 'board members', 'university', 'University of Abu Dhabi', 'university administration', 'Mohamed bin Zayed University', 'MBZU', 'Mohamed', 'education', 'governing body', 'institution', 'university members', 'members', 'faculty members']


<All keys matched successfully>




  response = conn.search(index="abudhabi-policies-legislations-v4", body=search_body, size=5)


{'score': 154.49675, 'heading': 'executive council resolution no. (11) of 2023 concerning the reformation of the board of trustees of mohamed bin zayed university for humanities', 'content': 'executive council resolution no. (11) of 2023 concerning the reformation of the board of trustees of mohamed bin zayed university for humanities having reviewed chairman of the executive council resolution no. (27) of 2020 concerning the formation of the board of trustees of mohamed bin zayed university for humanities. the executive council has decided the following: 1. the board of trustees of mohamed bin zayed university for humanities shall be reformed under the chairmanship of h.e. dr. mohammed rashid ahmed al hamli, and the membership of: • mubarak hamad mubarak al muhairi - vice chairman • mohammed hamza hassan al qassim. • maryam eid khamis al muhairi • abdullah aqueeda ali al muhairi • dr. omar habtoor theeb al-derei. • ghanem sultan ahmed al suwaidi. 2. the membership term of the board sh