`1. Initializing Elasticsearch and LLM client connection`

In [17]:
import os
from elasticsearch import Elasticsearch, helpers
from langchain_groq import ChatGroq

es_client = Elasticsearch(
        hosts="https://628c5bfd5b844abf882a8a6e24a04191.eastus2.azure.elastic-cloud.com:443",
        basic_auth=("elastic", "IjmfvKZxmDUfe6nksmfutJwy")
)

es_client.ping()

llm = ChatGroq(
    temperature=0,
    model_name="llama-3.1-8b-instant",#"llama3-70b-8192",
    api_key="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",
)

In [18]:
llm.invoke("Hello")

AIMessage(content='Hello. How can I assist you today?', additional_kwargs={}, response_metadata={'token_usage': {'completion_tokens': 10, 'prompt_tokens': 36, 'total_tokens': 46, 'completion_time': 0.013333333, 'prompt_time': 0.002118064, 'queue_time': 0.013077486000000001, 'total_time': 0.015451397}, 'model_name': 'llama-3.1-8b-instant', 'system_fingerprint': 'fp_f66ccb39ec', 'finish_reason': 'stop', 'logprobs': None}, id='run-eee4b671-6e0a-4f29-be42-088e42257682-0', usage_metadata={'input_tokens': 36, 'output_tokens': 10, 'total_tokens': 46})

In [19]:
es_client.ping()

True

`2. Parsing the document and creating documents`

`2.1 Extracting Table of Content from the Doc`

In [20]:
from langchain.prompts import PromptTemplate
from PyPDF2 import PdfReader

from typing import List, Optional, Dict
from langchain_core.pydantic_v1 import BaseModel, Field
# from pydantic import BaseModel, Field
from langchain_core.output_parsers import JsonOutputParser

class Index(BaseModel):
    Laws: Optional[List[str]] = Field(description="List of laws present in the legal document")
    Circulars: Optional[List[str]] = Field(description="List of circulars present in the legal document")
    Decrees: Optional[List[str]] = Field(description="List of decrees present in the legal document")
    ExecutiveCouncilResolutions: Optional[List[str]] = Field(description="List of executive council resolutions present in the legal document")


TOC_PROMPT = PromptTemplate(
    template="""
    <|begin_of_text|>
    <|start_header_id|>system<|end_header_id|>  
    You are an AI assistant specialized in identifying table of contents from the legal document paragraph.

    Guidelines:
    1. Thoroughly analyze the given document paragraph and generate the table of content.
    2. Table of content specifically target
        - Laws
        - Circulars
        - Decrees
        - ExecutiveCouncilResolutions
    3. Provide JSON response, where each dictionary key represents a primary topic (e.g., "Executive Council Resolutions," "Laws") and 
       each corresponding value contains a list of associated subtopics.
    4. Provide only the dictionary as the output, without any additional text, explanations, or commentary.
    5. If any primary topic is empty then exclude from the response and do not keep empty list.
    6. Do not change the format of the primary topic and associated subtopics, i.e., Punctuation, Comma, Full stop, quotation mark

    Sample response format:
    {{
      "Laws": ["Law No. (2) of 2022 Concerning the Reorganisation of the National Rehabilitation Centre - Abu Dhabi"],
      "ExecutiveCouncilResolutions": ["Chairman of the Executive Council Resolution No. (3) of 2022 Concerning the Appointment of the Director-General for Tourism"],
      "Circulars": ["Circular No. (1) of 2022 Concerning the Policy of Providing an Easy and Effortless Customer Experience in the Emirate of Abu Dhabi"],
      "Decrees": ["Amiri Decree No. (13) of 2023 Concerning Sending a Judge to Retirement"]
    }}

    Legal document paragraph: {paragraph}
    
    <|eot_id|>
    <|start_header_id|>assistant<|end_header_id|>
    """,
    input_variables=["paragraph"],
)

structured_llm = llm.with_structured_output(Index, method="json_mode")

toc_extractor = TOC_PROMPT | structured_llm

def table_of_content(reader):
    for page_no in range(1, 4):
        page = reader.pages[page_no]
        text = page.extract_text()
        print(text)
        return text

def parse_document(pdf_path):
    with open(pdf_path, 'rb') as file:

        reader = PdfReader(file)
        num_pages = len(reader.pages)

        index_text = table_of_content(reader)
        toc_data = toc_extractor.invoke({"paragraph": index_text})

        return toc_data


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  exec(code_obj, self.user_global_ns, self.user_ns)


In [21]:
reader = PdfReader("../doc/2021/3English2021.pdf")

num_pages = len(reader.pages)
print(num_pages)
index_text = table_of_content(reader)
for page_no in range(1, 4):
    page = reader.pages[page_no]
    text = page.extract_text()
    print(text)


100
1The Official Gazette
The Third Edition - The Fiftieth Year
Contents
Executive Council Resolutions
Executive Council Resolution No. (30) of 2021 Concerning the 
Replacement of a Member of the Board of Trustees of Sorbonne 
University Abu Dhabi.
Circulars
Circular No. (2) of 2021 Concerning the Regulatory Framework for 
Government Policies in the Emirate of Abu Dhabi.Pages
5
9
1The Official Gazette
The Third Edition - The Fiftieth Year
Contents
Executive Council Resolutions
Executive Council Resolution No. (30) of 2021 Concerning the 
Replacement of a Member of the Board of Trustees of Sorbonne 
University Abu Dhabi.
Circulars
Circular No. (2) of 2021 Concerning the Regulatory Framework for 
Government Policies in the Emirate of Abu Dhabi.Pages
5
9

3 
Executive Council 
Resolutions
Executive Council Resolutions


In [22]:
directory_path = "../doc/2021/"
table_of_content_list = list()

for filename in os.listdir(directory_path):
    print(f"Processing: {filename}")
    if filename.endswith('.pdf'): 
        file_path = os.path.join(directory_path, filename) 
        
        output = parse_document(file_path)
        table_of_content_list.append(output)
        print(f"Laws: {output.Laws}")
        print(f"Executive Council Resolutions: {output.ExecutiveCouncilResolutions}")
        print(f"Circulars: {output.Circulars}")
        print(f"Decrees: {output.Decrees}")

Processing: 3English2021.pdf
1The Official Gazette
The Third Edition - The Fiftieth Year
Contents
Executive Council Resolutions
Executive Council Resolution No. (30) of 2021 Concerning the 
Replacement of a Member of the Board of Trustees of Sorbonne 
University Abu Dhabi.
Circulars
Circular No. (2) of 2021 Concerning the Regulatory Framework for 
Government Policies in the Emirate of Abu Dhabi.Pages
5
9
Laws: None
Executive Council Resolutions: None
Circulars: ['Circular No. (2) of 2021 Concerning the Regulatory Framework for Government Policies in the Emirate of Abu Dhabi.']
Decrees: None
Processing: 4English2021.pdf
1The Official Gazette
The Fourth Edition - The Fiftieth Year
Contents
Laws
Law No. (2) of 2021 Concerning The Amendment of Some Provisions 
of Law No. (2) of 2000 concerning Civil Retirement Pensions and 
Benefits in the Emirate of Abu Dhabi.
Crown Prince Chairman of the Executive Council Resolutions 
Chairman of the Executive Council Resolution No. (5) of 2021 
Concerni

In [23]:
def table_of_content(reader):
    for page_no in range(1, 4):
        page = reader.pages[page_no]
        text = page.extract_text()
        return text

def parse_document(pdf_path):
    with open(pdf_path, 'rb') as file:

        reader = PdfReader(file)
        num_pages = len(reader.pages)

        index_text = table_of_content(reader)
    
    return index_text

# index_text = parse_document("../doc/2021/3English2021.pdf")
# index_text

`2.2 Text Extraction and Doc Parsing`

In [24]:
from langchain.prompts import PromptTemplate
from PyPDF2 import PdfReader

from typing import List, Optional, Dict
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.output_parsers import JsonOutputParser

class Index(BaseModel):
    Laws: Optional[List[str]] = Field(description="List of laws present in the legal document")
    Circulars: Optional[List[str]] = Field(description="List of circulars present in the legal document")
    Decrees: Optional[List[str]] = Field(description="List of decrees present in the legal document")
    ExecutiveCouncilResolutions: Optional[List[str]] = Field(description="List of executive council resolutions present in the legal document")


TOC_PROMPT = PromptTemplate(
    template="""
    <|begin_of_text|>
    <|start_header_id|>system<|end_header_id|>  
    You are an AI assistant specialized in identifying table of contents from the legal document paragraph.

    Guidelines:
    1. Thoroughly analyze the given document paragraph and generate the table of content.
    2. Table of content specifically target
        - Laws
        - Circulars
        - Decrees
        - ExecutiveCouncilResolutions
    3. Provide JSON response, where each dictionary key represents a primary topic (e.g., "Executive Council Resolutions," "Laws") and 
       each corresponding value contains a list of associated subtopics.
    4. Provide only the dictionary as the output, without any additional text, explanations, or commentary.
    5. If any primary topic is empty then exclude from the response and do not keep empty list.

    Sample response format:
    {{
      "Laws": ["Law No. (2) of 2022 Concerning the Reorganisation of the National Rehabilitation Centre - Abu Dhabi"],
      "ExecutiveCouncilResolutions": ["Chairman of the Executive Council Resolution No. (3) of 2022 Concerning the Appointment of the Director-General for Tourism"],
      "Circulars": ["Circular No. (1) of 2022 Concerning the Policy of Providing an Easy and Effortless Customer Experience in the Emirate of Abu Dhabi"],
      "Decrees": ["Amiri Decree No. (13) of 2023 Concerning Sending a Judge to Retirement"]
    }}

    Legal document paragraph: {paragraph}
    
    <|eot_id|>
    <|start_header_id|>assistant<|end_header_id|>
    """,
    input_variables=["paragraph"],
)

structured_llm = llm.with_structured_output(Index, method="json_mode")

toc_extractor = TOC_PROMPT | structured_llm

def table_of_content(reader):
    for page_no in range(1, 4):
        page = reader.pages[page_no]
        text = page.extract_text()
        return text

def parse_document(pdf_path):
    with open(pdf_path, 'rb') as file:

        reader = PdfReader(file)
        num_pages = len(reader.pages)

        index_text = table_of_content(reader)
        toc_data = toc_extractor.invoke({"paragraph": index_text})

        return toc_data
        

In [25]:
import os
import re
import json
from PyPDF2 import PdfReader

def remove_dot(text):
    return text[:-1] if text.endswith('.') else text

def extrating_content(key_topics, output_lower, final_output):

    for i in range(len(key_topics)):

        current_topic = key_topics[i]
        current_topic = remove_dot(current_topic)
        current_topic_lower = current_topic.lower()
        current_topic = current_topic_lower.replace("'", '"')

        if i < len(key_topics) - 1:
            next_topic = key_topics[i + 1]
            next_topic = remove_dot(next_topic)
            next_topic_lower = next_topic.lower()
            
            next_topic = next_topic_lower.replace("'", '"')

            start = output_lower.find(current_topic)
            end = output_lower.find(next_topic)

            substring = output_lower[start:end]
        else:
            start = output_lower.find(current_topic) + len(current_topic)
            end = len(output_lower)
            substring = output_lower[start:end]

        final_output[current_topic] = substring
    
    return final_output


def parse_document(pdf_path):

    # Reading the file from the pdf_path
    with open(pdf_path, 'rb') as file:

        overall_content = ""
        key_topics = list()
        topic_wise_content = {}

        # Initialize the PyPDF function
        reader = PdfReader(file)
        num_pages = len(reader.pages)

        # Extracting table of content
        index_text = table_of_content(reader)
        toc_data = toc_extractor.invoke({"paragraph": index_text})
        toc = json.loads(toc_data.json(exclude_none=True))
        for key, value in toc.items():
            key_topics.extend(value)
        
        # Extracting remain document content and preparing 
        for page_no in range(3, num_pages):
            page = reader.pages[page_no]
            text = page.extract_text()
            if text and text[0].isdigit():
                text = text[1:]
                text = re.sub(r'^\d+', '', text)

            cleaned_text = re.sub(r'[\n\r\t]', ' ', text)
            cleaned_text = cleaned_text.replace("  ", " ")
            cleaned_text = cleaned_text.replace("  ", " ")
            cleaned_text = cleaned_text.replace("”", "’")
            cleaned_text = cleaned_text.replace("“", "‘")

            overall_content = overall_content + " " + cleaned_text
            overall_content = overall_content.strip()
        
        # Extract the topic wise content from the document
        topic_wise_content = extrating_content(key_topics, overall_content.lower(), topic_wise_content)
        
        return toc, key_topics, overall_content, topic_wise_content
    
# toc, key_topics, overall_content, topic_wise_content = parse_document("../doc/2021/3English2021.pdf")

In [26]:
print(f"Table of content:\n{toc}\n\nIndex:\n{key_topics}\nOverall content:\n{overall_content}\n\nTopic wise Content:\n{topic_wise_content}")

NameError: name 'toc' is not defined

In [59]:
topic_wise_content

 'executive council resolution no. (247) of 2023 concerning the expansion of the boundaries of the twentieth investment zone in the emirate of abu dhabi': '  having reviewed the executive council resolution no. (9) of 2019 concerning the twentieth investment zone in the emirate of abu dhabi, the executive council has decided the following: 1. approve the expansion of the boundaries of the twentieth investment zone in the emirate of abu dhabi, according to the attached plan, to include plots of lands belonging to land (c2) from (c2-1) to (c2-67), provided that the ownership percentage of non-nationals therein does not exceed (50%) of the real estate units. 2. instruct the department of municipalities and transport to take the necessary measures in conformity with the applicable regulations. saif saeed ghobash secretary-general issued on: 18 december 2023 corresponding to: 05 jumada al akhir 1445 hijri united arab emirates the emirate of abu dhabi issued by the gereral secretariat of the

`2.3 Initializing Embedding Model and function`

In [27]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
from sentence_transformers import SentenceTransformer

model_name = "nomic-ai/nomic-embed-text-v1"
model = SentenceTransformer(model_name, trust_remote_code=True)

def embedding_creation(input_texts, model):
    embedding = model.encode(input_texts)
    return embedding.tolist()

  from .autonotebook import tqdm as notebook_tqdm
<All keys matched successfully>


In [61]:
for topic, content in topic_wise_content.items():
    embedding = embedding_creation(content, model)
    print(f"Topic: {topic}\n")
    print(f"Content: {content}\n")
    print(f"Embedding: {embedding}\n")

Topic: law no. (20) of 2023 concerning the establishment of abu dhabi hazardous materials management centre


Embedding: [0.0266974326223135, 0.003605922684073448, -0.0016061211936175823, -0.036304254084825516, 0.049667999148368835, 0.01602492853999138, -0.009421409107744694, 0.008556302636861801, -0.01823185570538044, -0.0026875389739871025, -0.007426863070577383, 0.02563253603875637, 0.01829666830599308, -0.036809369921684265, 0.03575807064771652, -0.021337181329727173, 0.0049580419436097145, 0.004827751312404871, -0.0446166917681694, -0.017318667843937874, -0.05366512015461922, -0.064052514731884, -0.037321776151657104, -0.029014313593506813, 0.08887223154306412, -0.048279035836458206, 0.02836748957633972, 0.006409407127648592, 0.011144878342747688, 0.03610839694738388, 0.03016546368598938, -0.06818887591362, -0.053239356726408005, -0.04805111885070801, 0.0025978144258260727, -0.0962536558508873, 0.056563250720500946, 0.007663312833756208, 0.017947740852832794, 0.0279853418469429, 0

#### Reference
+ https://www.nomic.ai/blog/posts/nomic-embed-text-v1

`2.4 Data Enrichment`

In [31]:
class ReferenceSchema(BaseModel):
    Reference: Optional[List[str]] = Field(description="List of references like laws, Resolutions present in the legal document")
    Questions: Optional[List[str]] = Field(description="List of LLM generated questions")

REFERENCE_EXTRACTION_PROMPT = PromptTemplate(
    template="""
    <|begin_of_text|>
    <|start_header_id|>system<|end_header_id>
    You are an expert in analyzing legal document and identifying reference i.e., laws, resolutions from the provided document and Generating questions for Retrieval-Augmented Generation (RAG) systems

    Guidelines for reference identifying:
    1. Provide only references without additional information or summaries.
    2. Provide JSON response as a reference, corresponding value contains a list of associatedFormat the output as a Python list, e.g., ["resolution no. (106) of 2021 on economic licensing fees", "chairman of the executive council resolution no. (4) of 2021 concerning the reformation of the board of directors of abu dhabi housing authority", "law no. (1) of 1974 concerning the reorganisation of the governmental body in the emirate of abu dhabi and its amendments"].
    3. Make sure that you are including the resolution and law concerning in short summary
    4. Exclude self-reference {index} from the output.
    5. Present only law or reference numbers and their years.
    6. If a reference is not a law or resolution, ignore it and keep the empty Python list, e.g., ["Law No. (2) of 2022", "Executive Council Resolution No. (3) of 2022", "Amiri Decree No. (13) of 2023", "Circular No. (1) of 2022"]

    Guidelines for generating questions:
    1. Thoroughly analyze the entire document.
    2. Generate exactly 10 questions that cover various aspects and levels of complexity within the document's content.
    3. Create questions that specifically target:
      - Key facts and information
      - Main concepts and ideas
      - Relationships between different parts of the content
      - Potential applications or implications of the information
      - Comparisons or contrasts within the document
    4. Ensure questions require answers of varying lengths and complexity, from simple retrieval to more complex synthesis.
    5. Include questions that might require combining information from different parts of the document.
    6. Frame questions to test both literal comprehension and inferential understanding.
    7. Avoid yes/no questions; focus on open-ended questions that promote comprehensive answers.
    8. Consider including questions that might require additional context or knowledge to fully answer, to test the RAG system's ability to combine retrieved information with broader knowledge.
    9. Output only the ten questions, without any additional text, explanations, or answers.
    10.Provide JSON response as a reference, corresponding value contains a list of associatedFormat the output as a Python list

    Sample response format:
    {{
      "Reference": ["Law No. (2) of 2022", "Executive Council Resolution No. (3) of 2022", "Amiri Decree No. (13) of 2023", "Circular No. (1) of 2022", "law no. (1) of 1974", "law no. (2) of 2000"],
      "Questions": ["What is the primary purpose of Law No. (12) of 2021, as stated in the document?" , "What are the laws and regulations reviewed by the ruler of Abu Dhabi before issuing Law No. (12) of 2021?"]
    }}

    The document references {index} and contains the following content: {paragraph}
    
    <|eot_id|>
    <|start_header_id|>assistant<|end_header_id|>
    """,
    input_variables=["index", "paragraph"],
)

structured_llm = llm.with_structured_output(ReferenceSchema, method="json_mode")

reference_extraction = REFERENCE_EXTRACTION_PROMPT | structured_llm

In [32]:
import time
for topic, content in topic_wise_content.items():
    print(f"Topic: {topic}\n")
    print(f"Content: {content}\n")
    print(f"Content length: {len(content)}")
    reference = reference_extraction.invoke({"index":topic, "paragraph": content})
    print(f"Reference: {reference.Reference}\n")
    print(f"Questions: {reference.Questions}\n")
    # time.sleep(60)

Topic: law no. (20) of 2020 concerning the establishment of mohamed bin zayed university for humanity sciences

Content: law no. (20) of 2020 concerning the establishment of mohamed bin zayed university for humanity sciences  we, khalifa bin zayed al nahyan, ruler of abu dhabi having reviewed: • law no. (1) of 1974 concerning the reorganisation of the governmental body in the emirate of abu dhabi and its amendments; • law no. (2) of 1971 concerning the national advisory council, and its amendments; • law no. (2) of 2000 concerning civil retirement pensions and benefits in the emirate of abu dhabi, and its amendments; • law no. (2) of 2013 concerning the establishment of mohammed v university/ abu dhabi, and its amendments; • law no. (6) of 2016 concerning the human resources in the emirate of abu dhabi and its amendments; • law no. (1) of 2017 concerning the financial system of abu dhabi government; • law no. (9) of 2018 concerning the establishment of the department of education and k

In [37]:
import os
directory_path = '../doc/2020'

final_index_document = []

for filename in os.listdir(directory_path):
    print(filename)

9English2020.pdf


In [38]:
# directory_path = '../../doc/'

final_index_document = []

for filename in os.listdir(directory_path):
    if filename.endswith('.pdf'): 

        file_path = os.path.join(directory_path, filename) 
        print(f"Processing File Name: {file_path}\n")

        toc, key_topics, overall_content, topic_wise_content = parse_document(file_path)
        print(f"Table of Content:\n{toc}\n")
        print(f"Key Topics:\n{key_topics}\n")
        print(f"Overall Content:\n{overall_content}\n")
        print(f"Topic-wise Content:\n{topic_wise_content}\n")

        for key, value in topic_wise_content.items():
            print(f"Topic:\n{key}\n")
            if len(value) > 50000:
                print(f"Skipped: {key}")
                continue

            embedding_text = embedding_creation(value, model)
            print(f"Embedding created\n")

            data_enrich = reference_extraction.invoke(
                    {
                        "index": key, 
                        "paragraph": value
                    }
            )
            references = data_enrich.Reference
            print(f"Reference:\n{references}\n")

            questions = reference.Questions
            print(f"Questions:\n{questions}\n")

            final_index_document.append(
                {
                    "heading": key,
                    "content": value,
                    "reference": references,
                    "questions": questions,
                    "embedded_content": embedding_text,
                    "document": filename
                }
            )

Processing File Name: ../doc/2020\9English2020.pdf

Table of Content:
{'Laws': ['Law No. (19) of 2020 Concerning the Reorganisation of Abu Dhabi Accountability Authority.'], 'Circulars': [], 'Decrees': [], 'ExecutiveCouncilResolutions': ['Executive Council Resolution No. (134) of 2020 Concerning the Transfer of Affiliation of the Statistics Centre − Abu Dhabi.', 'Executive Council Resolution No. (135) of 2020 Concerning the Transfer of Some Competencies of Abu Dhabi Agriculture and Food Safety Authority to Abu Dhabi Developmental Holding Company PJSC.', 'Executive Council Resolution No. (141) of 2020 Concerning the Exempt of the Children of Workers in the First Line of Defence in the Health Sector from Tuition Fees.', 'Executive Council Resolution No. (144) of 2020 Concerning Subsidizing the Taxi Transport Sector in the Emirate of Abu Dhabi.', 'Executive Council Resolution No. (145) of 2020 Concerning the Emirates Nuclear Energy Corporation.', 'Executive Council Resolution No. (146) of

In [39]:
len(final_index_document)

8

`3. Creating new index in Elastic Search`

In [40]:
resp = helpers.bulk(
    es_client,
    final_index_document,
    chunk_size=20, 
    timeout="100m",
    index = "abudhabi-policies-legislations-latest",
)