`Loading Docling related modules`

In [88]:
from pathlib import Path
from docling.chunking import HybridChunker
from docling_core.types.doc import ImageRefMode
from docling.datamodel.base_models import InputFormat
from docling.pipeline.simple_pipeline import SimplePipeline
from docling.document_converter import DocumentConverter, PdfFormatOption, WordFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode, EasyOcrOptions

from langchain_docling import DoclingLoader
from langchain_docling.loader import ExportType
from langchain_core.prompts import PromptTemplate
from langchain_text_splitters import MarkdownHeaderTextSplitter

from elasticsearch import Elasticsearch
from langchain_elasticsearch import ElasticsearchStore

`Initialize the Docling converter object`

In [89]:
pipeline_options = PdfPipelineOptions(
    do_ocr=True,
    images_scale=1.0,
    do_table_structure=True,
    generate_page_images=True,
    generate_picture_images=True,
    ocr_options=EasyOcrOptions(
        lang=["en"], 
        use_gpu=True,
        force_full_page_ocr=True, 
    ),
    table_structure_options=dict(
        do_cell_matching=False,
        mode=TableFormerMode.ACCURATE
    ),
)

doc_converter = DocumentConverter(
    allowed_formats=[
        InputFormat.PDF,
        InputFormat.IMAGE,
        InputFormat.DOCX,
        InputFormat.HTML,
        InputFormat.XLSX,
        InputFormat.PPTX,
        InputFormat.ASCIIDOC,
        InputFormat.MD,
    ],
    format_options={
        InputFormat.DOCX: WordFormatOption(
            pipeline_cls=SimplePipeline
        ),
        InputFormat.PDF: PdfFormatOption(
            pipeline_options=pipeline_options
        ),
    }
)

`Parse the document and create markdown file (Optional**)`

In [90]:
# result = doc_converter.convert(
#     source="../data/table.pdf"
# )
# _filename = Path(f"{result.input.file.stem}-with-image-refs.md")
# result.document.save_as_markdown(_filename, image_mode=ImageRefMode.REFERENCED)

`Input Variables`

In [94]:
FILE_PATH = "../data/First Edition Arabic 2023-44.pdf"

TOP_K = 3
PROMPT = PromptTemplate.from_template(
    """
    Context information is below.
    {context}
    Given the context information and not prior knowledge, answer the query.
    Query: {input}
    Answer:
    """,
)
EXPORT_TYPE = ExportType.DOC_CHUNKS if FILE_PATH.endswith(".xlsx") else ExportType.MARKDOWN

`Create Object for Langchain Docling Loader and load the document`

In [95]:
docling_loaded_file = DoclingLoader(
    file_path=FILE_PATH,
    converter=doc_converter,
    export_type=EXPORT_TYPE,
).load()

`Create the documents for the indexing from the Docling loaded documents`

In [96]:
if EXPORT_TYPE == ExportType.DOC_CHUNKS:
    documents = docling_loaded_file
elif EXPORT_TYPE == ExportType.MARKDOWN:
    splitter = MarkdownHeaderTextSplitter(
        headers_to_split_on=[("#", "Header_1"),("##", "Header_2"),("###", "Header_3"),],
    )
    documents = [split for doc in docling_loaded_file for split in splitter.split_text(doc.page_content)]
else:
    raise ValueError(f"Unexpected export type: {EXPORT_TYPE}")

documents[:3]

[Document(metadata={'Header_2': '2006 aiuuJ (25)'}, page_content="- ri\n- LoJ 2 L699 'U QnJl\n- 49\n- L2i")]

In [68]:
for idx in documents[:3]:
    print(idx.metadata)
    print(idx.page_content)
    print("\n")

{'source': '../data/EmployeeSampleData/Employee Sample Data.xlsx', 'dl_meta': {'schema_name': 'docling_core.transforms.chunker.DocMeta', 'version': '1.0.0', 'doc_items': [{'self_ref': '#/tables/0', 'parent': {'$ref': '#/groups/0'}, 'children': [], 'label': 'table', 'prov': []}], 'origin': {'mimetype': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'binary_hash': 17978778108143012858, 'filename': 'Employee Sample Data.xlsx'}}}
EEID, 1 = Full Name. EEID, 2 = Job Title. EEID, 3 = Department. EEID, 4 = Business Unit. EEID, 5 = Gender. EEID, 6 = Ethnicity. EEID, 7 = Age. EEID, 8 = Hire Date. EEID, 9 = Annual Salary. EEID, 10 = Bonus %. EEID, 11 = Country. EEID, 12 = City. EEID, 13 = Exit Date. E02387, 1 = Emily Davis. E02387, 2 = Sr. Manger. E02387, 3 = IT. E02387, 4 = Research & Development. E02387, 5 = Female. E02387, 6 = Black. E02387, 7 = 55. E02387, 8 = 2016-04-08 00:00:00. E02387, 9 = 141604. E02387, 10 = 0.15. E02387, 11 = United States. E02387, 12 = Seattle. E0

In [40]:
elasticsearch_client = Elasticsearch(
    hosts="https://360399a232d44176a9811dcf98c25240.eastus2.azure.elastic-cloud.com:443",
    basic_auth=("elastic", "zfNThaF3pna6JHmcGPIsntKP")
)

elasticsearch_client.ping()

True

In [10]:
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [29]:
ElasticsearchStore.from_documents(
    texts=documents,
    embedding=embedding,
    es_connection=elasticsearch_client,
    index_name="test_index_xlsx_1",
)

BulkIndexError: 269 document(s) failed to index.

In [73]:
QUERY = "List of employee under Sales depat"

query = {
    "match_all": {}
}
response = elasticsearch_client.search(index='xlsx_index', query=query, size=20)
print(response)

{'took': 0, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 1207, 'relation': 'eq'}, 'max_score': 1.0, 'hits': [{'_index': 'xlsx_index', '_id': 'phLqbpQBXUp6I3aU7ro2', '_score': 1.0, '_source': {'EEID': 'E02387', 'Full Name': 'Emily Davis', 'Job Title': 'Sr. Manger', 'Department': 'IT', 'Business Unit': 'Research & Development', 'Gender': 'Female', 'Ethnicity': 'Black', 'Age': 55, 'Hire Date': '2016-04-08T00:00:00', 'Annual Salary': 141604, 'Bonus %': 0.15, 'Country': 'United States', 'City': 'Seattle', 'Exit Date': '2021-10-16T00:00:00'}}, {'_index': 'xlsx_index', '_id': 'rRLqbpQBXUp6I3aU7ro2', '_score': 1.0, '_source': {'EEID': 'E04332', 'Full Name': 'Luke Martin', 'Job Title': 'Analyst', 'Department': 'Finance', 'Business Unit': 'Manufacturing', 'Gender': 'Male', 'Ethnicity': 'Black', 'Age': 25, 'Hire Date': '2020-05-16T00:00:00', 'Annual Salary': 41336, 'Bonus %': 0, 'Country': 'United States', 'City': 'Miami', 'E

In [74]:

content =""
results = response["hits"]["hits"]

for result in results: 
    content = content + "\n" + result["_source"]["text"]
    print(result["_source"]["text"])
    print("=============================================\n")

KeyError: 'text'

In [61]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(
    model="Phi-4",
    temperature=0,
    api_key="dummy-key",  
    base_url="http://127.0.0.1:8080/v1"
)

In [62]:
from langchain.prompts import ChatPromptTemplate
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system", 
            """
            You are an assistant for question-answering tasks. 
            Use the following pieces of retrieved context to answer the question. 
            If you don't know the answer, just say that you don't know. Keep the answer concise. 
            Context: {context} 
            """
        ),
        ("human", "Question: {question} Answer/s: ")
    ]
)

In [63]:
chain = prompt | llm

In [23]:


chain = prompt | llm

answer = chain.invoke(
    {
        "context": content,
        "question": QUERY
    }
)
print(answer.content)

Guy Holland is the Global Leader of KPMG's CIO Center of Excellence and the Group Executive of Technology and Group Chief Information Officer at the Commonwealth Bank of Australia.


`Excel`

In [36]:
import openpyxl
from elasticsearch import Elasticsearch, helpers
import json
from elasticsearch.helpers import bulk

xlsx_file = "../data/EmployeeSampleData/Employee Sample Data.xlsx"

workbook = openpyxl.load_workbook(xlsx_file)
sheet = workbook.active

def prepare_data(sheet):
    headers = [cell.value for cell in sheet[1]]
    documents = []

    for row in sheet.iter_rows(min_row=2, values_only=True):
        doc = {}
        for col, value in zip(headers, row):
            doc[col] = value
        documents.append(doc)

    return documents

documents = prepare_data(sheet)

In [42]:
import openpyxl
from elasticsearch import Elasticsearch, helpers
import json
from datetime import datetime

# # Initialize Elasticsearch client
elasticsearch_client = Elasticsearch(
    hosts="https://360399a232d44176a9811dcf98c25240.eastus2.azure.elastic-cloud.com:443",
    basic_auth=("elastic", "zfNThaF3pna6JHmcGPIsntKP")
)

elasticsearch_client.ping()

# Path to your .xlsx file
xlsx_file = "../data/EmployeeSampleData/Employee Sample Data.xlsx"

# Load the workbook and select the active sheet
workbook = openpyxl.load_workbook(xlsx_file)
sheet = workbook.active

# Helper function to handle empty dates and parse valid dates
def parse_date(date_value):
    if not date_value or date_value == '':  # Handle empty or None dates
        return None  # You can set a default date like '1970-01-01' if required
    try:
        # Attempt to parse the date (assuming the format yyyy-MM-dd)
        return datetime.strptime(str(date_value), "%Y-%m-%d").date()
    except ValueError:
        return None  # Return None if the date format is invalid

# Helper function to prepare data from the sheet
def prepare_data(sheet):
    headers = [cell.value for cell in sheet[1]]  # Extract headers from the first row
    documents = []

    for row in sheet.iter_rows(min_row=2, values_only=True):  # Start from the second row
        doc = {}
        for col, value in zip(headers, row):
            if col == "Exit Date":  # Handle the "Exit Date" field specifically
                doc[col] = parse_date(value)  # Use the date parser for the "Exit Date"
            else:
                doc[col] = value
        documents.append(doc)

    return documents

# Prepare documents
documents = prepare_data(sheet)

# Index name for Elasticsearch
index_name = 'xlsx_index'

# Bulk index function
def bulk_index_documents(documents, index_name, elasticsearch_client):
    actions = []
    for doc in documents:
        action = {
            "_op_type": "index",   # Default to "index" operation
            "_index": index_name,
            "_source": doc         # The document to index
        }
        actions.append(action)

    try:
        # Bulk helper function
        success, failed = helpers.bulk(elasticsearch_client, actions, raise_on_error=False)
        if failed:
            print(f"Failed to index {len(failed)} documents.")
            # Print the detailed error messages for failed documents
            for failure in failed:
                print(failure)
    except Exception as e:
        print(f"Error during bulk indexing: {str(e)}")

# Check if the index exists, create if not
if not elasticsearch_client.indices.exists(index=index_name):
    elasticsearch_client.indices.create(index=index_name)

# Perform bulk indexing
bulk_index_documents(documents, index_name, elasticsearch_client)

print(f"Successfully indexed {len(documents)} documents into Elasticsearch.")

Successfully indexed 1000 documents into Elasticsearch.


In [50]:
QUERY = "List of employee under Sales depat"

ELQL_PROMPT = PromptTemplate.from_template(
    """
    You are expert in Elasticsearch Query Language EL|QL. Your task is to write the EL|QL query to answer the user's question with the elasticsearch index details.
    Elasticsearch Index Mapping details: {mapping}
    User's question: {question}

    Sample EL|QL query:
    FROM library | KEEP author, name, page_count, release_date | SORT page_count DESC | LIMIT 5",

    EL|QL:
    """,
)

In [85]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(
    model="Qwen-7B",
    temperature=0,
    api_key="dummy-key",  
    base_url="http://127.0.0.1:8080/v1"
)

In [87]:
print(llm.invoke("Write the essary in Arabic about AI technology with 500 words").content)

تُعدّ تقنية الذكاء الاصطناعي من التقنيات الحديثة التي أصبحت جزءًا لا يتجزأ من حياتنا اليومية، حيث أصبحت متغلغلة في العديد من المجالات، من السيارات ذاتية القيادة إلى الأجهزة المنزلية الذكية. وتُستخدم في العديد من التطبيقات، مثل التعرف على الكلام، والترجمة، والرؤية الحاسوبية، والتحكم بالروبوتات، وغيرها.

تُعتَبر الذكاء الاصطناعي من التقنيات التي تحتاج إلى الكثير من البيانات والمعلومات لإعدادها وتدريبها، حيث يتطلب من الخبراء جمع كميات هائلة من البيانات من مصادر مختلفة، وتنظيمها وتصنيفها، ثم تدريب الأنظمة الذكية عليها. وتُعدّ هذه العملية من أكثر مراحل تطوير الذكاء الاصطناعي صعوبة، حيث تتطلب وقتًا طويلًا وتكلفة عالية.

تُستخدم تقنية الذكاء الاصطناعي في العديد من المجالات، مثل الرعاية الصحية، حيث يمكن استخدامها في تشخيص الأمراض، وتطوير علاجات جديدة، وتوفير الرعاية الصحية عن بعد. كما تُستخدم في مجال التعليم، حيث يمكن استخدامها في تخصيص التعليم، وتوفير تجربة تعليمية مخصصة لكل طالب. وتُستخدم أيضًا في مجال الأعمال، حيث يمكن استخدامها في تحليل البيانات، وتوقع الاتجاهات، وتطوير استراتيجيات الأعمال

In [75]:
mapping = """{
  "mappings": {
    "properties": {
      "Age": {
        "type": "long"
      },
      "Annual Salary": {
        "type": "long"
      },
      "Bonus %": {
        "type": "float"
      },
      "Business Unit": {
        "type": "text",
        "fields": {
          "keyword": {
            "type": "keyword",
            "ignore_above": 256
          }
        }
      },
      "City": {
        "type": "text",
        "fields": {
          "keyword": {
            "type": "keyword",
            "ignore_above": 256
          }
        }
      },
      "Country": {
        "type": "text",
        "fields": {
          "keyword": {
            "type": "keyword",
            "ignore_above": 256
          }
        }
      },
      "Department": {
        "type": "text",
        "fields": {
          "keyword": {
            "type": "keyword",
            "ignore_above": 256
          }
        }
      },
      "EEID": {
        "type": "text",
        "fields": {
          "keyword": {
            "type": "keyword",
            "ignore_above": 256
          }
        }
      },
      "Ethnicity": {
        "type": "text",
        "fields": {
          "keyword": {
            "type": "keyword",
            "ignore_above": 256
          }
        }
      },
      "Exit Date": {
        "type": "date"
      },
      "Full Name": {
        "type": "text",
        "fields": {
          "keyword": {
            "type": "keyword",
            "ignore_above": 256
          }
        }
      },
      "Gender": {
        "type": "text",
        "fields": {
          "keyword": {
            "type": "keyword",
            "ignore_above": 256
          }
        }
      },
      "Hire Date": {
        "type": "date"
      },
      "Job Title": {
        "type": "text",
        "fields": {
          "keyword": {
            "type": "keyword",
            "ignore_above": 256
          }
        }
      }
    }
  }
}
"""

In [83]:
chain = ELQL_PROMPT | llm

answer = chain.invoke(
    {
        "mapping": mapping,
        "question": QUERY
    }
)
print(answer.content)

```ql
FROM employees | KEEP Full Name, Department | FILTER Department = "Sales" | PROJECT Full Name
```

**Explanation:**

1. **FROM employees:** Specifies the index name (`employees`).
2. **KEEP Full Name, Department:** Retains only the `FullName` and `Department` fields for the result.
3. **FILTER Department = "Sales":** Filters the documents to include only those where the `Department` field is "Sales".
4. **PROJECT Full Name:** Projects (selects) only the `FullName` field from the filtered documents.

This query will return a list of employee names (`FullName`) who are part of the "Sales" department.


In [None]:
FROM xlsx_index | WHERE Department.keyword == "Sales" | KEEP `Full Name`, EEID, `Job Title`, Department
FROM xlsx_index | KEEP `Full Name`, Department | FILTER Department = "Sales" | PROJECT `Full Name`

In [84]:
resp = elasticsearch_client.esql.query(
    format="txt",
    query="""FROM xlsx_index | KEEP `Full Name`, Department | FILTER Department = "Sales" | PROJECT `Full Name`""",
    )
print(resp)

BadRequestError: BadRequestError(400, 'parsing_exception', "line 1:50: mismatched input 'FILTER' expecting {'dissect', 'drop', 'enrich', 'eval', 'grok', 'keep', 'limit', 'mv_expand', 'rename', 'sort', 'stats', 'where'}")