In [None]:
!git clone https://github.com/scholarly360/vsc_dev_docling.git

In [None]:
!pip install -r /content/vsc_dev_docling/requirements.txt

In [1]:
import os
from dotenv import load_dotenv
load_dotenv()
import torch
from typing import Iterator, Iterable
from tempfile import TemporaryDirectory # Import TemporaryDirectory

from langchain_core.document_loaders import BaseLoader
from langchain_core.documents import Document as LCDocument
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough

from docling.document_converter import DocumentConverter

from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_milvus import Milvus
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from langchain_huggingface import HuggingFacePipeline


# ... (rest of the code remains the same) ...

In [2]:



class DoclingPDFLoader(BaseLoader):

    def __init__(self, file_path: str | list[str]) -> None:
        self._file_paths = file_path if isinstance(file_path, list) else [file_path]
        self._converter = DocumentConverter()

    def lazy_load(self) -> Iterator[LCDocument]:
        for source in self._file_paths:
            dl_doc = self._converter.convert(source).document
            text = dl_doc.export_to_markdown()
            yield LCDocument(page_content=text)


def format_docs(docs: Iterable[LCDocument]):
    return "\n\n".join(doc.page_content for doc in docs)


# تحميل النموذج ومعالج النصوص
#model_id = "mistralai/Mistral-7B-Instruct-v0.3"
model_id = "ricepaper/vi-gemma-2b-RAG"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="auto")

# إنشاء Hugging Face pipeline
pipe = pipeline( # إنشاء pipeline
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

# إنشاء LLM باستخدام HuggingFacePipeline مع pipeline
llm = HuggingFacePipeline(pipeline=pipe) # تمرير pipeline إلى HuggingFacePipeline


# تحميل المستندات وتقسيمها
FILE_PATH = "/content/vsc_dev_docling/docs/sample.pdf"
loader = DoclingPDFLoader(file_path=FILE_PATH)
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
)
docs = loader.load()
splits = text_splitter.split_documents(docs)

# إنشاء متجه embeddings
HF_EMBED_MODEL_ID = "BAAI/bge-small-en-v1.5"
embeddings = HuggingFaceEmbeddings(model_name=HF_EMBED_MODEL_ID)

# إنشاء vectorstore
MILVUS_URI = os.environ.get(
    "MILVUS_URI", f"{(tmp_dir := TemporaryDirectory()).name}/milvus_demo.db"
)
vectorstore = Milvus.from_documents(
    splits,
    embeddings,
    connection_args={"uri": MILVUS_URI},
    drop_old=True,
    index_params={"index_type": "IVF_FLAT", "params": {"nlist": 128}},
)

# إنشاء RAG chain
retriever = vectorstore.as_retriever()
prompt = PromptTemplate.from_template(
    "Context information is below.\n---------------------\n{context}\n---------------------\nGiven the context information and not prior knowledge, answer the query.\nQuery: {question}\nAnswer:\n"
)
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

# اختبار RAG chain
# rag_chain.invoke("Explain Duties?")
# rag_chain.invoke("What about insurance?")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/40.6k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/751 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

Device set to use cpu


In [3]:

rag_chain.invoke("Explain Duties?")

"Context information is below.\n---------------------\nSECONDARY FACTORS: (a) The extent of control which, by agreement, COMMISSION may exercise over the details of the work is slight rather than substantial; (b) CONSULTANT is engaged in a distinct occupation or business; (c) In the locality, the work to be done by CONSULTANT is usually done by a specialist without supervision, rather than under the direction of an employer; (d) The skill required in the particular occupation is substantial rather than slight; (e) The CONSULTANT rather than the COMMISSION supplies the instrumentalities, tools and work place; (f) The length of time for which CONSULTANT is engaged is of limited duration rather than indefinite; (g) The method of payment of CONSULTANT is by the job rather than by the time; (h) The work is part of a special or permissive activity, program, or project, rather than part of the regular business of COMMISSION; (i) CONSULTANT and COMMISSION believe they are creating an independe