In [11]:
from langchain_docling import DoclingLoader
from dotenv import load_dotenv

load_dotenv()

FILE_PATH = "Docs/plan_and_solve.pdf"

loader = DoclingLoader(file_path=FILE_PATH)

docs = loader.load()

Token indices sequence length is longer than the specified maximum sequence length for this model (625 > 512). Running this sequence through the model will result in indexing errors


In [12]:
len(docs)

95

In [13]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
doc_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=0)
splitted_docs = doc_splitter.split_documents(docs)

In [19]:
len(splitted_docs)

95

In [14]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.3)

In [7]:
from langchain_groq import ChatGroq
llm_groq = ChatGroq(model="llama-3.1-8b-instant", temperature=0.3)

In [9]:
from langchain_anthropic import ChatAnthropic

llm_anthropic = ChatAnthropic(model="claude-3-5-haiku-20241022", temperature=0.3)

In [21]:
from pydantic import BaseModel, Field
from typing import List

class SummaryResponse(BaseModel):
    summary: str = Field(description="Summary for the provided text. Summary should be strictly in urdu.")
    document_class: str = Field(description="Suggest a document class for the provided text in urdu")
    questions: List[str] = Field(description="10 questions from the provided text in urdu")

In [22]:
llm_with_structured_output = llm.with_structured_output(SummaryResponse)

In [23]:
from langchain_core.prompts import PromptTemplate

template = """
You are an expert document summarizer with experience in extracting key informations and insights from various types of documents

Document for Analysis:
{doc}
"""

summary_prompt = PromptTemplate.from_template(template)

In [24]:
summary_chain = summary_prompt | llm_with_structured_output 

In [25]:
response = summary_chain.invoke(splitted_docs[:10])

In [None]:
print(response.summary)
print(response.doc_class)
print(response.questions)

['بڑے زبان ماڈلز (LLMs) کیا ہیں؟',
 'زیرو شاٹ چین آف تھوٹ (CoT) پرامپٹنگ کی کیا خصوصیات ہیں؟',
 'پلان اینڈ سالو (PS) پرامپٹنگ کا مقصد کیا ہے؟',
 'زیرو شاٹ-CoT میں کون سی خامیاں ہیں؟',
 'PS پرامپٹنگ میں کیا شامل ہے؟',
 'PS+ پرامپٹنگ کی خصوصیات کیا ہیں؟',
 'کون سی ڈیٹا سیٹس پر تجربات کیے گئے؟',
 'PS+ پرامپٹنگ کی کارکردگی کیسی ہے؟',
 'یہ تحقیق کس مسئلے کو حل کرنے کی کوشش کر رہی ہے؟',
 'کیا PS+ پرامپٹنگ کو دستی مظاہرہ کی ضرورت ہے؟']

In [None]:
# RAG Question Answer

In [30]:
from langchain_community.document_loaders import PyPDFLoader

path = "Docs/GraphRAG_for_structured_data_1.pdf"

loader = PyPDFLoader(path)
docs = loader.load()

In [33]:
docs[0].page_content

'Enhancing Structured-Data Retrieval with GraphRAG: Soccer\nData Case Study\nZahra Sepasdar\nForzasys & SimulaMet\nOslo, Norway\nzahra.sepasdar@gmail.com\nSushant Gautam\nSimulaMet & OsloMet\nOslo, Norway\nsushant@simula.no\nCise Midoglu\nSimulaMet\nOslo, Norway\ncise@simula.no\nMichael A. Riegler\nSimulaMet & OsloMet\nOslo, Norway\nmichael@simula.no\nPål Halvorsen\nSimulaMet, Forzasys & OsloMet\nOslo, Norway\npaalh@simula.no\nABSTRACT\nExtracting meaningful insights from large and complex datasets\nposes significant challenges, particularly in ensuring the accuracy\nand relevance of retrieved information. Traditional data retrieval\nmethods such as sequential search and index-based retrieval often\nfail when handling intricate and interconnected data structures,\nresulting in incomplete or misleading outputs. To overcome these\nlimitations, we introduce Structured-GraphRAG, a versatile frame-\nwork designed to enhance information retrieval across structured\ndatasets in natural langua

In [34]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 2500,
    chunk_overlap = 100,
    is_separator_regex=False
)

In [35]:
splitted_docs = text_splitter.split_documents(docs)

In [36]:
import torch
from langchain_community.embeddings import HuggingFaceBgeEmbeddings

model_kwargs = {"device": "cuda" if torch.cuda.is_available() else "cpu"}
model_name = "BAAI/bge-m3"
encode_kwargs = {"normalize_embeddings": True}

embedding = HuggingFaceBgeEmbeddings(

    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs 
)

  embedding = HuggingFaceBgeEmbeddings(


In [37]:
splitted_docs[0]

Document(metadata={'producer': 'pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'creator': 'LaTeX with acmart 2023/03/30 v1.90 Typesetting articles for the Association for Computing Machinery and hyperref 2023-04-22 v7.00x Hypertext links for LaTeX', 'creationdate': '2024-09-27T00:30:35+00:00', 'moddate': '2024-09-27T00:30:35+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '-  Information systems  ->  Database design and models.Information retrieval query processing.-  Computing methodologies  ->  Knowledge representation and reasoning.-  Theory of computation  ->  Data structures design and analysis.', 'title': 'Enhancing Structured-Data Retrieval with GraphRAG: Soccer Data Case Study', 'trapped': '/False', 'source': 'Docs/GraphRAG_for_structured_data_1.pdf', 'total_pages': 8, 'page': 0, 'page_label': '1'}, page_content='Enhancing Structured-Data Retrieval with GraphRAG: So

In [38]:
from langchain_qdrant import Qdrant
from qdrant_client import QdrantClient, models
import uuid

collection_name = "analytix_camp"
url = "http://localhost:6333"

file_uuid = uuid.uuid4()

document_with_payload = []

for chunk in splitted_docs:
    chunk.metadata["user_name"] = "Hasnain Ali Poonja"
    chunk.metadata["file_uuid"] = file_uuid
    chunk.metadata["no of pages"] = len(docs) 

    document_with_payload.append(chunk)

In [43]:
document_with_payload

[Document(metadata={'producer': 'pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'creator': 'LaTeX with acmart 2023/03/30 v1.90 Typesetting articles for the Association for Computing Machinery and hyperref 2023-04-22 v7.00x Hypertext links for LaTeX', 'creationdate': '2024-09-27T00:30:35+00:00', 'moddate': '2024-09-27T00:30:35+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '-  Information systems  ->  Database design and models.Information retrieval query processing.-  Computing methodologies  ->  Knowledge representation and reasoning.-  Theory of computation  ->  Data structures design and analysis.', 'title': 'Enhancing Structured-Data Retrieval with GraphRAG: Soccer Data Case Study', 'trapped': '/False', 'source': 'Docs/GraphRAG_for_structured_data_1.pdf', 'total_pages': 8, 'page': 0, 'page_label': '1', 'user_name': 'Hasnain Ali Poonja', 'file_uuid': UUID('aabbc23d-192f

In [42]:
vector_store = Qdrant.from_documents(
    document_with_payload,
    embedding,
    collection_name=collection_name,
    url=url
)

In [None]:
from qdrant_client import QdrantClient, models

qdrant_client = QdrantClient(url=url)

qdrant_client.delete(
    collection_name="analytix_camp",
    points_selector=models.FilterSelector(
        filter = models.Filter(
            must=[
                models.FieldCondition(
                    key="metadata.user_name",
                    match=models.MatchValue(value="Hasnain Ali Poonja")
                )
            ]
        )
    )
)

UpdateResult(operation_id=1, status=<UpdateStatus.COMPLETED: 'completed'>)