In [None]:
!pip install -q torch transformers accelerate bitsandbytes langchain sentence-transformers faiss-cpu openpyxl pacmap datasets langchain-community ragatouille pypdf


### HuggingFaceH4/zephyr-7b-beta

In [None]:
# 1. التثبيتات
#!pip install -q torch transformers accelerate bitsandbytes langchain sentence-transformers faiss-cpu openpyxl pacmap datasets langchain-community ragatouille pypdf

# 2. المكتبات
from tqdm.notebook import tqdm
import pandas as pd
from typing import Optional, List, Tuple
import matplotlib.pyplot as plt
from langchain.docstore.document import Document as LangchainDocument
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy
from ragatouille import RAGPretrainedModel
from pypdf import PdfReader
import torch
# 3. الثوابت
EMBEDDING_MODEL_NAME = "thenlper/gte-small"
READER_MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
PDF_FILEPATH = "/content/The_Lightning_Thief_-_Percy_Jackson_1-10.pdf"  # 👈 استبدل هذا بمسار ملف PDF الخاص بك
CHUNK_SIZE = 512
CHUNK_OVERLAP = int(CHUNK_SIZE / 10)

# 4. دوال معالجة PDF

def load_pdf(filepath: str) -> List[LangchainDocument]:
    """
    يحمل ملف PDF ويحوله إلى قائمة من LangchainDocument.
    """
    reader = PdfReader(filepath)
    documents = []
    for page_num, page in enumerate(tqdm(reader.pages, desc="Processing pages")):
        text = page.extract_text()
        metadata = {"source": filepath, "page": page_num + 1}
        documents.append(LangchainDocument(page_content=text, metadata=metadata))
    return documents

def split_documents(
    chunk_size: int,
    knowledge_base: List[LangchainDocument],
    tokenizer_name: Optional[str] = EMBEDDING_MODEL_NAME,
) -> List[LangchainDocument]:
    """
    يقسم المستندات إلى أجزاء (chunks) بحجم محدد.
    """
    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
        AutoTokenizer.from_pretrained(tokenizer_name),
        chunk_size=chunk_size,
        chunk_overlap=CHUNK_OVERLAP,
        add_start_index=True,
        strip_whitespace=True,
        separators=["\\n\\n", "\\n", " ", ""], # فواصل مناسبة للنص العربي
    )

    docs_processed = []
    for doc in knowledge_base:
        docs_processed += text_splitter.split_documents([doc])

    # إزالة التكرارات
    unique_texts = {}
    docs_processed_unique = []
    for doc in docs_processed:
        if doc.page_content not in unique_texts:
            unique_texts[doc.page_content] = True
            docs_processed_unique.append(doc)

    return docs_processed_unique

# 5. دوال النماذج

def setup_embedding_model():
    """
    يهيئ نموذج التضمين (embedding model).
    """
    embedding_model = HuggingFaceEmbeddings(
        model_name=EMBEDDING_MODEL_NAME,
        multi_process=True,
        model_kwargs={"device": "cuda"},
        encode_kwargs={"normalize_embeddings": True},  # مهم لـ cosine similarity
    )
    return embedding_model

def setup_reader_model():
    """
    يهيئ نموذج القارئ (reader model).
    """
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )
    model = AutoModelForCausalLM.from_pretrained(
        READER_MODEL_NAME, quantization_config=bnb_config
    )
    tokenizer = AutoTokenizer.from_pretrained(READER_MODEL_NAME)

    reader_llm = pipeline(
        model=model,
        tokenizer=tokenizer,
        task="text-generation",
        do_sample=True,
        temperature=0.2,
        repetition_penalty=1.1,
        return_full_text=False,
        max_new_tokens=500,
    )
    return reader_llm

def setup_reranker():
    """
    يهيئ نموذج إعادة الترتيب (reranker).
    """
    reranker = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")
    return reranker

# 6. دالة إنشاء قاعدة المعرفة

def create_knowledge_base(
    documents: List[LangchainDocument], embedding_model
) -> FAISS:
    """
    ينشئ قاعدة المعرفة المتجهة (vector database).
    """
    knowledge_base = FAISS.from_documents(
        documents, embedding_model, distance_strategy=DistanceStrategy.COSINE
    )
    return knowledge_base

# 7. دالة روبوت المحادثة

def answer_with_rag(
    question: str,
    llm: pipeline,
    knowledge_index: FAISS,
    reranker: Optional[RAGPretrainedModel] = None,
    num_retrieved_docs: int = 30,
    num_docs_final: int = 5,
) -> Tuple[str, List[str]]:
    """
    يجيب على سؤال المستخدم باستخدام RAG.
    """
    # 1. استرجاع المستندات
    print("=> Retrieving documents...")
    relevant_docs = knowledge_index.similarity_search(
        query=question, k=num_retrieved_docs
    )
    relevant_docs_content = [doc.page_content for doc in relevant_docs]

    # 2. إعادة ترتيب المستندات (اختياري)
    if reranker:
        print("=> Reranking documents...")
        relevant_docs_content = reranker.rerank(question, relevant_docs_content, k=num_docs_final)
        relevant_docs_content = [doc["content"] for doc in relevant_docs_content]

    relevant_docs_content = relevant_docs_content[:num_docs_final]

    # 3. صياغة ال prompt
    context = "\\nExtracted documents:\\n"
    context += "".join(
        [f"Document {str(i)}:::\\n{doc}" for i, doc in enumerate(relevant_docs_content)]
    )

    prompt_in_chat_format = [
      {
          "role": "system",
          "content": """Using the information contained in the context,
    give a comprehensive answer to the question.
    Respond only to the question asked, response should be concise and relevant to the question.
    Provide the number of the source document when relevant.
    If the answer cannot be deduced from the context, do not give an answer."""
      },
      {
          "role": "user",
          "content": f"""Context:
    {context}
    ---
    Now here is the question you need to answer.

    Question: {question}"""
      },
    ]
    tokenizer = AutoTokenizer.from_pretrained(READER_MODEL_NAME)
    final_prompt = tokenizer.apply_chat_template(
      prompt_in_chat_format, tokenize=False, add_generation_prompt=True
    )

    # 4. توليد الإجابة
    print("=> Generating answer...")
    answer = llm(final_prompt)[0]["generated_text"]

    return answer, relevant_docs_content

# 8. الدالة الرئيسية

def main():
    """
    الدالة الرئيسية لتشغيل روبوت المحادثة.
    """
    # 1. تحميل ومعالجة PDF
    raw_knowledge_base = load_pdf(PDF_FILEPATH)
    processed_docs = split_documents(CHUNK_SIZE, raw_knowledge_base)

    # 2. تهيئة النماذج
    embedding_model = setup_embedding_model()
    reader_llm = setup_reader_model()
    reranker = setup_reranker()

    # 3. إنشاء قاعدة المعرفة
    knowledge_base = create_knowledge_base(processed_docs, embedding_model)

    # 4. طرح الأسئلة
    question = "How many pages in the book?" # 👈  اطرح سؤالك هنا
    answer, relevant_docs = answer_with_rag(
        question, reader_llm, knowledge_base, reranker=reranker
    )

    # 5. طباعة النتائج
    print("=" * 50 + "Answer" + "=" * 50)
    print(f"{answer}")
    print("=" * 50 + "Source docs" + "=" * 50)
    for i, doc in enumerate(relevant_docs):
        print(f"Document {i} (Page: {processed_docs[i].metadata['page']}) {'-' * 40}")
        print(doc)

# 9. تشغيل الكود
if __name__ == "__main__":
    main()

### meta-llama/Llama-3.2-3B-Instruct

In [None]:
# 1. التثبيتات
#!pip install -q torch transformers accelerate bitsandbytes langchain sentence-transformers faiss-cpu openpyxl pacmap datasets langchain-community ragatouille pypdf

# 2. المكتبات
from tqdm.notebook import tqdm
import pandas as pd
from typing import Optional, List, Tuple
import matplotlib.pyplot as plt
from langchain.docstore.document import Document as LangchainDocument
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy
from ragatouille import RAGPretrainedModel
from pypdf import PdfReader
import torch
# 3. الثوابت
EMBEDDING_MODEL_NAME = "thenlper/gte-small"
READER_MODEL_NAME = "meta-llama/Llama-3.2-3B-Instruct"
PDF_FILEPATH = "/content/The_Lightning_Thief_-_Percy_Jackson_1-10.pdf"  # 👈 استبدل هذا بمسار ملف PDF الخاص بك
CHUNK_SIZE = 512
CHUNK_OVERLAP = int(CHUNK_SIZE / 10)

# 4. دوال معالجة PDF

def load_pdf(filepath: str) -> List[LangchainDocument]:
    """
    Loads a PDF file and converts it to a list from LangchainDocument.
    """
    reader = PdfReader(filepath)
    documents = []
    for page_num, page in enumerate(tqdm(reader.pages, desc="Processing pages")):
        text = page.extract_text()
        metadata = {"source": filepath, "page": page_num + 1}
        documents.append(LangchainDocument(page_content=text, metadata=metadata))
    return documents

def split_documents(
    chunk_size: int,
    knowledge_base: List[LangchainDocument],
    tokenizer_name: Optional[str] = EMBEDDING_MODEL_NAME,
) -> List[LangchainDocument]:
    """
    Splits documents into chunks of a specified size.
    """
    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
        AutoTokenizer.from_pretrained(tokenizer_name),
        chunk_size=chunk_size,
        chunk_overlap=CHUNK_OVERLAP,
        add_start_index=True,
        strip_whitespace=True,
        separators=["\\n\\n", "\\n", " ", ""], # فواصل مناسبة للنص العربي
    )

    docs_processed = []
    for doc in knowledge_base:
        docs_processed += text_splitter.split_documents([doc])

    # إزالة التكرارات
    unique_texts = {}
    docs_processed_unique = []
    for doc in docs_processed:
        if doc.page_content not in unique_texts:
            unique_texts[doc.page_content] = True
            docs_processed_unique.append(doc)

    return docs_processed_unique

# 5. دوال النماذج

def setup_embedding_model():
    """
    يهيئ نموذج التضمين (embedding model).
    """
    embedding_model = HuggingFaceEmbeddings(
        model_name=EMBEDDING_MODEL_NAME,
        multi_process=True,
        model_kwargs={"device": "cuda"},
        encode_kwargs={"normalize_embeddings": True},  # مهم لـ cosine similarity
    )
    return embedding_model

def setup_reader_model():
    """
    يهيئ نموذج القارئ (reader model).
    """
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )
    model = AutoModelForCausalLM.from_pretrained(
        READER_MODEL_NAME, quantization_config=bnb_config
    )
    tokenizer = AutoTokenizer.from_pretrained(READER_MODEL_NAME)

    reader_llm = pipeline(
        model=model,
        tokenizer=tokenizer,
        task="text-generation",
        do_sample=True,
        temperature=0.2,
        repetition_penalty=1.1,
        return_full_text=False,
        max_new_tokens=500,
    )
    return reader_llm

def setup_reranker():
    """
    Prepares the reranker model.
    """
    reranker = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")
    return reranker

# 6. دالة إنشاء قاعدة المعرفة

def create_knowledge_base(
    documents: List[LangchainDocument], embedding_model
) -> FAISS:
    """
    Creates a vector database.
    """
    knowledge_base = FAISS.from_documents(
        documents, embedding_model, distance_strategy=DistanceStrategy.COSINE
    )
    return knowledge_base

# 7. دالة روبوت المحادثة

def answer_with_rag(
    question: str,
    llm: pipeline,
    knowledge_index: FAISS,
    reranker: Optional[RAGPretrainedModel] = None,
    num_retrieved_docs: int = 30,
    num_docs_final: int = 5,
) -> Tuple[str, List[str]]:
    """
    Answers the user's question using RAG.
    """
    # 1. استرجاع المستندات
    print("=> Retrieving documents...")
    relevant_docs = knowledge_index.similarity_search(
        query=question, k=num_retrieved_docs
    )
    relevant_docs_content = [doc.page_content for doc in relevant_docs]

    # 2. إعادة ترتيب المستندات (اختياري)
    if reranker:
        print("=> Reranking documents...")
        relevant_docs_content = reranker.rerank(question, relevant_docs_content, k=num_docs_final)
        relevant_docs_content = [doc["content"] for doc in relevant_docs_content]

    relevant_docs_content = relevant_docs_content[:num_docs_final]

    # 3. صياغة ال prompt
    context = "\\nExtracted documents:\\n"
    context += "".join(
        [f"Document {str(i)}:::\\n{doc}" for i, doc in enumerate(relevant_docs_content)]
    )

    prompt_in_chat_format = [
      {
          "role": "system",
          "content": """Using the information contained in the context,
    give a comprehensive answer to the question.
    Respond only to the question asked, response should be concise and relevant to the question.
    Provide the number of the source document when relevant.
    If the answer cannot be deduced from the context, do not give an answer."""
      },
      {
          "role": "user",
          "content": f"""Context:
    {context}
    ---
    Now here is the question you need to answer.

    Question: {question}"""
      },
    ]
    tokenizer = AutoTokenizer.from_pretrained(READER_MODEL_NAME)
    final_prompt = tokenizer.apply_chat_template(
      prompt_in_chat_format, tokenize=False, add_generation_prompt=True
    )

    # 4. توليد الإجابة
    print("=> Generating answer...")
    answer = llm(final_prompt)[0]["generated_text"]

    return answer, relevant_docs_content

# 8. الدالة الرئيسية

def main():
    """
    The main function to run the chatbot.
    """
    # 1. تحميل ومعالجة PDF
    raw_knowledge_base = load_pdf(PDF_FILEPATH)
    processed_docs = split_documents(CHUNK_SIZE, raw_knowledge_base)

    # 2. تهيئة النماذج
    embedding_model = setup_embedding_model()
    reader_llm = setup_reader_model()
    reranker = setup_reranker()

    # 3. إنشاء قاعدة المعرفة
    knowledge_base = create_knowledge_base(processed_docs, embedding_model)

    # 4. طرح الأسئلة
    question = "How many pages in the book?" # 👈  اطرح سؤالك هنا
    answer, relevant_docs = answer_with_rag(
        question, reader_llm, knowledge_base, reranker=reranker
    )

    # 5. طباعة النتائج
    print("=" * 50 + "Answer" + "=" * 50)
    print(f"{answer}")
    print("=" * 50 + "Source docs" + "=" * 50)
    for i, doc in enumerate(relevant_docs):
        print(f"Document {i} (Page: {processed_docs[i].metadata['page']}) {'-' * 40}")
        print(doc)

# 9. تشغيل الكود
if __name__ == "__main__":
    main()