In [None]:
# !pip install -q torch transformers accelerate bitsandbytes langchain sentence_transformers faiss-gpu openpyxl pacmap datasets langchain-community ragatouille

In [1]:
from tqdm.notebook import tqdm
import pandas as pd
from typing import Optional, List, Tuple
from datasets import Dataset
import matplotlib.pyplot as plt

pd.set_option(
    "display.max_colwidth", None
)  # This will be helpful when visualizing retriever outputs

In [13]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from typing import List, Dict
import os

def extract_pdf_with_langchain(pdf_path: str, chunk_size: int = 1000, chunk_overlap: int = 200) -> Dict:
    """
    Extract and process text from a PDF using LangChain.
    
    Args:
        pdf_path (str): Path to the PDF file
        chunk_size (int): Size of text chunks for splitting
        chunk_overlap (int): Overlap between chunks
        
    Returns:
        dict: Dictionary containing raw text, chunked text, and metadata
    """
    try:
        # Initialize the PDF loader
        loader = PyPDFLoader(pdf_path)
        
        # Load the document
        pages = loader.load()
        
        # Extract raw text and metadata
        raw_text = '\n'.join([page.page_content for page in pages])
        metadata = [page.metadata for page in pages]
        
        # Initialize text splitter
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            length_function=len,
            is_separator_regex=False
        )
        
        # Split text into chunks
        chunks = text_splitter.split_text(raw_text)
        
        return {
            'raw_text': raw_text,
            'chunks': chunks,
            'metadata': metadata,
            'num_pages': len(pages),
            'num_chunks': len(chunks)
        }
        
    except Exception as e:
        print(f"Error processing PDF: {str(e)}")
        return None

def batch_process_pdfs(directory_path: str) -> List[Dict]:
    """
    Process multiple PDFs in a directory.
    
    Args:
        directory_path (str): Path to directory containing PDFs
        
    Returns:
        list: List of dictionaries containing processed PDF data
    """
    results = []
    
    for filename in os.listdir(directory_path):
        if filename.lower().endswith('.pdf'):
            pdf_path = os.path.join(directory_path, filename)
            result = extract_pdf_with_langchain(pdf_path)
            if result:
                result['filename'] = filename
                results.append(result)
    
    return results


In [17]:
!ls

 Meta-2021-Sustainability-Report.pdf  'RAG (1).ipynb'	       Untitled.ipynb
'Pymongo (1).ipynb'		       RAG_GENAI.ipynb
 PyMongo_rag.ipynb		       RAG_HuggingFace.ipynb


In [18]:
# for i in range(number of pdfs you have)
pdf_path = "Meta-2021-Sustainability-Report.pdf"
result = extract_pdf_with_langchain(pdf_path)

In [21]:
# print(f"Processed PDF with {result['num_pages']} pages")
# print(f"Generated {result['num_chunks']} chunks")
# print("\nFirst chunk of text:")
# if result['chunks']:
#     print(result['chunks'][0])

# print("\nMetadata for first page:")
# if result['metadata']:
#     print(result['metadata'][0])

In [2]:
import datasets
ds = datasets.load_dataset("m-ric/huggingface_doc", split="train")
ds

Dataset({
    features: ['text', 'source'],
    num_rows: 2647
})

In [25]:
from langchain.docstore.document import Document as LangchainDocument

RAW_KNOWLEDGE_BASE = [
    LangchainDocument(page_content=doc["text"], metadata={"source": doc["source"]})
    for doc in tqdm(ds)
]

  0%|          | 0/2647 [00:00<?, ?it/s]

In [30]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# We use a hierarchical list of separators specifically tailored for splitting Markdown documents
# This list is taken from LangChain's MarkdownTextSplitter class
MARKDOWN_SEPARATORS = [
    "\n#{1,6} ",
    "```\n",
    "\n\\*\\*\\*+\n",
    "\n---+\n",
    "\n___+\n",
    "\n\n",
    "\n",
    " ",
    "",
]

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # The maximum number of characters in a chunk: we selected this value arbitrarily
    chunk_overlap=100,  # The number of characters to overlap between chunks
    add_start_index=True,  # If `True`, includes chunk's start index in metadata
    strip_whitespace=True,  # If `True`, strips whitespace from the start and end of every document
    separators=MARKDOWN_SEPARATORS,
)

docs_processed = []
for doc in RAW_KNOWLEDGE_BASE:
    docs_processed += text_splitter.split_documents([doc])

In [34]:
len(docs_processed)

31085

In [35]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("thenlper/gte-small")

In [36]:
EMBEDDING_MODEL_NAME = 'thenlper/gte-small'

def split_documents(chunk_size: int,
                knowledge_base: List[LangchainDocument],
                tokenizer_name: Optional[str] = EMBEDDING_MODEL_NAME)-> List[LangchainDocument]:


    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
          AutoTokenizer.from_pretrained(tokenizer_name),
          chunk_size=chunk_size,
          chunk_overlap=100,
          add_start_index= True,
          strip_whitespace= True,
          separators = MARKDOWN_SEPARATORS)

    docs_processed = []
    for doc in knowledge_base:
        docs_processed += text_splitter.split_documents([doc])

    return docs_processed

In [37]:
docs_processed = split_documents(512,
                                 RAW_KNOWLEDGE_BASE,
                                 tokenizer_name = EMBEDDING_MODEL_NAME)

In [39]:
from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy

embedding_model = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_NAME,
    multi_process=True,
    model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": True},  # Set `True` for cosine similarity
)

KNOWLEDGE_VECTOR_DATABASE = FAISS.from_documents(
    docs_processed, embedding_model, distance_strategy=DistanceStrategy.COSINE
)

In [40]:
user_query = " How to create a transformer pipeline object?"

In [41]:
query_vector = embedding_model.embed_query(user_query)

In [43]:
related_documents = KNOWLEDGE_VECTOR_DATABASE.similarity_search(user_query, k=3)

In [44]:
len(related_documents)

3

# Second Part of RAG : Question Answering

OpenAI api - $$

In [52]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "Qwen/Qwen2.5-1.5B-Instruct"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)


In [53]:
model

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 1536)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear(in_features=1536, out_features=1536, bias=True)
          (k_proj): Linear(in_features=1536, out_features=256, bias=True)
          (v_proj): Linear(in_features=1536, out_features=256, bias=True)
          (o_proj): Linear(in_features=1536, out_features=1536, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=1536, out_features=8960, bias=False)
          (up_proj): Linear(in_features=1536, out_features=8960, bias=False)
          (down_proj): Linear(in_features=8960, out_features=1536, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
      )
    )
    (norm): Qw

In [64]:

def llm(sp, prompt):
    messages = [
        {"role": "system", "content": sp},
        {"role": "user", "content": prompt}
    ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=512
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return response

In [63]:
sp="""
You are acting as a RAG Assistant. 
Using the information contained in the context,
give a comprehensive answer to the question.
Respond only to the question asked, response should be concise and relevant to the question.
Provide the number of the source document when relevant.
If the answer cannot be deduced from the context, do not give an answer.

"""

In [65]:
def get_prompt(related_document, user_query):
    return f"""<|CONTEXT START|>

                                {related_document}
                                <|CONTEXT ENDS|>
                                Now  here is the question you need to answer
                                {user_query}
                                """

In [66]:
whole_docs = ''
for r in related_documents:
    whole_docs += str(r.page_content)

In [70]:
prompt =get_prompt(whole_docs, user_query)

In [71]:
res = llm(sp , prompt)
res

'To create a Transformer pipeline object, you would typically follow these steps:\n\n1. Import the necessary classes from the `transformers` package:\n   ```python\n   from transformers import AutoTokenizer, AutoModelForCausalLM\n   ```\n\n2. Load the tokenizer and model using the appropriate names or paths:\n   ```python\n   tokenizer = AutoTokenizer.from_pretrained("model_name_or_path", subfolder="tokenizer")\n   model = AutoModelForCausalLM.from_pretrained("model_name_or_path", subfolder="model")\n   ```\n\nThe exact details will depend on the specific models and tokenizers available in the Hugging Face repository.'

In [72]:
def rag(user_query, num_of_docs):
    related_documents = KNOWLEDGE_VECTOR_DATABASE.similarity_search(user_query, k=num_of_docs)
    whole_docs = ''
    for r in related_documents:
        whole_docs += str(r.page_content)
    prompt =get_prompt(whole_docs, user_query)
    res = llm(sp , prompt)
    return res

In [73]:
rag("What is a Transformer Seq-to-Seq", 5)

'A Transformer-based Sequence-to-Sequence model uses a special type of neural network called a Transformer to process sequences of data. It consists of two main components: an Encoder and a Decoder. The Encoder takes input sequences and encodes them into fixed-length representations, while the Decoder generates output sequences based on these encoded representations. This architecture allows the model to handle long-term dependencies and contextual information effectively, making it suitable for various NLP tasks such as machine translation, text summarization, and question answering.'