# **Installing and Importing Libraries**



In [None]:
!pip install langchain transformers pdfplumber faiss-gpu sentence-transformers bitsandbytes accelerate langchain-community

Collecting pdfplumber
  Downloading pdfplumber-0.11.4-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.8-py3-none-any.whl.metadata (2.9 kB)
Collecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
Collecting SQLAlchemy<3,>=1.4 (from langchain)


In [None]:
import os
from langchain.vectorstores import FAISS
from langchain.document_loaders import PyPDFLoader
from langchain.chains.question_answering import load_qa_chain
from langchain.prompts import PromptTemplate
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.document_loaders import UnstructuredFileLoader
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQAWithSourcesChain
from huggingface_hub import notebook_login
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForCausalLM
from langchain import HuggingFacePipeline
from langchain.text_splitter import CharacterTextSplitter
import textwrap
import sys
from transformers import BitsAndBytesConfig,AutoModelForCausalLM, AutoTokenizer
import pdfplumber
import re
import torch

Step 1: Extract text from the PDF


In [None]:
def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text()
    return text

 Step 2: Pre-process the text

In [None]:
def preprocess_text(text):
    # Remove special characters, newlines, and other non-alphanumeric characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Lowercase the text
    text = text.lower()
    return text

Step 3: Save the pre-processed text to a file

In [None]:
file_path = "pdffile.txt"

def save_text_to_file(text, file_path):
    with open(file_path, "w", encoding="utf-8") as file:
        file.write(text)

Step 4: Load and split the text into chunks

In [None]:
def split_text_into_chunks(file_path, chunk_size=1000, chunk_overlap=200):
    # Load the document
    loader = TextLoader(file_path)
    documents = loader.load()

    # Split the documents into smaller chunks
    text_splitter = CharacterTextSplitter(separator='\n', chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return text_splitter.split_documents(documents)

Step 5: Generate embeddings using HuggingFaceEmbeddings

In [None]:
def generate_embeddings(text_chunks):
    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2', model_kwargs={'device': 'cuda'})
    return FAISS.from_documents(text_chunks, embeddings)

Step 6: Set up BitsAndBytesConfig and Load the LLM Model

In [None]:
def load_model():
    model_name = "meta-llama/Llama-2-7b-chat-hf"
    token = "Enter_your_token_here"

    # Define configuration for BitsAndBytes (4-bit quantization)
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
    )

    # Load the model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name, token=token, trust_remote_code=True, padding_side="left", truncation=True)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        token=token,
        quantization_config=bnb_config,
        torch_dtype=torch.float16,
        trust_remote_code=True,

    )
    model.config.use_cache = False
    return model, tokenizer

Step 7: Set up the QA Prompt

In [None]:
def create_prompt():
    custom_prompt_template = """
    Context: {context}
    Question: {question}
    Provide a concise and accurate answer based on the context. If unsure, state "Sorry, I didn’t understand your question. Do you want to connect with a live agent?"
    Answer:
    """

    return PromptTemplate(template=custom_prompt_template, input_variables=["context", "question"])

Step 8: Set up the LLM pipeline for question answering

In [None]:
def setup_qa_pipeline(model, tokenizer, prompt, vectorstore):
    # Set up the text-generation pipeline
    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_length=3000,
        truncation=True,
        # num_return_sequences=1,
        # repetition_penalty=1.2,
    )

    # Wrap the pipeline with HuggingFacePipeline
    llm = HuggingFacePipeline(pipeline=pipe)

    # Set up the RetrievalQA chain
    return RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        return_source_documents=False,
        retriever=vectorstore.as_retriever(k=2),
        chain_type_kwargs={"prompt": prompt}
    )

# **Final pipeline execution**

In [None]:
def run_pipeline(pdf_path):
    # Step 1: Extract text from PDF
    pdf_text = extract_text_from_pdf(pdf_path)

    # Step 2: Preprocess the text
    preprocessed_text = preprocess_text(pdf_text)

    # Step 3: Save the preprocessed text to a file
    file_path = "pdffile.txt"
    save_text_to_file(preprocessed_text, file_path)

    # Step 4: Load and split the text
    text_chunks = split_text_into_chunks(file_path)

    # Step 5: Generate embeddings and create the vector store
    vectorstore = generate_embeddings(text_chunks)

    # Step 6: Load the model
    model, tokenizer = load_model()

    # Step 7: Create the QA prompt
    prompt = create_prompt()

    # Step 8: Set up the QA pipeline
    qa_pipeline = setup_qa_pipeline(model, tokenizer, prompt, vectorstore)

    return qa_pipeline

Define the query function


In [None]:
def get_answer(qa_pipeline, Question):
    # Perform question answering
    result = qa_pipeline({"query": Question}, return_only_outputs=True)

    # Extract the helpful answer from the result
    answer_pattern = r"Helpful Answer:\s(.+)"
    match = re.search(answer_pattern, result['result'])

    if match:
        helpful_answer = match.group(1)
        return helpful_answer
    else:
        return "Sorry, I didn’t understand your question. Do you want to connect with a live agent?"

# UPLOADING FILE

In [None]:
from google.colab import files
import os

# Upload the file
uploaded = files.upload()

# Get the uploaded file's name
original_filename = next(iter(uploaded))
custom_filename = "PDF"

# Save the uploaded file with the custom name
with open(custom_filename, 'wb') as f:
    f.write(uploaded[original_filename])
os.remove(original_filename)

pdf_path = "/content/PDF"

Saving SAMPLE PDF.pdf to SAMPLE PDF.pdf


# QA pipeline (Training on PDF)

In [None]:
qa_pipeline = run_pipeline(pdf_path)

`low_cpu_mem_usage` was None, now default to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

# ASK QUERIES

In [None]:
question = input("Ask your question: ")
answer = get_answer(qa_pipeline, question)
print("Question: ", question)
print("Answer: ", answer)

Ask your question: Who is the President of India?
Question:  Who is the President of India?
Answer:  Sorry, I didn’t understand your question. Do you want to connect with a live agent?
