### RAG using Mistral, LangChain and FAISS

In [1]:
import os
import torch
import transformers

from glob import glob
from tqdm.notebook import tqdm
from transformers import AutoTokenizer,AutoModelForCausalLM,BitsAndBytesConfig,pipeline
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.llms import HuggingFacePipeline
from langchain.chains import LLMChain
from langchain.schema.output_parser import StrOutputParser

import warnings
warnings.simplefilter('ignore')

In [2]:
# Checking if GPU is available
if torch.cuda.is_available():
    print("GPU is available.")
    print('Using GPU: ', torch.cuda.get_device_name(0))
    print('Memory Usage: ')
    print('Allocated: ', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached: ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')

else:
    print("GPU is not available.")

GPU is available.
Using GPU:  NVIDIA A100-SXM4-40GB
Memory Usage: 
Allocated:  0.0 GB
Cached:  0.0 GB


In [20]:
device = f'cuda:{torch.cuda.current_device()}' if torch.cuda.is_available() else 'cpu'


### Setting bitsandbytes config to improve speed

In [3]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
    bnb_4bit_use_double_quant=False,
)

### Loading Mistral-7B Model

In [15]:
model_id = "../pretrained_models/Mistral-7B-Instruct-v0.2"
model = AutoModelForCausalLM.from_pretrained(model_id,quantization_config=bnb_config,do_sample=True,device_map = "auto")

tokenizer = tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

### Testing model response without RAG

In [29]:
prompt = "Tell me about Transformers."

model_inputs = tokenizer([prompt], return_tensors="pt").to(device)

generated_ids = model.generate(**model_inputs, max_new_tokens=300, do_sample=True,pad_token_id=tokenizer.eos_token_id)
decoded = tokenizer.batch_decode(generated_ids)[0]

print(decoded.replace('\\n', '\n'))

<s> Tell me about Transformers.

Transformers is a line of toy robots created by Hasbro and released in 1984. These toys can transform from one form into another. The most popular Transformers are the Autobots and Decepticons, two factions engaged in an endless battle. Each Transformer is represented by distinctive vehicles and robots in both modes.

The Transformers originated from a Japanese toy line called Diaclone, which featured robots that combined into larger vehicles and back again. Hasbro acquired the distribution rights to these toys but wanted a more distinctive theme for the product. They partnered with Marvel Comics to create the backstory and narrative surrounding the Transformers.

The characters from Transformers have since been adapted into various forms of media. These include cartoons, films, comic books, video games, and novels. The story revolves around the war between the Autobots, led by Optimus Prime, and the Decepticons, led by Megatron, for control over the Al

### Loading supplementary data

In [6]:
paper_paths = glob("dataset/100-llm-papers-to-explore/*.pdf")
pages = []

# Initialize the progress bar
progress_bar = tqdm(total=len(paper_paths), desc="Processing PDFs")

for path in paper_paths:
    try:
        loader = PyPDFLoader(path)
        doc = loader.load()
        
        # Chunk text
        text_splitter = CharacterTextSplitter(chunk_size=500, 
                                              chunk_overlap=0)
        chunked_documents = text_splitter.split_documents(doc)
        
        pages.extend(chunked_documents)
    except Exception as e:
        print(f'Skipping {path} due to error: {e}')
    
    # Update the progress bar
    progress_bar.update(1)

# Close the progress bar
progress_bar.close()

Processing PDFs:   0%|          | 0/100 [00:00<?, ?it/s]

Created a chunk of size 944, which is longer than the specified 500
Created a chunk of size 714, which is longer than the specified 500
Created a chunk of size 1401, which is longer than the specified 500
Created a chunk of size 1469, which is longer than the specified 500
Created a chunk of size 1742, which is longer than the specified 500
Created a chunk of size 2142, which is longer than the specified 500
Multiple definitions in dictionary at byte 0xc1b for key /F16


### Creating a RAG Using LangChain and FAISS

In [7]:
text_generation_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    device_map="auto",
    task="text-generation",
    temperature=0.2,
    repetition_penalty=1.1,
    return_full_text=True,
    max_new_tokens=300,
    eos_token_id=tokenizer.eos_token_id,
)

mistral_llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

In [8]:
prompt_template = """
Instruction: Answer the question based on the following context:
{context}

Question:
{question} 
 """

# Create prompt from prompt template 
prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)

In [9]:
# Create llm chain 
llm_chain = LLMChain(llm=mistral_llm, prompt=prompt)

In [10]:
# Load chunked documents into the FAISS index with sentence-transformers/all-mpnet-base-v2
embedding_model_name = "/home/jomondal/experiments/mywork/pretrained_models/all-MiniLM-L6-v2"
db = FAISS.from_documents(
    pages,
    HuggingFaceEmbeddings(model_name=embedding_model_name)
)

### Creating a RAG Chain

In [11]:
# Connect query to FAISS index using a retriever
retriever = db.as_retriever(
    search_type="similarity",
    search_kwargs={'k': 4}
)

In [12]:
rag_chain = ( 
 {"context": retriever, "question": RunnablePassthrough()}
    | llm_chain
)

### Testing model response with RAG

In [13]:
query = "Tell me about Transformers."
response = rag_chain.invoke(query)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [14]:
print ("Question:", response["question"])
print (response["text"].replace('\\n', '\n'))

Question: Tell me about Transformers.

Answer:
Transformers are a type of neural network architecture introduced in the paper "Attention Is All You Need" by Vaswani et al. (2017). They have since become widely used in natural language processing tasks due to their ability to handle long-range dependencies in sequences. Transformers use self-attention mechanisms to allow each token in a sequence to attend to all other tokens, enabling the model to focus on relevant information regardless of its position in the sequence. This is in contrast to traditional recurrent neural networks, which process sequences one token at a time and can struggle with long-term dependencies.

Since their introduction, there have been many variations and improvements to the Transformer architecture. Some notable works include RoFormer (Devlin et al., 2017), BERT (Devlin et al., 2019), Longformer (Beltagy et al., 2020), and T5 (Raffel et al., 2020). These models have achieved state-of-the-art results on various