#  RAG with Gemma using Langchain and ChromaDB

In [15]:
import bs4
import torch 
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.schema.output_parser import StrOutputParser
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import WebBaseLoader
from langchain.vectorstores import Chroma
from langchain.schema.runnable import RunnablePassthrough
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA

from IPython.display import Markdown as md
import subprocess
import time
from transformers import AutoTokenizer,AutoModelForCausalLM,BitsAndBytesConfig,pipeline

In [48]:
web_url = "https://www.apple.com/newsroom/2024/01/apple-vision-pro-available-in-the-us-on-february-2/"

In [32]:
# Checking if GPU is available
if torch.cuda.is_available():
    print("GPU is available.")
    print('Using GPU: ', torch.cuda.get_device_name(0))
    print('Memory Usage: ')
    print('Allocated: ', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached: ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')

else:
    print("GPU is not available.")

GPU is available.
Using GPU:  NVIDIA A100-SXM4-40GB
Memory Usage: 
Allocated:  6.1 GB
Cached:  8.6 GB




In [33]:
device = f'cuda:{torch.cuda.current_device()}' if torch.cuda.is_available() else 'cpu'


In [34]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
    bnb_4bit_use_double_quant=False,
)

In [35]:
model_id = "../pretrained_models/gemma-7b-it"
model = AutoModelForCausalLM.from_pretrained(model_id,quantization_config=bnb_config,do_sample=True,device_map = "auto")

tokenizer = tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [51]:
prompt_query = "Tell me about apple vision pro? in 50 words"

model_inputs = tokenizer([prompt_query], return_tensors="pt").to(device)

generated_ids = model.generate(**model_inputs, max_new_tokens=300, do_sample=True,pad_token_id=tokenizer.eos_token_id)
decoded = tokenizer.batch_decode(generated_ids)[0]

print(decoded.replace('\\n', '\n'))

<bos>Tell me about apple vision pro? in 50 words or less:

Apple Vision Pro is a powerful vision-based AI that analyzes the world through high-quality images and videos. It enables developers to build advanced visual features such as object detection, facial recognition, and scene understanding into their apps and services. Vision Pro offers a wide range of tools for image and video processing, including object bounding boxes, facial landmark detection, and scene text extraction. With its deep learning capabilities, Vision Pro empowers developers to create innovative visual experiences that enhance user engagement and improve efficiency.<eos>


In [52]:
text_generation_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    device_map="auto",
    task="text-generation",
    return_tensors='pt',
    temperature=0.2,
    max_new_tokens=300,
    eos_token_id=tokenizer.eos_token_id,
)

gemma_llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

In [53]:
prompt_template = """
Instruction: Answer the question based on the following context:
{context}

Question:
{question} 
 """

# Create prompt from prompt template 
prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)

In [54]:
# Create llm chain 
llm_chain = LLMChain(llm=gemma_llm, prompt=prompt)

In [55]:
# Load, chunk and index the contents of the blog.
loader = WebBaseLoader(
    web_paths=(web_url,)
)
docs = loader.load()

# we split the collected text into chucks of size 1000 with and overlap of 200
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

In [56]:
# splits

In [57]:
embedding_model_name = "../pretrained_models/all-MiniLM-L6-v2"
model_kwargs = {"device": "cuda"}

embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name, model_kwargs=model_kwargs)
vector_store = Chroma.from_documents(documents=splits, embedding=embeddings)

**3. Building the Retrieval System:**



In [58]:

# Retrieve and generate using the relevant snippets of the blog
retriever = vector_store.as_retriever()

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = ( 
 {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | llm_chain
)

In [59]:
qa = RetrievalQA.from_chain_type(
 llm=gemma_llm,
 chain_type="stuff",
 retriever=vector_store.as_retriever()
)
qa.invoke(prompt_query)

{'query': 'Tell me about apple vision pro? in 50 words',
 'result': ' Apple Vision Pro is a revolutionary spatial computer that transforms how people work, collaborate, connect, relive memories, and enjoy entertainment.'}

In [62]:
response = rag_chain.invoke(prompt_query)
print ("Question:", response["question"])
print (response["text"].replace('\\n', '\n'))

Question: Tell me about apple vision pro? in 50 words

 Apple Vision Pro is a revolutionary spatial computer that transforms how people work, collaborate, connect, relive memories, and enjoy entertainment.
