In [None]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

Looking in indexes: https://download.pytorch.org/whl/cu121


In [None]:
!pip install langchain bitsandbytes chromadb sentence_transformers



In [None]:
!CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python



In [None]:
from torch import cuda, bfloat16
import torch
import transformers
from transformers import AutoTokenizer
from time import time
import chromadb
from chromadb.config import Settings
from langchain.llms import HuggingFacePipeline
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma
from langchain.llms import LlamaCpp

from langchain.prompts import PromptTemplate

from langchain.chains import LLMChain

from langchain.callbacks.manager import CallbackManager

from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

In [None]:
template =  """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
{context}
Question: {question}
Helpful Answer:"""
prompt = PromptTemplate(template=template, input_variables=["question"])

In [None]:
!wget https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q8_0.gguf?download=true

--2024-02-25 09:45:52--  https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q8_0.gguf?download=true
Resolving huggingface.co (huggingface.co)... 65.8.178.93, 65.8.178.118, 65.8.178.12, ...
Connecting to huggingface.co (huggingface.co)|65.8.178.93|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs.huggingface.co/repos/b0/ca/b0cae82fd4b3a362cab01d17953c45edac67d1c2dfb9fbb9e69c80c32dc2012e/f47dade5e86466edb66c5afe6f8e9fb1fbb2c292827b90bd46b7a1817d864bf2?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27llama-2-7b-chat.Q8_0.gguf%3B+filename%3D%22llama-2-7b-chat.Q8_0.gguf%22%3B&Expires=1709113552&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcwOTExMzU1Mn19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9yZXBvcy9iMC9jYS9iMGNhZTgyZmQ0YjNhMzYyY2FiMDFkMTc5NTNjNDVlZGFjNjdkMWMyZGZiOWZiYjllNjljODBjMzJkYzIwMTJlL2Y0N2RhZGU1ZTg2NDY2ZWRiNjZjNWFmZTZmOGU5ZmIxZmJ

In [None]:

n_gpu_layers = 4  # Change this value based on your model and your GPU VRAM pool.
n_batch = 512  # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.

# Loading model,
llm = LlamaCpp(
    model_path="/content/llama-2-7b-chat.Q8_0.gguf?download=true",
    max_tokens=1024,
    n_gpu_layers=n_gpu_layers,
    n_batch=n_batch,
    callback_manager = CallbackManager([StreamingStdOutCallbackHandler()]),
    verbose=True,
    n_ctx=4096, # Context window
    stop = ['USER:'], # Dynamic stopping when such token is detected.
    temperature = 0.4,
)

llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from /content/llama-2-7b-chat.Q8_0.gguf?download=true (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 11008
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 llama.attention.head_coun

In [None]:
loader = TextLoader("/content/eg14_cats_and_people.pdf",
                    encoding="latin-1")
documents = loader.load()

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
all_splits = text_splitter.split_documents(documents)

In [None]:

model_name = "BAAI/bge-large-en-v1.5"
model_kwargs = {"device": "cuda"}

embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

In [None]:

vectordb = Chroma.from_documents(documents=all_splits, embedding=embeddings, persist_directory="chroma_db")

In [None]:
retriever = vectordb.as_retriever()

qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    verbose=True
)

In [None]:
def test_rag(qa, query):
    print(f"Query: {query}\n")
    time_1 = time()
    result = qa.run(query)
    time_2 = time()
    print(f"Inference time: {round(time_2-time_1, 3)} sec.")
    print("\nResult: ", result)

In [None]:

query = "How are cats trained"
test_rag(qa, query)

Query: How are cats trained



[1m> Entering new RetrievalQA chain...[0m


Llama.generate: prefix-match hit


 Cats are not typically trained using traditional methods like dogs. Instead, they are more likely to learn through observation and imitation. Here are some tips on how to train a cat:
1. Be consistent: Cats are creatures of habit and will respond well to consistent training. Make sure to set clear boundaries and reinforce good behavior regularly.
2. Use positive reinforcement: Cats are more likely to respond to positive reinforcement than punishment. When your cat exhibits good behavior, reward them with treats or praise.
3. Be patient: Training a cat can take time, so be patient and consistent in your training efforts. It may take several weeks or even months for your cat to fully adjust to new behaviors.
4. Use clicker training: Clicker training is a type of positive reinforcement training that can be effective with cats. It involves using a small device that makes a clicking sound when pressed, which your cat will associate with good behavior.
5. Teach basic obedience commands: Whi

In [None]:
docs = vectordb.similarity_search(query)
print(f"Query: {query}")
print(f"Retrieved documents: {len(docs)}")
for doc in docs:
    doc_details = doc.to_json()['kwargs']
    print("Source: ", doc_details['metadata']['source'])
    print("Text: ", doc_details['page_content'], "\n")