In [None]:
!pip install peft
!pip install datasets
!pip install accelerate -U
!pip install bitsandbytes
!pip install optimum

In [None]:
!pip install llama-index
!pip install llama-index-embeddings-huggingface

In [3]:
import torch
import numpy as np
import transformers
from peft import LoraConfig, get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, GenerationConfig, TrainingArguments, Trainer, BitsAndBytesConfig
from datasets import Dataset, load_dataset
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings, SimpleDirectoryReader, VectorStoreIndex  #for building vector store
from llama_index.core.retrievers import VectorIndexRetriever  #for retrieval
from llama_index.core.query_engine import RetrieverQueryEngine
from  llama_index.core.postprocessor import SimilarityPostprocessor #similarities between embeddings

In [4]:
#device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [5]:
##loading the model and tokenizer

quantization_config = BitsAndBytesConfig(load_in_4bit=True)  #for loading model in 4 bit quantized format
hf_token = 'your_hf_token'

tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b", token = hf_token,
                                          use_fast = True)
model = AutoModelForCausalLM.from_pretrained("google/gemma-2b",token = hf_token, quantization_config=quantization_config)



tokenizer_config.json:   0%|          | 0.00/33.6k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [44]:
#eval state for dropouts to be deactivated
model.eval()
prompt = "What do you know about LoRA?"

inputs = tokenizer(prompt,
                   return_tensors = "pt")

outputs = model.generate(input_ids = inputs['input_ids'],
                         max_new_tokens = 150)

print(f'Model Output:\n  {tokenizer.batch_decode(outputs)[0]}')



Model Output:
  <bos>What do you know about LoRA?

* <strong>LoRA is a 501(c)(3) not-for-profit organization.</strong>
* <strong>LoRA is a 501(c)(3) not-for-profit organization.</strong>
* <strong>LoRA is a 501(c)(3) not-for-profit organization.</strong>
* <strong>LoRA is a 501(c)(3) not-for-profit organization.</strong>
* <strong>LoRA is a 501(c)(3) not-for-profit organization.</strong>
* <strong>LoRA is a 501(c)(3) not-for-profit organization.</strong>


Setting up embedding model for knowledge base

In [7]:
Settings.embed_model = HuggingFaceEmbedding(model_name = "BAAI/bge-small-en-v1.5")  #embddingodel from hugging face
Settings.llm = None  #using our own llm
Settings.chunk_size = 256  #chunk size
Settings.chunk_overlap = 25 #overlap between chunks so it dosent chop chucks abruptly

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

LLM is explicitly disabled. Using MockLLM.


In [10]:
!pwd

/content


In [11]:
documents = SimpleDirectoryReader("articles").load_data()
##add documents or pdf files to articles folder

In [14]:
print(len(documents))

28


In [21]:
for idx, doc in enumerate(documents):

  if "min read" in doc.text:
     documents.remove(doc)


In [22]:
print(len(documents))

22


In [23]:
#storing 22 data chucks to vector data store

index = VectorStoreIndex.from_documents(documents)

In [24]:
##setting the retriever

#number of docs to retrieve: 2
top_k = 2

#retriever configuration
retriever = VectorIndexRetriever(
    index = index,
    similarity_top_k = top_k   #retrieves top 2 docs based on embedding similarities
)

In [27]:
## assemble the query engine for retrieval

query_engine = RetrieverQueryEngine(  #assemble everything: takes in user query and outputs the context based on similarity index
    retriever = retriever,
    node_postprocessors = [SimilarityPostprocessor(similarity_cutoff = 0.5)]
)

In [40]:
input_query = 'What is LoRA?'
retrieved_output = query_engine.query(input_query)  #output includes top 2 most relevant chunks based of similarity within embeddings, file name that chunck was retuived from, page number, date, and other metadata

In [None]:
retrieved_output

In [43]:
##reformat output

context = f"Context: "

for i in range(top_k):
  context = context + retrieved_output.source_nodes[i].text + "\n\n"

print(context)

Context: Selective Fine-tuning: This involves freezing most of the LLM’s weights
and focusing on training only specific layers or components. This allows
for targeted improvement while preserving the pre-trained knowledge.
Reparameterization: This method utilizes techniques like LoRA (Low-
Rank Adaptation) to reduce the number of trainable parameters
significantly. LoRA injects smaller matrices alongside the original
weights, effectively compressing the information needed for fine-tuning.
LoRA: A Powerful Ally in PEFT
LoRA, or Low-Rank Adaptation, is a groundbreaking technique within PEFT.
LoRA reduces the number of trainable parameters to be trained by freezing
most of the original LLM weights. It injects two rank decomposed matrices
alongside original weights. Dimensions of these matrices are set so that their
product is the same dimension as the weights they are modifying.
For example: A base transformer model has weights of dimension 512 *
512, which results in 262144 trainable par

In [45]:
prompt_with_context = f"""Could you answer what is LoRA based on the following context:

{context}

"""

In [46]:
inputs = tokenizer(prompt, return_tensors = 'pt').to(device)
outputs = model.generate(input_ids = inputs['input_ids'], max_new_tokens = 70)

print(tokenizer.batch_decode(outputs)[0])

<bos>What do you know about LoRA?

* <strong>LoRA is a 501(c)(3) not-for-profit organization.</strong>
* <strong>LoRA is a 501(c)(3) not-for-profit organization.</strong>
* <strong>LoRA is a 501(c)(3) not-for


In [47]:
inputs = tokenizer(prompt_with_context, return_tensors = 'pt').to(device)
outputs = model.generate(input_ids = inputs['input_ids'], max_new_tokens = 70)

print(tokenizer.batch_decode(outputs)[0])

<bos>Could you answer what is LoRA based on the following context:

Context: Selective Fine-tuning: This involves freezing most of the LLM’s weights
and focusing on training only specific layers or components. This allows
for targeted improvement while preserving the pre-trained knowledge.
Reparameterization: This method utilizes techniques like LoRA (Low-
Rank Adaptation) to reduce the number of trainable parameters
significantly. LoRA injects smaller matrices alongside the original
weights, effectively compressing the information needed for fine-tuning.
LoRA: A Powerful Ally in PEFT
LoRA, or Low-Rank Adaptation, is a groundbreaking technique within PEFT.
LoRA reduces the number of trainable parameters to be trained by freezing
most of the original LLM weights. It injects two rank decomposed matrices
alongside original weights. Dimensions of these matrices are set so that their
product is the same dimension as the weights they are modifying.
For example: A base transformer model has w