# Microsoft Phi - 2 | RAG implementation

Loading microsoft phi-2 in GPU, it takes arround 2GB of RAM 🚀.

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

base_model_id = "microsoft/phi-2"
bnb_config = BitsAndBytesConfig(
   load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype='float16',
        bnb_4bit_use_double_quant=False,
)

model = AutoModelForCausalLM.from_pretrained(base_model_id, quantization_config=bnb_config)
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True)

  from .autonotebook import tqdm as notebook_tqdm


The repository for microsoft/phi-2 contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/microsoft/phi-2.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y
The repository for microsoft/phi-2 contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/microsoft/phi-2.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:01<00:00,  1.19it/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


### Custom function for checking GPU memory

In [2]:
import torch
def check_gpu_memory(): 

    # Check if GPU is available
    if torch.cuda.is_available():
        device = torch.device("cuda")
        gpu_memory = torch.cuda.get_device_properties(device).total_memory / 1e9 
        print(f"Total GPU Memory: {gpu_memory:.2f} GB")
        used_memory = torch.cuda.max_memory_allocated(device) / 1e9 
        print(f"Used GPU Memory: {used_memory:.2f} GB")
        return True
    else:
        print("GPU not available.")
        return False

## Chroma DB setup | Reading pdf's

In [3]:
import chromadb
chroma_client = chromadb.Client()

In [4]:
collection = chroma_client.create_collection(name="my_collection")

In [5]:
from pypdf import PdfReader
reader = PdfReader("data.pdf")
number_of_pages = len(reader.pages)

lis_of_docs = []
lis_of_metadatas = []
lis_of_ids = []

def string_formater(passed_string):
    words = passed_string.split()

    formatted_string = ' '.join(words)
    # print(formatted_string)
    return formatted_string
    
c = 0
for i in range(number_of_pages):
    lis_of_docs.append(string_formater(reader.pages[c].extract_text()))
    lis_of_metadatas.append({"source": "source" + str(c)})
    lis_of_ids.append(str(c))
    c+=1

In [6]:
collection.add(
    documents = lis_of_docs,
    metadatas = lis_of_metadatas,
    ids=lis_of_ids
)

This function will craft annswer and remove random strings, May be not work correctly, but it is working in most of cases.

In [7]:
def answer_formatter(input_string, word1, word2):
    start_index = input_string.find(word1)
    if word2 in input_string:
        return input_string[start_index: input_string.find(word2)]
    return input_string[start_index:]

# Running on custom questions | Inference

In [19]:
# question = "What are the skills kavyansh has? form the sentance in list."
question = "Companies for those Kavyansh worked for?"

results = collection.query(
    query_texts=[question],
    n_results=1
)

# print(results['documents'])

context = ""
for cnt in range(len(results['documents'])):
    context += str(results['documents'][cnt]) + "\n\n"


prompt = f"""Instruct: Response as Q&A Assitant and use data: {context} as reference. Question: {question} \nOutput:"""
model_inputs = tokenizer(prompt, return_tensors="pt", return_attention_mask=False)
outputs = model.generate(**model_inputs, max_length=1000)
text = tokenizer.batch_decode(outputs)[0]
# print(text)
answer = answer_formatter(text, 'Output:','<|endoftext|>')
print(answer)


Output: The companies Kavyansh worked for are:

- Solutions
- Datopic
- Hashedin by Deloitte
- Aviz Networks



In [20]:
print(check_gpu_memory())

Total GPU Memory: 25.43 GB
Used GPU Memory: 3.10 GB
True


## Note

Make sure the pdfs, docs and files you are loading into vector DB in simple format. If they are very good quality images, videos. they will consume more memory and chances you will get CUDA - Out of Memory Error.

Thanks
