In [17]:
import faiss
import torch
from transformers import AutoTokenizer, AutoModel
import numpy as np
import re  


embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(embedding_model_name)
embedding_model = AutoModel.from_pretrained(embedding_model_name).cuda()

embedding_dim = 384 
index = faiss.read_index("fcw_faiss_index.index")

def embed_query(query):
    inputs = tokenizer(query, padding=True, truncation=True, return_tensors="pt").to("cuda")
    with torch.no_grad():
        embedding = embedding_model(**inputs).last_hidden_state.mean(dim=1)
    return embedding.cpu().numpy()

def retrieve_documents(query, k=5):
    query_embedding = embed_query(query)
    
    query_embedding = query_embedding.astype(np.float32)
    
    distances, indices = index.search(query_embedding, k)
    
    return indices, distances

def format_text(text):
    text = re.sub(r"\s+", " ", text)
    # Insert line breaks for readability (every 100 characters, for example)
    formatted_text = re.sub(r"(.{80})", r"\1\n", text)
    return formatted_text.strip()

query = "What is the car's response to a forward collision?"  # Replace with your query
indices, distances = retrieve_documents(query)

text_chunks = split_text_into_chunks(extract_text_from_pdf("fcw_rag.pdf"))

print("Retrieved documents (indices):", indices)
print("Distances to the query:", distances)
print("\nFormatted Retrieved Documents:")

for idx in indices[0]:
    formatted_text = format_text(text_chunks[idx])
    print(f"Document {idx}:\n{formatted_text}\n")


Retrieved documents (indices): [[34 30  3 33  0]]
Distances to the query: [[26.819763 26.998878 27.133656 27.365654 27.848797]]

Formatted Retrieved Documents:
Document 34:
.x<(egoV ehicle.x+egoV ehicle.width)&&(object .x+object .width)>egoV ehicle.x&&o
bject .y<(egoV ehicle.y+egoV ehicle.heig ht)&&(object .y+object .heig ht)>egoV e
hicle.y){returntrue;//Collisiondetected}returnfalse;//Nocollision} //Functiontot
!Boundingboxoverlapdetected."<<std::endl;//Additionalalertlogic(audio,visual,etc
.)canbetrigger edhere}} intmain(){//MISRAC++:2023Rule5-0-4(Noglobalvariables)//S
imulat edboundingboxesforthedetectedobjectandtheegovehicleBoundingBo xdetectedOb
ject={50.0f,30.0f,5.0f,3.0f};//Objectat(50,30)withwidth5mandheight3mBoundingBo x
egoV ehicle={0.0f,0.0f,4.5f,2.0f};//Egovehicleat(0,0)withwidth4.5mandheight2m //
CheckforcollisionusingboundingboxoverlapboolcollisionDet ected=isCollision(det e
lisionW arning(collisionDet ected); return0;} E x p l a n a t i o n : 1.Bounding
Bo xDefinition: ○

In [11]:
pip install PyPDF2 transformers torch faiss-cpu sentence-transformers bitsandbytes


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting bitsandbytes
  Downloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl.metadata (2.9 kB)
Downloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl (69.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.1/69.1 MB[0m [31m206.3 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.45.0
Note: you may need to restart the kernel to use updated packages.


In [13]:
pip install 'accelerate>=0.26.0'


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting accelerate>=0.26.0
  Downloading accelerate-1.2.1-py3-none-any.whl.metadata (19 kB)
Downloading accelerate-1.2.1-py3-none-any.whl (336 kB)
Installing collected packages: accelerate
Successfully installed accelerate-1.2.1
Note: you may need to restart the kernel to use updated packages.
