# Virtual enviroment creation

In [1]:
!rag_env\Scripts\activate

In [1]:
import torch, numpy
from sentence_transformers import SentenceTransformer
import chromadb

print("Torch:", torch.__version__)
print("Numpy:", numpy.__version__)
print("ChromaDB:", chromadb.__version__)


  from .autonotebook import tqdm as notebook_tqdm


Torch: 2.2.0+cpu
Numpy: 1.26.4
ChromaDB: 1.2.0


# Extraction from PDF

* Extracts raw text from the PDF using Unstructured.

* Splits the document into chunks based on section titles (by_title).

* Limits each chunk to 3,000 characters; merges short ones and splits long ones.

* Produces clean text blocks ready for embedding generation in the RAG pipeline.

In [2]:
from unstructured.partition.pdf import partition_pdf
from unstructured.cleaners.core import clean, replace_unicode_quotes
import re
import nltk

PDF_PATH = "458italia.pdf"

elements = partition_pdf(
    filename=PDF_PATH,
    strategy="fast",
    infer_table_structure=True,
    chunking_strategy="by_title",
    max_characters=3000,
    combine_text_under_n_chars=300,
    new_after_n_chars=2000,
    extract_images_in_pdf=False,
)

nltk.download("punkt", quiet=True)

def clean_and_split_text(text: str) -> str:
    text = re.sub(r'(Downloaded\s+from\s+www\.Manualslib\.com.*?engine)+', '', text, flags=re.I)

    text = replace_unicode_quotes(text)
    text = clean(text)

    text = re.sub(r'[\•\-\–\·]+', ' ', text)

    text = re.sub(r'\s+', ' ', text.strip())

    return text

texts = [
    clean_and_split_text(el.text)
    for el in elements
    if getattr(el, "text", None)
]

print(f"Chunk #: {len(texts)}")
print("Example: "+texts[0][:500])


Chunk #: 316
Example: Owner's manual Introduction 4 General remarksThis vehicle, which complies with EC homologation parameters, uses advanced technology and is capable of achieving high performance levels. It is equipped with sophisticated active and passive safety systems (described below).These safety features and systems do not authorise the driver to take risks other than those involved in normal driving since their preventive and protective action is guaranteed only in certain conditions. Unless otherwise instr


### create and save chuncks into the JSON

In [3]:
import json

with open("chunks.jsonl", "w", encoding="utf-8") as f:
    for text in texts:
        json.dump({"text": text}, f, ensure_ascii=False)
        f.write("\n")

print(f" Saved {len(texts)} chunk in chunks.jsonl")


 Saved 316 chunk in chunks.jsonl


## ChromaDB Embedding Pipeline

- Set database path and target collection  
- Connect to Chroma persistent client and clean old collections  
- Create collection with SentenceTransformer embedding function  
- Load text chunks from `chunks.jsonl`  
- Encode texts and store embeddings in ChromaDB  


In [17]:
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.utils import embedding_functions
import json
import os

DB_PATH = "chroma_db"
TARGET_COLLECTION = "rag_docs_final"

# Ensure folder exists
os.makedirs(DB_PATH, exist_ok=True)

# Connect to persistent DB
client = chromadb.PersistentClient(path=DB_PATH)

# Keep only the target collection
for c in client.list_collections():
    if c.name != TARGET_COLLECTION:
        print(f"Deleting collection: {c.name}")
        client.delete_collection(c.name)

# Define embedding function
embedding_fn = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name="all-MiniLM-L6-v2"
)

# Create collection with embedding function
collection = client.get_or_create_collection(
    name=TARGET_COLLECTION,
    embedding_function=embedding_fn
)

print(f"\nUsing only collection: '{TARGET_COLLECTION}' (ID: {collection.id})")

# Load text chunks
with open("chunks.jsonl", "r", encoding="utf-8") as f:
    texts = [json.loads(line)["text"] for line in f]

print(f"Loaded {len(texts)} text chunks.")

# Encode and store embeddings explicitly
model = SentenceTransformer("all-MiniLM-L6-v2")

for i, text in enumerate(texts):
    emb = model.encode(text).tolist()
    collection.add(
        ids=[str(i)],
        documents=[text],
        embeddings=[emb],
    )

print(f"Stored {len(texts)} chunks with embeddings in persistent Chroma DB.")


Deleting collection: rag_docs

Using only collection: 'rag_docs_final' (ID: 410fc847-2b65-4c63-a2a8-5670c7822193)
Loaded 316 text chunks.
Stored 316 chunks with embeddings in persistent Chroma DB.


## Visualization

In [18]:
import numpy as np

# Retrieve data from the persistent collection
data = collection.get(include=["documents", "embeddings"])

if not data["ids"]:
    print("No data found in the collection.")
else:
    print(f"\nCollection '{TARGET_COLLECTION}' contains {len(data['ids'])} items.\n")
    for i in range(min(3, len(data["ids"]))):
        doc = data["documents"][i][:300].replace("\n", " ")
        emb = np.array(data["embeddings"][i])
        print(f"ID: {data['ids'][i]}")
        print(f"Text (beginning): {doc}...")
        print(f"Embedding (first 10 dims): {emb[:10]}")
        print("-" * 100)



Collection 'rag_docs_final' contains 316 items.

ID: 0
Text (beginning): Owner's manual Introduction 4 General remarksThis vehicle, which complies with EC homologation parameters, uses advanced technology and is capable of achieving high performance levels. It is equipped with sophisticated active and passive safety systems (described below).These safety features and sys...
Embedding (first 10 dims): [-0.01569819  0.07482845 -0.08127558  0.06908744  0.03128455  0.01125603
  0.10095844  0.07520975 -0.06108274 -0.05573786]
----------------------------------------------------------------------------------------------------
ID: 1
Embedding (first 10 dims): [ 0.02050922  0.06295383 -0.05008964  0.02681744  0.03926263  0.04689884
  0.10666895  0.06132101 -0.02233545 -0.0314818 ]
----------------------------------------------------------------------------------------------------
ID: 2
Text (beginning): 5 putting road safety ﬁrst; for example, under conditions of poor or limited visibility, l

# Retrival

- Load stored documents and embeddings from ChromaDB  
- Encode query using `SentenceTransformer`  
- Compute cosine similarity between query and stored embeddings  
- Return and print top 3 most relevant documents


In [27]:
from sentence_transformers import SentenceTransformer
import numpy as np

query = "What is the fuel inertia?"

query_emb = model.encode(query).tolist()

results = collection.query(
    query_embeddings=[query_emb],
    n_results=3
)

for i, doc in enumerate(results["documents"][0]):
    print(f"\n--- Risultato {i+1} ---")
    print(doc[:400]) 



--- Risultato 1 ---

--- Risultato 2 ---

--- Risultato 3 ---
Safety The exhaust gas generated by the running engine may be hazardous, especially when in closed spaces. As well as consuming oxygen, the engine discharges carbon dioxide, carbon oxide and other toxic gases. Fuel is highly inﬂammable and emits vapours which may be noxious if inhaled. Do not use naked ﬂames or create sparks near the open fuel tank or in any other condition where fuel comes into c


# Asking

- Retrieve top 3 relevant text chunks from ChromaDB using semantic search  
- Build a contextual prompt combining the retrieved text and user query  
- Send the prompt to the `llama-3.1-8b-instant` model via Groq API  
- Generate and print an answer grounded in the provided context  

In [None]:
from groq import Groq

client = Groq(api_key="GROQ_API_KEY")  

results = collection.query(query_texts=[query], n_results=3)
context = "\n\n".join(results["documents"][0])

prompt = f"""
You are a helpful assistant.
Use only the context below to answer the question.

Context:
{context}

Question: {query}
"""

completion = client.chat.completions.create(
    model="llama-3.1-8b-instant",
    messages=[
        {"role": "system", "content": ""},
        {"role": "user", "content": prompt},
    ],
    temperature=0.2,
    max_tokens=500,
)

print(completion.choices[0].message.content)


The fuel inertia switch is a safety device that deactivates the fuel pump relays if a collision occurs.
