# 1. Set Up the Notebook

In [1]:
# test_rag_pipeline.ipynb
# Import necessary modules
from rag_skeleton.data_processing import DataProcessor
from rag_skeleton.retrieval import DocumentRetriever
from rag_skeleton.rag import RAGPipeline

# Initialize paths and model names
data_path = "data/raw"  # Path where your PDFs are stored
vectordb_path = "vectordb"  # Path for vector database
embedding_model_name = "Alibaba-NLP/gte-large-en-v1.5"
model_name = "meta-llama/Llama-3.2-3B-Instruct"

  from .autonotebook import tqdm as notebook_tqdm


# 2. Test Loading PDFs

In [2]:
# Step 1: Load PDFs and inspect content
#processor = DataProcessor(data_path=data_path)
#documents = processor.load_documents(enrich_metadata=True)  # Enable metadata enrichment

#print("Loaded Documents:")
#for i, doc in enumerate(documents[:3]):  # Display the first 3 documents for brevity
#    print(f"Document {i+1}:")
#    print("Content:", doc.page_content[:500])  # Show the first 500 characters of the content
#    print("Metadata:", doc.metadata)
#    print("\n---\n")



# 2. Test Loading json transcripts

In [3]:
from pathlib import Path
import json
from langchain.schema import Document

def load_whisper_json_transcripts(json_dir):
    docs = []
    for json_path in Path(json_dir).glob("*.json"):
        with open(json_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            page_content = data["text"]
            metadata = {
                "video_id": data.get("video_id", json_path.stem),
                "segments": data.get("segments", []),
                "source": str(json_path)
            }
            docs.append(Document(page_content=page_content, metadata=metadata))
    return docs

documents = load_whisper_json_transcripts("G:/My Drive/teaching/RAG_AI_Tutor/AskSparks/transcripts_lectures_large")


# 3. Test Retriever for Context Retrieval

In [4]:
import shutil
import os

# Be very careful with this! It deletes your previous vector DB
if os.path.exists(vectordb_path):
    shutil.rmtree(vectordb_path)
    print(f"🧹 Deleted existing vector DB at {vectordb_path}")


🧹 Deleted existing vector DB at vectordb


In [5]:
# Step 2: Index documents into ChromaDB
retriever = DocumentRetriever(vectordb_path=vectordb_path, embedding_model_name=embedding_model_name)
retriever.index_documents(documents)  # <-- Make sure this is run at least once

print("✅ Vector store built and documents indexed.")


# Define a sample question
question = "Why do metals become harder with smaller grain size?"

# Retrieve relevant documents
retrieved_docs = retriever.get_retriever().invoke(question)

print("Retrieved Context:")
for i, doc in enumerate(retrieved_docs[:3]):  # Display the first 3 retrieved documents
    print(f"Retrieved Document {i+1}:")
    print("Content:", doc.page_content[:500])  # Show first 500 characters of the retrieved content
    print("Metadata:", doc.metadata)
    print("\n---\n")

NotFoundError: Database default_database not found for tenant default_tenant. Are you sure it exists?

# 4. Test Prompt Generation with RAGPipeline

In [None]:
# Step 3: Test prompt generation with RAGPipeline
rag_pipeline = RAGPipeline(vectordb_path=vectordb_path, embedding_model_name=embedding_model_name, model_name=model_name)
rag_pipeline.setup_pipeline()  # Initialize and set up the pipeline

# Define the question, context, and initialize history for prompt formatting
history = ""  # Start with an empty history or use some predefined history if available
formatted_prompt = rag_pipeline.prompt.format(history=history, context=retrieved_docs[0].page_content, question=question)

print("Generated Prompt:")
print(formatted_prompt)

Loading checkpoint shards: 100%|██████████| 2/2 [00:13<00:00,  6.54s/it]


RAG Pipeline is ready for queries.
Generated Prompt:

        <|start_header_id|>system<|end_header_id|>
        You are a knowledgeable assistant specializing in materials science. Answer each question directly and concisely, providing only the necessary information in a straightforward manner, using only the provided context and conversation history. Do NOT reference the question or instructions in the prompt.
        If you lack sufficient information, respond with "I do not know". Don't fabricate answers.
        <|eot_id|>
        <|start_header_id|>user<|end_header_id|>
        Question: Can you explain the environmental impact of traditional materials, especially in terms of energy consumption and pollution?
        Context: Environmental Impact and Sustainable 
Materials in Materials Science 
1. Introduction to Environmental Impact and Sustainable Materials 
The global demand for materials has grown significantly due to industrialization, population 
growth, and technological a

# 5. Full Pipeline Test: Get Response

In [None]:
# Step 4: Full pipeline test - Generate a response
response = rag_pipeline.get_response(question)

print("Generated Response:")
print(response)

Generated Response:
 The environmental impact of traditional materials is significant, resulting in resource depletion, energy consumption, pollution, and waste generation. Key impacts include:

Resource Depletion: Extraction of non-renewable resources leads to depletion of Earth's reserves.

Energy Consumption: Material production is energy-intensive, often derived from fossil fuels, increasing greenhouse gas emissions.

Pollution and Emissions: Processing materials releases pollutants into the air, water, and soil, contributing to air pollution.

Waste Generation: Industrial processes generate large amounts of waste, including hazardous waste from outdated devices and long-lasting pollution from conventional plastics.

These traditional materials have led to a growing emphasis on sustainable alternatives that aim to mitigate resource depletion, reduce emissions, and minimize waste.

For further reference, look at:
- data/raw\Environmental Impact and Sustainable Materials in Materials

# Test multiple questions and prompt

In [None]:
from rag_skeleton.rag import RAGPipeline

# Initialize the RAG pipeline
vectordb_path = "vectordb"  # Replace with your vector database path
model_name = "meta-llama/Llama-3.2-3B-Instruct"  # Replace with your model name
rag_pipeline = RAGPipeline(vectordb_path=vectordb_path, model_name=model_name)
rag_pipeline.setup_pipeline()

# Define the questions you want to test with history
questions = [
    "Can you explain the environmental impact of traditional materials, especially in terms of energy consumption and pollution?",
    "Given the environmental concerns with traditional materials, what sustainable alternatives are commonly used to reduce this impact, and how do they compare in terms of recyclability and biodegradability?",
    "Among the sustainable materials you mentioned, how does green concrete specifically contribute to reducing emissions, and what are some of its unique properties compared to traditional concrete?",
    "How does the use of green concrete and other sustainable materials align with the principles of the circular economy, and what role does recycling play in this context?"
]

# Preview the prompt that will be passed to the LLM for each question
for i, question in enumerate(questions):
    print(f"Preview of Prompt for Question {i+1}:")
    prompt_text = rag_pipeline.preview_prompt(question)
    print(prompt_text)
    print("\n" + "="*80 + "\n")

    # Optionally, invoke the response to update the history
    response = rag_pipeline.get_response(question)
