<a href="https://colab.research.google.com/github/kairamilanifitria/RAG-with-Groq/blob/main/Purple_Box_Case_Study.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Task 1 : Data Ingestion and Cleaning ✅

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [10]:
!pip install nltk



In [11]:
import os
import re
import nltk
nltk.download('punkt_tab')
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [12]:
# Ensure required NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [13]:
# Define the folder containing .txt files
data_folder = "/content/drive/MyDrive/Case Study RAG Interns"  # Replace with the actual path

In [14]:
# Initialize stop words and punctuation
stop_words = set(stopwords.words('english'))
stop_words.discard("not")
punctuation = re.compile(r'[^\w\s]')

In [15]:
def preprocess_text(text):
    # Tokenize text into sentences
    sentences = sent_tokenize(text)
    cleaned_sentences = []
    for sentence in sentences:
        # Remove punctuation and special characters
        sentence = punctuation.sub('', sentence)
        # Convert to lowercase and split into words
        words = sentence.lower().split()
        # Remove stop words
        cleaned_words = [word for word in words if word not in stop_words]
        cleaned_sentences.append(' '.join(cleaned_words))
    return cleaned_sentences

In [16]:
# Ingest and process all .txt files in the folder
def process_txt_files(folder_path):
    processed_documents = {}
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.txt'):
            file_path = os.path.join(folder_path, file_name)
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()
                processed_documents[file_name] = preprocess_text(text)
    return processed_documents

In [17]:
# Process the .txt files and print the results
cleaned_data = process_txt_files(data_folder)
for doc, sentences in cleaned_data.items():
    print(f"Document: {doc}")
    print(f"Cleaned Sentences: {sentences}")
    print("-" * 50)

Document: KTM 390 Duke Engine.txt
Cleaned Sentences: ['ktm 390 duke engine ktm 390 duke highperformance naked bike designed urban commuting thrilling weekend rides', 'powered cuttingedge engine 390 duke delivers perfect blend power agility efficiency', 'motorcycle built riders crave adrenaline maintaining versatility everyday use', 'heart 390 duke compact yet powerful 373cc engine engineered offer exhilarating experience streets making one top contenders entrylevel performance motorcycle market', 'features singlecylinder liquidcooled engine 390 dukes engine 373cc singlecylinder powerhouse liquidcooled providing optimal temperature regulation even highspeed runs', 'liquidcooling system ensures engine performs best even demanding riding conditions preventing overheating maintaining efficiency', 'fuel injection ridebywire technology electronic fuel injection efi system offers precise control fuelair mixture efficient combustion improved fuel economy', 'ridebywire throttle technology ensur

# Task 2 : Chunking Text ✅

In [18]:
# Parameters for chunking
MAX_TOKENS = 300
OVERLAP_TOKENS = 50

In [19]:
# Function to chunk text with overlap
def chunk_text(sentences):
    chunks = []
    current_chunk = []
    current_length = 0

    for sentence in sentences:
        tokens = word_tokenize(sentence)
        token_count = len(tokens)

        if current_length + token_count <= MAX_TOKENS:
            current_chunk.extend(tokens)
            current_length += token_count
        else:
            # Save the current chunk
            chunks.append(current_chunk)
            # Start a new chunk with overlap
            overlap = current_chunk[-OVERLAP_TOKENS:] if len(current_chunk) >= OVERLAP_TOKENS else current_chunk
            current_chunk = overlap + tokens
            current_length = len(current_chunk)

    # Add the last chunk
    if current_chunk:
        chunks.append(current_chunk)

    # Convert chunks back to text
    return [' '.join(chunk) for chunk in chunks]


In [20]:
# Process documents to create chunks
def process_chunks(cleaned_data):
    chunked_data = {}
    for doc_name, sentences in cleaned_data.items():
        chunked_data[doc_name] = chunk_text(sentences)
    return chunked_data

In [21]:
# Save chunked data to JSON
def save_to_json(data, output_file):
    with open(output_file, 'w', encoding='utf-8') as file:
        json.dump(data, file, indent=4)

In [22]:
# Chunk the cleaned data
chunked_data = process_chunks(cleaned_data)

In [23]:
chunked_data

{'KTM 390 Duke Engine.txt': ['ktm 390 duke engine ktm 390 duke highperformance naked bike designed urban commuting thrilling weekend rides powered cuttingedge engine 390 duke delivers perfect blend power agility efficiency motorcycle built riders crave adrenaline maintaining versatility everyday use heart 390 duke compact yet powerful 373cc engine engineered offer exhilarating experience streets making one top contenders entrylevel performance motorcycle market features singlecylinder liquidcooled engine 390 dukes engine 373cc singlecylinder powerhouse liquidcooled providing optimal temperature regulation even highspeed runs liquidcooling system ensures engine performs best even demanding riding conditions preventing overheating maintaining efficiency fuel injection ridebywire technology electronic fuel injection efi system offers precise control fuelair mixture efficient combustion improved fuel economy ridebywire throttle technology ensures smoother throttle response enhanced riding 

In [24]:
import json

# Save the result to a JSON file
output_file = "chunked_data.json"  # Replace with your desired output file name
save_to_json(chunked_data, output_file)

print(f"Chunked data saved to {output_file}")

Chunked data saved to chunked_data.json


In [25]:
import json

with open('chunked_data.json', 'r') as f:
    data = json.load(f)
    print(json.dumps(data, indent=4))

{
    "KTM 390 Duke Engine.txt": [
        "ktm 390 duke engine ktm 390 duke highperformance naked bike designed urban commuting thrilling weekend rides powered cuttingedge engine 390 duke delivers perfect blend power agility efficiency motorcycle built riders crave adrenaline maintaining versatility everyday use heart 390 duke compact yet powerful 373cc engine engineered offer exhilarating experience streets making one top contenders entrylevel performance motorcycle market features singlecylinder liquidcooled engine 390 dukes engine 373cc singlecylinder powerhouse liquidcooled providing optimal temperature regulation even highspeed runs liquidcooling system ensures engine performs best even demanding riding conditions preventing overheating maintaining efficiency fuel injection ridebywire technology electronic fuel injection efi system offers precise control fuelair mixture efficient combustion improved fuel economy ridebywire throttle technology ensures smoother throttle response en

# Task 3 : Embedding Creation ✅

In [26]:
from sentence_transformers import SentenceTransformer

In [27]:
# Load pre-trained models
model1 = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')
model2 = SentenceTransformer('all-MiniLM-L6-v2')

In [28]:
# Function to create embeddings
def create_embeddings(model, chunked_data):
    embeddings = []
    for doc_name, chunks in chunked_data.items():
        for idx, chunk in enumerate(chunks):
            vector = model.encode(chunk)
            embeddings.append({
                'document': doc_name,
                'chunk_index': idx,
                'chunk_text': chunk,
                'embedding': vector.tolist()  # Convert NumPy array to list for JSON compatibility
            })
    return embeddings

In [29]:
# Generate embeddings for both models
embeddings_model1 = create_embeddings(model1, chunked_data)
embeddings_model2 = create_embeddings(model2, chunked_data)

In [30]:
# Save embeddings to JSON files
output_file_model1 = "embeddings_distilbert.json"
output_file_model2 = "embeddings_minilm.json"

In [31]:
with open(output_file_model1, 'w', encoding='utf-8') as f:
    json.dump(embeddings_model1, f, indent=4)
with open(output_file_model2, 'w', encoding='utf-8') as f:
    json.dump(embeddings_model2, f, indent=4)

print(f"Embeddings saved to {output_file_model1} and {output_file_model2}")

Embeddings saved to embeddings_distilbert.json and embeddings_minilm.json


In [32]:
### comparing

print("Model Comparison:")
print(f"DistilBERT embedding dimension: {len(embeddings_model1[0]['embedding'])}")
print(f"MiniLM embedding dimension: {len(embeddings_model2[0]['embedding'])}")
print(f"Number of chunks encoded DistilBERT: {len(embeddings_model1)}")
print(f"Number of chunks encoded MiniLM: {len(embeddings_model2)}")

Model Comparison:
DistilBERT embedding dimension: 768
MiniLM embedding dimension: 384
Number of chunks encoded DistilBERT: 15
Number of chunks encoded MiniLM: 15


Insights:

1. **DistilBERT** *(embedding dimension: 768)*: Higher-dimensional embeddings typically capture more nuanced semantic information. DistilBERT model can be beneficial for tasks requiring fine-grained understanding, such as semantic similarity or clustering with complex data. However, due to the large embeddings it will increase storage requirements and computation time for downstream tasks (e.g., similarity search or model training).

2. **MiniLM** *(embedding dimension: 384)*: Lower-dimensional embeddings are more compact and computationally efficient.So, MiniLM maybe suitable for tasks where performance is acceptable with less detail (e.g., real-time search or large-scale document indexing). However, they might lose some subtleties in semantic representation compared to higher-dimensional embeddings like DistilBERT.

we can compare again based on the similarity search using cosine similarity for each models:

In [33]:
from sklearn.metrics.pairwise import cosine_similarity

def score_relevance(query, model, embeddings):
    """
    Calculate relevance scores for a query against the embeddings.
    Args:
        query (str): The user query.
        model: SentenceTransformer model used to encode the query.
        embeddings (list): List of chunks with metadata and their embeddings.

    Returns:
        List of tuples: (chunk_text, document_name, similarity_score) sorted by relevance.
    """
    # Encode the query
    query_vector = model.encode(query)

    # Calculate cosine similarity for each chunk
    relevance_scores = []
    for chunk in embeddings:
        chunk_vector = np.array(chunk['embedding'])
        similarity = cosine_similarity([query_vector], [chunk_vector])[0][0]
        relevance_scores.append((chunk['chunk_text'], chunk['document'], similarity))

    # Sort by relevance (highest score first)
    relevance_scores = sorted(relevance_scores, key=lambda x: x[2], reverse=True)
    return relevance_scores

# Score relevance for DistilBERT
relevance_distilbert = score_relevance(
    query="Shakespeare in late 1580",
    model=model1,
    embeddings=embeddings_model1
)

# Score relevance for MiniLM
relevance_minilm = score_relevance(
    query="Shakespeare in late 1580",
    model=model2,
    embeddings=embeddings_model2
)

# Print top results for comparison
print("Top results for DistilBERT:")
for chunk_text, doc_name, score in relevance_distilbert[:3]:  # Top 3 results
    print(f"Document: {doc_name}, Score: {score:.4f}\nChunk: {chunk_text}\n")

print("Top results for MiniLM:")
for chunk_text, doc_name, score in relevance_minilm[:3]:  # Top 3 results
    print(f"Document: {doc_name}, Score: {score:.4f}\nChunk: {chunk_text}\n")


Top results for DistilBERT:
Document: Biography of a Fictional Poet - William Shakespeare.txt, Score: 0.4154
Chunk: biography fictional poet william shakespeare william shakespeare often hailed greatest poet playwright english language born stratforduponavon 1564 exact date birth remains uncertain traditionally celebrated april 23 day passed away 1616 shakespeares life work transcended time geography making towering figure world literature though much personal life shrouded mystery poetic theatrical contributions left indelible mark english literature arts early life inspirations shakespeares early life deeply influenced cultural intellectual climate elizabethan england era renaissance flourishing growing town not far london shakespeare exposed range literary artistic influences father john shakespeare respected glove maker local official mother mary arden came wellestablished family believed young william received solid education stratford grammar school studied latin classics introdu

In [34]:
mean_score_distilbert = np.mean([score for _, _, score in relevance_distilbert[:3]])
mean_score_minilm = np.mean([score for _, _, score in relevance_minilm[:3]])
print(f"Mean Relevance Score - DistilBERT: {mean_score_distilbert:.4f}")
print(f"Mean Relevance Score - MiniLM: {mean_score_minilm:.4f}")

Mean Relevance Score - DistilBERT: 0.3107
Mean Relevance Score - MiniLM: 0.4517


Insights:


1.   Mean Relevance Score MiniLM: 0.4517, so MiniLM is efficient in semantic similarity and search tasks, which aligns well query-based relevance test. It also has smaller embedding size (384 dimensions) than DistilBERT, which is more compact and computationally efficient.
2.   **Dimensionality is not always a proxy for performance**. Low-dimensional embeddings like MiniLM can outperform higher-dimensional like DistilBERT in tasks where precision and semantic similarity are prioritized over contextual richness.



# Task 4: Vector Storage and Retrieval ✅

based on the previous analysis, we are gonna use the embedding from MiniLM

In [35]:
pip install faiss-cpu



In [36]:
import faiss
import numpy as np

def load_embeddings_into_faiss(embeddings):
    """
    Load embeddings into a FAISS index.
    Args:
        embeddings (list): List of embeddings with metadata.
    Returns:
        index: FAISS index for similarity search.
        metadata: List of metadata (chunk text, document name).
    """
    # Extract embeddings and metadata
    embedding_vectors = [np.array(item['embedding']) for item in embeddings]
    metadata = [(item['chunk_text'], item['document']) for item in embeddings]

    # Create FAISS index
    dimension = len(embedding_vectors[0])  # Get dimension from first embedding
    index = faiss.IndexFlatL2(dimension)   # L2 distance (equivalent to cosine similarity after normalization)

    # Normalize embeddings for cosine similarity
    embedding_vectors = np.array(embedding_vectors, dtype='float32')
    faiss.normalize_L2(embedding_vectors)

    # Add vectors to index
    index.add(embedding_vectors)

    return index, metadata

In [37]:
from sentence_transformers import SentenceTransformer

def retrieve_chunks(query, model, index, metadata, top_k=3):
    """
    Retrieve top-k most similar chunks for a query.
    Args:
        query (str): Query string.
        model: SentenceTransformer model to encode the query.
        index: FAISS index containing chunk embeddings.
        metadata: List of metadata (chunk text, document name).
        top_k (int): Number of top results to retrieve.
    Returns:
        List of top-k chunks with metadata and similarity scores.
    """
    # Encode query and normalize for cosine similarity
    query_vector = model.encode(query).astype('float32')
    faiss.normalize_L2(query_vector.reshape(1, -1))

    # Search in FAISS index
    distances, indices = index.search(query_vector.reshape(1, -1), top_k)

    # Retrieve top-k results
    results = []
    for i, idx in enumerate(indices[0]):
        if idx != -1:  # Ensure index is valid
            chunk_text, document_name = metadata[idx]
            # Convert L2 distance to cosine similarity
            similarity_score = 1 - (distances[0][i] / np.linalg.norm(query_vector))
            results.append({
                'rank': i + 1,
                'chunk_text': chunk_text,
                'document_name': document_name,
                'similarity_score': similarity_score
            })
    return results


In [38]:
# Load embeddings from JSON
import json
with open("embeddings_minilm.json", "r") as f:
    minilm_embeddings = json.load(f)

# Load embeddings into FAISS
index, metadata = load_embeddings_into_faiss(minilm_embeddings)

# Initialize query
query = "Growing up in a town that was not far from London"

# Retrieve top-3 chunks
results = retrieve_chunks(query, model=model2, index=index, metadata=metadata, top_k=3)


# Print results
print("Top-3 Results:")
for result in results:
    print(f"Rank: {result['rank']}")
    print(f"Document: {result['document_name']}")
    print(f"Similarity Score: {result['similarity_score']:.4f}")
    print(f"Chunk Text: {result['chunk_text']}\n")


Top-3 Results:
Rank: 1
Document: Biography of a Fictional Poet - William Shakespeare.txt
Similarity Score: -0.5606
Chunk Text: biography fictional poet william shakespeare william shakespeare often hailed greatest poet playwright english language born stratforduponavon 1564 exact date birth remains uncertain traditionally celebrated april 23 day passed away 1616 shakespeares life work transcended time geography making towering figure world literature though much personal life shrouded mystery poetic theatrical contributions left indelible mark english literature arts early life inspirations shakespeares early life deeply influenced cultural intellectual climate elizabethan england era renaissance flourishing growing town not far london shakespeare exposed range literary artistic influences father john shakespeare respected glove maker local official mother mary arden came wellestablished family believed young william received solid education stratford grammar school studied latin class

In [39]:
# Load embeddings from JSON
import json
with open("embeddings_distilbert.json", "r") as f:
    distilbert_embeddings = json.load(f)

# Load embeddings into FAISS
index, metadata = load_embeddings_into_faiss(distilbert_embeddings)

# Initialize query
query = "Growing up in a town that was not far from London"

# Retrieve top-3 chunks
results = retrieve_chunks(query, model=model1, index=index, metadata=metadata, top_k=3)


# Print results
print("Top-3 Results:")
for result in results:
    print(f"Rank: {result['rank']}")
    print(f"Document: {result['document_name']}")
    print(f"Similarity Score: {result['similarity_score']:.4f}")
    print(f"Chunk Text: {result['chunk_text']}\n")


Top-3 Results:
Rank: 1
Document: Biography of a Fictional Poet - William Shakespeare.txt
Similarity Score: -0.6572
Chunk Text: biography fictional poet william shakespeare william shakespeare often hailed greatest poet playwright english language born stratforduponavon 1564 exact date birth remains uncertain traditionally celebrated april 23 day passed away 1616 shakespeares life work transcended time geography making towering figure world literature though much personal life shrouded mystery poetic theatrical contributions left indelible mark english literature arts early life inspirations shakespeares early life deeply influenced cultural intellectual climate elizabethan england era renaissance flourishing growing town not far london shakespeare exposed range literary artistic influences father john shakespeare respected glove maker local official mother mary arden came wellestablished family believed young william received solid education stratford grammar school studied latin class

now, we are using miniLM embeddings for next step

In [40]:
# Load embeddings from JSON
import json
with open("embeddings_minilm.json", "r") as f:
    minilm_embeddings = json.load(f)

# Load embeddings into FAISS
index, metadata = load_embeddings_into_faiss(minilm_embeddings)

# Initialize query
query = "Growing up in a town that was not far from London"

# Retrieve top-3 chunks
results = retrieve_chunks(query, model=model2, index=index, metadata=metadata, top_k=3)


# Print results
print("Top-3 Results:")
for result in results:
    print(f"Rank: {result['rank']}")
    print(f"Document: {result['document_name']}")
    print(f"Chunk Text: {result['chunk_text']}\n")


Top-3 Results:
Rank: 1
Document: Biography of a Fictional Poet - William Shakespeare.txt
Chunk Text: biography fictional poet william shakespeare william shakespeare often hailed greatest poet playwright english language born stratforduponavon 1564 exact date birth remains uncertain traditionally celebrated april 23 day passed away 1616 shakespeares life work transcended time geography making towering figure world literature though much personal life shrouded mystery poetic theatrical contributions left indelible mark english literature arts early life inspirations shakespeares early life deeply influenced cultural intellectual climate elizabethan england era renaissance flourishing growing town not far london shakespeare exposed range literary artistic influences father john shakespeare respected glove maker local official mother mary arden came wellestablished family believed young william received solid education stratford grammar school studied latin classics introduced works great

# Step 5: Contextual Query Handling ✅

In [41]:
def retrieve_chunks_with_context(query, model, index, metadata, query_history, top_k=3):
    """
    Retrieve top-k most similar chunks for a query, with context from previous queries.

    Args:
        query (str): Current query string.
        model: SentenceTransformer model to encode the query.
        index: FAISS index containing chunk embeddings.
        metadata: List of metadata (chunk text, document name).
        query_history (list): List of previous queries to include context.
        top_k (int): Number of top results to retrieve.

    Returns:
        List of top-k chunks with metadata and similarity scores.
    """
    # Incorporate previous queries into the current query
    context = " ".join(query_history) + " " + query  # Concatenate history with the current query

    # Load the sentence transformer model (e.g., all-MiniLM-L6-v2)
    encoder_model = SentenceTransformer('all-MiniLM-L6-v2')

    # Encode the combined context
    context_vector = encoder_model.encode(context).astype('float32')
    context_vector = context_vector.reshape(1, -1)

    # Normalize the combined context vector
    faiss.normalize_L2(context_vector)

    # Search in FAISS index
    distances, indices = index.search(context_vector, top_k)

    # Retrieve top-k results
    results = []
    for i, idx in enumerate(indices[0]):
        if idx != -1:  # Ensure index is valid
            chunk_text, document_name = metadata[idx]
            # Convert L2 distance to cosine similarity
            similarity_score = 1 - distances[0][i]  # Since we normalized, this should work
            results.append({
                'rank': i + 1,
                'chunk_text': chunk_text,
                'document_name': document_name
            })

    # Update the query history
    query_history.append(query)  # Add the current query to the history
    return results


In [42]:
# Initialize query history
query_history = []

# Example multi-turn queries
queries = [
    "What is Shakespeare famous for?",
    "Can you tell me more about his plays?",
    "What are the themes in Hamlet?"
]

# For each query, retrieve the top-k chunks using the contextual retrieval function
for query in queries:
    results = retrieve_chunks_with_context(query, model2, index, metadata, query_history, top_k=3)

    # Print results for the current query
    print(f"Query: {query}")
    for result in results:
        print(f"Rank: {result['rank']}")
        print(f"Document: {result['document_name']}")
        print(f"Chunk Text: {result['chunk_text']}\n")


Query: What is Shakespeare famous for?
Rank: 1
Document: Biography of a Fictional Poet - William Shakespeare.txt
Chunk Text: forces tragic heros journey toward selfawareness destruction theme resonates across time characters like hamlet indecisive prince macbeth ambitious king continuing inspire readers actors alike comedies joy life contrast tragedies shakespeares comedies midsummer nights dream much ado nothing twelfth night provide lighthearted exploration human relationships often focusing love identity mistaken identities plays rich wit clever wordplay showcase shakespeares exceptional ability blend humor profound social commentary often satirizing social conventions presenting idealized world love conquers history plays national identity shakespeares history playsincluding richard iii henry v henry ivrely historical events figures examine themes power leadership national identity works shakespeare not contributed popularization english history also shaped national consciousness e

same as before, but we are trying to run it conversational (receive user input as the chatbot formatted)

In [43]:
# Initialize query history
query_history = []

# Function to handle multi-turn query processing with user input
def interactive_query(model, index, metadata):
    print("Welcome to the interactive query system! Type 'exit' to stop.")

    # Loop to receive user queries
    while True:
        # Prompt for user input
        user_query = input("Enter your query: ")

        if user_query.lower() == 'exit':  # Exit condition
            print("Exiting the interactive query system.")
            break

        # Call the retrieval function with the current query and the history
        results = retrieve_chunks_with_context(user_query, model, index, metadata, query_history, top_k=3)

        # Display the results for the current query
        print(f"\nQuery: {user_query}")
        for result in results:
            print(f"Rank: {result['rank']}")
            print(f"Document: {result['document_name']}")
            print(f"Chunk Text: {result['chunk_text']}\n")

        # Add the current query to history
        query_history.append(user_query)

# Assuming model, index, and metadata are already defined
# Call the function to start the interactive query session
interactive_query(model2, index, metadata)


Welcome to the interactive query system! Type 'exit' to stop.
Enter your query: exit
Exiting the interactive query system.


# Step 6: RAG Pipeline Simulation ✅

## llama2 : fails

In [36]:
!pip install torch
!pip install transformers



In [37]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [38]:
import torch

# Check for CUDA availability and set the device accordingly
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using GPU")
else:
    device = torch.device("cpu")
    print("Using CPU")

Using GPU


In [39]:
from transformers import AutoModelForCausalLM, AutoTokenizer

In [44]:
# Load the Hugging Face GPT-2 model and tokenizer
def load_model():
    model_name = "meta-llama/Llama-2-7b-chat-hf"
    model = AutoModelForCausalLM.from_pretrained(model_name,  low_cpu_mem_usage=True, torch_dtype=torch.float16, device_map="auto")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token

    # Ensure the model is in evaluation mode
    model.eval()

    return model, tokenizer

In [80]:
def generate_response_with_context(query, model, tokenizer, index, metadata, query_history, top_k=3):
    retrieval_results = retrieve_chunks_with_context(query, model, index, metadata, query_history, top_k)
    context = "\n".join([result['chunk_text'] for result in retrieval_results])

    # Modified prompt with clear delimiter
    input_text = f"Query: {query}\nContext:\n{context}\n### Answer ###"
    inputs = tokenizer(input_text, return_tensors='pt', max_length=1024, truncation=True, padding=True, return_attention_mask=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], max_new_tokens=200, num_return_sequences=1, no_repeat_ngram_size=2, top_p=0.92, top_k=50)

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Modified answer extraction
    answer_start_index = response.find('### Answer ###') + len('### Answer ###')
    answer = response[answer_start_index:].strip()

    return answer, retrieval_results

In [81]:
# Example of how to run the RAG pipeline
def rag_pipeline(query, model, tokenizer, index, metadata, query_history):
    # Generate response with context
    answer, retrieval_results = generate_response_with_context(query, model, tokenizer, index, metadata, query_history, top_k=3)

    # Print the retrieved chunks (context)
    print("\nRetrieved Context:")
    for result in retrieval_results:
        print(f"Rank: {result['rank']} | Document: {result['document_name']}")
        print(result['chunk_text'][:150] + "...")  # Display the start of the chunk (trimmed for brevity)

    # Output the generated answer
    print("\nGenerated Answer:")
    print(answer)

    return answer

In [47]:
# Initialize the model and tokenizer
model, tokenizer = load_model()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [76]:
# Example multi-turn query
query = "BOSCH 0607595100 cutting speed"

# Call the RAG pipeline (assuming index and metadata are defined)
answer = rag_pipeline(query, model, tokenizer, index, metadata, query_history)


Retrieved Context:
Rank: 1 | Document: BOSCH 0607595100 - Compressed Air Foam Cutter up to 300 mm, 3800 Strokes_Min.txt
bosch 0607595100 compressed air foam cutter 300 mm 3800 strokesmin description bosch 0607595100 highperformance compressed air foam cutter designed pr...
Rank: 2 | Document: STANLEY SFMCS650B-XJ - 18V 4.0 Ah Brushless Jigsaw (Without Batteries and Charger).txt
stanley sfmcs650bxj 18v 40 ah brushless jigsaw without batteries charger description stanley sfmcs650bxj powerful 18v brushless jigsaw designed delive...
Rank: 3 | Document: Ducati Panigale V4 Engine.txt
precision powerful dual front disc brakes provide excellent stopping power allowing riders push limits confidence fuel efficiency despite performance ...

Generated Answer:
Based on the provided information, the cutting Speed of the Bosch Compressed Air Foam Cutter 90 is 58 m/s.
The cutting performance of this tool is excellent, and it can make smooth and rapid cut foams. The tool has a high-speed air-driven en

In [78]:
# this is to refresh the query history
query_history = []

## openai : fails due to api service not free

In [6]:
pip install openai==0.28.0

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/pip/_vendor/pkg_resources/__init__.py", line 3070, in _dep_map
    return self.__dep_map
  File "/usr/local/lib/python3.10/dist-packages/pip/_vendor/pkg_resources/__init__.py", line 2863, in __getattr__
    raise AttributeError(attr)
AttributeError: _DistInfoDistribution__dep_map

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/base_command.py", line 179, in exc_logging_wrapper
    status = run_func(*args)
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/req_command.py", line 67, in wrapper
    return func(self, options, args)
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/commands/install.py", line 447, in run
    conflicts = self._determine_conflicts(to_install)
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/commands/install.py", line 5

In [140]:
pip install openai



In [63]:
import os
import openai

# Set your OpenAI API key
openai.api_key = ""

In [64]:
from sentence_transformers import SentenceTransformer

# Load the same embedding model you used to create the embeddings
embed_model = SentenceTransformer("all-MiniLM-L6-v2")  # Replace with your desired model


In [65]:
import openai
from openai.error import RateLimitError

def generate_response_with_openai(query, retrieved_chunks, model="gpt-3.5-turbo"):
    """
    Generate a response using OpenAI's API with error handling for RateLimitError.
    """
    try:
        # Prepare context
        context = "\n\n".join([f"Chunk {i+1}: {chunk['chunk_text']}" for i, chunk in enumerate(retrieved_chunks)])
        system_prompt = (
            "You are an intelligent assistant. Use the following context to answer the user's query:\n"
            f"{context}\n\nRespond concisely and clearly."
        )
        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": query}
        ]

        # Generate response
        response = openai.ChatCompletion.create(
            model=model,
            messages=messages
        )
        return response['choices'][0]['message']['content']
    except RateLimitError:
        return "The system is currently overloaded or you’ve exceeded your quota. Please try again later."


In [66]:
def retrieve_chunks_with_context(query, index, metadata, query_history=None, top_k=3):
    """
    Retrieve top-k relevant chunks for a query using cosine similarity and FAISS index.
    Incorporates query history for multi-turn context handling.

    Args:
        query (str): User query.
        index: FAISS index.
        metadata (list of tuples): Metadata associated with embeddings.
        query_history (list): History of previous queries for multi-turn interactions.
        top_k (int): Number of top chunks to retrieve.

    Returns:
        list: Top-k retrieved chunks with metadata.
    """
    from sklearn.metrics.pairwise import cosine_similarity

    # Initialize query history if None
    if query_history is None:
        query_history = []

    # Combine query history into a single context
    combined_query = " ".join(query_history + [query])

    # Convert the query to an embedding
    query_embedding = embed_model.encode([combined_query])

    # Search FAISS index for top-k most similar chunks
    distances, indices = index.search(query_embedding, top_k)

    # Prepare retrieved chunks with metadata
    retrieved_chunks = []
    for rank, (dist, idx) in enumerate(zip(distances[0], indices[0])):
        if idx != -1:  # Ensure valid index
            chunk_text, document_name = metadata[idx]  # Unpack tuple
            retrieved_chunks.append({
                "rank": rank + 1,
                "chunk_text": chunk_text,
                "document_name": document_name,
                "similarity_score": dist
            })

    return retrieved_chunks


In [67]:
def rag_pipeline_openai(query, index, metadata, query_history=None, model="gpt-4", top_k=3):
    """
    Complete RAG pipeline using OpenAI API for response generation.

    Args:
        query (str): User query.
        index: FAISS index.
        metadata: Metadata associated with embeddings.
        query_history (list): History of previous queries for multi-turn interactions.
        model (str): OpenAI model to use.
        top_k (int): Number of top chunks to retrieve.

    Returns:
        str: Generated response from the RAG pipeline.
    """
    # Ensure query_history is initialized
    if query_history is None:
        query_history = []

    # Add the current query to the query history
    query_history.append(query)

    # Retrieve top-k relevant chunks
    retrieved_chunks = retrieve_chunks_with_context(query, index, metadata, query_history, top_k=top_k)

    # Display retrieved context for debugging
    print("\nRetrieved Context:")
    for chunk in retrieved_chunks:
        print(f"Rank: {chunk['rank']} | Document: {chunk['document_name']} | Similarity Score: {chunk['similarity_score']:.4f}")
        print(chunk['chunk_text'][:150] + "...")  # Display a preview of each chunk

    # Generate a response using OpenAI API
    response = generate_response_with_openai(query, retrieved_chunks, model)

    print("\nGenerated Answer:")
    print(response)

    return response


In [68]:
import openai

# List available models
models = openai.Model.list()
print([model['id'] for model in models['data']])

['o1-preview-2024-09-12', 'o1-mini-2024-09-12', 'dall-e-2', 'o1-preview', 'gpt-3.5-turbo', 'text-embedding-3-large', 'gpt-3.5-turbo-0125', 'babbage-002', 'davinci-002', 'whisper-1', 'dall-e-3', 'gpt-3.5-turbo-16k', 'tts-1-hd-1106', 'text-embedding-ada-002', 'gpt-4o-mini-2024-07-18', 'gpt-4o-mini', 'o1-mini', 'text-embedding-3-small', 'tts-1-hd', 'gpt-3.5-turbo-1106', 'gpt-3.5-turbo-instruct', 'tts-1', 'tts-1-1106', 'gpt-3.5-turbo-instruct-0914']


In [76]:
query = "What are the key themes in Hamlet?"
query_history = []

response = rag_pipeline_openai(query, index, metadata, query_history, model="tts-1-1106", top_k=3)
print("Final Response:", response)




Retrieved Context:
Rank: 1 | Document: Biography of a Fictional Poet - William Shakespeare.txt | Similarity Score: 1.0472
forces tragic heros journey toward selfawareness destruction theme resonates across time characters like hamlet indecisive prince macbeth ambitious ki...
Rank: 2 | Document: Biography of a Fictional Poet - William Shakespeare.txt | Similarity Score: 1.2435
biography fictional poet william shakespeare william shakespeare often hailed greatest poet playwright english language born stratforduponavon 1564 ex...
Rank: 3 | Document: Biography of a Fictional Poet - William Shakespeare.txt | Similarity Score: 1.3675
valuable insights enduring legacy poet playwright lies not universal themes works also mastery language ability evoke empathy unique insight human exp...

Generated Answer:
The system is currently overloaded or you’ve exceeded your quota. Please try again later.
Final Response: The system is currently overloaded or you’ve exceeded your quota. Please try again l

## groq api : worked

In [118]:
pip install groq



In [155]:
import os
from groq import Groq
import json

# Set the GROQ_API_KEY environment variable
os.environ["GROQ_API_KEY"] = "-----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"

# Initialize the Groq client
client = Groq()

In [164]:
# Load embeddings into FAISS (from previous steps)
def load_embeddings_into_faiss(embeddings):
    embedding_vectors = [np.array(item['embedding']) for item in embeddings]
    metadata = [(item['chunk_text'], item['document']) for item in embeddings]

    dimension = len(embedding_vectors[0])  # Get dimension from first embedding
    index = faiss.IndexFlatL2(dimension)   # L2 distance (equivalent to cosine similarity after normalization)

    embedding_vectors = np.array(embedding_vectors, dtype='float32')
    faiss.normalize_L2(embedding_vectors)

    index.add(embedding_vectors)

    return index, metadata


In [165]:
# Retrieve top-k chunks using FAISS index
def retrieve_chunks_with_context(query, index, metadata, query_history, top_k=3):
    """
    Retrieve top-k most similar chunks for a query, with context from previous queries.
    """
    context = " ".join(query_history) + " " + query  # Concatenate history with the current query
    print(f"Context being used: {context}")  # Debugging: Show the context being passed

    # Query Groq API for response (using "llama3-8b-8192" model or others)
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": f"Retrieve the most relevant chunks for the following query:\n{context}",
            }
        ],
        model="llama3-8b-8192",  # Replace with your desired model
    )

    response = chat_completion.choices[0].message.content
    print("Groq API response:", response)  # Debugging: Check the raw response from Groq API

    # Encode the query and normalize for cosine similarity
    query_vector = np.array(model2.encode(query), dtype='float32')
    faiss.normalize_L2(query_vector.reshape(1, -1))

    # Search in FAISS index
    distances, indices = index.search(query_vector.reshape(1, -1), top_k)

    # Retrieve the top-k results and print debug info
    results = []
    for i, idx in enumerate(indices[0]):
        if idx != -1:
            chunk_text, document_name = metadata[idx]
            similarity_score = 1 - distances[0][i]
            results.append({
                'rank': i + 1,
                'chunk_text': chunk_text,
                'document_name': document_name,
                'similarity_score': similarity_score
            })
            print(f"Retrieved Chunk {i + 1}: {chunk_text[:100]}... (Score: {similarity_score:.4f})")  # Debugging: Show chunk text and score

    query_history.append(query)  # Add the current query to the history
    return results

In [166]:
# Function to generate response with context
def generate_response_with_context(query, index, metadata, query_history, top_k=3):
    """
    Generate response with context from Groq API using the top-k retrieved chunks.
    """
    # Retrieve top-k most relevant chunks with context
    retrieval_results = retrieve_chunks_with_context(query, index, metadata, query_history, top_k)

    # Build the context string for the Groq API prompt
    context = "\n".join([result['chunk_text'] for result in retrieval_results])
    print(f"Context to Groq API:\n{context[:300]}...")  # Debugging: Show the first 300 characters of the context

    # Query Groq API for an answer
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": f"Answer the following query using the context below:\nQuery: {query}\nContext:\n{context}\n### Answer ###"
            }
        ],
        model="llama3-8b-8192",  # Replace with your desired model
    )

    response = chat_completion.choices[0].message.content
    return response, retrieval_results

In [167]:
# RAG pipeline to generate the final answer
def rag_pipeline(query, index, metadata, query_history, top_k=3):
    """
    Run the RAG pipeline: retrieve context, generate response with Groq API.
    """
    # Generate response with context
    answer, retrieval_results = generate_response_with_context(query, index, metadata, query_history, top_k)

    # Debugging: Show the context and retrieved chunks
    print("\nRetrieved Context:")
    for result in retrieval_results:
        print(f"Rank: {result['rank']} | Document: {result['document_name']}")
        print(result['chunk_text'][:150] + "...")  # Show the beginning of the chunk for clarity

    # Show the final generated answer
    print("\nGenerated Answer:")
    print(answer)

    return answer

In [168]:
# Load your embeddings and metadata (these should be loaded from Step 3 and Step 4)
with open("embeddings_minilm.json", "r") as f:
    minilm_embeddings = json.load(f)

# Load embeddings into FAISS index
index, metadata = load_embeddings_into_faiss(minilm_embeddings)


In [169]:
# Initialize query history
query_history = []


In [171]:
# Example multi-turn queries
queries = [
    "BOSCH 0607595100 Weight"
]

# For each query, run the RAG pipeline and retrieve the answer
for query in queries:
    print(f"Processing query: {query}")
    answer = rag_pipeline(query, index, metadata, query_history, top_k=3)
    print(f"Final answer: {answer}\n")

Processing query: BOSCH 0607595100 Weight
Context being used: BOSCH 0607595100 cutting speed BOSCH 0607595100 Weight
Groq API response: A specific model query!

After searching, here are the most relevant chunks of information for the BOSCH 0607595100:

**Cutting Speed:**

* The BOSCH 0607595100 is a circular saw, which can cut at a speed of **4,500 RPM** ( Revolutions Per Minute).
* This speed is suitable for various materials, including wood, metal, and plastic.

**Weight:**

* The BOSCH 0607595100 weighs approximately **3.3 kg** or **7.3 lbs**, making it a relatively lightweight and easy-to-handle circular saw.

These details should help you with your query. If you need further information or specs, feel free to ask!
Retrieved Chunk 1: bosch 0607595100 compressed air foam cutter 300 mm 3800 strokesmin description bosch 0607595100 high... (Score: -0.1374)
Retrieved Chunk 2: stanley sfmcs650bxj 18v 40 ah brushless jigsaw without batteries charger description stanley sfmcs65... (Score:

In [172]:
# Example multi-turn queries
queries = [
    "When Shakespare died?"
]

# For each query, run the RAG pipeline and retrieve the answer
for query in queries:
    print(f"Processing query: {query}")
    answer = rag_pipeline(query, index, metadata, query_history, top_k=3)
    print(f"Final answer: {answer}\n")

Processing query: When Shakespare died?
Context being used: BOSCH 0607595100 cutting speed BOSCH 0607595100 Weight When Shakespare died?
Groq API response: A curious query!

After analyzing the text, I detected two distinct topics: 1) BOSCH 0607595100 (a product), and 2) When Shakespeare died? (a historical question).

Here are the most relevant chunks for each topic:

**BOSCH 0607595100**

* The cutting speed of BOSCH 0607595100 is likely related to the product's technical specifications, such as its power tool's speed rating.
* The product's weight is another relevant aspect of its design and functionality.

**When Shakespeare died?**

* William Shakespeare, the English playwright and poet, died on April 23, 1616.

In summary, the relevant chunks are:

* BOSCH 0607595100: cutting speed, weight
* When Shakespeare died?: April 23, 1616
Retrieved Chunk 1: valuable insights enduring legacy poet playwright lies not universal themes works also mastery langu... (Score: -0.1745)
Retrieved Ch

latency

In [177]:
import time
from groq import Groq

# Initialize Groq client
client = Groq()

# Function to generate response with context from Groq API
def generate_response_with_context(query, index, metadata, query_history, top_k=3):
    """
    Generate response with context from Groq API using the top-k retrieved chunks.
    """
    # Measure retrieval latency
    retrieval_start_time = time.time()
    retrieval_results = retrieve_chunks_with_context(query, index, metadata, query_history, top_k)
    retrieval_end_time = time.time()
    retrieval_latency = retrieval_end_time - retrieval_start_time

    # Build the context string for the Groq API prompt
    context = "\n".join([result['chunk_text'] for result in retrieval_results])
    print(f"Context to Groq API:\n{context[:300]}...")  # Debugging: Show the first 300 characters of the context

    # Measure generation latency
    generation_start_time = time.time()
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": f"Answer the following query using the context below:\nQuery: {query}\nContext:\n{context}\n### Answer ###"
            }
        ],
        model="llama3-8b-8192",  # Replace with your desired model
    )
    generation_end_time = time.time()
    generation_latency = generation_end_time - generation_start_time

    # Extract the response from Groq API
    response = chat_completion.choices[0].message.content

    # Return the response and latencies
    return response, retrieval_results, retrieval_latency, generation_latency


# Example usage of the RAG pipeline with latency measurement
def rag_pipeline(query, index, metadata, query_history):
    # Generate response with context and measure latencies
    answer, retrieval_results, retrieval_latency, generation_latency = generate_response_with_context(query, index, metadata, query_history, top_k=3)

    # Print the retrieval latency and generation latency
    print(f"Retrieval Latency: {retrieval_latency:.4f} seconds")
    print(f"Generation Latency: {generation_latency:.4f} seconds")

    # Print the retrieved chunks (context)
    print("\nRetrieved Context:")
    for result in retrieval_results:
        print(f"Rank: {result['rank']} | Document: {result['document_name']}")
        print(result['chunk_text'][:150] + "...")  # Display the start of the chunk (trimmed for brevity)

    # Output the generated answer
    print("\nGenerated Answer:")
    print(answer)

    return answer


In [179]:
# Example multi-turn queries
queries = [
    "BOSCH 0607595100 Weight"
]

# For each query, run the RAG pipeline and retrieve the answer
for query in queries:
    print(f"Processing query: {query}")
    answer = rag_pipeline(query, index, metadata, query_history)
    print(f"Final answer: {answer}\n")

Processing query: BOSCH 0607595100 Weight
Context being used: What is the population of France? BOSCH 0607595100 Weight
Groq API response: Based on the query, it seems like there are two separate entities being searched for: the population of France and a product with the code BOSCH 0607595100 and weight.

For the population of France, I found the following relevant chunk:

"The population of France is approximately 67 million people as of 2021."
Retrieved Chunk 1: bosch 0607595100 compressed air foam cutter 300 mm 3800 strokesmin description bosch 0607595100 high... (Score: -0.1374)
Retrieved Chunk 2: stanley sfmcs650bxj 18v 40 ah brushless jigsaw without batteries charger description stanley sfmcs65... (Score: -0.4461)
Retrieved Chunk 3: beta c45pro ms fixed service module workshop equipment combination c45pro description beta c45pro ms... (Score: -0.5146)
Context to Groq API:
bosch 0607595100 compressed air foam cutter 300 mm 3800 strokesmin description bosch 0607595100 highperforma

**Retrieval Latency**: 0.2993 seconds

**Generation Latency**: 0.2473 seconds