# Introduction

This jupyter notebook helps you to build a RAG system from scratch.

I strongly recommend you to checkout the [README](./readme.md) section to gain a background about this topic before diving straight into the code.


# Setup dev env


## Python Virtual Environment

- [Check here](https://realpython.com/python-virtual-environments-a-primer/) why is a venv useful
- Run cell below to create a venv


In [None]:
# Create a Python virtual environment
#!python -m venv rag_venv

# Add the virtual environment folder to ".gitignore" file
#with open(".gitignore", "a") as f:
#    f.write("rag_venv/\n")


- Activate the virtual environment:
  - On Windows - `.\rag_venv\Scripts\activate`
  - On Mac - `source rag_venv/bin/activate`


## Install Packages


In [None]:
# Install all dependencies
!pip install -r requirements.txt


## load

The Groq API key is stored in environment variables using the [python-dotenv package](https://pypi.org/project/python-dotenv/)


In [None]:
from dotenv import load_dotenv
load_dotenv()
import os
#print(os.getenv('MY_VAR'))
#print(os.getenv('GROQ_API_KEY'))


# Ask Questions


In [62]:
import os
import groq
from typing import List
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from pypdf import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import json

# Initialize the Groq client with the API key obtained from environment variables
client = groq.Groq(
    api_key=os.environ.get("GROQ_API_KEY"),
)

# Load a pre-trained sentence transformer model for generating embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to extract text content from a PDF file
def extract_text_from_pdf(pdf_path):
    # Open the PDF file in binary read mode
    with open(pdf_path, 'rb') as file:
        reader = PdfReader(file)
        text = ''
        # Iterate through each page in the PDF and extract text
        for page in reader.pages:
            text += page.extract_text() + '\n'  # Append text from each page
    return text

# Function to split the extracted text into smaller chunks for processing
def create_chunks(text, chunk_size=500, chunk_overlap=50):
    # Use a RecursiveCharacterTextSplitter to break down text into chunks of specified size
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,  # Maximum size of each chunk
        chunk_overlap=chunk_overlap,  # Overlap between chunks to ensure continuity
        length_function=len,  # Function to determine the length of chunks
    )
    # Split the text into chunks and return them
    chunks = text_splitter.split_text(text)
    return chunks

# Process all PDF files in the specified directory and create chunks from their text
pdf_directory = './input_files/'  # Path to the directory containing PDF files
all_chunks = []
for filename in os.listdir(pdf_directory):
    if filename.endswith('.pdf'):  # Check if the file is a PDF
        pdf_path = os.path.join(pdf_directory, filename)  # Get the full path of the PDF file
        text = extract_text_from_pdf(pdf_path)  # Extract text from the PDF
        chunks = create_chunks(text)  # Split the text into chunks
        all_chunks.extend(chunks)  # Add chunks to the list of all chunks

# Generate embeddings for each text chunk using the sentence transformer model
embeddings = model.encode(all_chunks)

# Create a FAISS index for efficient similarity search based on embeddings
dimension = embeddings.shape[1]  # Determine the dimensionality of the embeddings
index = faiss.IndexFlatL2(dimension)  # Initialize a FAISS index using L2 distance (Euclidean)
index.add(embeddings.astype('float32'))  # Add the embeddings to the index

# Initialize cache
cache_file = 'semantic_cache.json'

# Function to load the cache from a JSON file
def load_cache():
    try:
        with open(cache_file, 'r') as f:
            return json.load(f)
    except FileNotFoundError:
        return {"queries": [], "embeddings": [], "responses": []}

# Function to save the cache to a JSON file
def save_cache(cache):
    with open(cache_file, 'w') as f:
        json.dump(cache, f)

# Load the cache
cache = load_cache()

# Function to retrieve a response from the cache based on query similarity
def retrieve_from_cache(query_embedding, threshold=0.35):
    for i, cached_embedding in enumerate(cache['embeddings']):
        distance = np.linalg.norm(query_embedding - np.array(cached_embedding))
        if distance < threshold:
            return cache['responses'][i]
    return None

# Function to update the cache with a new query, embedding, and response
def update_cache(query, query_embedding, response):
    cache['queries'].append(query)
    cache['embeddings'].append(query_embedding.tolist())
    cache['responses'].append(response)
    save_cache(cache)

# Function to retrieve the most relevant chunks of text based on a query
def retrieve_relevant_chunks(query, top_k=5):
    # Encode the query into an embedding vector
    query_vector = model.encode([query])[0]

    # Check cache for a similar query
    cached_response = retrieve_from_cache(query_vector)
    if cached_response:
        print("Answer recovered from Cache.")
        return cached_response

    # Perform a search in the FAISS index to find the top_k most similar chunks
    D, I = index.search(np.array([query_vector]).astype('float32'), top_k)
    relevant_chunks = [all_chunks[i] for i in I[0]]

    # Update cache with the new query and its response
    update_cache(query, query_vector, relevant_chunks)
    return relevant_chunks

# Function to generate a response using the Groq API based on relevant chunks
def generate_response(query: str, relevant_chunks: List[str], model: str = "llama-3.1-8b-instant") -> str: #llama-3.1-8b-instant, gemma2-9b-it
    # Combine the relevant chunks into a single context string
    context = "\n".join(relevant_chunks)
    # Construct the prompt with the context and the query
    prompt = f"""Based on the following context, please answer the question. If the answer is not in the context, say "I don't have enough information to answer that question."

Context:
{context}

Question: {query}

Answer:"""

    # Call the Groq API to generate a completion based on the prompt
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": "You are a helpful assistant that answers questions based on the given context."
            },
            {
                "role": "user",
                "content": prompt
            }
        ],
        model=model,  # Specify the model to use for generating the response
        temperature=0.5,  # Control the randomness of the response
        max_tokens=1024,  # Limit the maximum number of tokens in the response
        top_p=1,  # Use the top-p sampling strategy
        stream=False,  # Disable streaming of the response
        stop=None  # Do not specify any stopping criteria
    )

    # Extract and return the generated response and usage information from the API response
    response = chat_completion.choices[0].message.content.strip()
    usage_info = chat_completion.usage
    return response, usage_info

# Function to process a query using retrieval-augmented generation (RAG)
def rag_query(query: str, top_k: int = 5) -> str:
    # Retrieve the top_k relevant chunks of text related to the query
    relevant_chunks = retrieve_relevant_chunks(query, top_k)
    # Generate a response based on the relevant chunks
    response, usage_info = generate_response(query, relevant_chunks)

    # Print or log the usage information (uncomment the desired method)
    #print(f"Usage Info: {usage_info}")  # Prints the usage information to the console
    # Alternatively, you can log the usage info using a logging library
    # logging.info(f"Usage Info: {usage_info}")

    # Return the generated response
    return response

# Test the RAG-based system with a sample query
test_query = "Am I allowed to work from home?"
result = rag_query(test_query)
print(f"Query: {test_query}")
print(f"Response: {result}")

Answer recovered from Cache.
Query: Am I allowed to work from home?
Response: According to the context, the answer is: Yes, you are allowed to work from home, but it depends on the job and the work arrangement. There are three options mentioned:

* **Fully Remote** - Work from anywhere, but show up occasionally.
* **Hybrid** - Split your time between the office and your home (or couch).


# Playground


## json full response


In [34]:
import os
import json
from groq import Groq
from datetime import datetime

client = Groq(
    api_key=os.environ.get("GROQ_API_KEY"),
)

chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "system",
            "content": "You are a helpful assistant."
        },
        {
            "role": "user",
            "content": "Give me a funny one-liner.",
        }
    ],
    model="gemma2-9b-it",  # gemma2-9b-it
    temperature=1,
    max_tokens=1024,
    top_p=1,
    stream=False,
    stop=None
)

# Create a dictionary with the desired structure
response_dict = {
    "id": chat_completion.id,
    "object": "chat.completion",
    "created": int(datetime.now().timestamp()),
    "model": chat_completion.model,
    "system_fingerprint": chat_completion.system_fingerprint,  # This might be None
    "choices": [
        {
            "index": 0,
            "message": {
                "role": "assistant",
                "content": chat_completion.choices[0].message.content
            },
            "finish_reason": chat_completion.choices[0].finish_reason,
            "logprobs": None
        }
    ],
    "usage": {
        "prompt_tokens": chat_completion.usage.prompt_tokens,
        "completion_tokens": chat_completion.usage.completion_tokens,
        "total_tokens": chat_completion.usage.total_tokens,
        "prompt_time": round(chat_completion.usage.prompt_time, 3),
        "completion_time": round(chat_completion.usage.completion_time, 3),
        "total_time": round(chat_completion.usage.total_time, 3)
    }
}

# Print the formatted JSON response
print(json.dumps(response_dict, indent=2))

{
  "id": "chatcmpl-8daa49bf-3a07-42fb-ab0e-c954a10d0107",
  "object": "chat.completion",
  "created": 1722858859,
  "model": "gemma2-9b-it",
  "system_fingerprint": "fp_10c08bf97d",
  "choices": [
    {
      "index": 0,
      "message": {
        "role": "assistant",
        "content": "I'm reading a book about anti-gravity. It's impossible to put down!  \ud83d\udcda\ud83d\ude04  \n\n"
      },
      "finish_reason": "stop",
      "logprobs": null
    }
  ],
  "usage": {
    "prompt_tokens": 28,
    "completion_tokens": 27,
    "total_tokens": 55,
    "prompt_time": 0.003,
    "completion_time": 0.054,
    "total_time": 0.057
  }
}


## rate limits

[link](https://console.groq.com/docs/rate-limits)


In [49]:
import requests
import os
from datetime import datetime, timedelta

# Set your Groq API key
api_key = os.environ.get("GROQ_API_KEY")

# Groq API endpoint
# https://console.groq.com/docs/api-reference#chat-create
url = "https://api.groq.com/openai/v1/chat/completions"

# Headers
headers = {
    "Authorization": f"Bearer {api_key}",
    "Content-Type": "application/json"
}

# Example request payload
payload = {
    "model": "llama-3.1-8b-instant", #llama-3.1-8b-instant, gemma2-9b-it
    "messages": [{"role": "user", "content": "Hello, how are you?"}]
}

def parse_time(time_str):
    if time_str.endswith('ms'):
        return float(time_str[:-2]) / 1000  # Convert milliseconds to seconds
    elif time_str.endswith('s'):
        return float(time_str[:-1])
    else:
        try:
            return float(time_str)  # Assume it's already in seconds
        except ValueError:
            return 0  # Default to 0 if format is unrecognized

def check_rate_limits():
    response = requests.post(url, json=payload, headers=headers)

    if response.status_code == 200:
        # Token usage limits
        token_limit = int(response.headers.get('x-ratelimit-limit-tokens', 0))
        tokens_remaining = int(response.headers.get('x-ratelimit-remaining-tokens', 0))
        token_reset = parse_time(response.headers.get('x-ratelimit-reset-tokens', '0'))

        # Daily request limits
        daily_limit = int(response.headers.get('x-ratelimit-limit-requests', 0))
        requests_remaining = int(response.headers.get('x-ratelimit-remaining-requests', 0))
        request_reset = parse_time(response.headers.get('x-ratelimit-reset-requests', '0'))

        # Calculate reset times
        token_reset_time = datetime.now() + timedelta(seconds=token_reset)
        request_reset_time = datetime.now() + timedelta(seconds=request_reset)

        print(f"Token Usage:")
        print(f"  Limit: {token_limit} tokens per minute")
        print(f"  Remaining: {tokens_remaining} tokens")
        print(f"  Resets in: {token_reset:.2f} seconds")
        print(f"  Resets at: {token_reset_time.strftime('%Y-%m-%d %H:%M:%S')}")
        print("\nDaily Request Limits:")
        print(f"  Limit: {daily_limit} requests per day")
        print(f"  Remaining: {requests_remaining} requests")
        print(f"  Resets in: {request_reset:.2f} seconds")
        print(f"  Resets at: {request_reset_time.strftime('%Y-%m-%d %H:%M:%S')}")

        # Check usage for this specific request
        usage = response.json().get('usage', {})
        print("\nThis request used:")
        print(f"  Prompt tokens: {usage.get('prompt_tokens', 0)}")
        print(f"  Completion tokens: {usage.get('completion_tokens', 0)}")
        print(f"  Total tokens: {usage.get('total_tokens', 0)}")

    else:
        print(f"Error: {response.status_code}")
        print(response.text)

if __name__ == "__main__":
    check_rate_limits()


Token Usage:
  Limit: 131072 tokens per minute
  Remaining: 131063 tokens
  Resets in: 0.00 seconds
  Resets at: 2024-08-05 14:21:34

Daily Request Limits:
  Limit: 14400 requests per day
  Remaining: 14399 requests
  Resets in: 6.00 seconds
  Resets at: 2024-08-05 14:21:40

This request used:
  Prompt tokens: 16
  Completion tokens: 50
  Total tokens: 66


## rate limits + errors

- [rate limits](https://console.groq.com/docs/rate-limits)
- [errors](https://console.groq.com/docs/errors)


In [60]:
import requests
import os
from datetime import datetime, timedelta
import json

# Set your Groq API key
api_key = os.environ.get("GROQ_API_KEY")

# Groq API endpoint
url = "https://api.groq.com/openai/v1/chat/completions"

# Headers
headers = {
    "Authorization": f"Bearer {api_key}",
    "Content-Type": "application/json"
}

# Example request payload
payload = {
    "model": "llama-3.1-8b-instant", #llama-3.1-8b-instant, gemma2-9b-it
    "messages": [{"role": "user", "content": "Describe the flower rose in detail."}]
}

def parse_time(time_str):
    if time_str.endswith('ms'):
        return float(time_str[:-2]) / 1000  # Convert milliseconds to seconds
    elif time_str.endswith('s'):
        return float(time_str[:-1])
    else:
        try:
            return float(time_str)  # Assume it's already in seconds
        except ValueError:
            return 0  # Default to 0 if format is unrecognized

def handle_error(response):
    try:
        error_data = response.json().get('error', {})
        error_message = error_data.get('message', 'Unknown error')
        error_type = error_data.get('type', 'Unknown type')
        print(f"Error Type: {error_type}")
        print(f"Error Message: {error_message}")
    except json.JSONDecodeError:
        print(f"Error: Unable to parse error response. Status code: {response.status_code}")
        print(f"Response text: {response.text}")

def check_rate_limits():
    try:
        response = requests.post(url, json=payload, headers=headers)
        response.raise_for_status()  # Raise an exception for 4xx and 5xx status codes

        # Token usage limits
        token_limit = int(response.headers.get('x-ratelimit-limit-tokens', 0))
        tokens_remaining = int(response.headers.get('x-ratelimit-remaining-tokens', 0))
        token_reset = parse_time(response.headers.get('x-ratelimit-reset-tokens', '0'))

        # Daily request limits
        daily_limit = int(response.headers.get('x-ratelimit-limit-requests', 0))
        requests_remaining = int(response.headers.get('x-ratelimit-remaining-requests', 0))
        request_reset = parse_time(response.headers.get('x-ratelimit-reset-requests', '0'))

        # Calculate reset times
        token_reset_time = datetime.now() + timedelta(seconds=token_reset)
        request_reset_time = datetime.now() + timedelta(seconds=request_reset)

        print(f"Token Usage:")
        print(f"  Limit (x-ratelimit-limit-tokens-allocated): {token_limit} tokens per minute")
        print(f"  Remaining (x-ratelimit-remaining-tokens): {tokens_remaining} tokens")
        print(f"  Resets in (x-ratelimit-reset-tokens): {token_reset:.2f} seconds")
        print(f"  Resets at: {token_reset_time.strftime('%Y-%m-%d %H:%M:%S')}")
        print("\nDaily Request Limits:")
        print(f"  Limit (x-ratelimit-limit-requests-allocated): {daily_limit} requests per day")
        print(f"  Remaining (x-ratelimit-remaining-requests): {requests_remaining} requests")
        print(f"  Resets in (x-ratelimit-reset-requests): {request_reset:.2f} seconds")
        print(f"  Resets at (calculated): {request_reset_time.strftime('%Y-%m-%d %H:%M:%S')}")

    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")
        handle_error(response)
    except requests.exceptions.RequestException as req_err:
        print(f"Request error occurred: {req_err}")
    except json.JSONDecodeError as json_err:
        print(f"JSON decoding error: {json_err}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

if __name__ == "__main__":
    check_rate_limits()


Token Usage:
  Limit (x-ratelimit-limit-tokens-allocated): 131072 tokens per minute
  Remaining (x-ratelimit-remaining-tokens): 131059 tokens
  Resets in (x-ratelimit-reset-tokens): 0.01 seconds
  Resets at: 2024-08-05 15:22:31

Daily Request Limits:
  Limit (x-ratelimit-limit-requests-allocated): 14400 requests per day
  Remaining (x-ratelimit-remaining-requests): 14399 requests
  Resets in (x-ratelimit-reset-requests): 6.00 seconds
  Resets at (calculated): 2024-08-05 15:22:36
