# Basic RAG

This is the basic setup required for retrieval augmented generation (RAG).  

The `LocalEmbeddingGenerator` class is used to generate embeddings for the input text. This class also holds the trained `Word2Vec` model of the input text. 

The `example_generate_embeddings` function is used to demonstrate the basic usage of the `LocalEmbeddingGenerator` class.



In [1]:
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
from gensim.models import Word2Vec
import re

class LocalEmbeddingGenerator:
    def __init__(self, embedding_dim=384):  # Using 384 dimensions as it's common for many embedding models
        self.embedding_dim = embedding_dim
        self.word2vec_model = None
        self.lemmatizer = WordNetLemmatizer()
        
        # Download required NLTK data
        import nltk
        nltk.download('punkt')
        nltk.download('stopwords')
        nltk.download('wordnet')
        
        # Initialize stop words
        self.stop_words = set(stopwords.words('english'))
        
    def preprocess_text(self, text):
        """Preprocess the input text."""
        # Convert to lowercase
        text = text.lower()
        
        # Remove special characters and numbers
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        
        # Tokenize
        tokens = word_tokenize(text)
        
        # Remove stop words and lemmatize
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens 
                 if token not in self.stop_words and token not in string.punctuation]
        
        return tokens

    def train_word2vec(self, texts):
        """Train a Word2Vec model on the given texts."""
        # Preprocess all texts
        print(texts)
        processed_texts = [self.preprocess_text(text) for text in texts]
        
        # Train Word2Vec model
        self.word2vec_model = Word2Vec(sentences=processed_texts, 
                                     vector_size=self.embedding_dim,
                                     window=5,
                                     min_count=1,
                                     workers=4)

    def generate_embedding(self, text):
        """Generate embedding for the input text."""
        if self.word2vec_model is None:
            raise ValueError("Word2Vec model not trained. Please train the model first.")
        
        # Preprocess the input text
        tokens = self.preprocess_text(text)
        
        if not tokens:
            return np.zeros(self.embedding_dim)
        
        # Get embeddings for each token
        token_embeddings = []
        for token in tokens:
            try:
                token_embedding = self.word2vec_model.wv[token]
                token_embeddings.append(token_embedding)
            except KeyError:
                continue
        
        if not token_embeddings:
            return np.zeros(self.embedding_dim)
        
        # Average the token embeddings
        final_embedding = np.mean(token_embeddings, axis=0)
        
        return final_embedding

    def format_embedding(self, embedding):
        """Format the embedding vector as a comma-separated list in square brackets."""
        return f"[{','.join(map(str, embedding.tolist()))}]"

# Example usage:
def example_generate_embeddings():
    import nltk
    nltk.download('omw-1.4')
    nltk.download('punkt_tab')
    # Sample texts for training
    training_texts = [
        "This is a sample text for training.",
        "Another example of training data.",
        "More text to train the embedding model."
    ]
    
    # Initialize the embedding generator
    generator = LocalEmbeddingGenerator(embedding_dim=384)
    
    # Train the model
    generator.train_word2vec(training_texts)
    
    # Generate embedding for new text
    test_text = "This is a test sentence for embedding generation."
    embedding = generator.generate_embedding(test_text)
    
    # Format and print the embedding
    formatted_embedding = generator.format_embedding(embedding)
    print(f"Generated embedding for: '{test_text}'")
    print(f"Embedding: {formatted_embedding}")
    print(generator.word2vec_model.wv.most_similar("training"))

example_generate_embeddings()

[nltk_data] Downloading package omw-1.4 to /home/craig/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/craig/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /home/craig/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/craig/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/craig/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


['This is a sample text for training.', 'Another example of training data.', 'More text to train the embedding model.']
Generated embedding for: 'This is a test sentence for embedding generation.'
Embedding: [0.0017156526446342468,0.001174519187770784,-0.0018371008336544037,-8.415989577770233e-05,0.00021731418382842094,0.0014967076713219285,-0.0004473058506846428,-0.0007308672065846622,0.0004553204926196486,0.00022061758500058204,0.00031063208007253706,-0.0006860109861008823,-0.0015587980160489678,0.0019070269772782922,0.0019758790731430054,0.002160509815439582,-0.002239283174276352,0.0006865691393613815,-0.0009270735899917781,0.002505313605070114,0.0007561895181424916,0.0012086232891306281,0.0006212539155967534,0.001720957807265222,-0.001495648524723947,0.002055836608633399,-0.000627844303380698,-0.0011879910016432405,-0.0005367162520997226,0.002534780651330948,-0.0017855704063549638,-0.000570760399568826,0.0018231769790872931,-1.451807747798739e-05,-0.0016393143450841308,-0.001664980

The `RAGSystem` class is used to generate the RAG model. This class uses the `LocalEmbeddingGenerator` class to generate embeddings for the input text. The `RAGSystem` class implements the following:

* `add_documents` - Add documents to the RAG model and generate embeddings for the documents.
* `find_similar_documents` - Find similar documents to the query.
* `save_to_disk` - Save the RAG model to disk.
* `load_from_disk` - Load the RAG model from disk.

We'll build on the `RAGSystem` class in the next notebook to implement the full RAG model.

The `example_rag_system` function is used to demonstrate the basic usage of the `RAGSystem` class.

The `example_rag_system` function demonstrates the following:

1. Create a `RAGSystem` object.
2. Create documents and associated metadata.
3. Add the documents to the `RAGSystem` object.
4. Query the `RAGSystem` object to find similar documents.
5. Output the similar documents.

In [2]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from typing import List, Dict, Tuple
import json

class RAGSystem:
    def __init__(self, embedding_dim=384):
        self.embedding_generator = LocalEmbeddingGenerator(embedding_dim)
        self.document_store: List[Dict] = []
        self.document_embeddings: List[np.ndarray] = []

    def add_documents(self, documents: List[str], metadata: List[Dict] = None):
        """
        Add documents to the RAG system and generate their embeddings.
        
        Args:
            documents: List of document texts
            metadata: Optional list of metadata dictionaries for each document
        """
        # Train the embedding model on all documents
        self.embedding_generator.train_word2vec(documents)

        # Generate embeddings and store documents
        for i, doc in enumerate(documents):
            embedding = self.embedding_generator.generate_embedding(doc)
            
            doc_entry = {
                'id': len(self.document_store),
                'text': doc,
                'metadata': metadata[i] if metadata else {}
            }
            
            self.document_store.append(doc_entry)
            self.document_embeddings.append(embedding)

    def find_similar_documents(self, query: str, k: int = 3) -> List[Tuple[Dict, float]]:
        """
        Find the k most similar documents to the query.
        
        Args:
            query: The search query
            k: Number of documents to retrieve
            
        Returns:
            List of tuples containing (document, similarity_score)
        """
        # Generate embedding for the query
        query_embedding = self.embedding_generator.generate_embedding(query)
        
        # Calculate similarities
        similarities = cosine_similarity(
            [query_embedding],
            self.document_embeddings
        )[0]
        
        # Get top k similar documents
        top_k_indices = np.argsort(similarities)[-k:][::-1]
        
        results = []
        for idx in top_k_indices:
            results.append((self.document_store[idx], similarities[idx]))
            
        return results

    def save_to_disk(self, filepath: str):
        """Save the RAG system to disk."""
        data = {
            'documents': self.document_store,
            'embeddings': [emb.tolist() for emb in self.document_embeddings]
        }
        
        with open(filepath, 'w') as f:
            json.dump(data, f)

    def load_from_disk(self, filepath: str):
        """Load the RAG system from disk."""
        with open(filepath, 'r') as f:
            data = json.load(f)
            
        self.document_store = data['documents']
        self.document_embeddings = [np.array(emb) for emb in data['embeddings']]

    
# Example query
def example_basic_rag_model():
    # Initialize RAG system
    rag = RAGSystem()
    
    # Sample documents
    documents = [
        "Python is a high-level programming language known for its simplicity.",
        "Machine learning is a subset of artificial intelligence.",
        "Natural language processing deals with interaction between computers and human language.",
        "Deep learning is a type of machine learning based on artificial neural networks.",
    ]
    
    # Add metadata for each document
    metadata = [
        {'source': 'programming_guide', 'category': 'programming'},
        {'source': 'ai_textbook', 'category': 'ai'},
        {'source': 'nlp_paper', 'category': 'nlp'},
        {'source': 'dl_tutorial', 'category': 'deep_learning'}
    ]
    
    # Add documents to RAG system
    rag.add_documents(documents, metadata)
    
    # Example queries and retrieval
    queries = [
        "What is Python programming?",
        "Tell me about artificial intelligence",
        "How does NLP work?"
    ]
    
    for query in queries:
        print(f"\nQuery: {query}")
        similar_docs = rag.find_similar_documents(query, k=2)
        
        print("Retrieved documents:")
        for doc, similarity in similar_docs:
            print(f"- Document: {doc['text']}")
            print(f"  Metadata: {doc['metadata']}")
            print(f"  Similarity score: {similarity:.4f}")

example_basic_rag_model()

['Python is a high-level programming language known for its simplicity.', 'Machine learning is a subset of artificial intelligence.', 'Natural language processing deals with interaction between computers and human language.', 'Deep learning is a type of machine learning based on artificial neural networks.']

Query: What is Python programming?
Retrieved documents:
- Document: Python is a high-level programming language known for its simplicity.
  Metadata: {'source': 'programming_guide', 'category': 'programming'}
  Similarity score: 0.5463
- Document: Machine learning is a subset of artificial intelligence.
  Metadata: {'source': 'ai_textbook', 'category': 'ai'}
  Similarity score: -0.0371

Query: Tell me about artificial intelligence
Retrieved documents:
- Document: Machine learning is a subset of artificial intelligence.
  Metadata: {'source': 'ai_textbook', 'category': 'ai'}
  Similarity score: 0.6557
- Document: Deep learning is a type of machine learning based on artificial neura

[nltk_data] Downloading package punkt to /home/craig/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/craig/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/craig/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
