# Install all the required packages 

In [None]:
print("Installing required packages with correct Pinecone setup...")

%pip install -q wikipedia-api
%pip install -q langchain
%pip install -q langchain-openai
%pip install -q langchain-community
%pip install -q langchain-text-splitters
%pip install -q langchainhub
%pip install -q pinecone 
%pip install -q langchain-pinecone
%pip install -q python-dotenv
%pip install -q tiktoken

print("All packages installed successfully!")

# Import all the required libraries

In [None]:
print("Importing libraries...")

import re
import random
from getpass import getpass
from typing import List, Dict, Optional

# Wikipedia
import wikipediaapi

# LangChain imports
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_openai import AzureOpenAIEmbeddings, AzureChatOpenAI
from langchain_pinecone import PineconeVectorStore
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

# Pinecone - CORRECT import
from pinecone import Pinecone, ServerlessSpec

print("All libraries imported successfully!")

# Set enviornment variables

In [None]:
print("Setting up environment variables...")

import os

# Set environment variables securely
os.environ["PINECONE_API_KEY"] = "<PINECONE_API_KEY>"
os.environ["AZURE_OPENAI_API_KEY"] = "<AZURE_OPENAI_API_KEY>"
os.environ["AZURE_OPENAI_ENDPOINT"] = "<AZURE_OPENAI_ENDPOINT>"
os.environ["OPENAI_API_VERSION"] = "2024-02-01"

print("✅ Environment variables set successfully!")

# Task 1: Fetch and Parse Webpages

In [None]:
# Handles fetching and cleaning Wikipedia content

class WikipediaProcessor:    
    def __init__(self):
        self.wiki_wiki = wikipediaapi.Wikipedia(
            user_agent='RAG-Pipeline/1.0',
            language='en',
            extract_format=wikipediaapi.ExtractFormat.WIKI
        )
    
    # Clean Wikipedia text by removing references and formatting artifacts.
    def clean_text(self, text: str) -> str:
        if not text:
            return ""
        
        # Remove Wikipedia references (e.g., [1], [citation needed])
        text = re.sub(r'\[\d+\]', '', text)
        text = re.sub(r'\[citation needed\]', '', text)
        text = re.sub(r'\[who\]', '', text)
        text = re.sub(r'\[when\]', '', text)
        
        # Remove text within parentheses and square brackets
        text = re.sub(r'\([^)]*\)', '', text)
        text = re.sub(r'\[[^\]]*\]', '', text)
        
        # Remove section headers
        text = re.sub(r'=+\s*(.*?)\s*=+', r'\1', text)
        
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text)
        
        return text.strip()
    
    # Fetch multiple Wikipedia pages and return cleaned content.
    def fetch_pages(self, page_titles: List[str]) -> Dict:
        results = {}
        
        for title in page_titles:
            try:
                print(f"Fetching: {title}")
                page = self.wiki_wiki.page(title)
                if page.exists():
                    cleaned_content = self.clean_text(page.text)
                    results[title] = {
                        'content': cleaned_content,
                        'url': page.fullurl,
                        'title': page.title,
                        'length': len(cleaned_content)
                    }
                    print(f"Successfully fetched: {title} ({len(cleaned_content)} chars)")
                else:
                    print(f"Page not found: {title}")
                    results[title] = None
                    
            except Exception as e:
                print(f"Error fetching {title}: {e}")
                results[title] = None
        
        return results


# Initialize processor and fetch pages
wiki_processor = WikipediaProcessor()
page_titles = ['Artificial_intelligence', 'Machine_learning']


print("Starting Wikipedia content fetch...")
wikipedia_data = wiki_processor.fetch_pages(page_titles)

# Display summary
print("\n" + "="*60)
print("WIKIPEDIA CONTENT SUMMARY")
print("="*60)
for title, data in wikipedia_data.items():
    if data:
        print(f" {data['title']}:")
        print(f"  URL: {data['url']}")
        print(f"  Length: {data['length']:,} characters")
        print(f"  Preview: {data['content'][:150]}...")
        print()

# Task 2: Chunking with Randomized Sizes

In [None]:
from typing import List

# Custom text splitter with randomized chunk sizes.
class RandomizedTextSplitter:
    
    def __init__(self, chunk_size_range: tuple = (400, 600), chunk_overlap: int = 50, length_function=len):
        self.chunk_size_range = chunk_size_range
        self.chunk_overlap = chunk_overlap
        self.length_function = length_function
    
    # Split text into chunks with randomized sizes.
    def split_text(self, text: str) -> List[str]:
        if self.length_function(text) <= self.chunk_size_range[0]:
            return [text]
        
        chunks = []
        start = 0
        
        while start < self.length_function(text):
            # Randomly select chunk size within range
            chunk_size = random.randint(*self.chunk_size_range)
            end = min(start + chunk_size, len(text))
            
            chunk = text[start:end]
            chunks.append(chunk)
            
            start = end - self.chunk_overlap
            
            if start >= len(text) - self.chunk_overlap:
                break
        
        return chunks

# Function to create document chunks from Wikipedia data
def create_chunks(wikipedia_data: Dict, use_randomized: bool = True) -> List[Document]:
    all_chunks = []
    
    if use_randomized:
        # Use custom randomized splitter
        text_splitter = RandomizedTextSplitter(
            chunk_size_range=(400, 600),
            chunk_overlap=50
        )
    else:
        # Use standard recursive splitter
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=500,
            chunk_overlap=50,
            length_function=len,
            separators=["\n\n", "\n", ". ", " ", ""]
        )
    
    for title, data in wikipedia_data.items():
        if data and data['content']:
            if use_randomized:
                chunks = text_splitter.split_text(data['content'])
            else:
                chunks = text_splitter.split_text(data['content'])
            
            # Create Document objects with metadata
            for i, chunk in enumerate(chunks):
                doc = Document(
                    page_content=chunk,
                    metadata={
                        "source": data['url'],
                        "title": data['title'],
                        "chunk_id": i,
                        "source_type": "wikipedia",
                        "chunk_size": len(chunk)
                    }
                )
                all_chunks.append(doc)
            
            print(f"Created {len(chunks)} chunks from {title}")
            if use_randomized:
                chunk_sizes = [len(chunk) for chunk in chunks]
                print(f"   Chunk size range: {min(chunk_sizes)}-{max(chunk_sizes)} characters")
    
    print(f"\nTotal chunks created: {len(all_chunks)}")
    return all_chunks

# Create chunks with randomized sizes
print("Creating chunks with randomized sizes (400-600 characters)...")
all_chunks = create_chunks(wikipedia_data, use_randomized=False)

# Task 3: Vector Database Setup

In [None]:
# Manages Pinecone vector store operations

class VectorStoreManager:
    
    def __init__(self):
        self.pc = None
        self.embeddings = None
        self.vector_store = None
    
    # Initialize Azure OpenAI embeddings.
    def initialize_embeddings(self):
        try:
            self.embeddings = AzureOpenAIEmbeddings(
                azure_deployment="text-embedding-3-small",
                openai_api_version=os.getenv("OPENAI_API_VERSION"),
                chunk_size=1000
            )
            print("Azure OpenAI Embeddings initialized successfully")
            return True
        except Exception as e:
            print(f"Error initializing embeddings: {e}")
            return False
    
    # Initialize Pinecone and create index if needed
    def initialize_pinecone(self, index_name: str = "rag-pipeline"):
        try:
            self.pc = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))
            print("Pinecone client initialized successfully")
            
            # Create index if it doesn't exist
            if index_name not in self.pc.list_indexes().names():
                print(f"Creating new index: {index_name}")
                self.pc.create_index(
                    name=index_name,
                    dimension=1536,  # Dimension for text-embedding-ada-002
                    metric="cosine",
                    spec=ServerlessSpec(
                        cloud="aws",
                        region="us-east-1"
                    )
                )
                # Wait for index to be ready
                import time
                time.sleep(10)

                print(f"Created new index: {index_name}")
            else:
                print(f"Using existing index: {index_name}")
            
            return True
        except Exception as e:
            print(f"Error initializing Pinecone: {e}")
            return False
    
    # Store documents in Pinecone vector store
    def store_documents(self, documents: List[Document], index_name: str = "rag-pipeline"):
        try:
            if not documents:
                print("No documents to store")
                return False
            
            print(f"Storing {len(documents)} documents in Pinecone...")
            
            self.vector_store = PineconeVectorStore.from_documents(
                documents=documents,
                embedding=self.embeddings,
                index_name=index_name
            )
            
            print("Documents successfully stored in Pinecone!")
            return True
        except Exception as e:
            print(f"Error storing documents: {e}")
            return False

# Initialize vector store manager
vector_manager = VectorStoreManager()

# Initialize embeddings
print("Initializing Azure OpenAI Embeddings...")
vector_manager.initialize_embeddings()

# Initialize Pinecone
print("\nInitializing Pinecone...")
vector_manager.initialize_pinecone()

# Store documents in Pinecone
print("\nStoring documents in Pinecone...")
vector_manager.store_documents(all_chunks)

# Create retriever for later use
retriever = vector_manager.vector_store.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 3}
)

# Task 4: Similarity Search

In [None]:
# Function to search the vector database with optional source filtering

def search_db(query: str, k: int = 3, filter_source: Optional[str] = None) -> List[Dict]:
    try:
        # Perform similarity search
        results = vector_manager.vector_store.similarity_search(query, k=k)
        
        # Apply source filter if provided
        if filter_source:
            results = [doc for doc in results if doc.metadata.get("source") == filter_source]
        
        # Format results
        formatted_results = []
        for i, doc in enumerate(results):
            formatted_results.append({
                "rank": i + 1,
                "content": doc.page_content,
                "source_url": doc.metadata.get("source"),
                "title": doc.metadata.get("title"),
                "chunk_size": doc.metadata.get("chunk_size", "N/A")
            })
        
        return formatted_results
    
    except Exception as e:
        print(f"Error during search: {e}")
        return []

# Test the search function
print("Testing similarity search...")

# Get source URLs for filtering
ai_url = wikipedia_data['Artificial_intelligence']['url']
ml_url = wikipedia_data['Machine_learning']['url']

test_queries = [
    "Who is considered the father of AI?",
    "What is supervised learning?",
    "What are neural networks?"
]

for query in test_queries:
    print(f"\n{'='*50}")
    print(f"Query: '{query}'")
    print(f"{'='*50}")
    
    # Unrestricted search
    results = search_db(query, k=2)
    print(f"Unrestricted results ({len(results)}):")
    for result in results:
        print(f"  📍 {result['rank']}. {result['content'][:100]}...")
        print(f"     🔗 Source: {result['source_url']}")
    
    # AI-only search
    ai_results = search_db(query, k=2, filter_source=ai_url)
    print(f"\nAI-only results ({len(ai_results)}):")
    for result in ai_results:
        print(f"  📍 {result['rank']}. {result['content'][:100]}...")
    
    # ML-only search
    ml_results = search_db(query, k=2, filter_source=ml_url)
    print(f"\nML-only results ({len(ml_results)}):")
    for result in ml_results:
        print(f"  📍 {result['rank']}. {result['content'][:100]}...")

# Task 5: LLM Module with RetrievalQA

In [None]:
# RAG QA System with source citations

class RAGQASystem:
    
    def __init__(self):
        self.llm = None
        self.qa_chain = None
        self.initialize_llm()
    
    # Initialize Azure Chat OpenAI LLM
    def initialize_llm(self):
        try:
            self.llm = AzureChatOpenAI(
                azure_deployment="gpt-35-turbo",  # Replace with your deployment name
                api_version=os.getenv("OPENAI_API_VERSION"),
                temperature=0.0,  # Low temperature for factual responses
                max_tokens=500
            )
            print("Azure Chat OpenAI initialized successfully")
        except Exception as e:
            print(f"Error initializing LLM: {e}")
    
    # Create RetrievalQA chain with source citations
    def create_qa_chain(self):

        prompt_template = """You are a helpful AI assistant. Use ONLY the following retrieved context to answer the question. 
        If the answer is not in the context, say "I cannot answer this question based on the provided information."
        If the answer comes from multiple sources, cite all relevant URLs.

        Context:
        {context}

        Question: {question}

        Answer:"""

        PROMPT = PromptTemplate(
            template=prompt_template,
            input_variables=["context", "question"]
        )
        
        try:
            self.qa_chain = RetrievalQA.from_chain_type(
                llm=self.llm,
                chain_type="stuff",
                #retriever=vector_manager.vector_store.as_retriever(search_kwargs={"k": 3}),
                retriever=retriever,
                chain_type_kwargs={"prompt": PROMPT},
                return_source_documents=True
            )
            print("RetrievalQA chain created successfully")
        except Exception as e:
            print(f"Error creating QA chain: {e}")
    
    # Ask a question and get answer with sources
    def ask_question(self, query: str) -> Dict:
        if not self.qa_chain:
            self.create_qa_chain()
        
        try:
            result = self.qa_chain.invoke({"query": query})
            
            # Extract unique sources
            sources = list(set(
                doc.metadata.get("source", "Unknown") 
                for doc in result["source_documents"]
            ))
            
            return {
                "question": query,
                "answer": result["result"],
                "sources": sources,
                "source_documents": result["source_documents"]
            }
        except Exception as e:
            return {
                "question": query,
                "answer": f"Error: {str(e)}",
                "sources": [],
                "source_documents": []
            }

# Initialize QA system
print("Initializing RAG QA System...")
qa_system = RAGQASystem()
qa_system.create_qa_chain()

# Test the QA system
print("\n Testing QA System...")
test_questions = [
    "Who is considered the father of AI?",
    "What is supervised learning?",
    "What are the main differences between AI and machine learning?",
    "What is deep learning?"
]

for question in test_questions:
    print(f"\n{'='*70}")
    print(f"Question: {question}")
    print(f"{'='*70}")
    
    result = qa_system.ask_question(question)
    
    print(f"Answer: {result['answer']}")
    print(f"Sources:")
    for source in result['sources']:
        print(f"   - {source}")
    
    print(f"\n Retrieved chunks used:")
    for i, doc in enumerate(result['source_documents']):
        print(f"   {i+1}. {doc.page_content[:100]}...")

# Task 6: Extended System with Page-Specific Search

In [None]:
# Extended RAG System with Page-Specific Search
class ExtendedRAGSystem(RAGQASystem):
    
    def __init__(self, wikipedia_data: Dict):
        super().__init__()
        self.wikipedia_data = wikipedia_data
        self.ai_url = wikipedia_data['Artificial_intelligence']['url']
        self.ml_url = wikipedia_data['Machine_learning']['url']
    
    # Answer question only from AI page
    def ask_ai(self, query: str) -> Dict:
        return self._ask_page_specific(query, self.ai_url, "AI")
    
    # Answer question only from ML page
    def ask_ml(self, query: str) -> Dict:
        return self._ask_page_specific(query, self.ml_url, "ML")
    
    # Core method to handle page-specific queries
    def _ask_page_specific(self, query: str, source_url: str, page_name: str) -> Dict:
        try:
            # Get relevant chunks from specific source
            results = search_db(query, k=3, filter_source=source_url)
            
            if not results:
                return {
                    "question": query,
                    "answer": f"I cannot answer this question based on the {page_name} page information alone.",
                    "sources": [],
                    "source_documents": []
                }
            
            # Combine context from filtered results
            context = "\n\n".join([f"Source: {r['source_url']}\nContent: {r['content']}" for r in results])
            
            # Create prompt for page-specific answers
            prompt = f"""You are a helpful AI assistant. Use ONLY the following context from the {page_name} Wikipedia page to answer the question. 
            If the answer is not in this context, say "I cannot answer this question based on the {page_name} page information alone."

            Context from {page_name} page:
            {context}

            Question: {query}

            Answer:"""
            
            # Get LLM response
            response = self.llm.invoke(prompt)
            answer = response.content if hasattr(response, 'content') else str(response)
            
            return {
                "question": query,
                "answer": answer,
                "sources": [source_url],
                "source_documents": results
            }
            
        except Exception as e:
            return {
                "question": query,
                "answer": f"Error: {str(e)}",
                "sources": [],
                "source_documents": []
            }
    
    def compare_retrieval_methods(self, query: str):
        """Compare answers from different retrieval methods"""
        print(f"\n{'='*80}")
        print(f"COMPARISON FOR: '{query}'")
        print(f"{'='*80}")
        
        # Unrestricted retrieval
        print(f"\n1. UNRESTRICTED RETRIEVAL (all pages):")
        unrestricted_result = self.ask_question(query)
        print(f"   Answer: {unrestricted_result['answer']}")
        print(f"   Sources: {len(unrestricted_result['sources'])} sources")
        
        # AI-only retrieval
        print(f"\n2. AI-ONLY RETRIEVAL:")
        ai_result = self.ask_ai(query)
        print(f"   Answer: {ai_result['answer']}")
        print(f"   Sources: {len(ai_result['sources'])} source(s)")
        
        # ML-only retrieval
        print(f"\n3. ML-ONLY RETRIEVAL:")
        ml_result = self.ask_ml(query)
        print(f"   Answer: {ml_result['answer']}")
        print(f"   Sources: {len(ml_result['sources'])} source(s)")

# Initialize extended system
print("Initializing Extended RAG System...")
extended_system = ExtendedRAGSystem(wikipedia_data)

# Test the extended system
print("\n Testing Page-Specific Search...")
comparison_queries = [
    "What is the definition and history?",
    "What are the main types or categories?",
    "What are the applications and future directions?"
]

for query in comparison_queries:
    extended_system.compare_retrieval_methods(query)

# Specific test cases
print("\n" + "="*80)
print("SPECIFIC TEST CASES")
print("="*80)

specific_tests = [
    ("What is the Turing Test?", "Tests AI page knowledge"),
    ("What is reinforcement learning?", "Tests ML page knowledge"),
    ("What are the ethical concerns?", "Tests cross-page knowledge")
]

for query, description in specific_tests:
    print(f"\n {description}: '{query}'")
    extended_system.compare_retrieval_methods(query)