In [None]:
!pip uninstall -y keras
!pip uninstall -y keras-nightly
!pip uninstall -y keras-preprocessing
!pip uninstall -y keras-vis

In [None]:
!pip install tf-keras

In [4]:
import os
import json
import time
import asyncio
import logging
from typing import List, Dict, Any
from dataclasses import dataclass
import warnings
warnings.filterwarnings('ignore')
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import numpy as np
import pandas as pd
import torch
import faiss
import chromadb
from sentence_transformers import SentenceTransformer
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import requests
import aiohttp
from bs4 import BeautifulSoup
from urllib.parse import quote_plus
import nltk
from nltk.tokenize import sent_tokenize
import re
from transformers import AutoTokenizer
import ollama
from datetime import datetime
import pickle
import hashlib
from tqdm import tqdm
import subprocess
import shutil
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
@dataclass
class Config:
    embedding_model_name: str = "all-MiniLM-L6-v2"
    llm_model_name: str = "deepseek-r1:14b"
    max_tokens: int = 4096
    temperature: float = 0.7
    chunk_size: int = 256
    chunk_overlap: int = 50
    max_chunks_per_doc: int = 20
    top_k_retrieval: int = 5
    similarity_threshold: float = 0.15 
    max_pages_per_query: int = 5  
    request_timeout: int = 10
    selenium_timeout: int = 15
    max_text_length: int = 5000
    cache_dir: str = "./scrape_cache"
    cache_expiry_hours: int = 48  
    vector_db_type: str = "chroma"
    persist_directory: str = "./chroma_db"
    device: str = "mps" if torch.backends.mps.is_available() else "cpu"
    batch_size: int = 64
config = Config()
class WebScraper:    
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36'
        })
        self.driver = None
        os.makedirs(config.cache_dir, exist_ok=True)
    def setup_selenium(self):
        try:
            chrome_options = Options()
            chrome_options.add_argument("--headless")
            chrome_options.add_argument("--no-sandbox")
            chrome_options.add_argument("--disable-dev-shm-usage")
            chrome_options.add_argument("--disable-gpu")
            service = Service(ChromeDriverManager().install())
            self.driver = webdriver.Chrome(service=service, options=chrome_options)
            logger.info("Selenium WebDriver initialized successfully.")
        except Exception as e:
            logger.error(f"Failed to initialize Selenium: {e}.")
    def scrape_with_requests(self, url: str) -> Dict[str, Any]:
        try:
            response = self.session.get(url, timeout=config.request_timeout)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')
            for script in soup(["script", "style"]):
                script.decompose()
            title = soup.find('title')
            title_text = title.get_text().strip() if title else ""
            content = soup.get_text(separator=' ', strip=True)[:config.max_text_length]
            if len(content) < 500:
                return None
            return {
                'url': url,
                'title': title_text,
                'content': content,
                'method': 'requests',
                'timestamp': datetime.now().isoformat()}
        except Exception as e:
            logger.error(f"Error scraping {url} with requests: {e}.")
            return None
    async def scrape_with_selenium(self, url: str) -> Dict[str, Any]:
        if not self.driver:
            self.setup_selenium()
        if not self.driver:
            return None
        try:
            self.driver.get(url)
            WebDriverWait(self.driver, config.selenium_timeout).until(
                EC.presence_of_element_located((By.TAG_NAME, "body")))
            time.sleep(2)
            title = self.driver.title
            content = self.driver.find_element(By.TAG_NAME, "body").text[:config.max_text_length]
            if len(content) < 500:
                return None
            return {
                'url': url,
                'title': title,
                'content': content,
                'method': 'selenium',
                'timestamp': datetime.now().isoformat()}
        except Exception as e:
            logger.error(f"Error scraping {url} with Selenium: {e}.")
            return None
    async def scrape_bbc_search(self, query: str) -> List[Dict[str, Any]]:
        if not self.driver:
            self.setup_selenium()
        if not self.driver:
            return []
        try:
            search_url = f"https://www.bbc.co.uk/search?q={quote_plus(query)}"
            self.driver.get(search_url)
            WebDriverWait(self.driver, config.selenium_timeout).until(
                EC.presence_of_element_located((By.TAG_NAME, "body")))
            time.sleep(2)
            soup = BeautifulSoup(self.driver.page_source, 'html.parser')
            urls = []
            for a in soup.select("a[href^='https://www.bbc.']"):
                href = a['href']
                if '/news/' in href and href not in urls:
                    urls.append(href)
            urls = urls[:config.max_pages_per_query]
            results = []
            for url in urls:
                result = await self.scrape_with_selenium(url)
                if result and result.get('content'):
                    results.append(result)
                    logger.info(f"Scraped BBC News {url}: {len(result['content'])} chars")
            return results
        except Exception as e:
            logger.error(f"Error scraping BBC News search: {e}.")
            return []
    async def search_and_scrape(self, query: str) -> List[Dict[str, Any]]:
        logger.info("Web scraping...")
        results = []        
        cache_key = hashlib.md5(query.encode()).hexdigest()
        cache_path = os.path.join(config.cache_dir, f"{cache_key}.pkl")
        if os.path.exists(cache_path):
            cache_age = (time.time() - os.path.getmtime(cache_path)) / 3600
            if cache_age < config.cache_expiry_hours:
                with open(cache_path, 'rb') as f:
                    cached_results = pickle.load(f)
                if all(len(r.get('content', '')) >= 500 for r in cached_results):
                    logger.info(f"Loaded {len(cached_results)} results from cache.")
                    return cached_results
        priority_urls = [
            "https://www.bbc.com/news",
            "https://abcnews.go.com",
            "https://www.nytimes.com"
        ][:config.max_pages_per_query]
        search_query = f"{quote_plus(query)} site:*.news | site:bbc.com | site:abcnews.go.com | site:nytimes.com"
        search_url = f"https://www.google.com/search?q={search_query}&num={config.max_pages_per_query}"
        urls = []
        try:
            async with aiohttp.ClientSession() as session:
                async with session.get(search_url, headers=self.session.headers, timeout=config.request_timeout) as response:
                    response.raise_for_status()
                    soup = BeautifulSoup(await response.text(), 'html.parser')
                    for a in soup.select("a[href^='http']"):
                        href = a['href']
                        if any(keyword in href for keyword in ['.news', 'articles', 'post']) and 'google.com' not in href:
                            urls.append(href)
                    urls = list(dict.fromkeys(urls))[:config.max_pages_per_query]
                    logger.info(f"Found {len(urls)} relevant URLs from Google search.")
        except Exception as e:
            logger.error(f"Error fetching Google search results: {e}.")        
        all_urls = priority_urls + [u for u in urls if u not in priority_urls][:config.max_pages_per_query - len(priority_urls)]        
        async def scrape_url(url):
            for attempt in range(5):
                result = self.scrape_with_requests(url)
                if result and len(result.get('content', '')) >= 500:
                    return result
                logger.warning(f"Attempt {attempt+1} failed for {url} with requests, trying Selenium...")
                result = await self.scrape_with_selenium(url)
                if result and len(result.get('content', '')) >= 500:
                    return result
                await asyncio.sleep(2 ** attempt)
            return None
        tasks = [scrape_url(url) for url in all_urls]
        tasks.append(self.scrape_bbc_search(query))
        results = []
        for task in asyncio.as_completed(tasks):
            result = await task
            if isinstance(result, list):  
                results.extend(result)
            elif result:
                results.append(result)        
        if results:
            with open(cache_path, 'wb') as f:
                pickle.dump(results, f)
            logger.info(f"Cached {len(results)} results for query: {query}")
        return results[:config.max_pages_per_query]
    def cleanup(self):
        if self.driver:
            self.driver.quit()
            self.driver = None
class TextChunker:    
    def __init__(self):
        try:
            self.tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")
        except Exception as e:
            logger.warning(f"Could not load tokenizer: {e}. Using basic word counting.")
            self.tokenizer = None
    def chunk_text(self, text: str, metadata: Dict = None) -> List[Dict[str, Any]]:
        text = self._clean_text(text)        
        if not metadata:
            metadata = {'source': 'unknown', 'created_at': datetime.now().isoformat()}
        if 'source' not in metadata:
            metadata['source'] = 'unknown'
        if 'created_at' not in metadata:
            metadata['created_at'] = datetime.now().isoformat()
        sentences = sent_tokenize(text)
        chunks = []
        current_chunk = []
        current_length = 0
        for sentence in sentences:
            if self.tokenizer:
                try:
                    sentence_length = len(self.tokenizer.encode(sentence))
                except:
                    sentence_length = len(sentence.split())
            else:
                sentence_length = len(sentence.split())
            if current_length + sentence_length > config.chunk_size and current_chunk:
                chunk_text = ' '.join(current_chunk)
                chunks.append({
                    'text': chunk_text,
                    'token_count': current_length,
                    'metadata': metadata.copy()})
                overlap_sentences = current_chunk[-2:] if len(current_chunk) >= 2 else current_chunk
                current_chunk = overlap_sentences + [sentence]
                if self.tokenizer:
                    try:
                        current_length = sum(len(self.tokenizer.encode(s)) for s in current_chunk)
                    except:
                        current_length = sum(len(s.split()) for s in current_chunk)
                else:
                    current_length = sum(len(s.split()) for s in current_chunk)
            else:
                current_chunk.append(sentence)
                current_length += sentence_length
        if current_chunk:
            chunk_text = ' '.join(current_chunk)
            chunks.append({
                'text': chunk_text,
                'token_count': current_length,
                'metadata': metadata.copy()})
        return chunks[:config.max_chunks_per_doc]
    def _clean_text(self, text: str) -> str:
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)]', '', text)
        return text.strip()
class EmbeddingManager:    
    def __init__(self):
        try:
            self.model = SentenceTransformer(config.embedding_model_name)
            if config.device == "mps":
                self.model = self.model.to("mps")
            logger.info(f"Embedding model loaded on {config.device}.")
        except Exception as e:
            logger.error(f"Failed to load embedding model: {e}.")
            raise
    def embed_texts(self, texts: List[str]) -> np.ndarray:
        try:
            non_empty_texts = [text for text in texts if text.strip()]
            if not non_empty_texts:
                return np.array([])
            embeddings = self.model.encode(
                non_empty_texts,
                batch_size=config.batch_size,
                show_progress_bar=True,
                convert_to_numpy=True)
            return embeddings
        except Exception as e:
            logger.error(f"Error generating embeddings: {e}.")
            return np.array([])
    def embed_query(self, query: str) -> np.ndarray:
        if not query.strip():
            return np.array([])
        return self.model.encode([query])[0]
class VectorDatabase:    
    def __init__(self, db_type: str = "chroma"):
        self.db_type = db_type
        self.chunks = []
        self.embeddings = None
        if db_type == "faiss":
            self.index = None
        elif db_type == "chroma":
            try:
                os.makedirs(config.persist_directory, exist_ok=True)
                if not os.access(config.persist_directory, os.W_OK):
                    raise PermissionError(f"Cannot write to {config.persist_directory}.")
                self.client = chromadb.PersistentClient(path=config.persist_directory)
                self.collection = self.client.get_or_create_collection(
                    name="rag_collection",
                    metadata={"hnsw:space": "cosine"}
                )
                logger.info("ChromaDB initialized successfully.")
            except Exception as e:
                logger.error(f"Failed to initialize ChromaDB: {e}.")
                raise
    def add_documents(self, chunks: List[Dict[str, Any]], embeddings: np.ndarray):
        if len(chunks) == 0 or len(embeddings) == 0:
            logger.warning("No chunks or embeddings to add.")
            return
        if self.db_type == "faiss":
            self._add_to_faiss(chunks, embeddings)
        elif self.db_type == "chroma":
            self._add_to_chroma(chunks, embeddings)
    def _add_to_faiss(self, chunks: List[Dict[str, Any]], embeddings: np.ndarray):
        if self.index is None:
            dimension = embeddings.shape[1]
            self.index = faiss.IndexFlatIP(dimension)            
        faiss.normalize_L2(embeddings)
        self.index.add(embeddings.astype('float32'))
        self.chunks.extend(chunks)
    def _add_to_chroma(self, chunks: List[Dict[str, Any]], embeddings: np.ndarray):
        try:
            documents = []
            metadatas = []
            ids = []
            valid_embeddings = []
            seen_texts = set()            
            for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
                if not chunk.get('text', '').strip():
                    continue
                text_hash = hashlib.md5(chunk['text'].encode()).hexdigest()
                if text_hash in seen_texts:
                    logger.info(f"Skipping duplicate chunk {i} with text: {chunk['text'][:50]}...")
                    continue
                seen_texts.add(text_hash)
                metadata = chunk.get('metadata', {})
                if not metadata or not isinstance(metadata, dict):
                    metadata = {
                        'source': 'unknown',
                        'created_at': datetime.now().isoformat(),
                        'chunk_index': i}
                clean_metadata = {}
                for key, value in metadata.items():
                    if value is not None:
                        clean_metadata[str(key)] = str(value)
                if 'source' not in clean_metadata:
                    clean_metadata['source'] = 'unknown'
                if 'created_at' not in clean_metadata:
                    clean_metadata['created_at'] = datetime.now().isoformat()
                documents.append(chunk['text'])
                metadatas.append(clean_metadata)
                ids.append(f"doc_{int(time.time())}_{i}_{text_hash[:8]}")
                valid_embeddings.append(embedding.tolist())
            if documents:
                self.collection.add(
                    documents=documents,
                    embeddings=valid_embeddings,
                    metadatas=metadatas,
                    ids=ids)
                logger.info(f"Added {len(documents)} unique documents to ChromaDB.")
            else:
                logger.warning("No valid documents to add to ChromaDB.")
        except Exception as e:
            logger.error(f"Error adding documents to ChromaDB: {e}.")
            raise
    def search(self, query_embedding: np.ndarray, k: int = 5) -> List[Dict[str, Any]]:
        if len(query_embedding) == 0:
            return []            
        if self.db_type == "faiss":
            return self._search_faiss(query_embedding, k)
        elif self.db_type == "chroma":
            return self._search_chroma(query_embedding, k)
    def _search_faiss(self, query_embedding: np.ndarray, k: int) -> List[Dict[str, Any]]:
        if self.index is None or len(self.chunks) == 0:
            return []            
        query_embedding = query_embedding.reshape(1, -1).astype('float32')
        faiss.normalize_L2(query_embedding)
        scores, indices = self.index.search(query_embedding, k)
        results = []
        for score, idx in zip(scores[0], indices[0]):
            if idx < len(self.chunks) and score >= config.similarity_threshold:
                result = self.chunks[idx].copy()
                result['score'] = float(score)
                results.append(result)
        return results
    def _search_chroma(self, query_embedding: np.ndarray, k: int) -> List[Dict[str, Any]]:
        try:
            results = self.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=k
            )            
            formatted_results = []
            if results['documents'] and results['documents'][0]:
                logger.info(f"ChromaDB returned {len(results['documents'][0])} documents.")
                for i, (doc, metadata, distance) in enumerate(zip(
                    results['documents'][0],
                    results['metadatas'][0] if results['metadatas'] else [{}] * len(results['documents'][0]),
                    results['distances'][0] if results['distances'] else [0.0] * len(results['documents'][0])
                )):
                    similarity_score = 1.0 - distance
                    logger.info(f"Document {i}: distance={distance:.4f}, similarity={similarity_score:.4f}, threshold={config.similarity_threshold}")
                    logger.info(f"Source: {metadata.get('url', metadata.get('source', 'Unknown'))}")
                    logger.info(f"Title: {metadata.get('title', 'Unknown')}")
                    logger.info(f"Content preview: {doc[:100]}...")
                    if similarity_score >= config.similarity_threshold:
                        formatted_results.append({
                            'text': doc,
                            'metadata': metadata if metadata else {},
                            'score': similarity_score,
                            'distance': distance
                        })
                        logger.info(f"Document {i} included (similarity={similarity_score:.4f}).")
                    else:
                        logger.info(f"Document {i} filtered out (similarity={similarity_score:.4f} < threshold={config.similarity_threshold}).")
            else:
                logger.warning("No documents returned from ChromaDB query.")
            logger.info(f"Returning {len(formatted_results)} results after filtering.")
            return formatted_results
        except Exception as e:
            logger.error(f"Error searching ChromaDB: {e}.")
            import traceback
            traceback.print_exc()
            return []
class DeepSeekLLM:    
    def __init__(self):
        try:
            self.client = ollama.Client()
            self.model_available = self._check_model_availability()
        except Exception as e:
            logger.error(f"Failed to initialize Ollama client: {e}.")
            raise
    def _check_model_availability(self) -> bool:
        model_name = config.llm_model_name
        logger.info("Testing model availability with direct generation...")
        try:
            test_response = self.client.chat(
                model=model_name,
                messages=[{"role": "user", "content": "Hello"}],
                options={"num_predict": 10})
            if test_response and 'message' in test_response and test_response['message'].get('content'):
                logger.info(f"Model {model_name} is working. Test response: {test_response['message']['content'][:50]}...")
                return True
        except Exception as e:
            logger.warning(f"Direct generation test failed: {e}.")
        try:
            response = self.client.list()
            available_models = []
            if isinstance(response, dict) and 'models' in response:
                for model in response.get('models', []):
                    model_name_field = model.get('name', model.get('model', '')).strip()
                    if model_name_field:
                        available_models.append(model_name_field)
                        if ':' in model_name_field:
                            base_name = model_name_field.split(':')[0]
                            available_models.append(base_name)
            model_variants = [
                model_name,
                model_name.lower(),
                model_name.replace('-', '_'),
                model_name.split(':')[0] if ':' in model_name else model_name,
                f"deepseek-r1",
                f"deepseek-r1:latest",
                f"deepseek-r1:14b-instruct"]
            for variant in model_variants:
                if variant in available_models:
                    logger.info(f"Found model variant: {variant}.")
                    config.llm_model_name = variant
                    return True
            logger.warning(f"Model not found in list. Available: {available_models}")
        except Exception as e:
            logger.warning(f"Failed to check model list: {e}.")
        logger.info(f"Cannot verify model {model_name}, but will attempt to use...")
        return True
    def generate(self, prompt: str, system_message: str = None) -> str:
        from concurrent.futures import ThreadPoolExecutor, TimeoutError
        def _generate():
            messages = []
            if system_message:
                messages.append({"role": "system", "content": system_message})
            messages.append({"role": "user", "content": prompt})
            response = self.client.chat(
                model=config.llm_model_name,
                messages=messages,
                options={
                    "temperature": config.temperature,
                    "num_predict": config.max_tokens,})
            if response and 'message' in response:
                content = response['message']['content']
                if '<think>' in content and '</think>' in content:
                    parts = content.split('</think>')
                    if len(parts) > 1:
                        content = parts[-1].strip()
                    else:
                        thinking_content = content.split('<think>')[1].split('</think>')[0] if '<think>' in content else content
                        content = thinking_content.strip()
                content = content.replace('<think>', '').replace('</think>', '').strip()
                if not content:
                    content = response['message']['content']
                return content if content else "No response generated by the model."    
            return "Error: Unexpected response format from model."
        for attempt in range(5):
            try:
                with ThreadPoolExecutor(max_workers=1) as executor:
                    future = executor.submit(_generate)
                    result = future.result(timeout=180)  
                logger.info(f"LLM generation successful on attempt {attempt+1}.")
                return result
            except TimeoutError:
                logger.warning(f"LLM generation timed out on attempt {attempt+1}.")
            except Exception as e:
                error_msg = str(e).lower()
                if "model" in error_msg and "not found" in error_msg:
                    return f"Error: Model '{config.llm_model_name}' not found. Please run: ollama pull {config.llm_model_name}."
                elif "connection" in error_msg or "refused" in error_msg:
                    return f"Error: Cannot connect to Ollama. Please ensure Ollama is running using 'ollama serve'."
                logger.error(f"LLM generation failed on attempt {attempt+1}: {e}.")
            time.sleep(2 ** attempt)
        return "Error: LLM generation failed after 5 attempts."
class RAGReasoner:    
    def __init__(self):
        self.scraper = WebScraper()
        self.chunker = TextChunker()
        self.embedder = EmbeddingManager()
        self.vector_db = VectorDatabase(config.vector_db_type)
        self.llm = DeepSeekLLM()
    async def process_query(self, query: str, use_web_search: bool = True) -> Dict[str, Any]:
        logger.info(f"Processing query: '{query}'.")        
        results = {
            'query': query,
            'timestamp': datetime.now().isoformat(),
            'steps': {}}
        try:
            all_chunks = []
            if use_web_search:
                logger.info("Web scraping...")
                scraped_data = await self.scraper.search_and_scrape(query)
                results['steps']['scraping'] = {
                    'num_pages': len(scraped_data),
                    'pages': [{'url': d['url'], 'title': d['title']} for d in scraped_data]
                }
                logger.info("Chunking and embedding...")
                for data in scraped_data:
                    if data and data.get('content'):
                        chunks = self.chunker.chunk_text(
                            data['content'],
                            metadata={
                                'url': data['url'],
                                'title': data['title'],
                                'scrape_method': data['method'],
                                'timestamp': data['timestamp']
                            })
                        all_chunks.extend(chunks)
                        logger.info(f"Generated {len(chunks)} chunks from {data['url']}.")
                if all_chunks:
                    texts = [chunk['text'] for chunk in all_chunks]
                    embeddings = self.embedder.embed_texts(texts)
                    if len(embeddings) > 0:
                        logger.info("Storing in vector database...")
                        self.vector_db.add_documents(all_chunks, embeddings)
                        results['steps']['embedding'] = {
                            'num_chunks': len(all_chunks),
                            'embedding_dimension': embeddings.shape[1]}
                    else:
                        logger.warning("Failed to generate embeddings.")
                else:
                    logger.warning("No chunks generated from scraped data.")
            else:
                logger.info("Skipping web search, using existing vector database.")
            logger.info("Retrieving relevant context...")
            query_embedding = self.embedder.embed_query(query)
            if len(query_embedding) > 0:
                relevant_chunks = self.vector_db.search(query_embedding, config.top_k_retrieval)
            else:
                relevant_chunks = []
            x_context = ""            
            if not relevant_chunks:
                logger.warning("No relevant chunks found in vector database.")
                context = f"I don't have specific information about '{query}' in my current knowledge base. Let me provide a general response based on my training.\n\n{x_context}"
                results['steps']['retrieval'] = {
                    'num_retrieved': 0,
                    'scores': [],
                    'note': 'No relevant chunks found.'}
            else:
                context = "\n\n".join([
                    f"Source: {chunk.get('metadata', {}).get('url', chunk.get('metadata', {}).get('source', 'Unknown'))}\n{chunk['text']}"
                    for chunk in relevant_chunks
                ])
                if self.chunker.tokenizer:
                    context_tokens = len(self.chunker.tokenizer.encode(context))
                    if context_tokens > 3000:
                        context = self.chunker.tokenizer.decode(self.chunker.tokenizer.encode(context)[:3000])
                context += f"\n\n{x_context}"
                results['steps']['retrieval'] = {
                    'num_retrieved': len(relevant_chunks),
                    'scores': [chunk['score'] for chunk in relevant_chunks]
                }
            logger.info("Performing initial reasoning...")
            reasoning_prompt = self._create_reasoning_prompt(query, context)
            initial_answer = self.llm.generate(reasoning_prompt)
            results['steps']['initial_reasoning'] = {
                'answer': initial_answer
            }
            logger.info("Self-correcting...")
            correction_prompt = self._create_correction_prompt(query, context, initial_answer)
            final_answer = self.llm.generate(correction_prompt)
            results['steps']['self_correction'] = {
                'answer': final_answer
            }
            results['final_answer'] = final_answer
            results['context_used'] = context
        except Exception as e:
            logger.error(f"Error in processing pipeline: {e}.")
            import traceback
            traceback.print_exc()
            results['error'] = str(e)
            results['final_answer'] = f"Error processing query: {str(e)}"
        finally:
            self.scraper.cleanup()
        return results
    def _create_reasoning_prompt(self, query: str, context: str) -> str:
        return f"""
You are an expert AI assistant with access to relevant information. Your task is to provide a comprehensive and accurate answer to the user's question using the provided context.

Context Information:
{context}

User Question: {query}

Instructions:
1. Analyze the provided context carefully, prioritizing verified news sources over unverified social media posts
2. Identify key information relevant to the question
3. Reason through the problem step by step
4. Provide a well-structured, comprehensive answer
5. If the context doesn't contain sufficient information, clearly state what's missing
6. Cite sources when possible, noting if information comes from unverified social media

Please provide your answer directly without any XML tags or special formatting:"""
    def _create_correction_prompt(self, query: str, context: str, initial_answer: str) -> str:
        return f"""
You are an expert AI assistant performing self-correction on a previous response. Your task is to review, analyze, and improve the initial answer.

Original Question: {query}

Context Information:
{context}

Initial Answer:
{initial_answer}

Self-Correction Instructions:
1. Critically evaluate the initial answer for:
   - Factual accuracy, especially verifying against news sources
   - Completeness
   - Logical consistency
   - Proper use of context
   - Clarity and structure

2. Check if the answer:
   - Fully addresses the question
   - Prioritizes verified news sources over unverified social media
   - Contains any contradictions or errors
   - Could be improved in any way

3. Provide an improved final answer that:
   - Corrects any identified errors
   - Adds missing important information
   - Improves clarity and structure
   - Better utilizes the available context

Provide your final corrected answer directly without any XML tags or special formatting:   
"""
class RAGSystem:    
    def __init__(self):
        self.reasoner = RAGReasoner()
    async def run_interactive(self):
        print("This is a RAG-based-Search-Augmented + Self-Correcting LLM Reasoning System.\n")
        print("Enter your queries (type 'quit' to exit).")
        while True:
            query = input("\n Query (type 'quit' to exit): ").strip()
            if query.lower() in ['quit', 'exit', 'q']:
                break 
            if not query:
                continue
            print("\nProcessing...")
            start_time = time.time()
            results = await self.reasoner.process_query(query)
            processing_time = time.time() - start_time
            self._display_results(results, processing_time)
    def _display_results(self, results: Dict[str, Any], processing_time: float):
        print("\nResults:")        
        if 'error' in results:
            print(f"Error: {results['error']}")
            return
        print(f"Processing Time: {processing_time:.2f} seconds.")
        if 'scraping' in results.get('steps', {}):
            scraping = results['steps']['scraping']
            print(f"Pages Scraped: {scraping['num_pages']}")
        if 'embedding' in results.get('steps', {}):
            embedding = results['steps']['embedding']
            print(f"Text Chunks: {embedding['num_chunks']}")
        if 'retrieval' in results.get('steps', {}):
            retrieval = results['steps']['retrieval']
            print(f"Retrieved Chunks: {retrieval['num_retrieved']}")
            if retrieval['scores']:
                print(f"Similarity Scores: {[f'{s:.3f}' for s in retrieval['scores'][:3]]}")
        print("\nFinal Answer:")
        print(results.get('final_answer', 'No answer generated'))
        print("\n")
def system_health_check():
    try:
        import os
        os.makedirs(config.cache_dir, exist_ok=True)
        os.makedirs(config.persist_directory, exist_ok=True)        
        import torch
        import chromadb
        import ollama
        print("Starting...")
        return True
    except Exception as e:
        print(f"System health check failed: {e}.")
        return False
def debug_vector_search(query: str):
    print(f"Debug: Testing vector search with query: {query}.")
async def main():
    import sys    
    if len(sys.argv) > 1 and sys.argv[1] == "debug":
        print("Debug Mode")
        debug_vector_search("What is machine learning?")
    else:
        try:
            health_check_result = system_health_check()
            if health_check_result:
                print("\nExecution Options:")
                print("1. Interactive Mode")
                print("2. Exit")
                try:
                    choice = input("\nSelect mode (1-2): ").strip()
                    print(f"Selected choice: {choice}")
                    if choice == "1":
                        system = RAGSystem()
                        print("Starting interactive mode...")
                        await system.run_interactive()
                    elif choice == "2":
                        print("\nThe system ends.")
                        return
                    else:
                        print("Invalid choice. Please select 1 or 2.")
                except KeyboardInterrupt:
                    print("\n\nGoodbye.")
                except Exception as e:
                    print(f"\nUnexpected error in main(): {e}.")
                    import traceback
                    traceback.print_exc()
            else:
                print("\nSystem health check failed.")
        except Exception as e:
            print(f"Error during system health check: {e}.")
            import traceback
            traceback.print_exc()
if __name__ == "__main__":
    import asyncio
    import sys        
    try:
        get_ipython()        
    except NameError:
        try:
            asyncio.run(main())
        except Exception as e:
            print(f"Error in script execution: {e}.")
            import traceback
            traceback.print_exc()
await main()

Starting...

Execution Options:
1. Interactive Mode
2. Exit
Selected choice: 1


2025-06-21 12:16:27,644 - INFO - Use pytorch device_name: mps
2025-06-21 12:16:27,644 - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2025-06-21 12:16:30,781 - INFO - Embedding model loaded on mps.
2025-06-21 12:16:30,789 - INFO - ChromaDB initialized successfully.
2025-06-21 12:16:30,799 - INFO - Testing model availability with direct generation...
2025-06-21 12:16:35,010 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
2025-06-21 12:16:35,038 - INFO - Model deepseek-r1:14b is working. Test response: <think>

</think>

Hello! How can I assist...


Starting interactive mode...
This is a RAG-based-Search-Augmented + Self-Correcting LLM Reasoning System.

Enter your queries (type 'quit' to exit).


2025-06-21 12:16:50,288 - INFO - Processing query: 'Did a plane crash in Ahmedabad?'.
2025-06-21 12:16:50,291 - INFO - Web scraping...
2025-06-21 12:16:50,292 - INFO - Web scraping...
2025-06-21 12:16:50,304 - INFO - Loaded 9 results from cache.
2025-06-21 12:16:50,304 - INFO - Chunking and embedding...
2025-06-21 12:16:50,475 - INFO - Generated 7 chunks from https://www.bbc.com/news.



Processing...


2025-06-21 12:16:50,619 - INFO - Generated 7 chunks from https://www.bbc.com/news/articles/c0l484l40gyo.
2025-06-21 12:16:50,771 - INFO - Generated 5 chunks from https://www.nytimes.com.
2025-06-21 12:16:50,999 - INFO - Generated 6 chunks from https://www.bbc.co.uk/news/articles/c0l484l40gyo.
2025-06-21 12:16:51,200 - INFO - Generated 5 chunks from https://www.bbc.co.uk/news/articles/c5y5nq170z4o.
2025-06-21 12:16:51,396 - INFO - Generated 6 chunks from https://www.bbc.co.uk/news/articles/cvgn23757jzo.
2025-06-21 12:16:51,547 - INFO - Generated 7 chunks from https://www.bbc.co.uk/news/articles/clyzn0gjz5lo.
2025-06-21 12:16:51,591 - INFO - Generated 4 chunks from https://abcnews.go.com.
2025-06-21 12:16:51,709 - INFO - Generated 7 chunks from https://www.bbc.com/news/articles/ce818jlz5mlo.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-21 12:16:52,452 - INFO - Storing in vector database...
2025-06-21 12:16:52,589 - INFO - Added 54 unique documents to ChromaDB.
2025-06-21 12:16:52,590 - INFO - Retrieving relevant context...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-21 12:16:52,730 - INFO - ChromaDB returned 5 documents.
2025-06-21 12:16:52,731 - INFO - Document 0: distance=0.3376, similarity=0.6624, threshold=0.15
2025-06-21 12:16:52,731 - INFO - Source: https://www.bbc.co.uk/news/articles/c5y5nq170z4o
2025-06-21 12:16:52,731 - INFO - Title: What we know after Air India flight from Ahmedabad to London crashes
2025-06-21 12:16:52,731 - INFO - Content preview: It was scheduled to land at London Gatwick at 18:25 BST. Moments after departing Ahmedabad, the plan...
2025-06-21 12:16:52,732 - INFO - Document 0 included (similarity=0.6624).
2025-06-21 12:16:52,732 - INFO - Document 1: distance=0.3376, similarity=0.6624, threshold=0.15
2025-06-21 12:16:52,732 - INFO - Source: https://www.bbc.co.uk/news/articles/c5y5nq170z4o
2025-06-21 12:16:52,732 - INFO - Title: What we know after Air India flight from Ahmedabad to London crashes
2025-06-21 12:16:52,733 - INFO - Content preview: It was scheduled to land at London Gatwick at 18:25 BST. Moments aft


Results:
Processing Time: 123.24 seconds.
Pages Scraped: 9
Text Chunks: 54
Retrieved Chunks: 5
Similarity Scores: ['0.662', '0.662', '0.662']

Final Answer:
Yes, a plane crash occurred in Ahmedabad. The incident took place moments after departure when the aircraft lost altitude and crashed into Meghani Nagar, a residential area. The flight was scheduled to land at London Gatwick but crashed shortly after takeoff. The plane struck a doctors' hostel at Byramjee Jeejeebhoy Medical College and Civil Hospital on Thursday during lunch break. Parts of the plane impacted the dining hall's roof, leaving abandoned tables and plates from the mealtime scattered in the canteen.

Flight tracking data showed the signal was lost less than a minute after takeoff at an altitude of 625ft (190m), with no response to the mayday call. The crash resulted in casualties, as reported by the BBC News.




2025-06-21 12:19:16,977 - INFO - Processing query: 'What's going on with Trump and Musk?'.
2025-06-21 12:19:16,978 - INFO - Web scraping...
2025-06-21 12:19:16,979 - INFO - Web scraping...
2025-06-21 12:19:16,996 - INFO - Loaded 5 results from cache.
2025-06-21 12:19:16,996 - INFO - Chunking and embedding...



Processing...


2025-06-21 12:19:17,201 - INFO - Generated 5 chunks from https://www.nytimes.com.
2025-06-21 12:19:17,300 - INFO - Generated 7 chunks from https://www.bbc.com/news.
2025-06-21 12:19:17,342 - INFO - Generated 4 chunks from https://abcnews.go.com.
2025-06-21 12:19:17,360 - INFO - Generated 1 chunks from https://www.bbc.co.uk/news/videos/c8d18dm967qo.
2025-06-21 12:19:17,516 - INFO - Generated 5 chunks from https://www.bbc.co.uk/news/articles/ceqgdnd2g9xo.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-21 12:19:18,304 - INFO - Storing in vector database...
2025-06-21 12:19:18,391 - INFO - Added 22 unique documents to ChromaDB.
2025-06-21 12:19:18,392 - INFO - Retrieving relevant context...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-21 12:19:18,486 - INFO - ChromaDB returned 5 documents.
2025-06-21 12:19:18,487 - INFO - Document 0: distance=0.3432, similarity=0.6568, threshold=0.15
2025-06-21 12:19:18,487 - INFO - Source: https://www.bbc.co.uk/news/articles/ceqgdnd2g9xo
2025-06-21 12:19:18,487 - INFO - Title: Will Musk's explosive row with Trump help or harm his businesses?
2025-06-21 12:19:18,488 - INFO - Content preview: Register Sign In Home News Sport Business Innovation Culture Arts Travel Earth Audio Video Live ADVE...
2025-06-21 12:19:18,488 - INFO - Document 0 included (similarity=0.6568).
2025-06-21 12:19:18,488 - INFO - Document 1: distance=0.3432, similarity=0.6568, threshold=0.15
2025-06-21 12:19:18,494 - INFO - Source: https://www.bbc.co.uk/news/articles/ceqgdnd2g9xo
2025-06-21 12:19:18,522 - INFO - Title: Will Musk's explosive row with Trump help or harm his businesses?
2025-06-21 12:19:18,523 - INFO - Content preview: Register Sign In Home News Sport Business Innovation Culture Arts Travel E


Results:
Processing Time: 180.69 seconds.
Pages Scraped: 5
Text Chunks: 22
Retrieved Chunks: 5
Similarity Scores: ['0.657', '0.657', '0.651']

Final Answer:
The relationship between Elon Musk and Donald Trump has recently been marked by significant public disputes that have impacted both their personal reputations and business interests. Here's an organized summary of the situation:

1. **Public Dispute**: A heated social media argument between Musk and Trump escalated, with comments from Musk about the White House leading to a high-profile feud.

2. **Impact on Tesla**: The conflict resulted in a 14% drop in Tesla's stock price during trading hours on June 7, 2025. Although there was a slight recovery as tensions eased, investor confidence remained shaken.

3. **Investor Concerns**: Investors, including long-time supporters like Ross Gerber, have expressed doubts about Musk's focus on his businesses. Some have reduced their holdings in Tesla due to concerns over Musk's political invo