In [None]:
import faiss
import numpy as np

class KnowledgeRetriever:
    def __init__(self, driver, embedder):
        """
        Args:
            driver: Neo4j driver instance
            embedder: SentenceTransformer (or similar) embedding model
        """
        self.driver = driver
        self.embedder = embedder
        self.faiss_index = None
        self.chunk_id_to_index = {}
        self.index_to_chunk_id = {}



In [2]:
%pip install clip-by-openai


Collecting clip-by-openai
  Downloading clip_by_openai-1.1-py3-none-any.whl.metadata (369 bytes)
INFO: pip is looking at multiple versions of clip-by-openai to determine which version is compatible with other requirements. This could take a while.
  Downloading clip_by_openai-1.0.1-py3-none-any.whl.metadata (407 bytes)
  Downloading clip_by_openai-0.1.1.5-py3-none-any.whl.metadata (8.6 kB)
  Downloading clip_by_openai-0.1.1.4-py3-none-any.whl.metadata (8.6 kB)
  Downloading clip_by_openai-0.1.1.3-py3-none-any.whl.metadata (8.7 kB)
  Downloading clip_by_openai-0.1.1.2-py3-none-any.whl.metadata (9.0 kB)
  Downloading clip_by_openai-0.1.1-py3-none-any.whl.metadata (9.0 kB)
  Downloading clip_by_openai-0.1.0-py3-none-any.whl.metadata (9.0 kB)
INFO: pip is still looking at multiple versions of clip-by-openai to determine which version is compatible with other requirements. This could take a while.
[31mERROR: Cannot install clip-by-openai==0.1.0, clip-by-openai==0.1.1, clip-by-openai==0.1.1

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from collections import deque
import re
import io
from PyPDF2 import PdfReader
import os
import numpy as np
from PIL import Image
import torch
from typing import Dict, List, Optional, Tuple, Any
import hashlib
import json
from dataclasses import dataclass, asdict
from datetime import datetime

# Embedding and ML imports

from sentence_transformers import SentenceTransformer
import clip
from transformers import BlipProcessor, BlipForConditionalGeneration

# LangChain imports
from langchain.embeddings.base import Embeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

# Graph database (Neo4j example)


2025-08-25 02:24:56.743076: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1756088696.959947    4824 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1756088697.025210    4824 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1756088697.605197    4824 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1756088697.605259    4824 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1756088697.605265    4824 computation_placer.cc:177] computation placer alr

ModuleNotFoundError: No module named 'clip'

In [None]:
try:
    from neo4j import GraphDatabase
except ImportError:
    print("Warning: neo4j not installed. Graph DB functionality will be limited.")
    GraphDatabase = None


@dataclass
class ContentNode:
    """Structured content node for graph database"""
    url: str
    title: str
    content_type: str  # 'page', 'pdf', 'image', 'table'
    text_content: Optional[str] = None
    image_urls: Optional[List[str]] = None
    metadata: Optional[Dict[str, Any]] = None
    embeddings: Optional[Dict[str, np.ndarray]] = None
    relationships: Optional[List[str]] = None
    timestamp: str = None
    
    def __post_init__(self):
        if self.timestamp is None:
            self.timestamp = datetime.now().isoformat()


class MultimodalEmbeddings:
    """Handle different types of embeddings for multimodal content"""
    
    def __init__(self, device="cpu"):
        self.device = device
        
        # Initialize models
        self.text_model = SentenceTransformer('all-MiniLM-L6-v2')
        self.clip_model, self.clip_preprocess = clip.load("ViT-B/32", device=device)
        
        # BLIP for image captioning
        self.blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
        self.blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
        
    def encode_text(self, texts: List[str]) -> np.ndarray:
        """Encode text using sentence transformers"""
        if not texts:
            return np.array([])
        return self.text_model.encode(texts, convert_to_numpy=True)
    
    def encode_images(self, image_urls: List[str]) -> Tuple[np.ndarray, List[str]]:
        """Encode images using CLIP and generate captions with BLIP"""
        if not image_urls:
            return np.array([]), []
            
        embeddings = []
        captions = []
        
        for img_url in image_urls:
            try:
                # Download image
                response = requests.get(img_url, timeout=10, 
                                      headers={"User-Agent": "MultimodalCrawler/1.0"})
                if response.status_code != 200:
                    continue
                    
                image = Image.open(io.BytesIO(response.content)).convert('RGB')
                
                # CLIP embedding
                image_tensor = self.clip_preprocess(image).unsqueeze(0).to(self.device)
                with torch.no_grad():
                    clip_embedding = self.clip_model.encode_image(image_tensor)
                    embeddings.append(clip_embedding.cpu().numpy().flatten())
                
                # BLIP caption
                inputs = self.blip_processor(image, return_tensors="pt")
                with torch.no_grad():
                    caption_ids = self.blip_model.generate(**inputs, max_length=50)
                    caption = self.blip_processor.decode(caption_ids[0], skip_special_tokens=True)
                    captions.append(caption)
                    
            except Exception as e:
                print(f"Failed to process image {img_url}: {e}")
                continue
                
        return np.array(embeddings), captions
    
    def encode_multimodal(self, text: str, image_urls: List[str]) -> Dict[str, np.ndarray]:
        """Combine text and image embeddings"""
        embeddings = {}
        
        if text and len(text.strip()) > 0:
            embeddings['text'] = self.encode_text([text])[0]
            
        if image_urls:
            img_embeddings, captions = self.encode_images(image_urls)
            if len(img_embeddings) > 0:
                embeddings['images'] = img_embeddings
                embeddings['captions'] = self.encode_text(captions) if captions else np.array([])
                
        return embeddings


class GraphDatabaseManager:
    """Manage graph database operations"""
    
    def __init__(self, uri="bolt://localhost:7687", user="neo4j", password="password"):
        if GraphDatabase is None:
            print("Neo4j driver not available. Graph operations will be skipped.")
            self.driver = None
            return
            
        try:
            self.driver = GraphDatabase.driver(uri, auth=(user, password))
        except Exception as e:
            print(f"Failed to connect to Neo4j: {e}")
            self.driver = None
    
    def close(self):
        if self.driver:
            self.driver.close()
    
    def create_content_node(self, node: ContentNode):
        """Create a content node in the graph database"""
        if not self.driver:
            return
            
        with self.driver.session() as session:
            session.write_transaction(self._create_node, node)
    
    @staticmethod
    def _create_node(tx, node: ContentNode):
        query = """
        MERGE (n:Content {url: $url})
        SET n.title = $title,
            n.content_type = $content_type,
            n.text_content = $text_content,
            n.timestamp = $timestamp
        """
        tx.run(query, **asdict(node))
    
    def create_relationship(self, from_url: str, to_url: str, relationship_type: str):
        """Create relationship between nodes"""
        if not self.driver:
            return
            
        with self.driver.session() as session:
            session.write_transaction(self._create_relationship, from_url, to_url, relationship_type)
    
    @staticmethod
    def _create_relationship(tx, from_url: str, to_url: str, rel_type: str):
        query = """
        MATCH (a:Content {url: $from_url})
        MATCH (b:Content {url: $to_url})
        MERGE (a)-[r:LINKS_TO {type: $rel_type}]->(b)
        """
        tx.run(query, from_url=from_url, to_url=to_url, rel_type=rel_type)


class LangChainRetriever:
    """LangChain integration for retrieval"""
    
    def __init__(self, embedding_model: MultimodalEmbeddings):
        self.embedding_model = embedding_model
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200
        )
        self.vectorstore = None
        
    def create_vectorstore(self, documents: List[ContentNode], persist_directory="./chroma_db"):
        """Create vector store from content nodes"""
        langchain_docs = []
        
        for node in documents:
            if node.text_content:
                # Split text into chunks
                chunks = self.text_splitter.split_text(node.text_content)
                
                for i, chunk in enumerate(chunks):
                    doc = Document(
                        page_content=chunk,
                        metadata={
                            "url": node.url,
                            "title": node.title,
                            "content_type": node.content_type,
                            "chunk_id": i,
                            "timestamp": node.timestamp
                        }
                    )
                    langchain_docs.append(doc)
        
        # Custom embedding wrapper
        class CustomEmbeddings(Embeddings):
            def __init__(self, model):
                self.model = model
                
            def embed_documents(self, texts: List[str]) -> List[List[float]]:
                embeddings = self.model.encode_text(texts)
                return [emb.tolist() for emb in embeddings]
                
            def embed_query(self, text: str) -> List[float]:
                embedding = self.model.encode_text([text])[0]
                return embedding.tolist()
        
        custom_embeddings = CustomEmbeddings(self.embedding_model)
        
        self.vectorstore = Chroma.from_documents(
            documents=langchain_docs,
            embedding=custom_embeddings,
            persist_directory=persist_directory
        )
        
        return self.vectorstore
    
    def similarity_search(self, query: str, k: int = 5) -> List[Document]:
        """Perform similarity search"""
        if not self.vectorstore:
            return []
        return self.vectorstore.similarity_search(query, k=k)


class EnhancedWebCrawler:
    """Enhanced web crawler with multimodal embeddings and graph DB integration"""
    
    def __init__(self, use_gpu=False, neo4j_config=None):
        device = "cuda" if use_gpu and torch.cuda.is_available() else "cpu"
        self.embeddings = MultimodalEmbeddings(device=device)
        
        # Initialize graph database
        if neo4j_config:
            self.graph_db = GraphDatabaseManager(**neo4j_config)
        else:
            self.graph_db = GraphDatabaseManager()
        
        # Initialize retriever
        self.retriever = LangChainRetriever(self.embeddings)
        
        # Store processed nodes
        self.content_nodes: List[ContentNode] = []
    
    def extract_pdf_data(self, pdf_url: str) -> Optional[Dict]:
        """Enhanced PDF extraction with embeddings"""
        try:
            resp = requests.get(pdf_url, timeout=10, 
                              headers={"User-Agent": "EnhancedCrawler/1.0"})
            if resp.status_code != 200:
                return None

            pdf_file = io.BytesIO(resp.content)
            reader = PdfReader(pdf_file)

            # Extract metadata
            title = reader.metadata.title if reader.metadata and reader.metadata.title else None
            if not title:
                parsed_url = urlparse(pdf_url)
                filename = os.path.basename(parsed_url.path)
                title = filename if filename else "Untitled PDF"

            # Extract text
            text_content = []
            for page in reader.pages:
                try:
                    text_content.append(page.extract_text() or "")
                except Exception:
                    pass
            text = "\n".join(text_content).strip()

            # Generate embeddings
            embeddings = {}
            if text:
                embeddings = self.embeddings.encode_multimodal(text, [])

            return {
                "title": title,
                "download_link": pdf_url,
                "content": text if text else None,
                "embeddings": embeddings
            }
        except Exception as e:
            print(f"Failed to parse PDF {pdf_url}: {e}")
            return None

    def extract_enhanced_content(self, url: str, soup: BeautifulSoup) -> ContentNode:
        """Extract content with multimodal embeddings"""
        
        # Get title
        title_tag = soup.find("title")
        title = title_tag.get_text().strip() if title_tag else urlparse(url).path
        
        # Clean text content
        for script in soup(["script", "style", "noscript", "header", "footer", "nav"]):
            script.extract()
        text = soup.get_text(" ", strip=True)
        text = re.sub(r"\s+", " ", text)
        text_content = text if len(text) > 50 else None

        # Collect images
        image_urls = []
        for img in soup.find_all("img", src=True):
            img_url = urljoin(url, img["src"])
            image_urls.append(img_url)

        # Generate multimodal embeddings
        embeddings = self.embeddings.encode_multimodal(
            text_content or "", 
            image_urls[:5]  # Limit to first 5 images for performance
        )

        # Collect additional metadata
        metadata = {
            "images": image_urls,
            "tables": [str(table) for table in soup.find_all("table")],
            "pdfs": [],
            "other_files": []
        }

        # Process file links
        for a in soup.find_all("a", href=True):
            href = urljoin(url, a["href"])
            if href.lower().endswith(".pdf"):
                pdf_data = self.extract_pdf_data(href)
                if pdf_data:
                    metadata["pdfs"].append(pdf_data)
            elif re.search(r"\.(zip|docx?|xlsx?|pptx?)$", href, re.I):
                metadata["other_files"].append(href)

        # Create content node
        node = ContentNode(
            url=url,
            title=title,
            content_type="page",
            text_content=text_content,
            image_urls=image_urls if image_urls else None,
            metadata=metadata,
            embeddings=embeddings
        )

        return node

    def crawl_site_tree(self, start_url: str, max_pages: int = 30) -> Dict[str, ContentNode]:
        """Enhanced crawling with graph relationships"""
        domain = urlparse(start_url).netloc
        visited = set()
        crawl_tree = {}

        queue = deque([start_url])

        while queue and len(visited) < max_pages:
            url = queue.popleft()
            if url in visited:
                continue
            visited.add(url)

            try:
                resp = requests.get(url, 
                                  headers={"User-Agent": "EnhancedCrawler/1.0"}, 
                                  timeout=10)
                if resp.status_code != 200 or "text/html" not in resp.headers.get("Content-Type", ""):
                    continue
            except Exception as e:
                print(f"Failed to crawl {url}: {e}")
                continue

            soup = BeautifulSoup(resp.text, "html.parser")

            # Extract enhanced content
            content_node = self.extract_enhanced_content(url, soup)
            self.content_nodes.append(content_node)

            # Create graph node
            self.graph_db.create_content_node(content_node)

            # Extract and process child links
            children = []
            for a in soup.find_all("a", href=True):
                absolute = urljoin(url, a["href"])
                anchor_text = a.get_text(" ", strip=True)

                if (urlparse(absolute).netlnet != domain or 
                    absolute in visited or 
                    not anchor_text or anchor_text.isspace()):
                    continue

                children.append({
                    "url": absolute,
                    "anchor_text": anchor_text
                })
                
                # Create relationship in graph
                self.graph_db.create_relationship(url, absolute, "LINKS_TO")
                
                queue.append(absolute)

            content_node.relationships = [child["url"] for child in children]
            crawl_tree[url] = content_node

        return crawl_tree

    def create_retrieval_system(self, persist_directory="./chroma_db"):
        """Create the retrieval system using LangChain"""
        vectorstore = self.retriever.create_vectorstore(
            self.content_nodes, 
            persist_directory
        )
        return vectorstore

    def search(self, query: str, k: int = 5) -> List[Document]:
        """Search the crawled content"""
        return self.retriever.similarity_search(query, k)

    def close(self):
        """Clean up resources"""
        self.graph_db.close()




In [None]:
# Usage example
if __name__ == "__main__":
    # Configuration
    neo4j_config = {
        "uri": "bolt://localhost:7687",
        "user": "neo4j", 
        "password": "password"
    }
    
    # Initialize crawler
    crawler = EnhancedWebCrawler(
        use_gpu=True,  # Set to False if no GPU
        neo4j_config=neo4j_config
    )
    
    # Crawl website
    site_map = crawler.crawl_site_tree("https://example.com", max_pages=10)
    
    # Create retrieval system
    vectorstore = crawler.create_retrieval_system()
    
    # Example search
    results = crawler.search("machine learning algorithms", k=3)
    for result in results:
        print(f"URL: {result.metadata['url']}")
        print(f"Content: {result.page_content[:200]}...")
        print("-" * 50)
    
    # Clean up
    crawler.close()