In [1]:
!pip install requests
!pip install bs4
!pip install emoji
!pip install python-docx



In [4]:
import requests 
from bs4 import BeautifulSoup 
import json
import emoji
from collections import deque
from urllib.parse import urljoin
from docx import Document
import time

In [5]:
DOMAIN = "https://webhopper-client.vercel.app"

In [8]:
class WebHopperCrawler:
    def __init__(self, url):
        self.domain = url
        self.paragraphs = {}
        self.visited = set()

    def paragraph_extractor(self, soup):
        """extract title and paragraphs from the page"""
        titleSoup = soup.find(id='title')
        textSoup = soup.find(id='text')
        
        title = titleSoup.get_text() if titleSoup else None
        text = textSoup.get_text() if textSoup else None
    
        if title in self.paragraphs:
            self.paragraphs[title].append(text)
        else:
            self.paragraphs[title] = [text]
        return title

    def DFS_crawler(self, url):
        """Depth First Search Algorithm"""
        if url in self.visited:
            return
        self.visited.add(url)
        
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        title = self.paragraph_extractor(soup)
        print(emoji.emojize("✅ crawling completed: "), title, url)
        # print(title, end=" → ")
        
        links = soup.find_all('a', href=True)
        for link in links:
            next_url = f"{DOMAIN}{link.get('href')}"
            self.DFS_crawler(next_url)

    def BFS_crawler(self, url):
        """Breadth First Search Algorithm"""
        queue = deque([url])
        
        while queue:
            url = queue.popleft()
            if url in self.visited:
                continue
            self.visited.add(url)
            
            response = requests.get(url)
            soup = BeautifulSoup(response.content, 'html.parser')
            
            title = self.paragraph_extractor(soup)
            print(emoji.emojize("✅ crawling completed: "), title, url)
            # print(title, end=" → ")
    
            links = soup.find_all('a', href=True)
            for link in links:
                next_url = f"{DOMAIN}{link.get('href')}"
                if next_url not in self.visited:
                    queue.append(next_url)

    def document_generator(self, algoname):
        """Human readable document generator"""
        doc = Document()
        for i, title in enumerate(self.paragraphs):
            para = self.paragraphs[title][0]
            doc.add_heading(title)
            doc.add_paragraph(para)
        doc.save(f"webhopper_{algoname}.docx")
        # print(json.dumps(self.paragraphs, indent=4))

    def reset(self):
        """reset the state of the class"""
        self.paragraphs = {}
        self.visited = set()

    def execute(self):
        """execute the program step by step"""
        print("Starting webhopper...")
       
        print("Depth First Search (DFS)")
        start_time = time.time()
        self.DFS_crawler(self.domain)
        print("DFS Completed, Saving document...")
        self.document_generator("DFS")
        end_time = time.time()
        print(f"Time elasped: {end_time - start_time}s")
        
        self.reset()
        
        print("Breadth First Search (BFS)")
        start_time = time.time()
        self.BFS_crawler(self.domain)
        print("BFS Completed, Saving document...")
        self.document_generator("BFS")
        end_time = time.time()
        print(f"Time elasped: {end_time - start_time}s")
        
        print("Webhopper Completed!")
        

In [12]:
webhopper = WebHopperCrawler(url=DOMAIN)
webhopper.execute()

Starting webhopper...
Depth First Search (DFS)
✅ crawling completed:  WebHopper | Articles https://webhopper-client.vercel.app
✅ crawling completed:  Data Structure https://webhopper-client.vercel.app/articles/data-structures
✅ crawling completed:  Computer Science https://webhopper-client.vercel.app/articles/computer-science
✅ crawling completed:  Computation https://webhopper-client.vercel.app/articles/computation
✅ crawling completed:  Algorithm https://webhopper-client.vercel.app/articles/algorithm
✅ crawling completed:  Computer https://webhopper-client.vercel.app/articles/computer
✅ crawling completed:  Information https://webhopper-client.vercel.app/articles/information
✅ crawling completed:  Decision Making https://webhopper-client.vercel.app/articles/decision-making
✅ crawling completed:  Automation https://webhopper-client.vercel.app/articles/automation
✅ crawling completed:  Human Intervention https://webhopper-client.vercel.app/articles/human-intervention
✅ crawling complet

In [11]:
class WebHopperSearch:
    def __init__(self, start_url, goal_keyword, max_depth=5):
        self.start_url = start_url
        self.goal_keyword = goal_keyword
        self.max_depth = max_depth
        self.visited = set()

    def heuristic(self, url):
        """Heuristic function: prioritize URLs with the goal keyword."""
        return -url.count(self.goal_keyword)

    def fetch_links(self, url):
        """Fetch all hyperlinks from a given URL."""
        try:
            response = requests.get(url, timeout=5)
            soup = BeautifulSoup(response.content, 'html.parser')
            links = set(
                urljoin(url, a['href'])
                for a in soup.find_all('a', href=True)
                if urljoin(url, a['href']).startswith(self.start_url)
            )
            return links
        except Exception as e:
            print(f"Failed to fetch links from {url}: {e}")
            return set()

    def find_match(self, url):
        """find the goal from the page"""
        response = requests.get(url, timeout=5)
        soup = BeautifulSoup(response.content, 'html.parser')

        titleSoup = soup.find(id='title')
        textSoup = soup.find(id='text')
        
        title = titleSoup.get_text().lower() if titleSoup else None
        text = textSoup.get_text().lower() if textSoup else None

        if not title or not text: 
            return False

        if(title.find(self.goal_keyword.lower()) >= 0 or text.find(self.goal_keyword.lower()) >= 0):
            return True
        return False

    def crawl(self):
        """A* algorithm for crawling and searching."""
        open_set = [(self.start_url, 0)]  # (URL, cost so far)
        g_scores = {self.start_url: 0}

        while open_set:
            # Sort open_set by total cost (g + h)
            open_set.sort(key=lambda x: g_scores[x[0]] + self.heuristic(x[0]))
            current_url, current_cost = open_set.pop(0)

            # Mark as visited
            if current_url in self.visited:
                continue
            self.visited.add(current_url)
            print(f"Visiting: {current_url}")

            # Check if goal is found
            if self.find_match(current_url):
                print(f"Goal found: {current_url}")
                return

            # Fetch and process neighbors
            if current_cost < self.max_depth:
                for neighbor in self.fetch_links(current_url):
                    tentative_g_score = current_cost + 1
                    if neighbor not in g_scores or tentative_g_score < g_scores[neighbor]:
                        g_scores[neighbor] = tentative_g_score
                        open_set.append((neighbor, tentative_g_score))

        print("Goal not found within max depth.")


In [12]:
goal_keyword = "human intervention"
webhopper_search = WebHopperSearch(DOMAIN, goal_keyword)
webhopper_search.crawl()

Visiting: https://webhopper-client.vercel.app
Visiting: https://webhopper-client.vercel.app/articles/data-structures
Visiting: https://webhopper-client.vercel.app/articles/data-structures/types
Visiting: https://webhopper-client.vercel.app/articles/algebraic-structure
Visiting: https://webhopper-client.vercel.app/articles/function
Visiting: https://webhopper-client.vercel.app/articles/computer-science
Visiting: https://webhopper-client.vercel.app/articles/array
Visiting: https://webhopper-client.vercel.app/articles/hash-table
Visiting: https://webhopper-client.vercel.app/articles/trees
Visiting: https://webhopper-client.vercel.app/articles/graphs
Visiting: https://webhopper-client.vercel.app/articles/record
Visiting: https://webhopper-client.vercel.app/articles/linked-list
Visiting: https://webhopper-client.vercel.app/articles/stack-queue
Visiting: https://webhopper-client.vercel.app/articles/information
Visiting: https://webhopper-client.vercel.app/articles/automation
Goal found: http