# Exploring Crawling

In [1]:
# URL = 'https://realpython.com/headless-selenium-testing-with-python-and-phantomjs/'

In [2]:
URL = 'https://site-hossain.web.app'

## Libs for processing data.

In [3]:
import selenium.webdriver
import selenium as se

In [4]:
from collections import defaultdict
from abc import ABCMeta
from abc import abstractmethod
from typing import Iterable
from copy import deepcopy as dc

In [5]:
class BaseCrawler():
    """
    A Crawler wrapper of selenium Chrome WebDriver.
    """
    def __init__(self, url=''):
        self.driver = None
        self.url = url
        pass

    def setup(self, options=None):
        if not options:
            options = selenium.webdriver.ChromeOptions()
            options.add_argument('headless')
        self.driver = selenium.webdriver.Chrome(options=options)
    
    def get(self, url=''):
        self._lazy_init()
        if url:
            self.url = url
        return self.driver.get(self.url)
    
    def _lazy_init(self, options=None):
        if not self.driver:
            self.setup(options)
        
    def find_element(self, tag, attributes=[], keep_dom=False):
        self._lazy_init()
        all_elements = self.driver.find_elements_by_tag_name(tag)
        if keep_dom or not attributes:
            yield from all_elements
        else:   
            yield from map(lambda x: tuple(x.get_attribute(attr) for attr in attributes), all_elements)
    
    def find_anchors(self, *args, **kwargs):
        yield from self.find_element(tag='a', *args, **kwargs)

    def find_images(self, *args, **kwargs):
        yield from self.find_element(tag='img', *args, **kwargs)

    def find_all_text(self):
        yield from map(lambda x: x.text, self.find_element(tag='body', keep_dom=True))
    
    def driver(self):
        return self.driver
    
    def __str__(self):
        return f'Chrome crawler for: {self.url} \n Initialized: \t {self.url is not None}'

    def done(self):
        self.driver.close()
        self.driver = None

### Get links from an html document.

# Visualization stuff

In [6]:
import pandas as pd

# Graph stuff

In [7]:
class Edge(metaclass=ABCMeta):
    def __init__(self):
        pass

    @abstractmethod
    def value(self):
        return

    @abstractmethod
    def label(self):
        return
    
    @abstractmethod
    def weight(self):
        return
    
    def __hash__(self):
        return hash(self.value())

    def __str__(self):
        return str(self.label())

In [8]:
class Node(metaclass=ABCMeta):
    def __init__(self):
        pass
        
    @abstractmethod
    def value(self):
        return

    @abstractmethod
    def label(self):
        return
    
    @abstractmethod
    def data(self):
        return
    
    def __hash__(self):
        return hash(self.value())
    
    def __str__(self):
        return str(self.label())

In [9]:
class SearchParams:
    def __init__(self, **kwargs):
        _not_allowed_keys = set()
        self.__dict__.update((k, v) for k, v in kwargs.items() if k not in _not_allowed_keys)
        pass
    
    def __str__(self):
        return self.__dict__.__str__()

In [10]:
from math import inf
from queue import PriorityQueue


class Graph(metaclass=ABCMeta):
    def __init__(self, name='', directed=True):
        self.adjacencies = dict()
        self.nodes = set()
        self.directed = directed
        self.edges = defaultdict(dict)
        pass
    
    def neighbors(self, node: Node):
        if node in self.adjacencies:
            yield from self.adjacencies[node]
    
    def add_node(self, node: Node):
        if node not in self.nodes:
            self.nodes |= {node}
            self.adjacencies[node] = set()
#         else:
#             raise Exception(f'Node {str(node)} already exists.')
    
    def add_nodes(self, nodes: Iterable[Node]):
        for node in nodes:
            self.add_node(node)
        pass
    
    def add_edge(self, start: Node, end: Node, *args, **kwargs):
        _wt = kwargs.get('weight', 1.0)
        edge = GraphEdge(start, end, *args, **kwargs)
        
        self.adjacencies[start] |= {(end, _wt)}
        self.edges[edge] |= {(start, end, _wt)}
        
        if not self.directed:
            self.adjacencies[end] |= {(start, _wt)}
    
    def add_edges(self, edges: Iterable[Edge]):
        for start, end, *rest in edges:
            self.add_edge(start, end, *rest)
    
    def bfs_explore(self, start: Node, use_neighbors=None, **search_params: SearchParams):
        _max_depth = search_params.get('max_depth', 5)
        _max_nodes_bound = search_params.get('max_nodes_bound', 10)
        _max_edges_bound = search_params.get('max_edges_bound', 100)
        
        if use_neighbors:
            neighbor_function = use_neighbors(self)
        else:
            neighbor_function = self.neighbors
        _queue = PriorityQueue()
        _visited_nodes = set()
        _visited_edges = set()
        _distances = defaultdict(lambda:defaultdict(lambda:(inf, 0)))

        _distances[start][start] = (0, 0)
        
        _queue.put((0, start, None))
        
        while _queue.qsize() and len(_visited_nodes) <= _max_nodes_bound and len(_visited_edges) <= _max_edges_bound:
            weight, current, parent = _queue.get()
            _visited_nodes |= {current}
            
            for nbr, edge in neighbor_function(current):
                if nbr not in _visited_nodes and edge not in _visited_edges:
                    wt, depth = _distances[start][nbr]
                    wt_prev, depth_prev = _distances[start][current]
                    if depth_prev >= _max_depth:
                        continue
                    if wt_prev + edge.weight() < wt:
                        _distances[start][nbr] = (wt_prev + edge.weight(), depth_prev + 1)
                    _visited_edges |= {edge}
                    _queue.put((_distances[start][nbr], nbr, current))
        return _distances
        
    def __getitem__(self, key):
        if key in self.adjacencies:
            return self.adjacencies[key]
        raise Exception(f'No node of key: {key}')
    
    def __iter__(self):
        return iter(self.adjacencies)
    
    def __len__(self):
        return len(self.adjacencies)
    
    def adj(self):
        return self.adjacencies
    
    def size(self):
        return len(self.edges)
    
    def __str__(self):
        return str(self.adjacencies.keys())
    

In [11]:
params = SearchParams(_max_depth=2, _max_nodes_bound=3, _max_edges_bound=3)

In [12]:
class GraphNode(Node):
    def __init__(self, dom_node: se.webdriver.remote.webelement.WebElement, data=None):
        super().__init__()
        
        self._value = None
        self._label = None
        self._data = data
        self._underlying_dom_node = dom_node
        self._node_id = dom_node.id
        
    
        if dom_node.tag_name == 'a':
            self._value = dom_node.get_attribute('href')
            temp = dom_node.get_attribute('text')
            if temp.strip() != '':
                self._label = temp
            else:
                self._label = 'unlabelled/link'
            
        elif dom_node.tag_name == 'img':
            self._value = dom_node.get_attribute('src')
            temp = dom_node.get_attribute('alt')
            if temp.strip() != '':
                self._label = temp
            else:
                self._label = 'no-alt/image'
        else:
            pass
    
    def value(self):
        return self._value
    
    def label(self):
        return self._label
    
    def data(self):
        return self._data
    
    def get_underlying_dom_node(self):
        return self._underlying_dom_node
    
    def get_node_id(self):
        return self._node_id
    
    def __str__(self):
        return str(self._label)

In [13]:
class GraphEdge(Edge):
    def __init__(self, start: GraphNode, end: GraphNode, weight: float = 1.0, label: str = ''):    
        super().__init__()
        
        self._value = f'({str(start)} ==> {str(end)}, weight: {weight})'
        self._label = label if label else self._value
        self._weight = weight
        self._start = start
        self._end = end
        self._edge_id = f'{start.get_node_id()}==>{end.get_node_id()}'
    
    def value(self):
        return self._value
    
    def label(self):
        return self._label

    def weight(self):
        return self._weight
    
    def start(self):
        return self._start
    
    def end(self):
        return self._end
    
    def get_edge_id(self):
        return self._edge_id
    
    def __str__(self):
        return f'Label: {self._label}\nValue: {self._value}'
    
    def __hash__(self):
        return hash(self.get_edge_id())
    
        

In [14]:
class DOMNodeGraph(Graph):
    def __init__(self, name='', directed=True):
        self.name = name
        self.directed = directed
        super().__init__(name=name, directed=directed)

    
    def __str__(self):
        return super().__str__()
    

In [15]:
def make_node(dom_node, data=None):
    return GraphNode(dom_node, data=data)

def make_edge(start, end, *args, **kwargs):
    return GraphEdge(start, end, *args, **kwargs)

In [16]:
class GraphCrawler(BaseCrawler):
    def __init__(self, url='', name=''):
        super().__init__(url=url)
        
        self.name = name
        self.graph = DOMNodeGraph(name=name)
        pass

    def initialize_root(self):
        curr = self.get()
        self.graph.add_node(make_node(dom_node=curr, data=self.find_all_text()))
        print(self.graph)
        _anchors = self.find_anchors(attributes=["href", "text"])
        _anchor_nodes = map(lambda x: make_node(url=x[0], data=''), _anchors)
        self.graph.add_nodes(_anchor_nodes)
        self.graph.add_edges(map(lambda a: make_edge(curr, a), self.find_anchors(keep_dom=True)))
        pass
    
    def __repr__(self):
        return self.__str__()
    
    @staticmethod
    def use_neighbors(graph):
        
        print(graph)
        return lambda x: [(None, None)]
    
    def __str__(self):
        return f'GraphCrawler: {self.name}, rooted at: {self.url}.\nMy cache is: {str(self.cache)}'

In [17]:
gc = GraphCrawler(url=URL, name='Crawley')

In [18]:
gc.initialize_root()

AttributeError: 'NoneType' object has no attribute 'id'

In [None]:
list(map(str, list(gc.graph)))

In [None]:
len(gc.graph)

In [None]:
gc

In [None]:
root = make_node(URL, '\n'.join(gc.find_all_text()))

In [None]:
print(root.data())

In [None]:
gc.graph.bfs_explore(start=root, use_neighbors=GraphCrawler.use_neighbors, search_params=params)

In [None]:
# params = SearchParams(_max_depth=2, _max_nodes_bound=3, _max_edges_bound=3)

In [None]:
# print(gc)

In [None]:
# for x in geb:
#     print(x)

In [None]:
# help(gc)