# Exploring Crawling

In [None]:
# URL = 'https://realpython.com/headless-selenium-testing-with-python-and-phantomjs/'

In [140]:
URL = 'https://site-hossain.web.app'

## Libs for processing data.

In [141]:
import selenium.webdriver
import selenium as se
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

In [142]:
from collections import defaultdict
from abc import ABCMeta
from abc import abstractmethod
from typing import Iterable
from copy import deepcopy as dc
import itertools
import time

In [143]:
class BaseCrawler():
    """
    A Crawler wrapper of selenium Firefox WebDriver.
    """
    def __init__(self, url='', headless=True):
        self.driver = None
        self.url = url
        self.headless = headless
        self.setup()
        pass

    def setup(self, options=None):
        if not options:
            options = selenium.webdriver.ChromeOptions()
            if self.headless:
                options.add_argument('headless')
        self.driver = selenium.webdriver.Chrome(options=options)
        self.driver.implicitly_wait(10)
        self.driver.get(self.url)
    
    def get(self, url=''):
        if url:
            self.url = url
        self.driver.get(self.url)
        return self.driver
        
    def find_element(self, tag, attributes=[]):
        all_elements = self.driver.find_elements_by_tag_name(tag)
#         all_elements = WebDriverWait(self.driver, 5).until(EC.visibility_of_all_elements_located((By.TAG_NAME, tag)))
        it1, it2 = itertools.tee(all_elements, 2)
        yield from zip(it1, map(lambda x: {attr: x.get_attribute(attr) for attr in attributes}, it2))
    
    def find_anchors(self, *args, **kwargs):
        yield from self.find_element(tag='a', *args, **kwargs)

    def find_images(self, *args, **kwargs):
        yield from self.find_element(tag='img', *args, **kwargs)

    def find_all_text(self):
        for _doms, _dict in self.find_element(tag='body'):
            yield _doms.text
    
    def driver(self):
        return self.driver
    
    def __str__(self):
        return f'Chrome crawler for: {self.url} \n Initialized: \t {self.driver is not None}'

    def done(self):
        self.driver.close()
        self.driver = None

### Get links from an html document.

# Visualization stuff

In [144]:
import pandas as pd

# Graph stuff

In [145]:
class Node(metaclass=ABCMeta):
    def __init__(self):
        pass
        
    @abstractmethod
    def value(self):
        return

    @abstractmethod
    def label(self):
        return
    
    @abstractmethod
    def data(self):
        return
    
    def __hash__(self):
        return hash(self.value())
    
    def __str__(self):
        return str(self.label())

In [146]:
class Edge(metaclass=ABCMeta):
    def __init__(self, start: Node, end: Node):
        self._start = start
        self._end = end
        pass

    def start(self):
        return self._start
    
    def end(self):
        return self._end
    
    @abstractmethod
    def value(self):
        return

    @abstractmethod
    def label(self):
        return
    
    @abstractmethod
    def weight(self):
        return
    
    
    
    def __hash__(self):
        return hash(self.value())

    def __str__(self):
        return str(self.label())

In [181]:
class SearchParams:
    def __init__(self, **kwargs):
        _not_allowed_keys = set()
        self.__dict__.update((k, v) for k, v in kwargs.items() if k not in _not_allowed_keys)
        pass
    
    def __str__(self):
        return self.__dict__.__str__()

In [182]:
from math import inf


class Graph(metaclass=ABCMeta):
    def __init__(self, name='', directed=True):
        self.adjacencies = dict()
        self.nodes = set()
        self.directed = directed
        self.edges = defaultdict(set)
        pass
    
    def neighbors(self, node: Node):
        if node in self.adjacencies:
            yield from self.adjacencies[node]
    
    def add_node(self, node: Node):
        if node not in self.nodes:
            self.nodes |= {node}
            self.adjacencies[node] = set()
    
    def add_nodes(self, nodes: Iterable[Node]):
        for node in nodes:
            self.add_node(node)
        pass
    
    def add_edge(self, start: Node, end: Node, *args, **kwargs):
        edge = GraphEdge(start, end, *args, **kwargs)
        
        self.adjacencies[start] |= {(end, edge)}
        self.edges[edge] |= {(start, end)}
        
        if not self.directed:
            self.adjacencies[end] |= {(start, edge)}
    
    def add_edges(self, edges: Iterable[Edge]):
        for start, end, *rest in edges:
            self.add_edge(start, end, *rest)
    
    def bfs_explore(self, start: Node, **search_params: SearchParams):
        _max_depth = search_params.get('max_depth', 5)
        _max_nodes_bound = search_params.get('max_nodes_bound', 1000)
        _max_edges_bound = search_params.get('max_edges_bound', 1000)
            
        _queue = [(start, None)]
        _visited_nodes = set()
        _visited_edges = set()
        _distances = defaultdict(lambda:inf)
        _distances[start] = 0
        
        def rest_of_bounds():
            return all(
                (
                    len(_visited_nodes) <= _max_nodes_bound,
                    len(_visited_edges) <= _max_edges_bound
                )
            )
        
        while _queue and rest_of_bounds():
            current, parent = _queue.pop(0)
            _visited_nodes |= {current}

            for nbr, edge in self.neighbors(current):
                if nbr not in _visited_nodes and edge not in _visited_edges:
                    if _distances[nbr] >= _max_depth:
                        continue
                    elif _distances[current] + 1 < _distances[nbr]:
                        _distances[nbr] = _distances[current] + 1
                        _visited_edges |= {edge}
                        _queue.append((nbr, current))
        for x in _distances:
            print(x, _distances[x])
        return _distances
        
    def __getitem__(self, key):
        if key in self.adjacencies:
            return self.adjacencies[key]
        raise Exception(f'No node of key: {key}')
    
    def __iter__(self):
        return iter(self.adjacencies)
    
    def __len__(self):
        return len(self.adjacencies)
    
    def adj(self):
        return self.adjacencies
    
    def size(self):
        return len(self.edges)
    
    def __str__(self):
        return str(self.adjacencies.keys())
    

In [183]:
params = SearchParams(_max_depth=20, _max_nodes_bound=300, _max_edges_bound=300)

In [184]:
class GraphNode(Node):
    def __init__(
        self,
        dom_node: se.webdriver.remote.webelement.WebElement,
        value=None,
        label=None,
        data=None
    ):
        super().__init__()
        
        self._value = value
        self._label = label
        self._data = data
        self._node_id = None
        self._underlying_dom_node = None
        
        if dom_node is None:
            return
        
        self._underlying_dom_node = dom_node
        self._node_id = dom_node.id


        
        if dom_node.tag_name == 'a':
            self._value = dom_node.get_attribute('href')
            temp = dom_node.get_attribute('text')
            if temp.strip() != '':
                self._label = temp
            else:
                self._label = 'unlabelled/link'
            
        elif dom_node.tag_name == 'img':
            self._value = dom_node.get_attribute('src')
            temp = dom_node.get_attribute('alt')
            if temp.strip() != '':
                self._label = temp
            else:
                self._label = 'no-alt/image'
        else:
            pass
    
    def value(self):
        return self._value
    
    def label(self):
        return self._label
    
    def data(self):
        return self._data
    
    def get_underlying_dom_node(self):
        return self._underlying_dom_node
    
    def get_node_id(self):
        return self._node_id
    
    def __str__(self):
        return str(self.label())

In [185]:
class GraphEdge(Edge):
    def __init__(
        self,
        start: GraphNode,
        end: GraphNode,
        weight: float = 1.0,
        value: str = '',
        label: str = ''
    ):    
        super().__init__(start=start, end=end)
        
        self._label = label if label else f'({str(start)} ==> {str(end)}, weight: {weight})'
        self._value = value if value else self._label
        self._weight = weight
        self._start = start
        self._end = end
        self._edge_id = f'{start.get_node_id()}==>{end.get_node_id()}'
    
    def value(self):
        return self._value
    
    def label(self):
        return self._label

    def weight(self):
        return self._weight
    
    def start(self):
        return self._start
    
    def end(self):
        return self._end
    
    def get_edge_id(self):
        return self._edge_id
    
    def __str__(self):
        return f'Label: {self._label}\nValue: {self._value}'
    
    def __hash__(self):
        return hash(self.get_edge_id())
    
        

In [186]:
class DOMNodeGraph(Graph):
    def __init__(self, name='', directed=True):
        self.name = name
        self.directed = directed
        super().__init__(name=name, directed=directed)

    
    def __str__(self):
        return super().__str__()
    

In [187]:
class GraphCrawler(BaseCrawler):
    def __init__(self, url='', name='', **kwargs):
        super().__init__(url=url, **kwargs)
        self.root = None
        self.name = name
        self.cache = dict()
        self.graph = DOMNodeGraph(name=name)
        pass

    def initialize_root(self):
        if not self.root:
            self.root = GraphNode(
                dom_node=None,
                value=self.url,
                label=f'Page: {self.url}',
                data=self.find_all_text()
            )
            self.graph.add_node(self.root)
            self.cache[self.root] = self.root.data()

        for _dom_elem, _dict in self.find_anchors(attributes=["href", "text"]):
            node = GraphNode(dom_node=_dom_elem, data='')
            self.graph.add_node(node)
            self.graph.add_edge(start=self.root, end=node, value=_dict['text'])
        
        pass
    
    def __repr__(self):
        return self.__str__()
    
#     @staticmethod
#     def use_neighbors(graph):
        
#         print(graph)
#         return lambda x: [(None, None)]
    
    def __str__(self):
        return f'GraphCrawler: {self.name}, rooted at: {self.url}.\nMy cache is: {str(self.cache)}'

In [None]:
gc = GraphCrawler(url=URL, name='Crawley')

In [None]:
gc.initialize_root()

In [None]:
len(gc.graph)

In [179]:
for nbr, wt in gc.graph.neighbors(gc.root):
    print(nbr.value())

https://site-hossain.web.app/contact#contact
https://site-hossain.web.app/contact#contact
https://site-hossain.web.app/students#students
https://site-hossain.web.app/students#students
https://www.researchgate.com/
https://www.twitter.com/
https://site-hossain.web.app/teaching#teaching
https://site-hossain.web.app/#home
https://site-hossain.web.app/research#funding
https://www.linkedin.com/
https://site-hossain.web.app/research#interests
https://site-hossain.web.app/research#funding
https://www.googlescholar.com/
https://site-hossain.web.app/publications#publications
https://site-hossain.web.app/#home
https://site-hossain.web.app/publications#publications
https://site-hossain.web.app/teaching#teaching
https://www.researchgate.com/
https://www.linkedin.com/
https://www.googlescholar.com/
https://site-hossain.web.app/research#interests
https://www.twitter.com/
https://www.github.com/
https://www.github.com/
https://site-hossain.web.app/#home


In [180]:
gc.graph.bfs_explore(start=gc.root, use_neighbors=None, search_params=params)

Page: https://site-hossain.web.app 0
     |     Contact inf
Contact inf
Students inf
     |     Students inf
unlabelled/link inf
unlabelled/link inf
Teaching inf
     |     Home inf
     |     Funding inf
unlabelled/link inf
Research inf
Funding inf
unlabelled/link inf
     |     Publications inf
Home inf
Publications inf
     |     Teaching inf
unlabelled/link inf
unlabelled/link inf
unlabelled/link inf
     |     Research inf
unlabelled/link inf
unlabelled/link inf
unlabelled/link inf
DoeD inf


defaultdict(<function __main__.Graph.bfs_explore.<locals>.<lambda>()>,
            {<__main__.GraphNode at 0x7f92209e9cd0>: 0,
             <__main__.GraphNode at 0x7f92209d7e50>: inf,
             <__main__.GraphNode at 0x7f92209c9ee0>: inf,
             <__main__.GraphNode at 0x7f92209c9820>: inf,
             <__main__.GraphNode at 0x7f92209d7fd0>: inf,
             <__main__.GraphNode at 0x7f92209d7f70>: inf,
             <__main__.GraphNode at 0x7f92209d72b0>: inf,
             <__main__.GraphNode at 0x7f92209c99d0>: inf,
             <__main__.GraphNode at 0x7f92209d79d0>: inf,
             <__main__.GraphNode at 0x7f92209d7fa0>: inf,
             <__main__.GraphNode at 0x7f92209d7340>: inf,
             <__main__.GraphNode at 0x7f92209c9190>: inf,
             <__main__.GraphNode at 0x7f92209c9f70>: inf,
             <__main__.GraphNode at 0x7f92209c9790>: inf,
             <__main__.GraphNode at 0x7f92209d7bb0>: inf,
             <__main__.GraphNode at 0x7f92209c9e80>: inf,
   

In [None]:
# params = SearchParams(_max_depth=2, _max_nodes_bound=3, _max_edges_bound=3)

In [None]:
# print(gc)

In [None]:
# for x in geb:
#     print(x)

In [None]:
# help(gc)