# Focused Web crawling and anchor text extraction

The task is to build a simple focused web crawler that crawls pages from a given website (i.e., without leaving a given domain) and extracts anchor text from each of the pages discovered.

In [8]:
DOMAIN = "wikipedia.org"  # this is the site we want to crawl; we don't follow links outside this domain
START = "https://en.wikipedia.org/wiki/Stavanger"  # we start crawling from this URL

### This utility function crawls a given URL and extracts all links from it.

In [4]:
from bs4 import BeautifulSoup
import urllib.request

In [22]:
def extract_links(url):
    resp = urllib.request.urlopen(url)
    soup = BeautifulSoup(resp, "lxml", from_encoding=resp.info().get_param('charset'))
    links = []
    for link in soup.find_all('a', href=True):
        links.append((link['href'], link.text))
        #print(link['href'], link.text)
    return links

## Crawler main

In [23]:
import time

In [24]:
def crawl(start_url, domain):
    visited = set()  # set of URLs visited so far
    atext = {}  # anchor text extracted for URLs, i.e., the key is the URL the value is a list of anchor texts
    queue = [start_url]
    
    while len(queue) > 0:
        url = queue.pop(0)
        print("Crawling {} ...".format(url))
        links = extract_links(url)
        # TODO for each link
        # - add anchor text to the page that is pointed by the link
        # - add link to queue if 
        #   - (i) it is within the target domain 
        #   - (ii) it hasn't been visited yet
        
        time.sleep(1)  # wait 1 sec before moving to next page

### Start crawling

In [25]:
crawl(START, DOMAIN)

Crawling https://en.wikipedia.org/wiki/Stavanger ...
