# Focused Web crawling and anchor text extraction

The task is to build a simple focused web crawler that crawls pages from a given website (i.e., without leaving a given domain) and extracts anchor text from each of the pages discovered.

In [1]:
DOMAIN = "wikipedia.org"  # this is the site we want to crawl; we don't follow links outside this domain
START = "https://en.wikipedia.org/wiki/Stavanger"  # we start crawling from this URL

### This utility function crawls a given URL and extracts all links from it.

In [2]:
from bs4 import BeautifulSoup
import urllib.request

In [3]:
def extract_links(url):
    resp = urllib.request.urlopen(url)
    soup = BeautifulSoup(resp, "lxml", from_encoding=resp.info().get_param('charset'))
    links = []
    for link in soup.find_all('a', href=True):
        links.append((link['href'], link.text))
        #print(link['href'], link.text)
    return links

## Crawler main

In [4]:
import time
from urllib.parse import urljoin
import pprint

Utility function for saving anchor text

In [5]:
def save_anchor_text(link_url, link_text, atext):
    if DOMAIN not in link_url:  # we collect anchor text only for in-domain pages
        return
    if link_url not in atext:
        atext[link_url] = []
    atext[link_url].append(link_text)    

Utility function for adding URL to queue (if it's not there and hasn't been visited already)

In [6]:
def add_to_queue(url, visited, queue):
    if (url not in visited) and (url not in queue):
        queue.append(url)

In [7]:
def crawl(start_url, domain):
    visited = set()  # set of URLs visited so far
    atext = {}  # anchor text extracted for URLs, i.e., the key is the URL the value is a list of anchor texts
    queue = [start_url]
    
    while len(queue) > 0:
        url = queue.pop(0)
        print("Crawling {} ...".format(url))
        links = extract_links(url)
        visited.add(url)
        for link_url, link_text in links:
            if link_url.startswith("/"):  # relative link
                # resolve link
                link_url = urljoin(url, link_url)
                # save anchor text
                save_anchor_text(link_url, link_text, atext)
                # add URL to queue 
                add_to_queue(link_url, visited, queue)       
                
            elif link_url.startswith("http"):  # absolute link
                if DOMAIN in link_url:  # a naive way of checking if it's within domain link
                    # save anchor text
                    save_anchor_text(link_url, link_text, atext)
                    # add URL to queue 
                    add_to_queue(link_url, visited, queue)       
            else:
                # we don't care about other types of links for now
                pass
        
        # stop after 10 pages visited
        if len(visited) > 10:
            break
        
        time.sleep(1)  # wait 1 sec before moving to next page
     
    # uncomment this line to see the collected anchor text
    #pprint.pprint(atext)

### Start crawling

In [8]:
crawl(START, DOMAIN)

Crawling https://en.wikipedia.org/wiki/Stavanger ...
Crawling https://en.wikipedia.org/wiki/File:Stavangercollage01.jpg ...
Crawling https://en.wikipedia.org/wiki/File:Stavanger_komm.svg ...
Crawling https://en.wikipedia.org/wiki/File:Norway_Rogaland_location_map.svg ...
Crawling https://en.wikipedia.org/wiki/File:Norway_location_map.svg ...
Crawling https://tools.wmflabs.org/geohack/geohack.php?pagename=Stavanger&params=58_57_48_N_5_43_8_E_region:NO_type:city(130426) ...
Crawling https://en.wikipedia.org/wiki/Geographic_coordinate_system ...
Crawling https://en.wikipedia.org/wiki/Country ...
Crawling https://en.wikipedia.org/wiki/Norway ...
Crawling https://en.wikipedia.org/wiki/List_of_municipalities_of_Norway ...
Crawling https://en.wikipedia.org/wiki/Counties_of_Norway ...
