In [1]:
from BeautifulSoup import BeautifulSoup
from collections import deque
import logging
import numpy as np
import pandas as pd
import re
import urllib2
import os

In [2]:
host = 'https://en.wikipedia.org'
url_path = '/wiki/Data_science'
MAX_DEGREE = 5

# logs
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
handler = logging.FileHandler('scrap.log')
handler.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)

In [3]:
def clean_url(url):
    return re.sub('[^\w]+', '', url)

def get_page(url):
    page = None
    file_path = 'cache/{url}'.format(url=clean_url(url))
    if not os.path.isfile(file_path):
        with open(file_path, 'w') as f:
            page = urllib2.urlopen('{host}{url_path}'.format(host=host, url_path=url_path)).read()
            f.write(page)
    with open(file_path, 'r') as f:
        page = f.read()
    return page


def get_links(url):
    page = get_page(url)
    soup = BeautifulSoup(page)
    soup.prettify()
    return [anchor['href'] 
            for anchor in soup.findAll('a', href=True) 
            if re.match(r'^/wiki/\w+$', anchor['href'])]

In [4]:
data = []
visited_urls = []
target_urls = deque([(url_path, 0)])

In [5]:
while target_urls:
    url, degree = target_urls.pop()    
    if degree <= MAX_DEGREE:
        logger.info('Degree: {degree}, URL: {url}'.format(degree=degree, url=url))
        visited_urls.append(url)
        for link in get_links('{host}{url}'.format(host=host, url=url)):
            data.append((url, link, degree))
            if link not in visited_urls and link not in [i for i, _ in target_urls]:
                target_urls.append((link, degree+1))

In [6]:
pd.DataFrame(data)

Unnamed: 0,0,1,2
0,/wiki/Data_science,/wiki/Information_science,0
1,/wiki/Data_science,/wiki/Statistics,0
2,/wiki/Data_science,/wiki/Data_visualization,0
3,/wiki/Data_science,/wiki/Exploratory_data_analysis,0
4,/wiki/Data_science,/wiki/Information_design,0
5,/wiki/Data_science,/wiki/Interactive_data_visualization,0
6,/wiki/Data_science,/wiki/Descriptive_statistics,0
7,/wiki/Data_science,/wiki/Statistical_inference,0
8,/wiki/Data_science,/wiki/Statistical_graphics,0
9,/wiki/Data_science,/wiki/Data_analysis,0
