In [4]:
import requests
from bs4 import BeautifulSoup
from pprint import pprint

In [53]:
url = 'https://en.wikipedia.org/wiki/Fylde_College,_Lancaster'

In [3]:
response = requests.get(url)
print(response.status_code)

200


In [125]:
# pprint(response.content.decode())

In [6]:
soup = BeautifulSoup(response.content)

# First try at collecting references
refs = soup.find('div', class_='reflist')
# refs

In [11]:
# Second approach
refs_other = soup.find('ol', class_='references')

In [14]:
# Confirm the length matches what I see in the browser
len(refs_other.find_all('li'))

9

In [15]:
refs = refs_other.find_all('li')

In [18]:
refs[0].find_all('a')

[<a href="#cite_ref-1">^</a>,
 <a class="external text" href="http://www.lancaster.ac.uk/colleges/fylde/scr/" rel="nofollow">"SCR - Fylde College"</a>]

In [25]:
# Write a function to perform the same steps 
def collect_refs(url):
    """Scrape reference links from a wikipedia page."""
    
    response = requests.get(url)
    if response.status_code != 200:
        return None
    
    soup = BeautifulSoup(response.content)
    references = soup.find('ol', class_='references')
    references = references.find_all('a', class_=['external', 'text'])
    
    reference_list = []
    for reference in references:
        if reference.get('href'):
            reference_list.append(reference.get('href'))
            
    return reference_list
    

In [27]:
# Testing
collect_refs(url)

['http://www.lancaster.ac.uk/colleges/fylde/scr/',
 'http://www.lancaster.ac.uk/colleges/fylde/about-us/deanery/',
 'https://web.archive.org/web/20110605231852/http://www.lancs.ac.uk/unihistory/growth/fyldecollegelink.htm',
 'http://www.lancs.ac.uk/unihistory/growth/fyldecollegelink.htm',
 'http://www.lancs.ac.uk/users/fylde/allabout/abouthistory.htm',
 'https://www.insidermedia.com/news/north-west/novus-completes-on-lancaster-university-schemes',
 'http://www.civicheraldry.co.uk/lancs_pre74.html',
 'https://web.archive.org/web/20110816224729/http://fylde.lusu.co.uk/fyldesport/',
 'http://fylde.lusu.co.uk/fyldesport/',
 'https://www.lancaster.ac.uk/news/alumni-awards-for-high-flying-lancaster-graduates']

In [37]:
# At this point, we're looking in the first section of the page to 
# Find a link to navigate to

paragraphs = soup.find('div', class_='mw-body-content').find_all('p')

In [48]:
# Iterate over 'paragraphs' until finding a link

link = None
start = 0
while not link:
    a = paragraphs[start].find('a')
    if a:
        link = a.get('href')
    start += 1
link

'/wiki/College'

In [49]:
prepend = 'https://en.wikipedia.org'

In [50]:
prepend+link

'https://en.wikipedia.org/wiki/College'

In [51]:
# Refactoring code to separate 'making soup' from 'collecting references'
def get_soup(url):
    """Get page HTML and load into BeautifulSoup."""
    response = requests.get(url)
    if response.status_code != 200:
        return None
    return BeautifulSoup(response.content)

In [85]:
# Take two of the collect_refs function
def collect_refs(soup):
    """Scrape reference links from a wikipedia page."""
    
    if soup.find('span', id='Notes'):
        references = soup.find_all('ol', class_='references')[1]
    
    references = soup.find('ol', class_='references')
    if soup.find('span', id='Notes'):
        references = soup.find_all('ol', class_='references')[1]
        
    references = references.find_all('a', class_=['external', 'text'])
    
    reference_list = []
    for reference in references:
        if reference.get('href'):
            reference_list.append(reference.get('href'))
            
    return reference_list
    

In [54]:
# Testing the refactored functions 
soup = get_soup(url)
collect_refs(soup)

In [61]:
# First attempt at link-finding function
def get_first_link(soup):
    """Find the first internal link to another wiki page."""
    intro_paragraphs = soup.find('div', class_='mw-body-content').find_all('p')
    
    # Link is relative, so adding the rest of the URL
    prepend = 'https://en.wikipedia.org'
    link = None
    start = 0
    while not link:
        a = paragraphs[start].find('a')
        if a:
            link = a.get('href')
        start += 1
    return prepend + link
    
    

In [62]:
# First trial
get_first_link(soup)

'https://en.wikipedia.org/wiki/College'

In [86]:
# First test of crawling multiple pages to get references.. we have bugs! 
import time

def crawl_references(start_url, n_pages):
    """Crawl wikipedia pages and collect URLs from the reference section
    of the page. After scraping the start page, moves on to the first 
    URL linked in the intro. Continues until n_pages have been scraped. """
    
    next_url = start_url
    all_refs = []

    for i in range(n_pages):
        soup = get_soup(next_url)
        
        refs = collect_refs(soup)
        all_refs.append(refs)
        
        next_url = get_first_link(soup)
        print(next_url)
        time.sleep(5)
    
    return all_refs
        
    
    
    

In [88]:
crawl_references(url, 3)

https://en.wikipedia.org/wiki/College
https://en.wikipedia.org/wiki/College
https://en.wikipedia.org/wiki/College


[['http://www.lancaster.ac.uk/colleges/fylde/scr/',
  'http://www.lancaster.ac.uk/colleges/fylde/about-us/deanery/',
  'https://web.archive.org/web/20110605231852/http://www.lancs.ac.uk/unihistory/growth/fyldecollegelink.htm',
  'http://www.lancs.ac.uk/unihistory/growth/fyldecollegelink.htm',
  'http://www.lancs.ac.uk/users/fylde/allabout/abouthistory.htm',
  'https://www.insidermedia.com/news/north-west/novus-completes-on-lancaster-university-schemes',
  'http://www.civicheraldry.co.uk/lancs_pre74.html',
  'https://web.archive.org/web/20110816224729/http://fylde.lusu.co.uk/fyldesport/',
  'http://fylde.lusu.co.uk/fyldesport/',
  'https://www.lancaster.ac.uk/news/alumni-awards-for-high-flying-lancaster-graduates'],
 ['https://www.usnews.com/education/community-colleges/articles/2015/02/06/frequently-asked-questions-community-college',
  'https://web.archive.org/web/20211028010848/https://www.usnews.com/education/community-colleges/articles/2015/02/06/frequently-asked-questions-communit

---

## Bug hunting starts here

In [68]:
collect_refs(get_soup('https://en.wikipedia.org/wiki/College'))

[]

In [69]:
soup = get_soup('https://en.wikipedia.org/wiki/College')

In [70]:
soup.find

<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-zebra-design-disabled vector-feature-custom-font-size-clientpref-0 vector-feature-client-preferences-disabled vector-feature-typography-survey-disabled vector-toc-available" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>College - Wikipedia</title>
<script>(function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1

In [76]:
references = soup.find('div', class_='mw-references-wrap')

In [77]:
references

<div class="mw-references-wrap"><ol class="references">
<li id="cite_note-12"><span class="mw-cite-backlink"><b><a href="#cite_ref-12">^</a></b></span> <span class="reference-text">Exceptions are made for "mature" student, meaning 21 years of age or over, and out of the educational system for at least 2 years.</span>
</li>
</ol></div>

In [None]:
references = references.find_all('a', class_=['external', 'text'])

In [71]:
reference_list = []
for reference in references:
    if reference.get('href'):
        reference_list.append(reference.get('href'))

In [79]:
soup.find('span', id='Notes')

<span class="mw-headline" id="Notes">Notes</span>

In [None]:
<span class="mw-headline" id="Notes">Notes</span>

In [80]:
if soup.find('span', id='Notes'):
    references = soup.find_all('ol', class_='references')[1]

In [81]:
references

<ol class="references">
<li id="cite_note-1"><span class="mw-cite-backlink"><b><a href="#cite_ref-1">^</a></b></span> <span class="reference-text"><style data-mw-deduplicate="TemplateStyles:r1133582631">.mw-parser-output cite.citation{font-style:inherit;word-wrap:break-word}.mw-parser-output .citation q{quotes:"\"""\"""'""'"}.mw-parser-output .citation:target{background-color:rgba(0,127,255,0.133)}.mw-parser-output .id-lock-free a,.mw-parser-output .citation .cs1-lock-free a{background:url("//upload.wikimedia.org/wikipedia/commons/6/65/Lock-green.svg")right 0.1em center/9px no-repeat}.mw-parser-output .id-lock-limited a,.mw-parser-output .id-lock-registration a,.mw-parser-output .citation .cs1-lock-limited a,.mw-parser-output .citation .cs1-lock-registration a{background:url("//upload.wikimedia.org/wikipedia/commons/d/d6/Lock-gray-alt-2.svg")right 0.1em center/9px no-repeat}.mw-parser-output .id-lock-subscription a,.mw-parser-output .citation .cs1-lock-subscription a{background:url("//

## Bug hunting ends. Post-livecode notes & solutions below
---

In [None]:
######### post-livecoding #########

# The mistake was a global variable that I forgot to change! 

#     while not link:
#         a = paragraphs[start].find('a')
#         if a:
#             link = a.get('href')

# This should be intro_paragraphs[start], not paragraphs[start].
# Please learn from my mistakes ;_;

# I also added a condition to ignore help pages. 

In [123]:
######### Fixed function and testing below #########

def get_soup(url):
    """Get page HTML and load into BeautifulSoup."""
    response = requests.get(url)
    if response.status_code != 200:
        return None
    return BeautifulSoup(response.content)

def get_first_link(soup):
    """Find the first internal link to another wiki page."""
    intro_paragraphs = soup.find('div', class_='mw-body-content').find_all('p')
    
    # Link is relative, so adding the rest of the URL
    prepend = 'https://en.wikipedia.org'
    link = None
    start = 0
    while not link:
        a = intro_paragraphs[start].find('a')
        if a and not a.get('href').startswith('/wiki/Help'):
            link = a.get('href')
        start += 1
    return prepend + link

def crawl_references(start_url, n_pages):
    """Crawl wikipedia pages and collect URLs from the reference section
    of the page. After scraping the start page, moves on to the first 
    URL linked in the intro. Continues until n_pages have been scraped. """
    
    next_url = start_url
    all_refs = []

    for i in range(n_pages):
        soup = get_soup(next_url)
        
        refs = collect_refs(soup)
        all_refs.append(refs)
        
        next_url = get_first_link(soup)
        print(next_url)
        time.sleep(5)
    
    return all_refs

In [124]:
crawl_references(url, 3)

https://en.wikipedia.org/wiki/College
https://en.wikipedia.org/wiki/Latin
https://en.wikipedia.org/wiki/Fusional_language


[['http://www.lancaster.ac.uk/colleges/fylde/scr/',
  'http://www.lancaster.ac.uk/colleges/fylde/about-us/deanery/',
  'https://web.archive.org/web/20110605231852/http://www.lancs.ac.uk/unihistory/growth/fyldecollegelink.htm',
  'http://www.lancs.ac.uk/unihistory/growth/fyldecollegelink.htm',
  'http://www.lancs.ac.uk/users/fylde/allabout/abouthistory.htm',
  'https://www.insidermedia.com/news/north-west/novus-completes-on-lancaster-university-schemes',
  'http://www.civicheraldry.co.uk/lancs_pre74.html',
  'https://web.archive.org/web/20110816224729/http://fylde.lusu.co.uk/fyldesport/',
  'http://fylde.lusu.co.uk/fyldesport/',
  'https://www.lancaster.ac.uk/news/alumni-awards-for-high-flying-lancaster-graduates'],
 ['https://www.usnews.com/education/community-colleges/articles/2015/02/06/frequently-asked-questions-community-college',
  'https://web.archive.org/web/20211028010848/https://www.usnews.com/education/community-colleges/articles/2015/02/06/frequently-asked-questions-communit