In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup 

html = urlopen('http://en.wikipedia.org/wiki/Kevin_Bacon')
bs = BeautifulSoup(html, 'html.parser')
for link in bs.find_all('a'):
    if 'href' in link.attrs:
        print(link.attrs['href'])

#bodyContent
/wiki/Main_Page
/wiki/Wikipedia:Contents
/wiki/Portal:Current_events
/wiki/Special:Random
/wiki/Wikipedia:About
//en.wikipedia.org/wiki/Wikipedia:Contact_us
https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&utm_medium=sidebar&utm_campaign=C13_en.wikipedia.org&uselang=en
/wiki/Help:Contents
/wiki/Help:Introduction
/wiki/Wikipedia:Community_portal
/wiki/Special:RecentChanges
/wiki/Wikipedia:File_upload_wizard
/wiki/Main_Page
/wiki/Special:Search
/w/index.php?title=Special:CreateAccount&returnto=Kevin+Bacon
/w/index.php?title=Special:UserLogin&returnto=Kevin+Bacon
/w/index.php?title=Special:CreateAccount&returnto=Kevin+Bacon
/w/index.php?title=Special:UserLogin&returnto=Kevin+Bacon
/wiki/Help:Introduction
/wiki/Special:MyContributions
/wiki/Special:MyTalk
#
#Early_life_and_education
#Acting_career
#Early_work
#1980s
#1990s
#2000s
#2010s
#Advertising_work
#Six_Degrees_of_Kevin_Bacon
#Music
#Personal_life
#Accolades
#Awards_and_nominations
#Other_

## Retrieving Articles Only

In [1]:
from urllib.request import urlopen 
from bs4 import BeautifulSoup 
import re

html = urlopen('http://en.wikipedia.org/wiki/Kevin_Bacon')
bs = BeautifulSoup(html, 'html.parser')
for link in bs.find('div', {'id':'bodyContent'}).find_all(
    'a', href=re.compile('^(/wiki/)((?!:).)*$')):
    print(link.attrs['href'])

/wiki/Kevin_Bacon_(disambiguation)
/wiki/Philadelphia
/wiki/Kevin_Bacon_filmography
/wiki/Kyra_Sedgwick
/wiki/Sosie_Bacon
/wiki/Edmund_Bacon_(architect)
/wiki/Michael_Bacon_(musician)
/wiki/Leading_man
/wiki/Character_actor
/wiki/Golden_Globe_Award
/wiki/Screen_Actors_Guild_Award
/wiki/Primetime_Emmy_Award
/wiki/National_Lampoon%27s_Animal_House
/wiki/Footloose_(1984_film)
/wiki/Diner_(1982_film)
/wiki/JFK_(film)
/wiki/A_Few_Good_Men
/wiki/Apollo_13_(film)
/wiki/Mystic_River_(film)
/wiki/Frost/Nixon_(film)
/wiki/Friday_the_13th_(1980_film)
/wiki/Tremors_(1990_film)
/wiki/The_River_Wild
/wiki/The_Woodsman_(2004_film)
/wiki/Crazy,_Stupid,_Love
/wiki/Patriots_Day_(film)
/wiki/Losing_Chase
/wiki/Loverboy_(2005_film)
/wiki/Golden_Globe_Award_for_Best_Actor_%E2%80%93_Miniseries_or_Television_Film
/wiki/Screen_Actors_Guild_Award_for_Outstanding_Performance_by_a_Male_Actor_in_a_Miniseries_or_Television_Movie
/wiki/Michael_Strobl
/wiki/HBO
/wiki/Taking_Chance
/wiki/Fox_Broadcasting_Company
/wik

## Random Walk

In [248]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import datetime
import random
import re

random.seed(datetime.datetime.now().strftime('%s'))
def getLinks(articleUrl):
    html = urlopen(f'http://en.wikipedia.org{articleUrl}')
    bs = BeautifulSoup(html, 'html.parser')
    return bs.find('div', {'id':'bodyContent'}).find_all('a', href=re.compile('^(/wiki/)((?!:).)*$'))

links = getLinks('/wiki/Kevin_Bacon')
while len(links) > 0:
    newArticle = links[random.randint(0, len(links)-1)].attrs['href']
    print(newArticle)
    links = getLinks(newArticle)

/wiki/Albert_Finney
/wiki/Traffic_(2000_film)
/wiki/Grammy_Award_for_Best_Score_Soundtrack_for_Visual_Media
/wiki/Shandi_Sinnamon
/wiki/Sharon_Robinson_(songwriter)
/wiki/Everybody_Knows_(Leonard_Cohen_song)
/wiki/Concrete_Blonde
/wiki/Live_in_Brazil_2002
/wiki/Tomorrow_Wendy_(song)
/wiki/Musidisc
/wiki/Universal_Music_Group
/wiki/Big_Hit_Music
/wiki/Arirang_TV
/wiki/NewsNet
/wiki/Digi-TV
/wiki/Satellite_television
/wiki/Cable_television
/wiki/Bandwidth_(computing)
/wiki/Discrete_cosine_transform
/wiki/Moving_Picture_Experts_Group
/wiki/MPEG-G
/wiki/IEC_62351
/wiki/ISO/IEC_15504
/wiki/COBOL
/wiki/Salt_spray_test
/wiki/Metal
/wiki/Crystal_structure
/wiki/Chemical_elements
/wiki/Block_(periodic_table)
/wiki/Radon
/wiki/Group_8_element


KeyboardInterrupt: 

## Recursively crawling an entire site

In [250]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

pages = set()
def getLinks(pageUrl):
    html = urlopen(f'http://en.wikipedia.org{pageUrl}')
    bs = BeautifulSoup(html, 'html.parser')
    for link in bs.find_all('a', href=re.compile('^(/wiki/)')):
        if 'href' in link.attrs:
            if link.attrs['href'] not in pages:
                #We have encountered a new page
                newPage = link.attrs['href']
                print(newPage)
                pages.add(newPage)
                getLinks(newPage)
getLinks('')

/wiki/Main_Page
/wiki/Wikipedia:Contents
/wiki/Portal:Current_events
/wiki/Special:Random
/wiki/Wikipedia:About
/wiki/Help:Contents
/wiki/Help:Introduction
/wiki/Wikipedia:Community_portal
/wiki/Special:RecentChanges
/wiki/Wikipedia:File_upload_wizard
/wiki/Special:Search
/wiki/Special:MyContributions
/wiki/Special:MyTalk
/wiki/Special:WhatLinksHere/User_talk:2003:E9:6F41:8B00:458C:A677:4A39:9BCD


HTTPError: HTTP Error 404: Not Found

## Collecting Data Across an Entire Site

In [251]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

pages = set()
def getLinks(pageUrl):
    html = urlopen(f'http://en.wikipedia.org{pageUrl}')
    bs = BeautifulSoup(html, 'html.parser')
    try:
        print(bs.h1.get_text())
        #mw-parser-output
        bodyContent = bs.find('div', {'id':'bodyContent'}).find_all('p')
        if len(bodyContent):
            print(bodyContent[0])
        print(bs.find(id='ca-edit').find('a').attrs['href'])
    except AttributeError:
        print('This page is missing something! Continuing.')
    
    for link in bs.find_all('a', href=re.compile('^(/wiki/)')):
        if 'href' in link.attrs:
            if link.attrs['href'] not in pages:
                #We have encountered a new page
                newPage = link.attrs['href']
                print('-'*20)
                print(newPage)
                pages.add(newPage)
                getLinks(newPage)
getLinks('/wiki/General-purpose_programming_language') 

General-purpose programming language
<p>In <a class="mw-redirect" href="/wiki/Computer_software" title="Computer software">computer software</a>, a <b>general-purpose programming language</b> (<b>GPL</b>) is a <a href="/wiki/Programming_language" title="Programming language">programming language</a> for building <a href="/wiki/Software" title="Software">software</a> in a wide variety of application <a href="/wiki/Domain_(software_engineering)" title="Domain (software engineering)">domains</a>. Conversely, a <a href="/wiki/Domain-specific_language" title="Domain-specific language">domain-specific programming language</a> (DSL) is used within a specific area. For example, <a href="/wiki/Python_(programming_language)" title="Python (programming language)">Python</a> is a GPL, while <a href="/wiki/SQL" title="SQL">SQL</a> is a DSL for <a href="/wiki/Query_language" title="Query language">querying relational databases</a>.
</p>
/w/index.php?title=General-purpose_programming_language&action=

HTTPError: HTTP Error 404: Not Found

## Crawling across the Internet

In [256]:
from urllib.request import urlopen
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import re
import datetime
import random

#Retrieves a list of all Internal links found on a page
def getInternalLinks(bs, url):
    netloc = urlparse(url).netloc
    scheme = urlparse(url).scheme
    internalLinks = set()
    for link in bs.find_all('a'):
        if not link.attrs.get('href'):
            continue
        parsed = urlparse(link.attrs['href'])
        if parsed.netloc == '':
            internalLinks.add(f'{scheme}://{netloc}/{link.attrs["href"].strip("/")}')
        elif parsed.netloc == netloc:
            internalLinks.add(link.attrs['href'])
    return list(internalLinks)
            
#Retrieves a list of all external links found on a page
def getExternalLinks(bs, url):
    netloc = urlparse(url).netloc
    externalLinks = set()
    for link in bs.find_all('a'):
        if not link.attrs.get('href'):
            continue
        parsed = urlparse(link.attrs['href'])
        if parsed.netloc != '' and parsed.netloc != netloc:
            externalLinks.add(link.attrs['href'])
    return list(externalLinks)

def getRandomExternalLink(startingPage):
    bs = BeautifulSoup(urlopen(startingPage), 'html.parser')
    externalLinks = getExternalLinks(bs, startingPage)
    if not len(externalLinks):
        print('No external links, looking around the site for one')
        internalLinks = getInternalLinks(bs, startingPage)
        return getRandomExternalLink(random.choice(internalLinks))
    else:
        return random.choice(externalLinks)
    
def followExternalOnly(startingSite):
    externalLink = getRandomExternalLink(startingSite)
    print(f'Random external link is: {externalLink}')
    followExternalOnly(externalLink)


followExternalOnly('https://www.astro-seek.com')


Random external link is: https://es.astro-seek.com/horoscopos-gratuitos-cartas-astrologicas-online
Random external link is: https://tr.astro-seek.com/
Random external link is: https://de.astro-seek.com/
Random external link is: https://www.astro-seek.com/
Random external link is: https://horoscopes.astro-seek.com/planet-ingresses-and-particular-degree-returns-calculator
Random external link is: https://www.astro-seek.com/contact
Random external link is: https://horoscopes.astro-seek.com/sabian-symbols-calculator-calendar
Random external link is: https://mooncalendar.astro-seek.com/solar-and-lunar-eclipses-2024
Random external link is: https://horoscopes.astro-seek.com/astrology-secondary-progressions-directions-chart
Random external link is: https://www.astro-seek.com/registration
Random external link is: https://horoscopes.astro-seek.com/retrograde-planets-astrology-calendar-2024
Random external link is: https://es.astro-seek.com/horoscopos-gratuitos-cartas-astrologicas-online
Random 

KeyboardInterrupt: 

## Collect all External Links from a Site

In [339]:
# Collects a list of all external URLs found on the site
allExtLinks = []
allIntLinks = []


def getAllExternalLinks(url):
    bs = BeautifulSoup(urlopen(url), 'html.parser')
    internalLinks = getInternalLinks(bs, url)
    externalLinks = getExternalLinks(bs, url)
    for link in externalLinks:
        if link not in allExtLinks:
            allExtLinks.append(link)
            print(link)

    for link in internalLinks:
        if link not in allIntLinks:
            allIntLinks.append(link)
            getAllExternalLinks(link)


allIntLinks.append('https://www.astro.com')
getAllExternalLinks('https://www.astro.com')

https://twitter.com/share?text=Astrodienst%20-%20Die%20besten%20Horoskope%20auf%20diesem%20Planeten&url=https%3A%2F%2Fwww.astro.com%2Findex_g.htm
https://www.facebook.com/sharer/sharer.php?u=https%3A%2F%2Fwww.astro.com%2Findex_g.htm
https://twitter.com/share?text=Beziehungs-Horoskope%20im%20Astro%20shop&url=https%3A%2F%2Fwww.astro.com%2Fpro%2Fpr_partner_g.htm
https://www.facebook.com/sharer/sharer.php?u=https%3A%2F%2Fwww.astro.com%2Fpro%2Fpr_partner_g.htm
https://twitter.com/share?text=L%27équipe%20d%27Astrodienst&url=https%3A%2F%2Fwww.astro.com%2Fpeople%2Fstaff_f.htm
https://www.facebook.com/sharer/sharer.php?u=https%3A%2F%2Fwww.astro.com%2Fpeople%2Fstaff_f.htm
http://www.klangfeld.ch
http://www.astrolog.ch
https://twitter.com/share?text=Oroscopi%20della%20personalità&url=https%3A%2F%2Fwww.astro.com%2Fpro%2Fpr_personal_i.htm
https://www.facebook.com/sharer/sharer.php?u=https%3A%2F%2Fwww.astro.com%2Fpro%2Fpr_personal_i.htm
https://twitter.com/share?text=Love%20Horoscope&url=https%3A%2F

HTTPError: HTTP Error 404: Not Found