In [1]:
# Retrieve a given Wikipedia page and produce a list of links on that page!

from urllib.request import urlopen
from bs4 import BeautifulSoup 

top_level_page = urlopen('http://en.wikipedia.org/wiki/Kevin_Bacon')
bs = BeautifulSoup(top_level_page, 'html.parser')
for link in bs.find_all('a'):
    if 'href' in link.attrs:
        print(link.attrs['href'])

#bodyContent
/wiki/Main_Page
/wiki/Wikipedia:Contents
/wiki/Portal:Current_events
/wiki/Special:Random
/wiki/Wikipedia:About
//en.wikipedia.org/wiki/Wikipedia:Contact_us
https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&utm_medium=sidebar&utm_campaign=C13_en.wikipedia.org&uselang=en
/wiki/Help:Contents
/wiki/Help:Introduction
/wiki/Wikipedia:Community_portal
/wiki/Special:RecentChanges
/wiki/Wikipedia:File_upload_wizard
/wiki/Main_Page
/wiki/Special:Search
/w/index.php?title=Special:CreateAccount&returnto=Kevin+Bacon
/w/index.php?title=Special:UserLogin&returnto=Kevin+Bacon
/w/index.php?title=Special:CreateAccount&returnto=Kevin+Bacon
/w/index.php?title=Special:UserLogin&returnto=Kevin+Bacon
/wiki/Help:Introduction
/wiki/Special:MyContributions
/wiki/Special:MyTalk
#
#Early_life_and_education
#Acting_career
#Early_work
#1980s
#1990s
#2000s
#2010s
#Other_ventures
#Six_Degrees_of_Kevin_Bacon
#Personal_life
#Accolades
#Awards_and_nominations
#Other_honors
#S

In [4]:
#Generalization: Retrieve an arbitrary Wikipedia page and produce (or retrieve) a list of links on that page using a function!

from urllib.request import urlopen
from bs4 import BeautifulSoup 

def retrieve_an_arbitrary_wikipedia_page_and_produce_a_list_of_links_on_that_page(wikipedia_page : str):
    html = urlopen('http://en.wikipedia.org/wiki/Kevin_Bacon')
    bs = BeautifulSoup(html, 'html.parser')
    collection_that_can_be_mapped_to_links = bs.find_all('a')
    links = [link.attrs['href'] for link in collection_that_can_be_mapped_to_links if 'href' in link.attrs]
    return links

retrieve_an_arbitrary_wikipedia_page_and_produce_a_list_of_links_on_that_page('http://en.wikipedia.org/wiki/Kevin_Bacon') 

#TODO: Make a type that accepts only wikipedia pages, and use that type to make sure that correct pages are given as an input! (Types can be used to specify languages, specifying strings)
#TODO: links apply attrs['href'] should apply attrs['href'] on links. Find the closest expression in Python to do this after bs to simplify the results.
#Some links don't contain different articles.

['#bodyContent',
 '/wiki/Main_Page',
 '/wiki/Wikipedia:Contents',
 '/wiki/Portal:Current_events',
 '/wiki/Special:Random',
 '/wiki/Wikipedia:About',
 '//en.wikipedia.org/wiki/Wikipedia:Contact_us',
 'https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&utm_medium=sidebar&utm_campaign=C13_en.wikipedia.org&uselang=en',
 '/wiki/Help:Contents',
 '/wiki/Help:Introduction',
 '/wiki/Wikipedia:Community_portal',
 '/wiki/Special:RecentChanges',
 '/wiki/Wikipedia:File_upload_wizard',
 '/wiki/Main_Page',
 '/wiki/Special:Search',
 '/w/index.php?title=Special:CreateAccount&returnto=Kevin+Bacon',
 '/w/index.php?title=Special:UserLogin&returnto=Kevin+Bacon',
 '/w/index.php?title=Special:CreateAccount&returnto=Kevin+Bacon',
 '/w/index.php?title=Special:UserLogin&returnto=Kevin+Bacon',
 '/wiki/Help:Introduction',
 '/wiki/Special:MyContributions',
 '/wiki/Special:MyTalk',
 '#',
 '#Early_life_and_education',
 '#Acting_career',
 '#Early_work',
 '#1980s',
 '#1990s',
 '#2000s',
 

## Retrieving Articles Only

In [2]:
#Revise the previous process by retrieving only desired links by using the regular expression '^(/wiki/)((?!:).)*$")'!

from urllib.request import urlopen 
from bs4 import BeautifulSoup 
import re

top_level_page = urlopen('http://en.wikipedia.org/wiki/Kevin_Bacon')
bs = BeautifulSoup(top_level_page, 'html.parser')
for link in bs.find('div', {'id':'bodyContent'}).find_all(
    'a', href=re.compile('^(/wiki/)((?!:).)*$')):
    if 'href' in link.attrs:
        print(link.attrs['href'])

/wiki/Kevin_Bacon_(disambiguation)
/wiki/San_Diego_Comic-Con
/wiki/Philadelphia
/wiki/Pennsylvania
/wiki/Kyra_Sedgwick
/wiki/Sosie_Bacon
/wiki/Edmund_Bacon_(architect)
/wiki/Michael_Bacon_(musician)
/wiki/Footloose_(1984_film)
/wiki/JFK_(film)
/wiki/A_Few_Good_Men
/wiki/Apollo_13_(film)
/wiki/Mystic_River_(film)
/wiki/Sleepers
/wiki/The_Woodsman_(2004_film)
/wiki/Fox_Broadcasting_Company
/wiki/The_Following
/wiki/HBO
/wiki/Taking_Chance
/wiki/Golden_Globe_Award
/wiki/Screen_Actors_Guild_Award
/wiki/Primetime_Emmy_Award
/wiki/The_Guardian
/wiki/Academy_Award
/wiki/Hollywood_Walk_of_Fame
/wiki/Social_networks
/wiki/Six_Degrees_of_Kevin_Bacon
/wiki/SixDegrees.org
/wiki/Philadelphia
/wiki/Edmund_Bacon_(architect)
/wiki/Pennsylvania_Governor%27s_School_for_the_Arts
/wiki/Bucknell_University
/wiki/Glory_Van_Scott
/wiki/Circle_in_the_Square
/wiki/Nancy_Mills
/wiki/Cosmopolitan_(magazine)
/wiki/Fraternities_and_sororities
/wiki/Animal_House
/wiki/Search_for_Tomorrow
/wiki/Guiding_Light
/wiki/F

## Random Walk

In [10]:
#Generalization: Retrieve an arbitrary Wikipedia page and produce (or retrieve) a list of only desired links determined by the regex <regex> - a string - on that page using a function!
#Assumptions and their translation:
#   Desired URL's reside within the div with the id set to bodyContent. - is_subcollection_of(bs.find('div', {'id':'bodyContent'}), the_desired_links)
#   The URLs do not contain colons. - 
#   The URLs begin with /wiki/. - <regex>[:9] == '^(/wiki/)'
from urllib.request import urlopen
from bs4 import BeautifulSoup 
import re

def is_subcollection_of(sub, main):
    return all(item in main for item in sub)

def retrieve_an_arbitrary_wikipedia_page_and_produce_a_list_of_desired_links_on_that_page_using_the_tag_known_to_contain_the_desired_URLs_and_the_constraints_on_that_tag_and_a_regex_pattern(wikipedia_page : str, tag_known_to_contain_the_desired_urls : str, constraints_on_that_tag : dict, regex_pattern : str):
    html = urlopen(wikipedia_page)
    bs = BeautifulSoup(html, 'html.parser')
    collection_that_can_be_mapped_to_links = bs.find(tag_known_to_contain_the_desired_urls, constraints_on_that_tag).find_all('a', href=re.compile(regex_pattern))
    links = [link.attrs['href'] for link in collection_that_can_be_mapped_to_links if 'href' in link.attrs]
    return links
retrieve_an_arbitrary_wikipedia_page_and_produce_a_list_of_desired_links_on_that_page_using_the_tag_known_to_contain_the_desired_URLs_and_the_constraints_on_that_tag_and_a_regex_pattern('http://en.wikipedia.org/wiki/Kevin_Bacon', 'div', {'id':'bodyContent'}, '^(/wiki/)((?!:).)*$')

#Show an example for an arbitrary, different wikipedia page!
retrieve_an_arbitrary_wikipedia_page_and_produce_a_list_of_desired_links_on_that_page_using_the_tag_known_to_contain_the_desired_URLs_and_the_constraints_on_that_tag_and_a_regex_pattern('https://en.wikipedia.org/wiki/Oppenheimer_security_hearing', 'div', {'id':'bodyContent'}, '^(/wiki/)((?!:).)*$')
#TODO: Formulate the type "a dict that can only create dicts acceptablae by find_all in the second parameter!"
#TODO: Formulate the type "A string that's a regex."

['/wiki/J._Robert_Oppenheimer',
 '/wiki/Hearing_(law)',
 '/wiki/Lewis_Strauss',
 '/wiki/Kenneth_D._Nichols',
 '/wiki/United_States_Atomic_Energy_Commission',
 '/wiki/Roger_Robb',
 '/wiki/J._Robert_Oppenheimer',
 '/wiki/Lloyd_K._Garrison',
 '/wiki/Gordon_Gray_(politician)',
 '/wiki/Ward_V._Evans',
 '/wiki/Security_clearance',
 '/wiki/United_States_Atomic_Energy_Commission',
 '/wiki/J._Robert_Oppenheimer',
 '/wiki/Los_Alamos_Laboratory',
 '/wiki/World_War_II',
 '/wiki/Manhattan_Project',
 '/wiki/Atomic_bomb',
 '/wiki/Q_clearance',
 '/wiki/McCarthyism',
 '/wiki/Communist_front',
 '/wiki/Communist_Party_USA',
 '/wiki/United_States_Army_Counterintelligence',
 '/wiki/Hydrogen_bomb',
 '/wiki/Lewis_Strauss',
 '/wiki/John_F._Kennedy',
 '/wiki/Lyndon_B._Johnson',
 '/wiki/Dissent',
 '/wiki/United_States_Secretary_of_Energy',
 '/wiki/Jennifer_Granholm',
 '/wiki/Oppenheimer_(film)',
 '/wiki/World_War_II',
 '/wiki/J._Robert_Oppenheimer',
 '/wiki/University_of_California,_Berkeley',
 '/wiki/Harvard_U

In [13]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import datetime
import random
import re

random.seed(datetime.datetime.now())
def getLinks(articleUrl):
    html = urlopen('http://en.wikipedia.org{}'.format(articleUrl))
    bs = BeautifulSoup(html, 'html.parser')
    return bs.find('div', {'id':'bodyContent'}).find_all('a', href=re.compile('^(/wiki/)((?!:).)*$'))

links = getLinks('/wiki/Kevin_Bacon')
while len(links) > 0:
    newArticle = links[random.randint(0, len(links)-1)].attrs['href']
    print(newArticle)
    links = getLinks(newArticle)

/wiki/Screen_Actors_Guild_Award_for_Outstanding_Performance_by_a_Cast_in_a_Motion_Picture
/wiki/Madeline_Kahn
/wiki/Jennifer_Ehle
/wiki/Justin_Chang
/wiki/Variety_(magazine)
/wiki/ISSN_(identifier)
/wiki/JPEG_XL


KeyboardInterrupt: 

## Recursively crawling an entire site

In [14]:
#Take in a Wikipedia article URL
#of the form /wiki/<Article_Name> and return a list of all linked
#article URLs in the same form with a function named 'getLinks'!
#Call getLinks with a starting article, choose
#a random article link from the returned list, and call getLinks
#again, until you stop the program or until no article links are found
#on the new page with the main function!


from urllib.request import urlopen
from bs4 import BeautifulSoup
import datetime
import random
import re

random.seed(datetime.datetime.now())
def getLinks(articleUrl):
    html = urlopen('http://en.wikipedia.org{}'.format(articleUrl))
    bs = BeautifulSoup(html, 'html.parser')
    return bs.find('div', {'id':'bodyContent'}).find_all('a', href=re.compile('^(/wiki/)((?!:).)*$'))

def main():
    links = getLinks('/wiki/Kevin_Bacon')
    while len(links) > 0:
        newArticle = links[random.randint(0, len(links)-1)].attrs['href']
        print(newArticle)
        links = getLinks(newArticle)

main()

URLError: <urlopen error [Errno -2] Name or service not known>

In [4]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

pages = set()
def getLinks(pageUrl):
    global pages
    html = urlopen('http://en.wikipedia.org{}'.format(pageUrl))
    bs = BeautifulSoup(html, 'html.parser')
    for link in bs.find_all('a', href=re.compile('^(/wiki/)')):
        if 'href' in link.attrs:
            if link.attrs['href'] not in pages:
                #We have encountered a new page
                newPage = link.attrs['href']
                print(newPage)
                pages.add(newPage)
                getLinks(newPage)
getLinks('')

/wiki/Wikipedia
/wiki/Wikipedia:Protection_policy#semi
/wiki/Wikipedia:Requests_for_page_protection
/wiki/Wikipedia:Requests_for_permissions


KeyboardInterrupt: 

## Collecting Data Across an Entire Site

In [4]:
#Exhaustive site crawl geneeralized (55 minutes 42 seconds)

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

from typing import List, Tuple

pages = set()

nodes = pages

edges = set()

def search_for_a_list_of_all_internal_links_on_that_page(bs : BeautifulSoup, regex_string):
    return bs.find_all('a', href=re.compile(regex_string))

def extend_edges_with_an_edge(first_node, second_node):#Assuming that edges is a global variable
    edges[first_node].append(second_node)

def extend_nodes_with_a_node(node):#Assuming that nodes are the global variable named "pages"
    nodes.add(node)

def extend_the_graph_with_a_node_and_an_edge(node, first_node_and_second_node : Tuple[object, object]):
    extend_nodes_with_a_node(node)
    extend_edges_with_an_edge(first_node_and_second_node[0], first_node_and_second_node[1])

def trigger_another_round_of_crawling_for_the_new_page_found(newPage):
    getLinks(newPage)


def crawl_every_one_of_those_links_and_find_additional_lists_of_links_on_each_one_of_them_then_trigger_another_round_of_crawling_for_every_new_page_found(bs : BeautifulSoup, all_internal_links_on_that_page : List[str], top_level_page):
    for link in all_internal_links_on_that_page:
        if 'href' in link.attrs:
            if link.attrs['href'] not in pages:
                #We have encountered a new page
                current_page = top_level_page
                newPage = link.attrs['href']

                extend_the_graph_with_a_node_and_an_edge(newPage, (current_page, newPage))

                print('-'*20)
                print(newPage)
                
                trigger_another_round_of_crawling_for_the_new_page_found(newPage)

def getLinks(pageUrl):
    global pages

    global edges 

    edges = dict() # Is interpreted as a function mapping a page to its neighbours.


    top_level_page = urlopen('http://en.wikipedia.org{}'.format(pageUrl))

    if top_level_page not in edges.keys():
        edges[top_level_page] = []

    bs = BeautifulSoup(top_level_page, 'html.parser')
    try:
        print(bs.h1.get_text())
        print(bs.find(id ='mw-content-text').find_all('p')[0])
        print(bs.find(id='ca-edit').find('span').find('a').attrs['href'])
    except AttributeError:
        print('This page is missing something! Continuing.')

    all_internal_links_on_that_page = search_for_a_list_of_all_internal_links_on_that_page(bs, '^(/wiki/)')
    crawl_every_one_of_those_links_and_find_additional_lists_of_links_on_each_one_of_them_then_trigger_another_round_of_crawling_for_every_new_page_found(bs, all_internal_links_on_that_page, top_level_page)


 #getLinks('') 
#Example
getLinks('/wiki/Kevin_Bacon')
#print(edges)

#TODO: Figure out why does the crawler stop!

Kevin Bacon
<p class="mw-empty-elt">
</p>
This page is missing something! Continuing.
{<http.client.HTTPResponse object at 0x7f4e7cc17e80>: ['/wiki/Main_Page']}
{'/wiki/Main_Page'}
--------------------
/wiki/Main_Page
Main Page
<p><i><b><a href="/wiki/Not_My_Responsibility" title="Not My Responsibility">Not My Respon­si­bility</a></b></i> is a 2020 American <a href="/wiki/Short_film" title="Short film">short film</a> written and produced by singer-songwriter <a href="/wiki/Billie_Eilish" title="Billie Eilish">Billie Eilish</a>. A commentary on <a href="/wiki/Body_shaming" title="Body shaming">body shaming</a> and double standards placed upon young women's appearances, it features a monologue from Eilish about the media scrutiny surrounding her body. The film is <a href="/wiki/Spoken_word" title="Spoken word">spoken-word</a> and stars Eilish in a dark room, where she gradually undresses before submerging herself in black substance. The film premiered during Eilish's <a href="/wiki/Where

IndexError: list index out of range

In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

pages = set()
def getLinks(pageUrl):
    global pages
    #Obtain the parse tree of the HTML file!
    html = urlopen('http://en.wikipedia.org{}'.format(pageUrl))
    bs = BeautifulSoup(html, 'html.parser')
    #Check whether a title, a paragraph, or and edit link exists on the page!
    try:
        #Print the title!
        print(bs.h1.get_text())
        #Print just the first paragraph of text!
        print(bs.find(id ='mw-content-text').find_all('p')[0])
        #Print the edit links!
        print(bs.find(id='ca-edit').find('span').find('a').attrs['href'])
    except AttributeError:
        print('This page is missing a title or paragraphs or edit links! Continuing.')
    #search_for_a_list_of_all_internal_links_on_that_page, then crawl_every_one_of_those_links_and_find_additional_lists_of_links_on_each_one_of_them_then_trigger_another_round_of_crawling_for_every_new_page_found!
    for link in bs.find_all('a', href=re.compile('^(/wiki/)')):
        if 'href' in link.attrs:
            if link.attrs['href'] not in pages:
                #We have encountered a new page.
                newPage = link.attrs['href']
                #Separate the printed content!
                print('-'*20)
                print(newPage)
                pages.add(newPage)
                getLinks(newPage)
#Call the getLinks function with '' parameter!
getLinks('') 

Main Page
<p><b><a href="/wiki/Tolui" title="Tolui">Tolui</a></b> (<abbr title="circa">c.</abbr> 1191 – 1232) was a prominent general and prince of the early <a href="/wiki/Mongol_Empire" title="Mongol Empire">Mongol Empire</a>. The fourth son of <a href="/wiki/Genghis_Khan" title="Genghis Khan">Genghis Khan</a> and his first wife <a href="/wiki/B%C3%B6rte" title="Börte">Börte</a>, Tolui came to prominence in 1221 during the <a href="/wiki/Mongol_invasion_of_the_Khwarazmian_Empire" title="Mongol invasion of the Khwarazmian Empire">Mongol invasion of the Khwarazmian Empire</a>—contemporary chroniclers claimed that his army killed more than three million people while capturing <a href="/wiki/Merv" title="Merv">Merv</a> and <a href="/wiki/Nishapur" title="Nishapur">Nishapur</a> in <a href="/wiki/Greater_Khorasan" title="Greater Khorasan">Khorasan</a>. While modern historians consider this figure exaggerated, Tolui's campaign was undoubtedly brutal. A candidate to inherit his father's empi

IndexError: list index out of range

## Crawling across the Internet

In [9]:
#Functions:
#   getInternalLinks(bs, includeURL)
#       format, urlparse, 
#       find_all, re.compile
#       startswith
#       append
#   getExternalLinks
#   getRandomExternalLink

from urllib.request import urlopen
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import re
import datetime
import random

pages = set()
random.seed(datetime.datetime.now())

#Retrieves a list of all Internal links found on a page
def getInternalLinks(bs, includeUrl):
    includeUrl = '{}://{}'.format(urlparse(includeUrl).scheme, urlparse(includeUrl).netloc)
    internalLinks = []
    #Finds all links that begin with a "/"
    for link in bs.find_all('a', href=re.compile('^(/|.*'+includeUrl+')')):
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in internalLinks:
                if(link.attrs['href'].startswith('/')):
                    internalLinks.append(includeUrl+link.attrs['href'])
                else:
                    internalLinks.append(link.attrs['href'])
    return internalLinks
            
#Retrieves a list of all external links found on a page
def getExternalLinks(bs, excludeUrl):
    externalLinks = []
    #Finds all links that start with "http" that do
    #not contain the current URL
    for link in bs.find_all('a', href=re.compile('^(http|www)((?!'+excludeUrl+').)*$')):
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in externalLinks:
                externalLinks.append(link.attrs['href'])
    return externalLinks

def getRandomExternalLink(startingPage):#Variables are predicates, that are either applied on a single file, such as html, or bs, or on multiple files, such as externalLinks
    #Initialize the html file, then the parse-tree, then the extarnalLinks!
    html = urlopen(startingPage)
    bs = BeautifulSoup(html, 'html.parser')
    externalLinks = getExternalLinks(bs, urlparse(startingPage).netloc)
    no_external_links_found = len(externalLinks) == 0
    if no_external_links_found:
        print('No external links, looking around the site for one.')#Behaviour of code is being printed by the program in first-person singular!
        domain = '{}://{}'.format(urlparse(startingPage).scheme, urlparse(startingPage).netloc)
        internalLinks = getInternalLinks(bs, domain)
        return getRandomExternalLink(internalLinks[random.randint(0,
                                    len(internalLinks)-1)])
    else:
        return externalLinks[random.randint(0, len(externalLinks)-1)]
    
def followExternalOnly(startingSite):
    externalLink = getRandomExternalLink(startingSite)
    print('Random external link is: {}'.format(externalLink))
    followExternalOnly(externalLink)
            
followExternalOnly('http://oreilly.com')

SyntaxError: invalid syntax (<ipython-input-9-fa256841566e>, line 26)

## Collect all External Links from a Site

In [8]:
# Collects a list of all external URLs found on the site
allExtLinks = set()
allIntLinks = set()


def getAllExternalLinks(siteUrl):
    html = urlopen(siteUrl)
    domain = '{}://{}'.format(urlparse(siteUrl).scheme,
                              urlparse(siteUrl).netloc)
    bs = BeautifulSoup(html, 'html.parser')
    internalLinks = getInternalLinks(bs, domain)
    externalLinks = getExternalLinks(bs, domain)

    for link in externalLinks:
        if link not in allExtLinks:
            allExtLinks.add(link)
            print(link)
    for link in internalLinks:
        if link not in allIntLinks:
            allIntLinks.add(link)
            getAllExternalLinks(link)


allIntLinks.add('http://oreilly.com')
getAllExternalLinks('http://oreilly.com')

https://www.oreilly.com
http://www.oreilly.com/ideas
https://www.safaribooksonline.com/?utm_medium=content&utm_source=oreilly.com&utm_campaign=lgen&utm_content=20170601+nav
http://www.oreilly.com/conferences/
http://shop.oreilly.com/
http://members.oreilly.com
https://www.oreilly.com/topics
https://www.safaribooksonline.com/?utm_medium=content&utm_source=oreilly.com&utm_campaign=lgen&utm_content=20170505+homepage+get+started+now
https://www.safaribooksonline.com/accounts/login/?utm_medium=content&utm_source=oreilly.com&utm_campaign=lgen&utm_content=20170203+homepage+sign+in
https://www.safaribooksonline.com/live-training/?utm_medium=content&utm_source=oreilly.com&utm_campaign=lgen&utm_content=20170201+homepage+take+a+live+online+course
https://www.safaribooksonline.com/learning-paths/?utm_medium=content&utm_source=oreilly.com&utm_campaign=lgen&utm_content=20170201+homepage+follow+a+path
https://www.safaribooksonline.com/?utm_medium=content&utm_source=oreilly.com&utm_campaign=lgen&utm_c

KeyboardInterrupt: 

In [11]:
allExtLinks = set()
allIntLinks = set()

def getAllExternalLinks(siteUrl):
    html = urlopen(siteUrl)
    domain = '{}://{}'.format(urlparse(siteUrl).scheme,
    urlparse(siteUrl).netloc)
    bs = BeautifulSoup(html, 'html.parser')
    internalLinks = getInternalLinks(bs, domain)
    externalLinks = getExternalLinks(bs, domain)
    for link in externalLinks:
        if link not in allExtLinks:
            allExtLinks.add(link)
            print(link)
    for link in internalLinks:
        if link not in allIntLinks:
            allIntLinks.add(link)
            getAllExternalLinks(link)

allIntLinks.add('http://oreilly.com')
getAllExternalLinks('http://oreilly.com')

https://www.oreilly.com
https://www.oreilly.com/member/login/
https://www.oreilly.com/online-learning/try-now.html
https://www.oreilly.com/online-learning/teams.html
https://www.oreilly.com/online-learning/government.html
https://www.oreilly.com/online-learning/academic.html
https://www.oreilly.com/online-learning/individuals.html
https://www.oreilly.com/online-learning/features.html
https://www.oreilly.com/online-learning/courses.html
https://www.oreilly.com/online-learning/feature-certification.html
https://www.oreilly.com/online-learning/intro-interactive-learning.html
https://www.oreilly.com/online-learning/live-events.html
https://www.oreilly.com/online-learning/feature-answers.html
https://www.oreilly.com/online-learning/insights-dashboard.html
https://www.oreilly.com/radar/
https://www.oreilly.com/content-marketing-solutions.html
https://www.oreilly.com/ceros/554134-holiday-card-2023.html
https://learning.oreilly.com/start-trial/
https://www.oreilly.com/about/oreilly-approach-to

KeyboardInterrupt: 