In [2]:
#Imports
import requests
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import os

In [3]:
# Set to uniquely store internal and external urls
internal_urls = set()
external_urls = set()

In [4]:
def is_url_valid(url):
   #Checks if the url is valid or not
    parsedURL = urlparse(url)
    return bool(parsedURL.netloc) and bool(parsedURL.scheme)

#The above function works as follows. We check if the scheme is present and there is a value in the network location part
# url = "https://umd.edu/virusinfo"
# urlparse(url)
# ParseResult(scheme='https', netloc='umd.edu', path='/virusinfo', 
# params='', query='', fragment='')

In [5]:

#The function gives all urls
def get_all_urls(url):
    urls = set()
    # domain name of the URL without the protocol (umd.edu in this case)
    domain_name = urlparse(url).netloc
    soup = BeautifulSoup(requests.get(url).content, "html.parser")
    for a_tag in soup.findAll("a"):
        href = a_tag.attrs.get("href")
        if href == "" or href is None:
            #href is empty and we don't need that a element
            continue
        #if the link is not absolute, make it by joining relative to the base
        href = urljoin(url, href)
        parsed_href = urlparse(href)
        #constructing an absolute URL from parsed data
        href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
        if not is_url_valid(href):
            #in valid url
            continue
        if href in internal_urls:
            #it is already in the set, so we don't need to add
            continue
        if domain_name not in href:
            #it is an external link. i.e
            # Check if it is already there 
            if href not in external_urls:
                print(f"[EXT] External link: {href}")
                external_urls.add(href)
            continue
        print(f"[INT] Internal link: {href}")
        urls.add(href)
        internal_urls.add(href)
    return urls


In [6]:
total_urls_visited = 0
def crawl(url, max_urls=50):
    #Max URL is just to decrease the time if there are a lot of pages.
    #The following code was openly available of github and I found this
    #idea useful to inhibit crawling time
    global total_urls_visited
    total_urls_visited += 1
    links = get_all_urls(url)
    for link in links:
        if total_urls_visited > max_urls:
            break
        crawl(link, max_urls=max_urls)


In [7]:
#base_url = "https://umd.edu/virusinfo"
#Input
base_url = "input("Enter the URL : ")"
# base_url = "https://www.ucf.edu/coronavirus/"

In [8]:
parsedurl = urlparse(base_url)
# base_url_text = base_url.split("//",1)[1]
base_url_text = parsedurl.netloc+parsedurl.path
#print(base_url_text)
# base_url_text_domain = base_url_text.split("/",1)[0]
base_url_text_domain = parsedurl.netloc
#print(base_url_text_domain)

In [9]:
#Crawler call
crawl(base_url)
print("[+] Total External links:", len(external_urls))
print("[+] Total Internal links:", len(internal_urls))
print("[+] Total:", len(external_urls) + len(internal_urls))

[INT] Internal link: https://www.ucf.edu/coronavirus/
[INT] Internal link: https://www.ucf.edu/coronavirus/faq/
[INT] Internal link: https://www.ucf.edu/coronavirus/whats-open-at-ucf/
[INT] Internal link: https://www.ucf.edu/coronavirus/faculty-toolkit/
[INT] Internal link: https://www.ucf.edu/coronavirus/working-remotely/
[EXT] External link: https://digitallearning.ucf.edu/newsroom/keeplearning/
[INT] Internal link: https://www.ucf.edu/coronavirus/updates/
[INT] Internal link: https://www.ucf.edu/coronavirus/returning-to-ucf/
[EXT] External link: https://corona.research.ucf.edu/principles-for-reopening-labs/
[INT] Internal link: https://www.ucf.edu/news/antibody-testing-coming-to-ucfs-garage-a-testing-site/
[EXT] External link: tel://1-407-823-2701
[EXT] External link: tel://1-407-266-3627
[EXT] External link: tel://1-407-986-0133
[INT] Internal link: https://www.ucf.edu/news/covid-19-testing-site-to-open-at-ucf-monday-in-partnership-with-aventus-biolabs/
[EXT] External link: https:/

[INT] Internal link: https://www.ucf.edu/coronavirus/question/which-employees-will-be-allowed-to-return-to-work/tel:4078232000
[INT] Internal link: https://www.ucf.edu/feedback
[INT] Internal link: https://www.ucf.edu/academics/
[INT] Internal link: https://www.ucf.edu/admissions/
[INT] Internal link: https://www.ucf.edu/research/
[INT] Internal link: https://www.ucf.edu/locations/
[INT] Internal link: https://www.ucf.edu/students/
[INT] Internal link: https://www.ucf.edu/alumni-giving/
[INT] Internal link: https://www.ucf.edu/athletics/
[INT] Internal link: https://www.ucf.edu/apply-to-ucf/
[EXT] External link: https://admissions.ucf.edu/
[EXT] External link: https://graduate.ucf.edu/
[EXT] External link: https://hr.ucf.edu/
[INT] Internal link: https://www.ucf.edu/jobs/
[INT] Internal link: https://www.ucf.edu/tel:4078232000
[INT] Internal link: https://www.ucf.edu/academics/us-news-rankings/
[INT] Internal link: https://www.ucf.edu/degree-search
[EXT] External link: http://studentun

[EXT] External link: http://www.facebook.com/sharer.php
[INT] Internal link: https://www.ucf.edu/degree/aerospace-engineering-bsae/
[INT] Internal link: https://www.ucf.edu/college/engineering-computer-science/
[EXT] External link: https://apply.ucf.edu/application/
[EXT] External link: http://www.mae.ucf.edu
[EXT] External link: mailto://MAEadvising@ucf.edu
[EXT] External link: mailto://hjcho@ucf.edu
[EXT] External link: mailto://Lynn.Grabenhorst@ucf.edu
[EXT] External link: https://catalog.ucf.edu/preview_program.php
[EXT] External link: http://today.ucf.edu/princeton-review-names-ucf-1-nations-best-values/
[EXT] External link: http://www.kiplinger.com/tool/college/T014-S001-find-best-colleges-value-rankings/end_page.php
[EXT] External link: https://www.forbes.com/lists/2010/94/best-colleges-10_University-of-Central-Florida_94438.html
[EXT] External link: https://studentaccounts.ucf.edu/tf-tuitionratescurrent/
[INT] Internal link: https://www.ucf.edu/college/burnett-honors-college/
[

[INT] Internal link: https://www.ucf.edu/news/engineer-named-national-fellow-for-work-in-materials-science/
[INT] Internal link: https://www.ucf.edu/news/how-to-be-a-better-ally-against-racism/
[INT] Internal link: https://www.ucf.edu/news/ucf-1st-in-florida-3rd-nationally-for-coveted-national-science-foundation-career-grants/
[INT] Internal link: https://www.ucf.edu/news/faculty-members-recognized-for-commitment-to-undergraduate-research/
[INT] Internal link: https://www.ucf.edu/news/tag/faculty-excellence/tel:4078232000
[EXT] External link: https://our.ucf.edu/summer-research-on-campus-one-students-experience-in-the-surf-program/
[EXT] External link: https://sciences.ucf.edu/psychology/acat/
[EXT] External link: https://healthprofessions.ucf.edu/healthsciences/
[EXT] External link: https://sciences.ucf.edu/biology/
[EXT] External link: https://our.ucf.edu/learn/
[INT] Internal link: https://www.ucf.edu/news/tag/research/
[INT] Internal link: https://www.ucf.edu/news/faculty-members-r

[EXT] External link: https://www.facebook.com/LoveYourShorts/
[INT] Internal link: https://www.ucf.edu/news/motion-capture-to-on-screen-acting-theatre-animation-and-film-programs-team-up/
[INT] Internal link: https://www.ucf.edu/news/from-army-to-animation/
[INT] Internal link: https://www.ucf.edu/news/ucf-celebrates-arts-educational-component-benefit-k-12-students/
[INT] Internal link: https://www.ucf.edu/news/cheating-online-can-be-a-problem-but-there-are-ways-to-stifle-that/
[INT] Internal link: https://www.ucf.edu/news/tag/pegasus-briefs/
[INT] Internal link: https://www.ucf.edu/news/tag/stella-sung/
[INT] Internal link: https://www.ucf.edu/news/tag/animation/
[INT] Internal link: https://www.ucf.edu/news/ucf-represents-love-shorts-film-festival-sanford/tel:4078232000
[EXT] External link: https://arts.cah.ucf.edu/
[INT] Internal link: https://www.ucf.edu/news/ucf-celebrates-the-arts-2020-features-inspiration-from-dorothy-gillespie/
[INT] Internal link: https://www.ucf.edu/news/exod

In [10]:
immediate_urls = [] #Linked Associated with the current page, https://umd.edu/virusinfo/ in this case

for url in internal_urls:
    if base_url_text in url:
        immediate_urls.append(url)

for immediate_url in immediate_urls:
    print(f'{immediate_url}\n')


https://www.ucf.edu/coronavirus/question/what-if-i-have-concerns-about-disregard-of-safety-precautions/

https://www.ucf.edu/coronavirus/question/will-i-be-told-if-a-student-or-colleague-has-tested-positive-for-covid-19/

https://www.ucf.edu/coronavirus/question/when-will-i-receive-notification-about-whether-my-class-is-offered-face-to-face-or-remotely/

https://www.ucf.edu/coronavirus/faq/

https://www.ucf.edu/coronavirus/returning-to-ucf/tel:14072663627

https://www.ucf.edu/coronavirus/question/will-i-need-to-be-tested-for-covid-19-before-returning-to-work/

https://www.ucf.edu/coronavirus/question/will-study-abroad-programs-resume-in-the-fall/

https://www.ucf.edu/coronavirus/bookstore-update-summer-textbook-orders-and-returns/

https://www.ucf.edu/coronavirus/question/i-know-that-official-university-travel-has-been-suspended-are-there-any-restrictions-on-my-personal-travel/

https://www.ucf.edu/coronavirus/question/will-face-coverings-be-required/

https://www.ucf.edu/coronavirus/q

In [None]:
count = 0
name_of_folder = f'{base_url_text_domain}_Screenshots'
for i in immediate_urls:
    options = webdriver.ChromeOptions()
    options.headless = True
    driver = webdriver.Chrome(options=options,executable_path='chromedriver.exe') # #Local Path of chrome driver
    url = i
    file_name = url.replace(base_url,'')
    file_name = file_name.replace('/','')
    print(f'Visiting {base_url_text}/{file_name}')
    print(f'...Taking a screenshot')
    driver.get(url)
    if not os.path.exists(name_of_folder):
        os.makedirs(name_of_folder)
    S = lambda X: driver.execute_script('return document.body.parentNode.scroll'+X)
    driver.set_window_size(S('Width'),S('Height')) # May need manual adjustment                                                                                                                
    driver.find_element_by_tag_name('body').screenshot(f'{name_of_folder}\\{base_url_text_domain}-{file_name}.png')
    print(f'Screenshot of {file_name} page taken! \n')
    driver.quit()

print(f'Task Completed! Files stored in the {name_of_folder} Folder')

Visiting www.ucf.edu/coronavirus//questionwhat-if-i-have-concerns-about-disregard-of-safety-precautions
...Taking a screenshot
Screenshot of questionwhat-if-i-have-concerns-about-disregard-of-safety-precautions page taken! 

Visiting www.ucf.edu/coronavirus//questionwill-i-be-told-if-a-student-or-colleague-has-tested-positive-for-covid-19
...Taking a screenshot
Screenshot of questionwill-i-be-told-if-a-student-or-colleague-has-tested-positive-for-covid-19 page taken! 

Visiting www.ucf.edu/coronavirus//questionwhen-will-i-receive-notification-about-whether-my-class-is-offered-face-to-face-or-remotely
...Taking a screenshot
Screenshot of questionwhen-will-i-receive-notification-about-whether-my-class-is-offered-face-to-face-or-remotely page taken! 

Visiting www.ucf.edu/coronavirus//faq
...Taking a screenshot
Screenshot of faq page taken! 

Visiting www.ucf.edu/coronavirus//returning-to-ucftel:14072663627
...Taking a screenshot
Screenshot of returning-to-ucftel:14072663627 page taken! 


Visiting www.ucf.edu/coronavirus//questionwhat-about-employees-with-children-at-home
...Taking a screenshot
Screenshot of questionwhat-about-employees-with-children-at-home page taken! 

Visiting www.ucf.edu/coronavirus//returning-to-ucf
...Taking a screenshot
Screenshot of returning-to-ucf page taken! 

Visiting www.ucf.edu/coronavirus//documentreturn-to-campus-plan
...Taking a screenshot
Screenshot of documentreturn-to-campus-plan page taken! 

Visiting www.ucf.edu/coronavirus//documentjune-10-return-to-campus-presentation
...Taking a screenshot
Screenshot of documentjune-10-return-to-campus-presentation page taken! 

Visiting www.ucf.edu/coronavirus//working-remotely
...Taking a screenshot
Screenshot of working-remotely page taken! 

Visiting www.ucf.edu/coronavirus//tel:18772406863
...Taking a screenshot
Screenshot of tel:18772406863 page taken! 

Visiting www.ucf.edu/coronavirus//questionwhat-about-supplying-my-lab-with-personal-protective-equipment-and-cleaning-supplies
...Taking