In [1]:
#Imports
import requests
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import os

In [2]:
# Set to uniquely store internal and external urls
internal_urls = set()
external_urls = set()

In [3]:
def is_url_valid(url):
   #Checks if the url is valid or not
    parsedURL = urlparse(url)
    return bool(parsedURL.netloc) and bool(parsedURL.scheme)

#The above function works as follows. We check if the scheme is present and there is a value in the network location part
# url = "https://umd.edu/virusinfo"
# urlparse(url)
# ParseResult(scheme='https', netloc='umd.edu', path='/virusinfo', 
# params='', query='', fragment='')

In [4]:

#The function gives all urls
def get_all_urls(url):
    urls = set()
    # domain name of the URL without the protocol (umd.edu in this case)
    domain_name = urlparse(url).netloc
    soup = BeautifulSoup(requests.get(url).content, "html.parser")
    for a_tag in soup.findAll("a"):
        href = a_tag.attrs.get("href")
        if href == "" or href is None:
            #href is empty and we don't need that a element
            continue
        #if the link is not absolute, make it by joining relative to the base
        href = urljoin(url, href)
        parsed_href = urlparse(href)
        #constructing an absolute URL from parsed data
        href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
        if not is_url_valid(href):
            #in valid url
            continue
        if href in internal_urls:
            #it is already in the set, so we don't need to add
            continue
        if domain_name not in href:
            #it is an external link. i.e
            # Check if it is already there 
            if href not in external_urls:
                print(f"[EXT] External link: {href}")
                external_urls.add(href)
            continue
        print(f"[INT] Internal link: {href}")
        urls.add(href)
        internal_urls.add(href)
    return urls


In [5]:
total_urls_visited = 0
def crawl(url, max_urls=50):
    #Max URL is just to decrease the time if there are a lot of pages.
    #The following code was openly available of github and I found this
    #idea useful to inhibit crawling time
    global total_urls_visited
    total_urls_visited += 1
    links = get_all_urls(url)
    for link in links:
        if total_urls_visited > max_urls:
            break
        crawl(link, max_urls=max_urls)


In [7]:
#base_url = "https://umd.edu/virusinfo"
#Input
base_url = input("Enter the URL : ")

Enter the URL : https://umd.edu/virusinfo


In [8]:
parsedurl = urlparse(base_url)
# base_url_text = base_url.split("//",1)[1]
base_url_text = parsedurl.netloc+parsedurl.path
#print(base_url_text)
# base_url_text_domain = base_url_text.split("/",1)[0]
base_url_text_domain = parsedurl.netloc
#print(base_url_text_domain)

In [9]:
#Crawler call
crawl(base_url)
print("[+] Total External links:", len(external_urls))
print("[+] Total Internal links:", len(internal_urls))
print("[+] Total:", len(external_urls) + len(internal_urls))

[INT] Internal link: https://umd.edu/virusinfo
[INT] Internal link: https://umd.edu/
[EXT] External link: https://www.youtube.com/user/UMD2101
[EXT] External link: https://www.facebook.com/UnivofMaryland
[EXT] External link: https://twitter.com/UofMaryland
[EXT] External link: https://www.instagram.com/univofmaryland/
[INT] Internal link: https://umd.edu/virusinfo/campus-guidance
[INT] Internal link: https://umd.edu/virusinfo/stay-healthy
[INT] Internal link: https://umd.edu/virusinfo/communications-news
[INT] Internal link: https://umd.edu/virusinfo/confirmed-cases
[INT] Internal link: https://umd.edu/virusinfo/campus-resources
[INT] Internal link: https://umd.edu/virusinfo/help-students-crisis
[INT] Internal link: https://umd.edu/virusinfo/contact-information
[EXT] External link: https://www.cdc.gov/coronavirus/2019-ncov/index.html
[EXT] External link: https://phpa.health.maryland.gov/Pages/Novel-coronavirus.aspx
[INT] Internal link: https://umd.edu/covid-19-updates-and-resources/cas

[INT] Internal link: https://www.umd.edu/
[EXT] External link: http://cvs.umd.edu/visitors/maps.html
[EXT] External link: http://cvs.umd.edu/visitors/
[EXT] External link: http://www.visitmaryland.org/
[EXT] External link: https://visitmontgomery.com/
[EXT] External link: http://www.visitprincegeorges.com/
[EXT] External link: https://washington.org/
[EXT] External link: http://terpfamily.umd.edu/
[EXT] External link: http://www.commencement.umd.edu/
[EXT] External link: http://www.provost.umd.edu/
[EXT] External link: http://www.testudo.umd.edu/ScheduleOfClasses.html
[EXT] External link: http://shopcollegepark.org/search/
[EXT] External link: http://www.collegeparkmd.gov/
[EXT] External link: http://www.collegeparkmd.gov/programs_and_initiatives/livesmart/index.php
[EXT] External link: http://shopcollegepark.org/
[INT] Internal link: https://www.umd.edu/node/152
[INT] Internal link: https://www.umd.edu/web-accessibility
[INT] Internal link: https://www.umd.edu/administration/public-in

[INT] Internal link: https://www.umd.edu/privacy-notice/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/152
[INT] Internal link: https://www.umd.edu/privacy-notice/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/152
[INT] Internal link: https://www.umd.edu/privacy-notice/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/152
[INT] Internal link: https://www.umd.edu/privacy-notice/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/node/152
[INT] Internal link: https://w

In [11]:
immediate_urls = [] #Linked Associated with the current page, https://umd.edu/virusinfo/ in this case

for url in internal_urls:
    if base_url_text in url:
        immediate_urls.append(url)

for immediate_url in immediate_urls:
    print(f'{immediate_url}\n')


https://umd.edu/virusinfo/campus-guidance

https://umd.edu/virusinfo/campus-resources

https://umd.edu/virusinfo/help-students-crisis

https://umd.edu/virusinfo/communications-news

https://umd.edu/virusinfo/contact-information

https://umd.edu/virusinfo/stay-healthy

https://umd.edu/virusinfo/academic-faqs

https://umd.edu/virusinfo/confirmed-cases

https://umd.edu/virusinfo



In [16]:
count = 0
name_of_folder = f'{base_url_text_domain}_Screenshots'
for i in immediate_urls:
    options = webdriver.ChromeOptions()
    options.headless = True
    driver = webdriver.Chrome(options=options,executable_path='chromedriver.exe') # #Local Path of chrome driver
    url = i
    file_name = url.replace(base_url,'')
    file_name = file_name.replace('/','')
    print(f'Visiting {base_url_text}/{file_name}')
    print(f'...Taking a screenshot')
    driver.get(url)
    if not os.path.exists(name_of_folder):
        os.makedirs(name_of_folder)
    S = lambda X: driver.execute_script('return document.body.parentNode.scroll'+X)
    driver.set_window_size(S('Width'),S('Height')) # May need manual adjustment                                                                                                                
    driver.find_element_by_tag_name('body').screenshot(f'{name_of_folder}\\{base_url_text_domain}-{file_name}.png')
    print(f'Screenshot of {file_name} page taken! \n')
    driver.quit()

print(f'Task Completed! Files stored in the {name_of_folder} Folder')

Visiting umd.edu/virusinfo/campus-guidance
...Taking a screenshot
Screenshot of campus-guidance page taken! 

Visiting umd.edu/virusinfo/campus-resources
...Taking a screenshot
Screenshot of campus-resources page taken! 

Visiting umd.edu/virusinfo/help-students-crisis
...Taking a screenshot
Screenshot of help-students-crisis page taken! 

Visiting umd.edu/virusinfo/communications-news
...Taking a screenshot
Screenshot of communications-news page taken! 

Visiting umd.edu/virusinfo/contact-information
...Taking a screenshot
Screenshot of contact-information page taken! 

Visiting umd.edu/virusinfo/stay-healthy
...Taking a screenshot
Screenshot of stay-healthy page taken! 

Visiting umd.edu/virusinfo/academic-faqs
...Taking a screenshot
Screenshot of academic-faqs page taken! 

Visiting umd.edu/virusinfo/confirmed-cases
...Taking a screenshot
Screenshot of confirmed-cases page taken! 

Visiting umd.edu/virusinfo/
...Taking a screenshot
Screenshot of  page taken! 

Task Completed! Files 